flyingbugs's picture
Model save
b73a0a4 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1914,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001567398119122257,
"grad_norm": 47.04829723887942,
"learning_rate": 0.0,
"loss": 11.4214,
"step": 1
},
{
"epoch": 0.003134796238244514,
"grad_norm": 43.946534217723,
"learning_rate": 2.604166666666667e-07,
"loss": 11.4756,
"step": 2
},
{
"epoch": 0.004702194357366771,
"grad_norm": 43.08546020079388,
"learning_rate": 5.208333333333334e-07,
"loss": 11.5966,
"step": 3
},
{
"epoch": 0.006269592476489028,
"grad_norm": 44.64806373570963,
"learning_rate": 7.8125e-07,
"loss": 11.4776,
"step": 4
},
{
"epoch": 0.007836990595611285,
"grad_norm": 47.85186512634389,
"learning_rate": 1.0416666666666667e-06,
"loss": 11.5148,
"step": 5
},
{
"epoch": 0.009404388714733543,
"grad_norm": 43.4289376617681,
"learning_rate": 1.3020833333333335e-06,
"loss": 11.5386,
"step": 6
},
{
"epoch": 0.0109717868338558,
"grad_norm": 46.443240118413485,
"learning_rate": 1.5625e-06,
"loss": 11.0058,
"step": 7
},
{
"epoch": 0.012539184952978056,
"grad_norm": 48.07484042452407,
"learning_rate": 1.8229166666666669e-06,
"loss": 11.1704,
"step": 8
},
{
"epoch": 0.014106583072100314,
"grad_norm": 48.003085699137905,
"learning_rate": 2.0833333333333334e-06,
"loss": 11.2467,
"step": 9
},
{
"epoch": 0.01567398119122257,
"grad_norm": 70.56128743954308,
"learning_rate": 2.3437500000000002e-06,
"loss": 10.3353,
"step": 10
},
{
"epoch": 0.017241379310344827,
"grad_norm": 75.2140924061546,
"learning_rate": 2.604166666666667e-06,
"loss": 9.9202,
"step": 11
},
{
"epoch": 0.018808777429467086,
"grad_norm": 78.3720693875648,
"learning_rate": 2.8645833333333334e-06,
"loss": 9.9333,
"step": 12
},
{
"epoch": 0.02037617554858934,
"grad_norm": 84.01385239723572,
"learning_rate": 3.125e-06,
"loss": 5.5962,
"step": 13
},
{
"epoch": 0.0219435736677116,
"grad_norm": 70.53790445674674,
"learning_rate": 3.3854166666666665e-06,
"loss": 4.6695,
"step": 14
},
{
"epoch": 0.023510971786833857,
"grad_norm": 53.78136554438082,
"learning_rate": 3.6458333333333337e-06,
"loss": 3.7144,
"step": 15
},
{
"epoch": 0.025078369905956112,
"grad_norm": 45.61652483434186,
"learning_rate": 3.90625e-06,
"loss": 3.4577,
"step": 16
},
{
"epoch": 0.02664576802507837,
"grad_norm": 11.396007252848847,
"learning_rate": 4.166666666666667e-06,
"loss": 2.099,
"step": 17
},
{
"epoch": 0.02821316614420063,
"grad_norm": 6.277397174300771,
"learning_rate": 4.427083333333334e-06,
"loss": 1.8814,
"step": 18
},
{
"epoch": 0.029780564263322883,
"grad_norm": 5.299678043755367,
"learning_rate": 4.6875000000000004e-06,
"loss": 1.7146,
"step": 19
},
{
"epoch": 0.03134796238244514,
"grad_norm": 4.113043933899011,
"learning_rate": 4.947916666666666e-06,
"loss": 1.5247,
"step": 20
},
{
"epoch": 0.032915360501567396,
"grad_norm": 3.5162634253765415,
"learning_rate": 5.208333333333334e-06,
"loss": 1.4683,
"step": 21
},
{
"epoch": 0.034482758620689655,
"grad_norm": 2.9574729072514256,
"learning_rate": 5.46875e-06,
"loss": 1.6099,
"step": 22
},
{
"epoch": 0.03605015673981191,
"grad_norm": 2.2776102495593604,
"learning_rate": 5.729166666666667e-06,
"loss": 1.3839,
"step": 23
},
{
"epoch": 0.03761755485893417,
"grad_norm": 2.1763328839897373,
"learning_rate": 5.9895833333333335e-06,
"loss": 1.4349,
"step": 24
},
{
"epoch": 0.03918495297805643,
"grad_norm": 2.1180367820440895,
"learning_rate": 6.25e-06,
"loss": 1.2569,
"step": 25
},
{
"epoch": 0.04075235109717868,
"grad_norm": 1.3577721417844781,
"learning_rate": 6.510416666666667e-06,
"loss": 1.3011,
"step": 26
},
{
"epoch": 0.04231974921630094,
"grad_norm": 1.223525587065415,
"learning_rate": 6.770833333333333e-06,
"loss": 1.3484,
"step": 27
},
{
"epoch": 0.0438871473354232,
"grad_norm": 0.9378262658242363,
"learning_rate": 7.031250000000001e-06,
"loss": 1.2283,
"step": 28
},
{
"epoch": 0.045454545454545456,
"grad_norm": 0.8475061582128701,
"learning_rate": 7.2916666666666674e-06,
"loss": 1.1581,
"step": 29
},
{
"epoch": 0.047021943573667714,
"grad_norm": 0.7698789968420477,
"learning_rate": 7.552083333333333e-06,
"loss": 1.0707,
"step": 30
},
{
"epoch": 0.048589341692789965,
"grad_norm": 0.7699833375213356,
"learning_rate": 7.8125e-06,
"loss": 1.0581,
"step": 31
},
{
"epoch": 0.050156739811912224,
"grad_norm": 0.7238635985933511,
"learning_rate": 8.072916666666667e-06,
"loss": 1.1366,
"step": 32
},
{
"epoch": 0.05172413793103448,
"grad_norm": 0.6500839051752617,
"learning_rate": 8.333333333333334e-06,
"loss": 1.029,
"step": 33
},
{
"epoch": 0.05329153605015674,
"grad_norm": 0.6324756201783983,
"learning_rate": 8.59375e-06,
"loss": 1.1305,
"step": 34
},
{
"epoch": 0.054858934169279,
"grad_norm": 0.6580587881569693,
"learning_rate": 8.854166666666667e-06,
"loss": 1.1712,
"step": 35
},
{
"epoch": 0.05642633228840126,
"grad_norm": 0.6865616559772267,
"learning_rate": 9.114583333333334e-06,
"loss": 0.9816,
"step": 36
},
{
"epoch": 0.05799373040752351,
"grad_norm": 0.6189796243210208,
"learning_rate": 9.375000000000001e-06,
"loss": 1.0658,
"step": 37
},
{
"epoch": 0.05956112852664577,
"grad_norm": 0.5197248747319764,
"learning_rate": 9.635416666666668e-06,
"loss": 0.9551,
"step": 38
},
{
"epoch": 0.061128526645768025,
"grad_norm": 0.6365179537126,
"learning_rate": 9.895833333333333e-06,
"loss": 1.0242,
"step": 39
},
{
"epoch": 0.06269592476489028,
"grad_norm": 0.6892788851559103,
"learning_rate": 1.0156250000000001e-05,
"loss": 1.287,
"step": 40
},
{
"epoch": 0.06426332288401254,
"grad_norm": 0.5617816200299977,
"learning_rate": 1.0416666666666668e-05,
"loss": 1.0643,
"step": 41
},
{
"epoch": 0.06583072100313479,
"grad_norm": 0.5927608146444618,
"learning_rate": 1.0677083333333333e-05,
"loss": 1.0782,
"step": 42
},
{
"epoch": 0.06739811912225706,
"grad_norm": 0.5365992557754803,
"learning_rate": 1.09375e-05,
"loss": 1.1096,
"step": 43
},
{
"epoch": 0.06896551724137931,
"grad_norm": 0.49213341105629616,
"learning_rate": 1.1197916666666668e-05,
"loss": 0.9635,
"step": 44
},
{
"epoch": 0.07053291536050156,
"grad_norm": 1.2174955277533845,
"learning_rate": 1.1458333333333333e-05,
"loss": 1.0144,
"step": 45
},
{
"epoch": 0.07210031347962383,
"grad_norm": 0.5446626965221274,
"learning_rate": 1.171875e-05,
"loss": 1.0665,
"step": 46
},
{
"epoch": 0.07366771159874608,
"grad_norm": 0.49371851206527106,
"learning_rate": 1.1979166666666667e-05,
"loss": 1.0556,
"step": 47
},
{
"epoch": 0.07523510971786834,
"grad_norm": 0.44311555453669416,
"learning_rate": 1.2239583333333334e-05,
"loss": 0.9364,
"step": 48
},
{
"epoch": 0.0768025078369906,
"grad_norm": 0.4059478507987885,
"learning_rate": 1.25e-05,
"loss": 0.9471,
"step": 49
},
{
"epoch": 0.07836990595611286,
"grad_norm": 0.440774084909754,
"learning_rate": 1.2760416666666666e-05,
"loss": 0.9889,
"step": 50
},
{
"epoch": 0.07993730407523511,
"grad_norm": 0.46793149842855114,
"learning_rate": 1.3020833333333334e-05,
"loss": 0.9441,
"step": 51
},
{
"epoch": 0.08150470219435736,
"grad_norm": 0.39140609081800326,
"learning_rate": 1.3281250000000001e-05,
"loss": 0.9428,
"step": 52
},
{
"epoch": 0.08307210031347963,
"grad_norm": 5.84156950137597,
"learning_rate": 1.3541666666666666e-05,
"loss": 1.0649,
"step": 53
},
{
"epoch": 0.08463949843260188,
"grad_norm": 0.3814930364237649,
"learning_rate": 1.3802083333333335e-05,
"loss": 0.8278,
"step": 54
},
{
"epoch": 0.08620689655172414,
"grad_norm": 0.4402736271628929,
"learning_rate": 1.4062500000000001e-05,
"loss": 1.0358,
"step": 55
},
{
"epoch": 0.0877742946708464,
"grad_norm": 0.37104402221958355,
"learning_rate": 1.4322916666666666e-05,
"loss": 0.8755,
"step": 56
},
{
"epoch": 0.08934169278996865,
"grad_norm": 0.3456930393355076,
"learning_rate": 1.4583333333333335e-05,
"loss": 0.906,
"step": 57
},
{
"epoch": 0.09090909090909091,
"grad_norm": 0.3791681027447875,
"learning_rate": 1.484375e-05,
"loss": 0.842,
"step": 58
},
{
"epoch": 0.09247648902821316,
"grad_norm": 0.3545791496979038,
"learning_rate": 1.5104166666666667e-05,
"loss": 0.8078,
"step": 59
},
{
"epoch": 0.09404388714733543,
"grad_norm": 0.35113405873897646,
"learning_rate": 1.5364583333333335e-05,
"loss": 0.8403,
"step": 60
},
{
"epoch": 0.09561128526645768,
"grad_norm": 0.3713326125081008,
"learning_rate": 1.5625e-05,
"loss": 0.7789,
"step": 61
},
{
"epoch": 0.09717868338557993,
"grad_norm": 0.3100548288078542,
"learning_rate": 1.5885416666666665e-05,
"loss": 0.7739,
"step": 62
},
{
"epoch": 0.0987460815047022,
"grad_norm": 0.35008896127230327,
"learning_rate": 1.6145833333333334e-05,
"loss": 0.8365,
"step": 63
},
{
"epoch": 0.10031347962382445,
"grad_norm": 0.4075911969314728,
"learning_rate": 1.6406250000000002e-05,
"loss": 1.1141,
"step": 64
},
{
"epoch": 0.10188087774294671,
"grad_norm": 0.8069604622712818,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.7811,
"step": 65
},
{
"epoch": 0.10344827586206896,
"grad_norm": 0.3084969368866898,
"learning_rate": 1.6927083333333336e-05,
"loss": 0.6745,
"step": 66
},
{
"epoch": 0.10501567398119123,
"grad_norm": 0.3194713901565365,
"learning_rate": 1.71875e-05,
"loss": 0.8369,
"step": 67
},
{
"epoch": 0.10658307210031348,
"grad_norm": 0.3213654292987939,
"learning_rate": 1.7447916666666666e-05,
"loss": 0.8724,
"step": 68
},
{
"epoch": 0.10815047021943573,
"grad_norm": 0.3328561211408985,
"learning_rate": 1.7708333333333335e-05,
"loss": 0.7867,
"step": 69
},
{
"epoch": 0.109717868338558,
"grad_norm": 0.29992255723226025,
"learning_rate": 1.796875e-05,
"loss": 0.7069,
"step": 70
},
{
"epoch": 0.11128526645768025,
"grad_norm": 0.3602513000961509,
"learning_rate": 1.8229166666666668e-05,
"loss": 0.7406,
"step": 71
},
{
"epoch": 0.11285266457680251,
"grad_norm": 0.31867899669425803,
"learning_rate": 1.8489583333333337e-05,
"loss": 0.852,
"step": 72
},
{
"epoch": 0.11442006269592477,
"grad_norm": 0.5274118817345942,
"learning_rate": 1.8750000000000002e-05,
"loss": 0.9487,
"step": 73
},
{
"epoch": 0.11598746081504702,
"grad_norm": 0.3091575864347062,
"learning_rate": 1.9010416666666667e-05,
"loss": 0.7956,
"step": 74
},
{
"epoch": 0.11755485893416928,
"grad_norm": 0.4036300104012909,
"learning_rate": 1.9270833333333335e-05,
"loss": 0.8399,
"step": 75
},
{
"epoch": 0.11912225705329153,
"grad_norm": 0.31589900498181267,
"learning_rate": 1.953125e-05,
"loss": 0.787,
"step": 76
},
{
"epoch": 0.1206896551724138,
"grad_norm": 0.3064495805561625,
"learning_rate": 1.9791666666666665e-05,
"loss": 0.7665,
"step": 77
},
{
"epoch": 0.12225705329153605,
"grad_norm": 0.30932917343504795,
"learning_rate": 2.0052083333333334e-05,
"loss": 0.7749,
"step": 78
},
{
"epoch": 0.1238244514106583,
"grad_norm": 0.3623654093367351,
"learning_rate": 2.0312500000000002e-05,
"loss": 0.8394,
"step": 79
},
{
"epoch": 0.12539184952978055,
"grad_norm": 0.3071519771573341,
"learning_rate": 2.0572916666666668e-05,
"loss": 0.8556,
"step": 80
},
{
"epoch": 0.12695924764890282,
"grad_norm": 0.3248932869182534,
"learning_rate": 2.0833333333333336e-05,
"loss": 0.8353,
"step": 81
},
{
"epoch": 0.12852664576802508,
"grad_norm": 0.3274818082222233,
"learning_rate": 2.109375e-05,
"loss": 0.7125,
"step": 82
},
{
"epoch": 0.13009404388714735,
"grad_norm": 0.35045140247387685,
"learning_rate": 2.1354166666666666e-05,
"loss": 0.9437,
"step": 83
},
{
"epoch": 0.13166144200626959,
"grad_norm": 0.31189086800089705,
"learning_rate": 2.1614583333333335e-05,
"loss": 0.9113,
"step": 84
},
{
"epoch": 0.13322884012539185,
"grad_norm": 0.3777931368188779,
"learning_rate": 2.1875e-05,
"loss": 1.0618,
"step": 85
},
{
"epoch": 0.13479623824451412,
"grad_norm": 0.2784245256699354,
"learning_rate": 2.2135416666666668e-05,
"loss": 0.7531,
"step": 86
},
{
"epoch": 0.13636363636363635,
"grad_norm": 0.30279607485439763,
"learning_rate": 2.2395833333333337e-05,
"loss": 0.7718,
"step": 87
},
{
"epoch": 0.13793103448275862,
"grad_norm": 0.28223626767049836,
"learning_rate": 2.2656250000000002e-05,
"loss": 0.74,
"step": 88
},
{
"epoch": 0.13949843260188088,
"grad_norm": 0.2892557686011558,
"learning_rate": 2.2916666666666667e-05,
"loss": 0.7303,
"step": 89
},
{
"epoch": 0.14106583072100312,
"grad_norm": 0.3273321643903824,
"learning_rate": 2.3177083333333335e-05,
"loss": 0.8565,
"step": 90
},
{
"epoch": 0.1426332288401254,
"grad_norm": 0.28896683262276707,
"learning_rate": 2.34375e-05,
"loss": 0.7491,
"step": 91
},
{
"epoch": 0.14420062695924765,
"grad_norm": 0.3626980002175838,
"learning_rate": 2.3697916666666666e-05,
"loss": 0.8322,
"step": 92
},
{
"epoch": 0.14576802507836992,
"grad_norm": 0.3194730867510284,
"learning_rate": 2.3958333333333334e-05,
"loss": 0.8278,
"step": 93
},
{
"epoch": 0.14733542319749215,
"grad_norm": 0.27801935054054877,
"learning_rate": 2.4218750000000003e-05,
"loss": 0.7143,
"step": 94
},
{
"epoch": 0.14890282131661442,
"grad_norm": 0.48350915028589525,
"learning_rate": 2.4479166666666668e-05,
"loss": 0.8935,
"step": 95
},
{
"epoch": 0.15047021943573669,
"grad_norm": 0.3049446737092653,
"learning_rate": 2.4739583333333336e-05,
"loss": 0.7667,
"step": 96
},
{
"epoch": 0.15203761755485892,
"grad_norm": 0.35024300165268374,
"learning_rate": 2.5e-05,
"loss": 0.7653,
"step": 97
},
{
"epoch": 0.1536050156739812,
"grad_norm": 0.59143739851361,
"learning_rate": 2.526041666666667e-05,
"loss": 0.8685,
"step": 98
},
{
"epoch": 0.15517241379310345,
"grad_norm": 0.3622179686809848,
"learning_rate": 2.552083333333333e-05,
"loss": 0.8222,
"step": 99
},
{
"epoch": 0.15673981191222572,
"grad_norm": 0.31745663647344796,
"learning_rate": 2.578125e-05,
"loss": 0.821,
"step": 100
},
{
"epoch": 0.15830721003134796,
"grad_norm": 0.42208937965757665,
"learning_rate": 2.604166666666667e-05,
"loss": 0.8616,
"step": 101
},
{
"epoch": 0.15987460815047022,
"grad_norm": 0.3238158297387486,
"learning_rate": 2.6302083333333333e-05,
"loss": 0.8887,
"step": 102
},
{
"epoch": 0.1614420062695925,
"grad_norm": 0.32156995065433236,
"learning_rate": 2.6562500000000002e-05,
"loss": 0.6132,
"step": 103
},
{
"epoch": 0.16300940438871472,
"grad_norm": 0.3098965087999306,
"learning_rate": 2.682291666666667e-05,
"loss": 0.8247,
"step": 104
},
{
"epoch": 0.164576802507837,
"grad_norm": 0.3242215722091957,
"learning_rate": 2.7083333333333332e-05,
"loss": 0.9176,
"step": 105
},
{
"epoch": 0.16614420062695925,
"grad_norm": 0.7035283867161487,
"learning_rate": 2.734375e-05,
"loss": 0.9075,
"step": 106
},
{
"epoch": 0.1677115987460815,
"grad_norm": 0.5927704948163743,
"learning_rate": 2.760416666666667e-05,
"loss": 0.8833,
"step": 107
},
{
"epoch": 0.16927899686520376,
"grad_norm": 0.2965563792710556,
"learning_rate": 2.7864583333333334e-05,
"loss": 0.7889,
"step": 108
},
{
"epoch": 0.17084639498432602,
"grad_norm": 0.321064004807903,
"learning_rate": 2.8125000000000003e-05,
"loss": 0.8961,
"step": 109
},
{
"epoch": 0.1724137931034483,
"grad_norm": 0.29583491832362163,
"learning_rate": 2.838541666666667e-05,
"loss": 0.7791,
"step": 110
},
{
"epoch": 0.17398119122257052,
"grad_norm": 0.32948207055829726,
"learning_rate": 2.8645833333333333e-05,
"loss": 0.7315,
"step": 111
},
{
"epoch": 0.1755485893416928,
"grad_norm": 0.3039977446238745,
"learning_rate": 2.890625e-05,
"loss": 0.785,
"step": 112
},
{
"epoch": 0.17711598746081506,
"grad_norm": 0.3305386110174328,
"learning_rate": 2.916666666666667e-05,
"loss": 0.7608,
"step": 113
},
{
"epoch": 0.1786833855799373,
"grad_norm": 0.31333379795633476,
"learning_rate": 2.942708333333333e-05,
"loss": 0.7,
"step": 114
},
{
"epoch": 0.18025078369905956,
"grad_norm": 0.3125772698566396,
"learning_rate": 2.96875e-05,
"loss": 0.7622,
"step": 115
},
{
"epoch": 0.18181818181818182,
"grad_norm": 0.33676409004225016,
"learning_rate": 2.994791666666667e-05,
"loss": 0.8056,
"step": 116
},
{
"epoch": 0.1833855799373041,
"grad_norm": 0.3218656193668577,
"learning_rate": 3.0208333333333334e-05,
"loss": 0.7956,
"step": 117
},
{
"epoch": 0.18495297805642633,
"grad_norm": 0.2969117729842054,
"learning_rate": 3.0468750000000002e-05,
"loss": 0.7434,
"step": 118
},
{
"epoch": 0.1865203761755486,
"grad_norm": 1.9411833119336097,
"learning_rate": 3.072916666666667e-05,
"loss": 0.7512,
"step": 119
},
{
"epoch": 0.18808777429467086,
"grad_norm": 0.3767500380437524,
"learning_rate": 3.0989583333333336e-05,
"loss": 0.7378,
"step": 120
},
{
"epoch": 0.1896551724137931,
"grad_norm": 0.30105778013044776,
"learning_rate": 3.125e-05,
"loss": 0.7274,
"step": 121
},
{
"epoch": 0.19122257053291536,
"grad_norm": 0.306580889404461,
"learning_rate": 3.151041666666667e-05,
"loss": 0.7462,
"step": 122
},
{
"epoch": 0.19278996865203762,
"grad_norm": 0.36971079246051763,
"learning_rate": 3.177083333333333e-05,
"loss": 0.8986,
"step": 123
},
{
"epoch": 0.19435736677115986,
"grad_norm": 0.3244364837855066,
"learning_rate": 3.203125e-05,
"loss": 0.7464,
"step": 124
},
{
"epoch": 0.19592476489028213,
"grad_norm": 0.36371536251466147,
"learning_rate": 3.229166666666667e-05,
"loss": 0.7283,
"step": 125
},
{
"epoch": 0.1974921630094044,
"grad_norm": 0.3475079068978037,
"learning_rate": 3.255208333333333e-05,
"loss": 0.7125,
"step": 126
},
{
"epoch": 0.19905956112852666,
"grad_norm": 0.38445036269196864,
"learning_rate": 3.2812500000000005e-05,
"loss": 0.8212,
"step": 127
},
{
"epoch": 0.2006269592476489,
"grad_norm": 0.35904120535135803,
"learning_rate": 3.307291666666667e-05,
"loss": 0.744,
"step": 128
},
{
"epoch": 0.20219435736677116,
"grad_norm": 0.32602249831155816,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.6818,
"step": 129
},
{
"epoch": 0.20376175548589343,
"grad_norm": 0.3780448948047033,
"learning_rate": 3.359375e-05,
"loss": 0.7463,
"step": 130
},
{
"epoch": 0.20532915360501566,
"grad_norm": 0.31956853454007544,
"learning_rate": 3.385416666666667e-05,
"loss": 0.6819,
"step": 131
},
{
"epoch": 0.20689655172413793,
"grad_norm": 0.3569305296443554,
"learning_rate": 3.411458333333333e-05,
"loss": 0.7797,
"step": 132
},
{
"epoch": 0.2084639498432602,
"grad_norm": 0.3147837394732672,
"learning_rate": 3.4375e-05,
"loss": 0.6795,
"step": 133
},
{
"epoch": 0.21003134796238246,
"grad_norm": 0.32669800154009265,
"learning_rate": 3.463541666666667e-05,
"loss": 0.7258,
"step": 134
},
{
"epoch": 0.2115987460815047,
"grad_norm": 0.4306283730197053,
"learning_rate": 3.489583333333333e-05,
"loss": 0.7446,
"step": 135
},
{
"epoch": 0.21316614420062696,
"grad_norm": 0.350741707267517,
"learning_rate": 3.5156250000000004e-05,
"loss": 0.7643,
"step": 136
},
{
"epoch": 0.21473354231974923,
"grad_norm": 0.39008817917566596,
"learning_rate": 3.541666666666667e-05,
"loss": 0.7016,
"step": 137
},
{
"epoch": 0.21630094043887146,
"grad_norm": 0.3264573228434128,
"learning_rate": 3.5677083333333334e-05,
"loss": 0.7506,
"step": 138
},
{
"epoch": 0.21786833855799373,
"grad_norm": 0.41226987388173447,
"learning_rate": 3.59375e-05,
"loss": 0.6878,
"step": 139
},
{
"epoch": 0.219435736677116,
"grad_norm": 0.3060453350386506,
"learning_rate": 3.619791666666667e-05,
"loss": 0.762,
"step": 140
},
{
"epoch": 0.22100313479623823,
"grad_norm": 0.4121853634133045,
"learning_rate": 3.6458333333333336e-05,
"loss": 0.7339,
"step": 141
},
{
"epoch": 0.2225705329153605,
"grad_norm": 0.3009107642838843,
"learning_rate": 3.671875e-05,
"loss": 0.6925,
"step": 142
},
{
"epoch": 0.22413793103448276,
"grad_norm": 0.41861834212675914,
"learning_rate": 3.697916666666667e-05,
"loss": 0.7766,
"step": 143
},
{
"epoch": 0.22570532915360503,
"grad_norm": 0.3810413317288267,
"learning_rate": 3.723958333333333e-05,
"loss": 0.6759,
"step": 144
},
{
"epoch": 0.22727272727272727,
"grad_norm": 0.5643981372030594,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.8419,
"step": 145
},
{
"epoch": 0.22884012539184953,
"grad_norm": 0.5315511706016451,
"learning_rate": 3.776041666666667e-05,
"loss": 0.7624,
"step": 146
},
{
"epoch": 0.2304075235109718,
"grad_norm": 0.35303080142576654,
"learning_rate": 3.8020833333333334e-05,
"loss": 0.739,
"step": 147
},
{
"epoch": 0.23197492163009403,
"grad_norm": 0.5176438221712008,
"learning_rate": 3.828125e-05,
"loss": 0.7648,
"step": 148
},
{
"epoch": 0.2335423197492163,
"grad_norm": 0.37966710272587695,
"learning_rate": 3.854166666666667e-05,
"loss": 0.6294,
"step": 149
},
{
"epoch": 0.23510971786833856,
"grad_norm": 0.4493088907893656,
"learning_rate": 3.8802083333333336e-05,
"loss": 0.7129,
"step": 150
},
{
"epoch": 0.23667711598746083,
"grad_norm": 2.2342908463328452,
"learning_rate": 3.90625e-05,
"loss": 0.7408,
"step": 151
},
{
"epoch": 0.23824451410658307,
"grad_norm": 0.5828375961864183,
"learning_rate": 3.932291666666667e-05,
"loss": 0.6704,
"step": 152
},
{
"epoch": 0.23981191222570533,
"grad_norm": 0.5454486184645655,
"learning_rate": 3.958333333333333e-05,
"loss": 0.7604,
"step": 153
},
{
"epoch": 0.2413793103448276,
"grad_norm": 0.36259997810294775,
"learning_rate": 3.984375e-05,
"loss": 0.7031,
"step": 154
},
{
"epoch": 0.24294670846394983,
"grad_norm": 0.4831816563598834,
"learning_rate": 4.010416666666667e-05,
"loss": 0.775,
"step": 155
},
{
"epoch": 0.2445141065830721,
"grad_norm": 0.34261387835906687,
"learning_rate": 4.036458333333333e-05,
"loss": 0.7051,
"step": 156
},
{
"epoch": 0.24608150470219436,
"grad_norm": 0.5371757835146368,
"learning_rate": 4.0625000000000005e-05,
"loss": 0.7863,
"step": 157
},
{
"epoch": 0.2476489028213166,
"grad_norm": 0.4582221018060355,
"learning_rate": 4.088541666666667e-05,
"loss": 0.6085,
"step": 158
},
{
"epoch": 0.24921630094043887,
"grad_norm": 0.4769444678679491,
"learning_rate": 4.1145833333333335e-05,
"loss": 0.7668,
"step": 159
},
{
"epoch": 0.2507836990595611,
"grad_norm": 0.4463545256447175,
"learning_rate": 4.140625e-05,
"loss": 0.73,
"step": 160
},
{
"epoch": 0.25235109717868337,
"grad_norm": 0.48191400471360285,
"learning_rate": 4.166666666666667e-05,
"loss": 0.6397,
"step": 161
},
{
"epoch": 0.25391849529780564,
"grad_norm": 0.44701278024941393,
"learning_rate": 4.192708333333333e-05,
"loss": 0.7383,
"step": 162
},
{
"epoch": 0.2554858934169279,
"grad_norm": 0.4304308340404474,
"learning_rate": 4.21875e-05,
"loss": 0.6212,
"step": 163
},
{
"epoch": 0.25705329153605017,
"grad_norm": 0.514118794720696,
"learning_rate": 4.244791666666667e-05,
"loss": 0.7079,
"step": 164
},
{
"epoch": 0.25862068965517243,
"grad_norm": 0.4409299821779926,
"learning_rate": 4.270833333333333e-05,
"loss": 0.6642,
"step": 165
},
{
"epoch": 0.2601880877742947,
"grad_norm": 0.4569041405382737,
"learning_rate": 4.2968750000000004e-05,
"loss": 0.6707,
"step": 166
},
{
"epoch": 0.2617554858934169,
"grad_norm": 0.4774152382188804,
"learning_rate": 4.322916666666667e-05,
"loss": 0.8117,
"step": 167
},
{
"epoch": 0.26332288401253917,
"grad_norm": 0.3865092418967612,
"learning_rate": 4.3489583333333334e-05,
"loss": 0.6098,
"step": 168
},
{
"epoch": 0.26489028213166144,
"grad_norm": 0.5118338717176162,
"learning_rate": 4.375e-05,
"loss": 0.6633,
"step": 169
},
{
"epoch": 0.2664576802507837,
"grad_norm": 0.4567827748659261,
"learning_rate": 4.401041666666667e-05,
"loss": 0.7738,
"step": 170
},
{
"epoch": 0.26802507836990597,
"grad_norm": 0.4399847317147505,
"learning_rate": 4.4270833333333337e-05,
"loss": 0.6861,
"step": 171
},
{
"epoch": 0.26959247648902823,
"grad_norm": 0.4156384026765874,
"learning_rate": 4.453125e-05,
"loss": 0.6729,
"step": 172
},
{
"epoch": 0.2711598746081505,
"grad_norm": 0.4555923431557191,
"learning_rate": 4.4791666666666673e-05,
"loss": 0.6775,
"step": 173
},
{
"epoch": 0.2727272727272727,
"grad_norm": 0.4197970926982543,
"learning_rate": 4.505208333333333e-05,
"loss": 0.746,
"step": 174
},
{
"epoch": 0.274294670846395,
"grad_norm": 0.39629981920651186,
"learning_rate": 4.5312500000000004e-05,
"loss": 0.6634,
"step": 175
},
{
"epoch": 0.27586206896551724,
"grad_norm": 0.4324800848031094,
"learning_rate": 4.557291666666667e-05,
"loss": 0.6995,
"step": 176
},
{
"epoch": 0.2774294670846395,
"grad_norm": 0.44123158912151544,
"learning_rate": 4.5833333333333334e-05,
"loss": 0.664,
"step": 177
},
{
"epoch": 0.27899686520376177,
"grad_norm": 0.38123891742500443,
"learning_rate": 4.609375e-05,
"loss": 0.6144,
"step": 178
},
{
"epoch": 0.28056426332288403,
"grad_norm": 0.3630998942822551,
"learning_rate": 4.635416666666667e-05,
"loss": 0.7425,
"step": 179
},
{
"epoch": 0.28213166144200624,
"grad_norm": 0.35341288941104904,
"learning_rate": 4.6614583333333336e-05,
"loss": 0.7123,
"step": 180
},
{
"epoch": 0.2836990595611285,
"grad_norm": 0.33449905332561913,
"learning_rate": 4.6875e-05,
"loss": 0.6299,
"step": 181
},
{
"epoch": 0.2852664576802508,
"grad_norm": 0.3495255923774706,
"learning_rate": 4.713541666666667e-05,
"loss": 0.7273,
"step": 182
},
{
"epoch": 0.28683385579937304,
"grad_norm": 0.390181384509761,
"learning_rate": 4.739583333333333e-05,
"loss": 0.7257,
"step": 183
},
{
"epoch": 0.2884012539184953,
"grad_norm": 0.34913911482519716,
"learning_rate": 4.765625e-05,
"loss": 0.6436,
"step": 184
},
{
"epoch": 0.28996865203761757,
"grad_norm": 0.4959928030711952,
"learning_rate": 4.791666666666667e-05,
"loss": 0.7458,
"step": 185
},
{
"epoch": 0.29153605015673983,
"grad_norm": 0.4626012969308779,
"learning_rate": 4.817708333333333e-05,
"loss": 0.739,
"step": 186
},
{
"epoch": 0.29310344827586204,
"grad_norm": 0.42876091546213435,
"learning_rate": 4.8437500000000005e-05,
"loss": 0.7522,
"step": 187
},
{
"epoch": 0.2946708463949843,
"grad_norm": 0.44112288939521893,
"learning_rate": 4.869791666666667e-05,
"loss": 0.6851,
"step": 188
},
{
"epoch": 0.2962382445141066,
"grad_norm": 0.39661512836783747,
"learning_rate": 4.8958333333333335e-05,
"loss": 0.6014,
"step": 189
},
{
"epoch": 0.29780564263322884,
"grad_norm": 0.5221929761670279,
"learning_rate": 4.921875e-05,
"loss": 0.6339,
"step": 190
},
{
"epoch": 0.2993730407523511,
"grad_norm": 0.4766790805656907,
"learning_rate": 4.947916666666667e-05,
"loss": 0.72,
"step": 191
},
{
"epoch": 0.30094043887147337,
"grad_norm": 0.461462108134711,
"learning_rate": 4.973958333333333e-05,
"loss": 0.6635,
"step": 192
},
{
"epoch": 0.30250783699059564,
"grad_norm": 0.5275910082053545,
"learning_rate": 5e-05,
"loss": 0.7098,
"step": 193
},
{
"epoch": 0.30407523510971785,
"grad_norm": 0.47678503837915104,
"learning_rate": 4.997096399535424e-05,
"loss": 0.7175,
"step": 194
},
{
"epoch": 0.3056426332288401,
"grad_norm": 0.4007982969917202,
"learning_rate": 4.994192799070848e-05,
"loss": 0.6807,
"step": 195
},
{
"epoch": 0.3072100313479624,
"grad_norm": 0.49872537853046905,
"learning_rate": 4.991289198606272e-05,
"loss": 0.7701,
"step": 196
},
{
"epoch": 0.30877742946708464,
"grad_norm": 0.48912824059092896,
"learning_rate": 4.988385598141696e-05,
"loss": 0.7486,
"step": 197
},
{
"epoch": 0.3103448275862069,
"grad_norm": 0.40557658945067887,
"learning_rate": 4.98548199767712e-05,
"loss": 0.6821,
"step": 198
},
{
"epoch": 0.31191222570532917,
"grad_norm": 0.44177200739434963,
"learning_rate": 4.9825783972125436e-05,
"loss": 0.6344,
"step": 199
},
{
"epoch": 0.31347962382445144,
"grad_norm": 0.42927347788676246,
"learning_rate": 4.9796747967479676e-05,
"loss": 0.7714,
"step": 200
},
{
"epoch": 0.31504702194357365,
"grad_norm": 0.3523298695219205,
"learning_rate": 4.9767711962833916e-05,
"loss": 0.6239,
"step": 201
},
{
"epoch": 0.3166144200626959,
"grad_norm": 0.5620628119356224,
"learning_rate": 4.9738675958188156e-05,
"loss": 0.7067,
"step": 202
},
{
"epoch": 0.3181818181818182,
"grad_norm": 0.44397052100682377,
"learning_rate": 4.9709639953542396e-05,
"loss": 0.6307,
"step": 203
},
{
"epoch": 0.31974921630094044,
"grad_norm": 0.4991053443616419,
"learning_rate": 4.9680603948896636e-05,
"loss": 0.6593,
"step": 204
},
{
"epoch": 0.3213166144200627,
"grad_norm": 0.5793366664656031,
"learning_rate": 4.965156794425087e-05,
"loss": 0.7583,
"step": 205
},
{
"epoch": 0.322884012539185,
"grad_norm": 0.37903802785420715,
"learning_rate": 4.962253193960511e-05,
"loss": 0.6465,
"step": 206
},
{
"epoch": 0.32445141065830724,
"grad_norm": 0.5703443631076596,
"learning_rate": 4.959349593495935e-05,
"loss": 0.6694,
"step": 207
},
{
"epoch": 0.32601880877742945,
"grad_norm": 0.34003382126022313,
"learning_rate": 4.956445993031359e-05,
"loss": 0.7067,
"step": 208
},
{
"epoch": 0.3275862068965517,
"grad_norm": 0.5334225014679039,
"learning_rate": 4.953542392566783e-05,
"loss": 0.6391,
"step": 209
},
{
"epoch": 0.329153605015674,
"grad_norm": 0.40567890700435644,
"learning_rate": 4.950638792102207e-05,
"loss": 0.6825,
"step": 210
},
{
"epoch": 0.33072100313479624,
"grad_norm": 0.45316550772276887,
"learning_rate": 4.947735191637631e-05,
"loss": 0.6676,
"step": 211
},
{
"epoch": 0.3322884012539185,
"grad_norm": 0.4347510890006613,
"learning_rate": 4.944831591173055e-05,
"loss": 0.6611,
"step": 212
},
{
"epoch": 0.3338557993730408,
"grad_norm": 0.3870349164947141,
"learning_rate": 4.9419279907084783e-05,
"loss": 0.7165,
"step": 213
},
{
"epoch": 0.335423197492163,
"grad_norm": 0.5143517274590542,
"learning_rate": 4.9390243902439024e-05,
"loss": 0.6392,
"step": 214
},
{
"epoch": 0.33699059561128525,
"grad_norm": 0.4264746614842827,
"learning_rate": 4.9361207897793264e-05,
"loss": 0.7011,
"step": 215
},
{
"epoch": 0.3385579937304075,
"grad_norm": 0.5863351496505967,
"learning_rate": 4.9332171893147504e-05,
"loss": 0.7764,
"step": 216
},
{
"epoch": 0.3401253918495298,
"grad_norm": 0.33291465050003677,
"learning_rate": 4.9303135888501744e-05,
"loss": 0.716,
"step": 217
},
{
"epoch": 0.34169278996865204,
"grad_norm": 0.580680473810341,
"learning_rate": 4.9274099883855984e-05,
"loss": 0.7299,
"step": 218
},
{
"epoch": 0.3432601880877743,
"grad_norm": 0.33014325886048285,
"learning_rate": 4.9245063879210224e-05,
"loss": 0.6352,
"step": 219
},
{
"epoch": 0.3448275862068966,
"grad_norm": 0.46526438443917084,
"learning_rate": 4.9216027874564464e-05,
"loss": 0.6444,
"step": 220
},
{
"epoch": 0.3463949843260188,
"grad_norm": 0.3421550046357593,
"learning_rate": 4.9186991869918704e-05,
"loss": 0.7168,
"step": 221
},
{
"epoch": 0.34796238244514105,
"grad_norm": 0.5500083752174315,
"learning_rate": 4.9157955865272944e-05,
"loss": 0.7709,
"step": 222
},
{
"epoch": 0.3495297805642633,
"grad_norm": 0.4486349064548526,
"learning_rate": 4.9128919860627184e-05,
"loss": 0.5967,
"step": 223
},
{
"epoch": 0.3510971786833856,
"grad_norm": 0.4035392982583883,
"learning_rate": 4.9099883855981424e-05,
"loss": 0.6251,
"step": 224
},
{
"epoch": 0.35266457680250785,
"grad_norm": 0.429627017902352,
"learning_rate": 4.907084785133566e-05,
"loss": 0.6158,
"step": 225
},
{
"epoch": 0.3542319749216301,
"grad_norm": 0.41611021741129317,
"learning_rate": 4.90418118466899e-05,
"loss": 0.6382,
"step": 226
},
{
"epoch": 0.3557993730407524,
"grad_norm": 0.3822912033728707,
"learning_rate": 4.901277584204414e-05,
"loss": 0.6483,
"step": 227
},
{
"epoch": 0.3573667711598746,
"grad_norm": 0.40196238381850596,
"learning_rate": 4.898373983739837e-05,
"loss": 0.7369,
"step": 228
},
{
"epoch": 0.35893416927899685,
"grad_norm": 0.42153193080047036,
"learning_rate": 4.895470383275261e-05,
"loss": 0.6598,
"step": 229
},
{
"epoch": 0.3605015673981191,
"grad_norm": 1.524620354973828,
"learning_rate": 4.892566782810685e-05,
"loss": 0.6305,
"step": 230
},
{
"epoch": 0.3620689655172414,
"grad_norm": 0.4014258793676616,
"learning_rate": 4.889663182346109e-05,
"loss": 0.6568,
"step": 231
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.39917957265338694,
"learning_rate": 4.886759581881533e-05,
"loss": 0.6693,
"step": 232
},
{
"epoch": 0.3652037617554859,
"grad_norm": 0.3617125908280669,
"learning_rate": 4.883855981416957e-05,
"loss": 0.5835,
"step": 233
},
{
"epoch": 0.3667711598746082,
"grad_norm": 0.4346607092077126,
"learning_rate": 4.880952380952381e-05,
"loss": 0.6392,
"step": 234
},
{
"epoch": 0.3683385579937304,
"grad_norm": 0.3699730904366294,
"learning_rate": 4.878048780487805e-05,
"loss": 0.5608,
"step": 235
},
{
"epoch": 0.36990595611285265,
"grad_norm": 0.34958323489520826,
"learning_rate": 4.875145180023229e-05,
"loss": 0.5941,
"step": 236
},
{
"epoch": 0.3714733542319749,
"grad_norm": 0.4249382982615816,
"learning_rate": 4.872241579558653e-05,
"loss": 0.6227,
"step": 237
},
{
"epoch": 0.3730407523510972,
"grad_norm": 0.3248777426199452,
"learning_rate": 4.869337979094077e-05,
"loss": 0.6789,
"step": 238
},
{
"epoch": 0.37460815047021945,
"grad_norm": 0.37974135211323834,
"learning_rate": 4.8664343786295005e-05,
"loss": 0.5953,
"step": 239
},
{
"epoch": 0.3761755485893417,
"grad_norm": 0.38354855030631746,
"learning_rate": 4.8635307781649245e-05,
"loss": 0.7057,
"step": 240
},
{
"epoch": 0.3777429467084639,
"grad_norm": 0.43194800844539677,
"learning_rate": 4.8606271777003485e-05,
"loss": 0.6934,
"step": 241
},
{
"epoch": 0.3793103448275862,
"grad_norm": 0.32897315346118594,
"learning_rate": 4.8577235772357725e-05,
"loss": 0.7053,
"step": 242
},
{
"epoch": 0.38087774294670845,
"grad_norm": 0.3869035992347456,
"learning_rate": 4.8548199767711965e-05,
"loss": 0.6193,
"step": 243
},
{
"epoch": 0.3824451410658307,
"grad_norm": 0.3243650761215195,
"learning_rate": 4.8519163763066205e-05,
"loss": 0.7172,
"step": 244
},
{
"epoch": 0.384012539184953,
"grad_norm": 0.3729037272469332,
"learning_rate": 4.8490127758420445e-05,
"loss": 0.6413,
"step": 245
},
{
"epoch": 0.38557993730407525,
"grad_norm": 0.3299231931699701,
"learning_rate": 4.8461091753774685e-05,
"loss": 0.7091,
"step": 246
},
{
"epoch": 0.3871473354231975,
"grad_norm": 0.3828957210423206,
"learning_rate": 4.8432055749128926e-05,
"loss": 0.6585,
"step": 247
},
{
"epoch": 0.3887147335423197,
"grad_norm": 0.41594941752592457,
"learning_rate": 4.8403019744483166e-05,
"loss": 0.7818,
"step": 248
},
{
"epoch": 0.390282131661442,
"grad_norm": 0.3531648773880787,
"learning_rate": 4.8373983739837406e-05,
"loss": 0.619,
"step": 249
},
{
"epoch": 0.39184952978056425,
"grad_norm": 0.36844580229217766,
"learning_rate": 4.8344947735191646e-05,
"loss": 0.652,
"step": 250
},
{
"epoch": 0.3934169278996865,
"grad_norm": 0.3261087497021728,
"learning_rate": 4.831591173054588e-05,
"loss": 0.656,
"step": 251
},
{
"epoch": 0.3949843260188088,
"grad_norm": 0.37837634105258905,
"learning_rate": 4.828687572590012e-05,
"loss": 0.7171,
"step": 252
},
{
"epoch": 0.39655172413793105,
"grad_norm": 0.33448440125954376,
"learning_rate": 4.825783972125435e-05,
"loss": 0.734,
"step": 253
},
{
"epoch": 0.3981191222570533,
"grad_norm": 0.37108826752553414,
"learning_rate": 4.822880371660859e-05,
"loss": 0.7087,
"step": 254
},
{
"epoch": 0.3996865203761755,
"grad_norm": 0.33781007140153113,
"learning_rate": 4.819976771196283e-05,
"loss": 0.634,
"step": 255
},
{
"epoch": 0.4012539184952978,
"grad_norm": 0.38763992565690825,
"learning_rate": 4.817073170731707e-05,
"loss": 0.6718,
"step": 256
},
{
"epoch": 0.40282131661442006,
"grad_norm": 0.31540620007280695,
"learning_rate": 4.814169570267131e-05,
"loss": 0.5801,
"step": 257
},
{
"epoch": 0.4043887147335423,
"grad_norm": 0.3229847356721017,
"learning_rate": 4.811265969802555e-05,
"loss": 0.6876,
"step": 258
},
{
"epoch": 0.4059561128526646,
"grad_norm": 0.3232964880524149,
"learning_rate": 4.808362369337979e-05,
"loss": 0.5756,
"step": 259
},
{
"epoch": 0.40752351097178685,
"grad_norm": 0.31683721641923,
"learning_rate": 4.805458768873403e-05,
"loss": 0.64,
"step": 260
},
{
"epoch": 0.4090909090909091,
"grad_norm": 0.34852036167381356,
"learning_rate": 4.802555168408827e-05,
"loss": 0.747,
"step": 261
},
{
"epoch": 0.4106583072100313,
"grad_norm": 0.319639186532805,
"learning_rate": 4.799651567944251e-05,
"loss": 0.626,
"step": 262
},
{
"epoch": 0.4122257053291536,
"grad_norm": 0.3324507974178701,
"learning_rate": 4.796747967479675e-05,
"loss": 0.6808,
"step": 263
},
{
"epoch": 0.41379310344827586,
"grad_norm": 0.3357912380759631,
"learning_rate": 4.793844367015099e-05,
"loss": 0.6696,
"step": 264
},
{
"epoch": 0.4153605015673981,
"grad_norm": 0.30290523340505365,
"learning_rate": 4.7909407665505226e-05,
"loss": 0.6402,
"step": 265
},
{
"epoch": 0.4169278996865204,
"grad_norm": 0.3110030273209187,
"learning_rate": 4.7880371660859467e-05,
"loss": 0.5596,
"step": 266
},
{
"epoch": 0.41849529780564265,
"grad_norm": 0.3060397044268462,
"learning_rate": 4.7851335656213707e-05,
"loss": 0.6222,
"step": 267
},
{
"epoch": 0.4200626959247649,
"grad_norm": 0.31822875072798845,
"learning_rate": 4.782229965156795e-05,
"loss": 0.7499,
"step": 268
},
{
"epoch": 0.4216300940438871,
"grad_norm": 0.36122239988840815,
"learning_rate": 4.779326364692219e-05,
"loss": 0.5732,
"step": 269
},
{
"epoch": 0.4231974921630094,
"grad_norm": 0.3016492562139979,
"learning_rate": 4.776422764227643e-05,
"loss": 0.5618,
"step": 270
},
{
"epoch": 0.42476489028213166,
"grad_norm": 0.34127965868107674,
"learning_rate": 4.773519163763067e-05,
"loss": 0.7197,
"step": 271
},
{
"epoch": 0.4263322884012539,
"grad_norm": 0.3776251035726337,
"learning_rate": 4.770615563298491e-05,
"loss": 0.6619,
"step": 272
},
{
"epoch": 0.4278996865203762,
"grad_norm": 0.2784884256162241,
"learning_rate": 4.767711962833915e-05,
"loss": 0.5868,
"step": 273
},
{
"epoch": 0.42946708463949845,
"grad_norm": 0.4426982677154907,
"learning_rate": 4.764808362369339e-05,
"loss": 0.7764,
"step": 274
},
{
"epoch": 0.43103448275862066,
"grad_norm": 0.36996852640033745,
"learning_rate": 4.761904761904762e-05,
"loss": 0.6803,
"step": 275
},
{
"epoch": 0.43260188087774293,
"grad_norm": 0.35414579296717463,
"learning_rate": 4.759001161440186e-05,
"loss": 0.7042,
"step": 276
},
{
"epoch": 0.4341692789968652,
"grad_norm": 0.39007290278155815,
"learning_rate": 4.75609756097561e-05,
"loss": 0.6118,
"step": 277
},
{
"epoch": 0.43573667711598746,
"grad_norm": 0.30442087789309363,
"learning_rate": 4.7531939605110334e-05,
"loss": 0.6007,
"step": 278
},
{
"epoch": 0.4373040752351097,
"grad_norm": 0.40156907388631813,
"learning_rate": 4.7502903600464574e-05,
"loss": 0.6515,
"step": 279
},
{
"epoch": 0.438871473354232,
"grad_norm": 0.3285475288220044,
"learning_rate": 4.7473867595818814e-05,
"loss": 0.6363,
"step": 280
},
{
"epoch": 0.44043887147335425,
"grad_norm": 0.34762618708226484,
"learning_rate": 4.7444831591173054e-05,
"loss": 0.6719,
"step": 281
},
{
"epoch": 0.44200626959247646,
"grad_norm": 0.34495079555600205,
"learning_rate": 4.7415795586527294e-05,
"loss": 0.6922,
"step": 282
},
{
"epoch": 0.44357366771159873,
"grad_norm": 0.2958281247043729,
"learning_rate": 4.7386759581881534e-05,
"loss": 0.6256,
"step": 283
},
{
"epoch": 0.445141065830721,
"grad_norm": 0.3443018639238689,
"learning_rate": 4.7357723577235774e-05,
"loss": 0.7549,
"step": 284
},
{
"epoch": 0.44670846394984326,
"grad_norm": 0.32337041054638743,
"learning_rate": 4.7328687572590014e-05,
"loss": 0.6475,
"step": 285
},
{
"epoch": 0.4482758620689655,
"grad_norm": 0.3166803051733083,
"learning_rate": 4.7299651567944254e-05,
"loss": 0.6946,
"step": 286
},
{
"epoch": 0.4498432601880878,
"grad_norm": 0.32167258246754155,
"learning_rate": 4.7270615563298494e-05,
"loss": 0.679,
"step": 287
},
{
"epoch": 0.45141065830721006,
"grad_norm": 0.3033807754883045,
"learning_rate": 4.7241579558652734e-05,
"loss": 0.7106,
"step": 288
},
{
"epoch": 0.45297805642633227,
"grad_norm": 0.29097777465199653,
"learning_rate": 4.7212543554006975e-05,
"loss": 0.5897,
"step": 289
},
{
"epoch": 0.45454545454545453,
"grad_norm": 0.3282214807171184,
"learning_rate": 4.718350754936121e-05,
"loss": 0.6195,
"step": 290
},
{
"epoch": 0.4561128526645768,
"grad_norm": 0.3182488296082386,
"learning_rate": 4.715447154471545e-05,
"loss": 0.6432,
"step": 291
},
{
"epoch": 0.45768025078369906,
"grad_norm": 0.3849059224326338,
"learning_rate": 4.712543554006969e-05,
"loss": 0.6179,
"step": 292
},
{
"epoch": 0.4592476489028213,
"grad_norm": 0.2857966208341309,
"learning_rate": 4.709639953542393e-05,
"loss": 0.6978,
"step": 293
},
{
"epoch": 0.4608150470219436,
"grad_norm": 0.42189100113972805,
"learning_rate": 4.706736353077817e-05,
"loss": 0.6088,
"step": 294
},
{
"epoch": 0.46238244514106586,
"grad_norm": 0.2836051923428305,
"learning_rate": 4.703832752613241e-05,
"loss": 0.5955,
"step": 295
},
{
"epoch": 0.46394984326018807,
"grad_norm": 0.37864089343504936,
"learning_rate": 4.700929152148665e-05,
"loss": 0.6497,
"step": 296
},
{
"epoch": 0.46551724137931033,
"grad_norm": 0.3252222063202702,
"learning_rate": 4.698025551684089e-05,
"loss": 0.6301,
"step": 297
},
{
"epoch": 0.4670846394984326,
"grad_norm": 0.3681797362871755,
"learning_rate": 4.695121951219512e-05,
"loss": 0.6258,
"step": 298
},
{
"epoch": 0.46865203761755486,
"grad_norm": 2.7147267225995626,
"learning_rate": 4.692218350754936e-05,
"loss": 0.6394,
"step": 299
},
{
"epoch": 0.4702194357366771,
"grad_norm": 0.4197440941880559,
"learning_rate": 4.68931475029036e-05,
"loss": 0.7542,
"step": 300
},
{
"epoch": 0.4717868338557994,
"grad_norm": 0.375338293992843,
"learning_rate": 4.686411149825784e-05,
"loss": 0.5823,
"step": 301
},
{
"epoch": 0.47335423197492166,
"grad_norm": 0.30515219592681225,
"learning_rate": 4.683507549361208e-05,
"loss": 0.5916,
"step": 302
},
{
"epoch": 0.47492163009404387,
"grad_norm": 0.41847488831526025,
"learning_rate": 4.680603948896632e-05,
"loss": 0.6474,
"step": 303
},
{
"epoch": 0.47648902821316613,
"grad_norm": 0.37874666984828614,
"learning_rate": 4.6777003484320555e-05,
"loss": 0.6444,
"step": 304
},
{
"epoch": 0.4780564263322884,
"grad_norm": 0.37978849493705225,
"learning_rate": 4.6747967479674795e-05,
"loss": 0.6473,
"step": 305
},
{
"epoch": 0.47962382445141066,
"grad_norm": 0.3277831184743026,
"learning_rate": 4.6718931475029035e-05,
"loss": 0.667,
"step": 306
},
{
"epoch": 0.48119122257053293,
"grad_norm": 0.3308804665920976,
"learning_rate": 4.6689895470383275e-05,
"loss": 0.6716,
"step": 307
},
{
"epoch": 0.4827586206896552,
"grad_norm": 0.34774075525011255,
"learning_rate": 4.6660859465737516e-05,
"loss": 0.6442,
"step": 308
},
{
"epoch": 0.4843260188087774,
"grad_norm": 1.398983730475838,
"learning_rate": 4.6631823461091756e-05,
"loss": 0.6707,
"step": 309
},
{
"epoch": 0.48589341692789967,
"grad_norm": 0.40651164981259996,
"learning_rate": 4.6602787456445996e-05,
"loss": 0.5837,
"step": 310
},
{
"epoch": 0.48746081504702193,
"grad_norm": 0.4546951470139932,
"learning_rate": 4.6573751451800236e-05,
"loss": 0.6599,
"step": 311
},
{
"epoch": 0.4890282131661442,
"grad_norm": 0.4061119160576302,
"learning_rate": 4.6544715447154476e-05,
"loss": 0.6373,
"step": 312
},
{
"epoch": 0.49059561128526646,
"grad_norm": 0.40743818080238076,
"learning_rate": 4.6515679442508716e-05,
"loss": 0.6561,
"step": 313
},
{
"epoch": 0.49216300940438873,
"grad_norm": 0.3643646076547515,
"learning_rate": 4.6486643437862956e-05,
"loss": 0.6077,
"step": 314
},
{
"epoch": 0.493730407523511,
"grad_norm": 0.40661761292381376,
"learning_rate": 4.6457607433217196e-05,
"loss": 0.7144,
"step": 315
},
{
"epoch": 0.4952978056426332,
"grad_norm": 0.37269790832571625,
"learning_rate": 4.642857142857143e-05,
"loss": 0.6426,
"step": 316
},
{
"epoch": 0.49686520376175547,
"grad_norm": 0.3764750852929548,
"learning_rate": 4.639953542392567e-05,
"loss": 0.6824,
"step": 317
},
{
"epoch": 0.49843260188087773,
"grad_norm": 0.3754787284045529,
"learning_rate": 4.637049941927991e-05,
"loss": 0.6893,
"step": 318
},
{
"epoch": 0.5,
"grad_norm": 0.33359600845319776,
"learning_rate": 4.634146341463415e-05,
"loss": 0.6667,
"step": 319
},
{
"epoch": 0.5015673981191222,
"grad_norm": 0.373621963335094,
"learning_rate": 4.631242740998839e-05,
"loss": 0.618,
"step": 320
},
{
"epoch": 0.5031347962382445,
"grad_norm": 0.3012874937628567,
"learning_rate": 4.628339140534262e-05,
"loss": 0.6491,
"step": 321
},
{
"epoch": 0.5047021943573667,
"grad_norm": 0.37289398592555206,
"learning_rate": 4.625435540069686e-05,
"loss": 0.6349,
"step": 322
},
{
"epoch": 0.5062695924764891,
"grad_norm": 0.3241733786063918,
"learning_rate": 4.62253193960511e-05,
"loss": 0.6396,
"step": 323
},
{
"epoch": 0.5078369905956113,
"grad_norm": 0.3241041567536623,
"learning_rate": 4.619628339140534e-05,
"loss": 0.5823,
"step": 324
},
{
"epoch": 0.5094043887147336,
"grad_norm": 0.3182812581599756,
"learning_rate": 4.616724738675958e-05,
"loss": 0.5488,
"step": 325
},
{
"epoch": 0.5109717868338558,
"grad_norm": 0.3340162037009047,
"learning_rate": 4.613821138211382e-05,
"loss": 0.7173,
"step": 326
},
{
"epoch": 0.512539184952978,
"grad_norm": 0.7663773098821299,
"learning_rate": 4.610917537746806e-05,
"loss": 0.6723,
"step": 327
},
{
"epoch": 0.5141065830721003,
"grad_norm": 0.32549963076067745,
"learning_rate": 4.6080139372822303e-05,
"loss": 0.6393,
"step": 328
},
{
"epoch": 0.5156739811912225,
"grad_norm": 0.2892499546072762,
"learning_rate": 4.6051103368176543e-05,
"loss": 0.6019,
"step": 329
},
{
"epoch": 0.5172413793103449,
"grad_norm": 0.36149454004186027,
"learning_rate": 4.602206736353078e-05,
"loss": 0.6017,
"step": 330
},
{
"epoch": 0.5188087774294671,
"grad_norm": 0.27085483458744447,
"learning_rate": 4.599303135888502e-05,
"loss": 0.5961,
"step": 331
},
{
"epoch": 0.5203761755485894,
"grad_norm": 0.31832573236860795,
"learning_rate": 4.596399535423926e-05,
"loss": 0.5752,
"step": 332
},
{
"epoch": 0.5219435736677116,
"grad_norm": 6.6013630858810215,
"learning_rate": 4.59349593495935e-05,
"loss": 0.798,
"step": 333
},
{
"epoch": 0.5235109717868338,
"grad_norm": 0.4698608206009564,
"learning_rate": 4.590592334494774e-05,
"loss": 0.6797,
"step": 334
},
{
"epoch": 0.5250783699059561,
"grad_norm": 0.30037705694922096,
"learning_rate": 4.587688734030198e-05,
"loss": 0.6249,
"step": 335
},
{
"epoch": 0.5266457680250783,
"grad_norm": 0.481477349833268,
"learning_rate": 4.584785133565622e-05,
"loss": 0.6663,
"step": 336
},
{
"epoch": 0.5282131661442007,
"grad_norm": 1.7307534667017153,
"learning_rate": 4.581881533101046e-05,
"loss": 0.6506,
"step": 337
},
{
"epoch": 0.5297805642633229,
"grad_norm": 0.8464484361819015,
"learning_rate": 4.57897793263647e-05,
"loss": 0.6799,
"step": 338
},
{
"epoch": 0.5313479623824452,
"grad_norm": 0.34374480805915025,
"learning_rate": 4.576074332171894e-05,
"loss": 0.6121,
"step": 339
},
{
"epoch": 0.5329153605015674,
"grad_norm": 0.4219790195573184,
"learning_rate": 4.573170731707318e-05,
"loss": 0.6497,
"step": 340
},
{
"epoch": 0.5344827586206896,
"grad_norm": 0.40112503061723104,
"learning_rate": 4.570267131242742e-05,
"loss": 0.6536,
"step": 341
},
{
"epoch": 0.5360501567398119,
"grad_norm": 0.4362956706096327,
"learning_rate": 4.567363530778165e-05,
"loss": 0.5786,
"step": 342
},
{
"epoch": 0.5376175548589341,
"grad_norm": 0.37264633443497885,
"learning_rate": 4.564459930313589e-05,
"loss": 0.6722,
"step": 343
},
{
"epoch": 0.5391849529780565,
"grad_norm": 0.3578115812598959,
"learning_rate": 4.561556329849013e-05,
"loss": 0.6505,
"step": 344
},
{
"epoch": 0.5407523510971787,
"grad_norm": 1.591212057521109,
"learning_rate": 4.5586527293844364e-05,
"loss": 0.7345,
"step": 345
},
{
"epoch": 0.542319749216301,
"grad_norm": 0.5025893871076349,
"learning_rate": 4.5557491289198604e-05,
"loss": 0.6265,
"step": 346
},
{
"epoch": 0.5438871473354232,
"grad_norm": 1.8028130284350732,
"learning_rate": 4.5528455284552844e-05,
"loss": 0.761,
"step": 347
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.5008336970159709,
"learning_rate": 4.5499419279907084e-05,
"loss": 0.6348,
"step": 348
},
{
"epoch": 0.5470219435736677,
"grad_norm": 0.35031743138829397,
"learning_rate": 4.5470383275261325e-05,
"loss": 0.6924,
"step": 349
},
{
"epoch": 0.54858934169279,
"grad_norm": 0.5304825650671162,
"learning_rate": 4.5441347270615565e-05,
"loss": 0.6338,
"step": 350
},
{
"epoch": 0.5501567398119123,
"grad_norm": 0.3814609309183334,
"learning_rate": 4.5412311265969805e-05,
"loss": 0.6722,
"step": 351
},
{
"epoch": 0.5517241379310345,
"grad_norm": 0.4679483296725716,
"learning_rate": 4.5383275261324045e-05,
"loss": 0.7341,
"step": 352
},
{
"epoch": 0.5532915360501567,
"grad_norm": 0.44298805965902605,
"learning_rate": 4.5354239256678285e-05,
"loss": 0.5838,
"step": 353
},
{
"epoch": 0.554858934169279,
"grad_norm": 0.5859834884580318,
"learning_rate": 4.5325203252032525e-05,
"loss": 0.6785,
"step": 354
},
{
"epoch": 0.5564263322884012,
"grad_norm": 0.4853448630156449,
"learning_rate": 4.529616724738676e-05,
"loss": 0.6206,
"step": 355
},
{
"epoch": 0.5579937304075235,
"grad_norm": 0.3484991871727902,
"learning_rate": 4.5267131242741e-05,
"loss": 0.7485,
"step": 356
},
{
"epoch": 0.5595611285266457,
"grad_norm": 0.4086220184919094,
"learning_rate": 4.523809523809524e-05,
"loss": 0.601,
"step": 357
},
{
"epoch": 0.5611285266457681,
"grad_norm": 0.3532890385938265,
"learning_rate": 4.520905923344948e-05,
"loss": 0.7194,
"step": 358
},
{
"epoch": 0.5626959247648903,
"grad_norm": 0.39120909869314585,
"learning_rate": 4.518002322880372e-05,
"loss": 0.5672,
"step": 359
},
{
"epoch": 0.5642633228840125,
"grad_norm": 0.3643379775099069,
"learning_rate": 4.515098722415796e-05,
"loss": 0.6551,
"step": 360
},
{
"epoch": 0.5658307210031348,
"grad_norm": 0.3387937661468538,
"learning_rate": 4.51219512195122e-05,
"loss": 0.5854,
"step": 361
},
{
"epoch": 0.567398119122257,
"grad_norm": 0.2844501591631617,
"learning_rate": 4.509291521486644e-05,
"loss": 0.6544,
"step": 362
},
{
"epoch": 0.5689655172413793,
"grad_norm": 2.498416311860011,
"learning_rate": 4.506387921022068e-05,
"loss": 0.6858,
"step": 363
},
{
"epoch": 0.5705329153605015,
"grad_norm": 0.4409146364631813,
"learning_rate": 4.503484320557492e-05,
"loss": 0.6675,
"step": 364
},
{
"epoch": 0.5721003134796239,
"grad_norm": 1.960058084903314,
"learning_rate": 4.500580720092916e-05,
"loss": 0.631,
"step": 365
},
{
"epoch": 0.5736677115987461,
"grad_norm": 0.48320829331912285,
"learning_rate": 4.49767711962834e-05,
"loss": 0.6926,
"step": 366
},
{
"epoch": 0.5752351097178683,
"grad_norm": 0.3516586488979057,
"learning_rate": 4.494773519163763e-05,
"loss": 0.6743,
"step": 367
},
{
"epoch": 0.5768025078369906,
"grad_norm": 0.3880595890801754,
"learning_rate": 4.491869918699187e-05,
"loss": 0.6497,
"step": 368
},
{
"epoch": 0.5783699059561128,
"grad_norm": 0.4434212595833726,
"learning_rate": 4.4889663182346106e-05,
"loss": 0.6556,
"step": 369
},
{
"epoch": 0.5799373040752351,
"grad_norm": 0.48851722892846305,
"learning_rate": 4.4860627177700346e-05,
"loss": 0.6211,
"step": 370
},
{
"epoch": 0.5815047021943573,
"grad_norm": 0.36769561570747455,
"learning_rate": 4.4831591173054586e-05,
"loss": 0.6936,
"step": 371
},
{
"epoch": 0.5830721003134797,
"grad_norm": 0.38960913312831963,
"learning_rate": 4.4802555168408826e-05,
"loss": 0.6524,
"step": 372
},
{
"epoch": 0.5846394984326019,
"grad_norm": 1.1358720169976089,
"learning_rate": 4.4773519163763066e-05,
"loss": 0.6365,
"step": 373
},
{
"epoch": 0.5862068965517241,
"grad_norm": 3.1779877879744425,
"learning_rate": 4.4744483159117306e-05,
"loss": 0.6028,
"step": 374
},
{
"epoch": 0.5877742946708464,
"grad_norm": 0.5553168494387453,
"learning_rate": 4.4715447154471546e-05,
"loss": 0.5984,
"step": 375
},
{
"epoch": 0.5893416927899686,
"grad_norm": 0.38410585227016236,
"learning_rate": 4.4686411149825786e-05,
"loss": 0.633,
"step": 376
},
{
"epoch": 0.5909090909090909,
"grad_norm": 0.4057357635238573,
"learning_rate": 4.4657375145180026e-05,
"loss": 0.6757,
"step": 377
},
{
"epoch": 0.5924764890282131,
"grad_norm": 0.36024440697049104,
"learning_rate": 4.4628339140534266e-05,
"loss": 0.6335,
"step": 378
},
{
"epoch": 0.5940438871473355,
"grad_norm": 0.4339386820141435,
"learning_rate": 4.4599303135888506e-05,
"loss": 0.5938,
"step": 379
},
{
"epoch": 0.5956112852664577,
"grad_norm": 0.37622691231993677,
"learning_rate": 4.4570267131242746e-05,
"loss": 0.6087,
"step": 380
},
{
"epoch": 0.5971786833855799,
"grad_norm": 0.4044583412106807,
"learning_rate": 4.454123112659698e-05,
"loss": 0.6906,
"step": 381
},
{
"epoch": 0.5987460815047022,
"grad_norm": 0.3848229449304871,
"learning_rate": 4.451219512195122e-05,
"loss": 0.6304,
"step": 382
},
{
"epoch": 0.6003134796238244,
"grad_norm": 4.443345384716547,
"learning_rate": 4.448315911730546e-05,
"loss": 0.8514,
"step": 383
},
{
"epoch": 0.6018808777429467,
"grad_norm": 0.4755429150686569,
"learning_rate": 4.44541231126597e-05,
"loss": 0.6424,
"step": 384
},
{
"epoch": 0.603448275862069,
"grad_norm": 0.3854756229266763,
"learning_rate": 4.442508710801394e-05,
"loss": 0.6567,
"step": 385
},
{
"epoch": 0.6050156739811913,
"grad_norm": 0.4049602271475368,
"learning_rate": 4.439605110336818e-05,
"loss": 0.6266,
"step": 386
},
{
"epoch": 0.6065830721003135,
"grad_norm": 0.3638731314794061,
"learning_rate": 4.436701509872242e-05,
"loss": 0.683,
"step": 387
},
{
"epoch": 0.6081504702194357,
"grad_norm": 0.39009856184963637,
"learning_rate": 4.433797909407666e-05,
"loss": 0.6324,
"step": 388
},
{
"epoch": 0.609717868338558,
"grad_norm": 0.3790678465015395,
"learning_rate": 4.43089430894309e-05,
"loss": 0.597,
"step": 389
},
{
"epoch": 0.6112852664576802,
"grad_norm": 0.31068223879201406,
"learning_rate": 4.427990708478514e-05,
"loss": 0.6804,
"step": 390
},
{
"epoch": 0.6128526645768025,
"grad_norm": 0.43366870933703133,
"learning_rate": 4.4250871080139374e-05,
"loss": 0.6657,
"step": 391
},
{
"epoch": 0.6144200626959248,
"grad_norm": 0.26308546049003334,
"learning_rate": 4.4221835075493614e-05,
"loss": 0.5998,
"step": 392
},
{
"epoch": 0.6159874608150471,
"grad_norm": 0.30584665544105244,
"learning_rate": 4.4192799070847854e-05,
"loss": 0.5472,
"step": 393
},
{
"epoch": 0.6175548589341693,
"grad_norm": 0.4451094350250893,
"learning_rate": 4.4163763066202094e-05,
"loss": 0.5785,
"step": 394
},
{
"epoch": 0.6191222570532915,
"grad_norm": 0.3146847206512534,
"learning_rate": 4.413472706155633e-05,
"loss": 0.6891,
"step": 395
},
{
"epoch": 0.6206896551724138,
"grad_norm": 0.3811733173675268,
"learning_rate": 4.410569105691057e-05,
"loss": 0.6918,
"step": 396
},
{
"epoch": 0.622257053291536,
"grad_norm": 0.2849954597038435,
"learning_rate": 4.407665505226481e-05,
"loss": 0.6105,
"step": 397
},
{
"epoch": 0.6238244514106583,
"grad_norm": 0.3357304787171675,
"learning_rate": 4.404761904761905e-05,
"loss": 0.7043,
"step": 398
},
{
"epoch": 0.6253918495297806,
"grad_norm": 0.35374691651167883,
"learning_rate": 4.401858304297329e-05,
"loss": 0.6752,
"step": 399
},
{
"epoch": 0.6269592476489029,
"grad_norm": 0.34015061829935833,
"learning_rate": 4.398954703832753e-05,
"loss": 0.6235,
"step": 400
},
{
"epoch": 0.6285266457680251,
"grad_norm": 0.3734062849423573,
"learning_rate": 4.396051103368177e-05,
"loss": 0.7157,
"step": 401
},
{
"epoch": 0.6300940438871473,
"grad_norm": 0.325781498212067,
"learning_rate": 4.393147502903601e-05,
"loss": 0.6545,
"step": 402
},
{
"epoch": 0.6316614420062696,
"grad_norm": 0.3477090664156203,
"learning_rate": 4.390243902439025e-05,
"loss": 0.6185,
"step": 403
},
{
"epoch": 0.6332288401253918,
"grad_norm": 0.3053312496243885,
"learning_rate": 4.387340301974449e-05,
"loss": 0.6017,
"step": 404
},
{
"epoch": 0.6347962382445141,
"grad_norm": 2.4901930935905865,
"learning_rate": 4.384436701509873e-05,
"loss": 0.6829,
"step": 405
},
{
"epoch": 0.6363636363636364,
"grad_norm": 0.48854247833463954,
"learning_rate": 4.381533101045297e-05,
"loss": 0.6012,
"step": 406
},
{
"epoch": 0.6379310344827587,
"grad_norm": 0.36872367448985827,
"learning_rate": 4.37862950058072e-05,
"loss": 0.695,
"step": 407
},
{
"epoch": 0.6394984326018809,
"grad_norm": 0.4947038714173907,
"learning_rate": 4.375725900116144e-05,
"loss": 0.718,
"step": 408
},
{
"epoch": 0.6410658307210031,
"grad_norm": 0.3568617271267534,
"learning_rate": 4.372822299651568e-05,
"loss": 0.6902,
"step": 409
},
{
"epoch": 0.6426332288401254,
"grad_norm": 0.5865735757236012,
"learning_rate": 4.369918699186992e-05,
"loss": 0.5996,
"step": 410
},
{
"epoch": 0.6442006269592476,
"grad_norm": 0.3763171611660941,
"learning_rate": 4.367015098722416e-05,
"loss": 0.6801,
"step": 411
},
{
"epoch": 0.64576802507837,
"grad_norm": 0.5704266627503888,
"learning_rate": 4.36411149825784e-05,
"loss": 0.6577,
"step": 412
},
{
"epoch": 0.6473354231974922,
"grad_norm": 0.45914990199559125,
"learning_rate": 4.361207897793264e-05,
"loss": 0.6506,
"step": 413
},
{
"epoch": 0.6489028213166145,
"grad_norm": 0.2675304633204114,
"learning_rate": 4.358304297328688e-05,
"loss": 0.6006,
"step": 414
},
{
"epoch": 0.6504702194357367,
"grad_norm": 0.36353430473957776,
"learning_rate": 4.3554006968641115e-05,
"loss": 0.6241,
"step": 415
},
{
"epoch": 0.6520376175548589,
"grad_norm": 4.309189046564216,
"learning_rate": 4.3524970963995355e-05,
"loss": 0.7289,
"step": 416
},
{
"epoch": 0.6536050156739812,
"grad_norm": 0.48737867465669055,
"learning_rate": 4.3495934959349595e-05,
"loss": 0.7723,
"step": 417
},
{
"epoch": 0.6551724137931034,
"grad_norm": 0.3543814011213638,
"learning_rate": 4.3466898954703835e-05,
"loss": 0.666,
"step": 418
},
{
"epoch": 0.6567398119122257,
"grad_norm": 0.3808052737942801,
"learning_rate": 4.3437862950058075e-05,
"loss": 0.692,
"step": 419
},
{
"epoch": 0.658307210031348,
"grad_norm": 0.3758624643690562,
"learning_rate": 4.3408826945412315e-05,
"loss": 0.6238,
"step": 420
},
{
"epoch": 0.6598746081504702,
"grad_norm": 0.38488172754025846,
"learning_rate": 4.337979094076655e-05,
"loss": 0.6563,
"step": 421
},
{
"epoch": 0.6614420062695925,
"grad_norm": 1.8649696784051404,
"learning_rate": 4.335075493612079e-05,
"loss": 0.5935,
"step": 422
},
{
"epoch": 0.6630094043887147,
"grad_norm": 0.3958557829316864,
"learning_rate": 4.332171893147503e-05,
"loss": 0.5522,
"step": 423
},
{
"epoch": 0.664576802507837,
"grad_norm": 0.40003562483689326,
"learning_rate": 4.329268292682927e-05,
"loss": 0.6918,
"step": 424
},
{
"epoch": 0.6661442006269592,
"grad_norm": 1.4743612089024112,
"learning_rate": 4.326364692218351e-05,
"loss": 0.707,
"step": 425
},
{
"epoch": 0.6677115987460815,
"grad_norm": 0.41505425946842317,
"learning_rate": 4.323461091753775e-05,
"loss": 0.6805,
"step": 426
},
{
"epoch": 0.6692789968652038,
"grad_norm": 0.43412990725784295,
"learning_rate": 4.320557491289199e-05,
"loss": 0.6426,
"step": 427
},
{
"epoch": 0.670846394984326,
"grad_norm": 0.3300227694892137,
"learning_rate": 4.317653890824623e-05,
"loss": 0.6823,
"step": 428
},
{
"epoch": 0.6724137931034483,
"grad_norm": 0.3386178997612696,
"learning_rate": 4.314750290360047e-05,
"loss": 0.5853,
"step": 429
},
{
"epoch": 0.6739811912225705,
"grad_norm": 1.8095247543757185,
"learning_rate": 4.311846689895471e-05,
"loss": 0.8027,
"step": 430
},
{
"epoch": 0.6755485893416928,
"grad_norm": 0.4598174979484723,
"learning_rate": 4.308943089430895e-05,
"loss": 0.699,
"step": 431
},
{
"epoch": 0.677115987460815,
"grad_norm": 0.3564497697308691,
"learning_rate": 4.306039488966318e-05,
"loss": 0.7092,
"step": 432
},
{
"epoch": 0.6786833855799373,
"grad_norm": 0.5205368420888661,
"learning_rate": 4.303135888501742e-05,
"loss": 0.6635,
"step": 433
},
{
"epoch": 0.6802507836990596,
"grad_norm": 0.33843005622079836,
"learning_rate": 4.300232288037166e-05,
"loss": 0.6045,
"step": 434
},
{
"epoch": 0.6818181818181818,
"grad_norm": 0.3107807698177877,
"learning_rate": 4.29732868757259e-05,
"loss": 0.6206,
"step": 435
},
{
"epoch": 0.6833855799373041,
"grad_norm": 1.942838490658471,
"learning_rate": 4.294425087108014e-05,
"loss": 0.6102,
"step": 436
},
{
"epoch": 0.6849529780564263,
"grad_norm": 2.2088063472134394,
"learning_rate": 4.291521486643438e-05,
"loss": 0.5535,
"step": 437
},
{
"epoch": 0.6865203761755486,
"grad_norm": 0.44888910875757826,
"learning_rate": 4.2886178861788616e-05,
"loss": 0.533,
"step": 438
},
{
"epoch": 0.6880877742946708,
"grad_norm": 0.2926773495489017,
"learning_rate": 4.2857142857142856e-05,
"loss": 0.5803,
"step": 439
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.46667562100728727,
"learning_rate": 4.2828106852497096e-05,
"loss": 0.5853,
"step": 440
},
{
"epoch": 0.6912225705329154,
"grad_norm": 0.35680895480518715,
"learning_rate": 4.2799070847851336e-05,
"loss": 0.619,
"step": 441
},
{
"epoch": 0.6927899686520376,
"grad_norm": 0.3147016077242331,
"learning_rate": 4.2770034843205577e-05,
"loss": 0.5743,
"step": 442
},
{
"epoch": 0.6943573667711599,
"grad_norm": 0.3992585401081134,
"learning_rate": 4.2740998838559817e-05,
"loss": 0.5386,
"step": 443
},
{
"epoch": 0.6959247648902821,
"grad_norm": 0.39530008665122773,
"learning_rate": 4.271196283391406e-05,
"loss": 0.6987,
"step": 444
},
{
"epoch": 0.6974921630094044,
"grad_norm": 0.25371346897898583,
"learning_rate": 4.26829268292683e-05,
"loss": 0.613,
"step": 445
},
{
"epoch": 0.6990595611285266,
"grad_norm": 0.35512056249942525,
"learning_rate": 4.265389082462253e-05,
"loss": 0.643,
"step": 446
},
{
"epoch": 0.700626959247649,
"grad_norm": 0.35213216046467644,
"learning_rate": 4.262485481997677e-05,
"loss": 0.5911,
"step": 447
},
{
"epoch": 0.7021943573667712,
"grad_norm": 0.3069468317616531,
"learning_rate": 4.259581881533101e-05,
"loss": 0.6211,
"step": 448
},
{
"epoch": 0.7037617554858934,
"grad_norm": 0.38056346006186087,
"learning_rate": 4.256678281068525e-05,
"loss": 0.6286,
"step": 449
},
{
"epoch": 0.7053291536050157,
"grad_norm": 0.29479855855937276,
"learning_rate": 4.253774680603949e-05,
"loss": 0.5861,
"step": 450
},
{
"epoch": 0.7068965517241379,
"grad_norm": 0.31530437666301986,
"learning_rate": 4.250871080139373e-05,
"loss": 0.6127,
"step": 451
},
{
"epoch": 0.7084639498432602,
"grad_norm": 0.31040419218209503,
"learning_rate": 4.247967479674797e-05,
"loss": 0.5966,
"step": 452
},
{
"epoch": 0.7100313479623824,
"grad_norm": 0.27974408686174956,
"learning_rate": 4.245063879210221e-05,
"loss": 0.57,
"step": 453
},
{
"epoch": 0.7115987460815048,
"grad_norm": 0.3354074543093332,
"learning_rate": 4.242160278745645e-05,
"loss": 0.5618,
"step": 454
},
{
"epoch": 0.713166144200627,
"grad_norm": 0.3041983074214006,
"learning_rate": 4.239256678281069e-05,
"loss": 0.6077,
"step": 455
},
{
"epoch": 0.7147335423197492,
"grad_norm": 0.3043403060903483,
"learning_rate": 4.236353077816493e-05,
"loss": 0.7194,
"step": 456
},
{
"epoch": 0.7163009404388715,
"grad_norm": 0.2597921942280548,
"learning_rate": 4.233449477351917e-05,
"loss": 0.5213,
"step": 457
},
{
"epoch": 0.7178683385579937,
"grad_norm": 0.33297727240229413,
"learning_rate": 4.2305458768873404e-05,
"loss": 0.6578,
"step": 458
},
{
"epoch": 0.719435736677116,
"grad_norm": 0.2819447028644682,
"learning_rate": 4.2276422764227644e-05,
"loss": 0.6114,
"step": 459
},
{
"epoch": 0.7210031347962382,
"grad_norm": 0.30853066466274703,
"learning_rate": 4.2247386759581884e-05,
"loss": 0.6446,
"step": 460
},
{
"epoch": 0.7225705329153606,
"grad_norm": 0.29613718566039543,
"learning_rate": 4.221835075493612e-05,
"loss": 0.5825,
"step": 461
},
{
"epoch": 0.7241379310344828,
"grad_norm": 0.29055460433825414,
"learning_rate": 4.218931475029036e-05,
"loss": 0.5874,
"step": 462
},
{
"epoch": 0.725705329153605,
"grad_norm": 0.282087944294198,
"learning_rate": 4.21602787456446e-05,
"loss": 0.6415,
"step": 463
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.5316004402515562,
"learning_rate": 4.213124274099884e-05,
"loss": 0.6776,
"step": 464
},
{
"epoch": 0.7288401253918495,
"grad_norm": 0.28607682315084804,
"learning_rate": 4.210220673635308e-05,
"loss": 0.6177,
"step": 465
},
{
"epoch": 0.7304075235109718,
"grad_norm": 0.2838890595401558,
"learning_rate": 4.207317073170732e-05,
"loss": 0.6377,
"step": 466
},
{
"epoch": 0.731974921630094,
"grad_norm": 0.3174863649213358,
"learning_rate": 4.204413472706156e-05,
"loss": 0.7339,
"step": 467
},
{
"epoch": 0.7335423197492164,
"grad_norm": 0.2653298580131076,
"learning_rate": 4.20150987224158e-05,
"loss": 0.5859,
"step": 468
},
{
"epoch": 0.7351097178683386,
"grad_norm": 0.2620221919560746,
"learning_rate": 4.198606271777004e-05,
"loss": 0.6088,
"step": 469
},
{
"epoch": 0.7366771159874608,
"grad_norm": 0.2997060077148446,
"learning_rate": 4.195702671312428e-05,
"loss": 0.6633,
"step": 470
},
{
"epoch": 0.7382445141065831,
"grad_norm": 0.29133301882567897,
"learning_rate": 4.192799070847852e-05,
"loss": 0.6689,
"step": 471
},
{
"epoch": 0.7398119122257053,
"grad_norm": 0.2665849017135695,
"learning_rate": 4.189895470383275e-05,
"loss": 0.6098,
"step": 472
},
{
"epoch": 0.7413793103448276,
"grad_norm": 0.26687122264594487,
"learning_rate": 4.186991869918699e-05,
"loss": 0.5882,
"step": 473
},
{
"epoch": 0.7429467084639498,
"grad_norm": 1.093676516035668,
"learning_rate": 4.184088269454123e-05,
"loss": 0.6345,
"step": 474
},
{
"epoch": 0.7445141065830722,
"grad_norm": 0.2861340337595245,
"learning_rate": 4.181184668989547e-05,
"loss": 0.6231,
"step": 475
},
{
"epoch": 0.7460815047021944,
"grad_norm": 0.27457277348542236,
"learning_rate": 4.178281068524971e-05,
"loss": 0.6315,
"step": 476
},
{
"epoch": 0.7476489028213166,
"grad_norm": 0.2637985596134815,
"learning_rate": 4.175377468060395e-05,
"loss": 0.6385,
"step": 477
},
{
"epoch": 0.7492163009404389,
"grad_norm": 0.2519755980750552,
"learning_rate": 4.172473867595819e-05,
"loss": 0.6349,
"step": 478
},
{
"epoch": 0.7507836990595611,
"grad_norm": 0.25961352779970687,
"learning_rate": 4.169570267131243e-05,
"loss": 0.5987,
"step": 479
},
{
"epoch": 0.7523510971786834,
"grad_norm": 0.27958886009441103,
"learning_rate": 4.166666666666667e-05,
"loss": 0.6889,
"step": 480
},
{
"epoch": 0.7539184952978056,
"grad_norm": 0.2644974769391109,
"learning_rate": 4.163763066202091e-05,
"loss": 0.6526,
"step": 481
},
{
"epoch": 0.7554858934169278,
"grad_norm": 0.2760545167224485,
"learning_rate": 4.160859465737515e-05,
"loss": 0.6679,
"step": 482
},
{
"epoch": 0.7570532915360502,
"grad_norm": 0.25351778860804824,
"learning_rate": 4.157955865272939e-05,
"loss": 0.7046,
"step": 483
},
{
"epoch": 0.7586206896551724,
"grad_norm": 0.275238590837339,
"learning_rate": 4.1550522648083626e-05,
"loss": 0.6567,
"step": 484
},
{
"epoch": 0.7601880877742947,
"grad_norm": 0.28579664629771834,
"learning_rate": 4.1521486643437866e-05,
"loss": 0.6764,
"step": 485
},
{
"epoch": 0.7617554858934169,
"grad_norm": 0.264432881111492,
"learning_rate": 4.14924506387921e-05,
"loss": 0.6053,
"step": 486
},
{
"epoch": 0.7633228840125392,
"grad_norm": 0.34446852752115725,
"learning_rate": 4.146341463414634e-05,
"loss": 0.5182,
"step": 487
},
{
"epoch": 0.7648902821316614,
"grad_norm": 0.32616264832222647,
"learning_rate": 4.143437862950058e-05,
"loss": 0.722,
"step": 488
},
{
"epoch": 0.7664576802507836,
"grad_norm": 0.26198273093328805,
"learning_rate": 4.140534262485482e-05,
"loss": 0.6733,
"step": 489
},
{
"epoch": 0.768025078369906,
"grad_norm": 0.2848401393408249,
"learning_rate": 4.137630662020906e-05,
"loss": 0.5746,
"step": 490
},
{
"epoch": 0.7695924764890282,
"grad_norm": 0.2715242769883968,
"learning_rate": 4.13472706155633e-05,
"loss": 0.6217,
"step": 491
},
{
"epoch": 0.7711598746081505,
"grad_norm": 0.5731642509889413,
"learning_rate": 4.131823461091754e-05,
"loss": 0.6432,
"step": 492
},
{
"epoch": 0.7727272727272727,
"grad_norm": 0.27464371861212766,
"learning_rate": 4.128919860627178e-05,
"loss": 0.559,
"step": 493
},
{
"epoch": 0.774294670846395,
"grad_norm": 0.2789717585662965,
"learning_rate": 4.126016260162602e-05,
"loss": 0.5736,
"step": 494
},
{
"epoch": 0.7758620689655172,
"grad_norm": 0.2678765379212865,
"learning_rate": 4.123112659698026e-05,
"loss": 0.6374,
"step": 495
},
{
"epoch": 0.7774294670846394,
"grad_norm": 0.2578807318192767,
"learning_rate": 4.12020905923345e-05,
"loss": 0.6365,
"step": 496
},
{
"epoch": 0.7789968652037618,
"grad_norm": 0.27201728340842485,
"learning_rate": 4.117305458768874e-05,
"loss": 0.6496,
"step": 497
},
{
"epoch": 0.780564263322884,
"grad_norm": 0.24556920587297879,
"learning_rate": 4.114401858304297e-05,
"loss": 0.5664,
"step": 498
},
{
"epoch": 0.7821316614420063,
"grad_norm": 0.25598792236532125,
"learning_rate": 4.111498257839721e-05,
"loss": 0.5814,
"step": 499
},
{
"epoch": 0.7836990595611285,
"grad_norm": 0.28254373818818224,
"learning_rate": 4.108594657375145e-05,
"loss": 0.6207,
"step": 500
},
{
"epoch": 0.7852664576802508,
"grad_norm": 0.27152785996512757,
"learning_rate": 4.105691056910569e-05,
"loss": 0.6384,
"step": 501
},
{
"epoch": 0.786833855799373,
"grad_norm": 0.26099779970743886,
"learning_rate": 4.102787456445993e-05,
"loss": 0.6431,
"step": 502
},
{
"epoch": 0.7884012539184952,
"grad_norm": 0.3182094000352917,
"learning_rate": 4.099883855981417e-05,
"loss": 0.7526,
"step": 503
},
{
"epoch": 0.7899686520376176,
"grad_norm": 0.47257222040770286,
"learning_rate": 4.0969802555168413e-05,
"loss": 0.601,
"step": 504
},
{
"epoch": 0.7915360501567398,
"grad_norm": 0.2964522644159519,
"learning_rate": 4.0940766550522653e-05,
"loss": 0.6728,
"step": 505
},
{
"epoch": 0.7931034482758621,
"grad_norm": 0.30213443967922515,
"learning_rate": 4.0911730545876894e-05,
"loss": 0.5924,
"step": 506
},
{
"epoch": 0.7946708463949843,
"grad_norm": 0.35847472884454135,
"learning_rate": 4.0882694541231134e-05,
"loss": 0.6573,
"step": 507
},
{
"epoch": 0.7962382445141066,
"grad_norm": 0.24789550554709405,
"learning_rate": 4.085365853658537e-05,
"loss": 0.5572,
"step": 508
},
{
"epoch": 0.7978056426332288,
"grad_norm": 0.3406800714083669,
"learning_rate": 4.082462253193961e-05,
"loss": 0.5831,
"step": 509
},
{
"epoch": 0.799373040752351,
"grad_norm": 0.30300076604800236,
"learning_rate": 4.079558652729385e-05,
"loss": 0.6244,
"step": 510
},
{
"epoch": 0.8009404388714734,
"grad_norm": 0.2499399302075023,
"learning_rate": 4.076655052264808e-05,
"loss": 0.5962,
"step": 511
},
{
"epoch": 0.8025078369905956,
"grad_norm": 0.3127275202981103,
"learning_rate": 4.073751451800232e-05,
"loss": 0.6107,
"step": 512
},
{
"epoch": 0.8040752351097179,
"grad_norm": 0.30491683082813964,
"learning_rate": 4.070847851335656e-05,
"loss": 0.6315,
"step": 513
},
{
"epoch": 0.8056426332288401,
"grad_norm": 0.30700013150155575,
"learning_rate": 4.06794425087108e-05,
"loss": 0.5855,
"step": 514
},
{
"epoch": 0.8072100313479624,
"grad_norm": 0.2862676057511448,
"learning_rate": 4.065040650406504e-05,
"loss": 0.5996,
"step": 515
},
{
"epoch": 0.8087774294670846,
"grad_norm": 0.26420261907706766,
"learning_rate": 4.062137049941928e-05,
"loss": 0.4921,
"step": 516
},
{
"epoch": 0.8103448275862069,
"grad_norm": 0.2801864496726549,
"learning_rate": 4.059233449477352e-05,
"loss": 0.6004,
"step": 517
},
{
"epoch": 0.8119122257053292,
"grad_norm": 0.31021961179028656,
"learning_rate": 4.056329849012776e-05,
"loss": 0.6238,
"step": 518
},
{
"epoch": 0.8134796238244514,
"grad_norm": 0.4463627984726597,
"learning_rate": 4.0534262485482e-05,
"loss": 0.5959,
"step": 519
},
{
"epoch": 0.8150470219435737,
"grad_norm": 0.35886637862758114,
"learning_rate": 4.050522648083624e-05,
"loss": 0.6965,
"step": 520
},
{
"epoch": 0.8166144200626959,
"grad_norm": 14.859144154580198,
"learning_rate": 4.047619047619048e-05,
"loss": 0.9688,
"step": 521
},
{
"epoch": 0.8181818181818182,
"grad_norm": 0.3606273039305309,
"learning_rate": 4.044715447154472e-05,
"loss": 0.687,
"step": 522
},
{
"epoch": 0.8197492163009404,
"grad_norm": 0.2641777673392744,
"learning_rate": 4.0418118466898954e-05,
"loss": 0.5455,
"step": 523
},
{
"epoch": 0.8213166144200627,
"grad_norm": 0.30901261897806276,
"learning_rate": 4.0389082462253194e-05,
"loss": 0.6062,
"step": 524
},
{
"epoch": 0.822884012539185,
"grad_norm": 0.27063859564634596,
"learning_rate": 4.0360046457607435e-05,
"loss": 0.5982,
"step": 525
},
{
"epoch": 0.8244514106583072,
"grad_norm": 0.3910703785001249,
"learning_rate": 4.0331010452961675e-05,
"loss": 0.6444,
"step": 526
},
{
"epoch": 0.8260188087774295,
"grad_norm": 0.3291591101691133,
"learning_rate": 4.0301974448315915e-05,
"loss": 0.6468,
"step": 527
},
{
"epoch": 0.8275862068965517,
"grad_norm": 0.324394430360334,
"learning_rate": 4.0272938443670155e-05,
"loss": 0.5634,
"step": 528
},
{
"epoch": 0.829153605015674,
"grad_norm": 0.4009621202082367,
"learning_rate": 4.0243902439024395e-05,
"loss": 0.6345,
"step": 529
},
{
"epoch": 0.8307210031347962,
"grad_norm": 0.2993379733946394,
"learning_rate": 4.0214866434378635e-05,
"loss": 0.5812,
"step": 530
},
{
"epoch": 0.8322884012539185,
"grad_norm": 0.3303554807746441,
"learning_rate": 4.018583042973287e-05,
"loss": 0.5792,
"step": 531
},
{
"epoch": 0.8338557993730408,
"grad_norm": 0.2607385771730796,
"learning_rate": 4.015679442508711e-05,
"loss": 0.5822,
"step": 532
},
{
"epoch": 0.835423197492163,
"grad_norm": 0.29784499328331143,
"learning_rate": 4.012775842044135e-05,
"loss": 0.6396,
"step": 533
},
{
"epoch": 0.8369905956112853,
"grad_norm": 0.277519931171073,
"learning_rate": 4.009872241579559e-05,
"loss": 0.5706,
"step": 534
},
{
"epoch": 0.8385579937304075,
"grad_norm": 0.3408744124104284,
"learning_rate": 4.006968641114983e-05,
"loss": 0.6543,
"step": 535
},
{
"epoch": 0.8401253918495298,
"grad_norm": 0.253445487292907,
"learning_rate": 4.004065040650407e-05,
"loss": 0.6002,
"step": 536
},
{
"epoch": 0.841692789968652,
"grad_norm": 0.275752059476966,
"learning_rate": 4.00116144018583e-05,
"loss": 0.6634,
"step": 537
},
{
"epoch": 0.8432601880877743,
"grad_norm": 0.3213094016509197,
"learning_rate": 3.998257839721254e-05,
"loss": 0.6263,
"step": 538
},
{
"epoch": 0.8448275862068966,
"grad_norm": 0.2590536256592508,
"learning_rate": 3.995354239256678e-05,
"loss": 0.5999,
"step": 539
},
{
"epoch": 0.8463949843260188,
"grad_norm": 0.2847861644395836,
"learning_rate": 3.992450638792102e-05,
"loss": 0.5971,
"step": 540
},
{
"epoch": 0.8479623824451411,
"grad_norm": 0.28000092567278617,
"learning_rate": 3.989547038327526e-05,
"loss": 0.6351,
"step": 541
},
{
"epoch": 0.8495297805642633,
"grad_norm": 0.2626764547439551,
"learning_rate": 3.98664343786295e-05,
"loss": 0.5873,
"step": 542
},
{
"epoch": 0.8510971786833855,
"grad_norm": 0.24680654844931224,
"learning_rate": 3.983739837398374e-05,
"loss": 0.5728,
"step": 543
},
{
"epoch": 0.8526645768025078,
"grad_norm": 0.28440552715618767,
"learning_rate": 3.980836236933798e-05,
"loss": 0.6578,
"step": 544
},
{
"epoch": 0.85423197492163,
"grad_norm": 0.25672977989435464,
"learning_rate": 3.977932636469222e-05,
"loss": 0.6251,
"step": 545
},
{
"epoch": 0.8557993730407524,
"grad_norm": 0.24960788410794055,
"learning_rate": 3.975029036004646e-05,
"loss": 0.5637,
"step": 546
},
{
"epoch": 0.8573667711598746,
"grad_norm": 0.2810467150062857,
"learning_rate": 3.97212543554007e-05,
"loss": 0.6283,
"step": 547
},
{
"epoch": 0.8589341692789969,
"grad_norm": 0.4130385782799843,
"learning_rate": 3.969221835075494e-05,
"loss": 0.5886,
"step": 548
},
{
"epoch": 0.8605015673981191,
"grad_norm": 2.8763759176337196,
"learning_rate": 3.9663182346109176e-05,
"loss": 0.5442,
"step": 549
},
{
"epoch": 0.8620689655172413,
"grad_norm": 0.39094561767695846,
"learning_rate": 3.9634146341463416e-05,
"loss": 0.6112,
"step": 550
},
{
"epoch": 0.8636363636363636,
"grad_norm": 0.29328490203742436,
"learning_rate": 3.9605110336817656e-05,
"loss": 0.7033,
"step": 551
},
{
"epoch": 0.8652037617554859,
"grad_norm": 0.33256505466027014,
"learning_rate": 3.9576074332171896e-05,
"loss": 0.6226,
"step": 552
},
{
"epoch": 0.8667711598746082,
"grad_norm": 0.3436734709625679,
"learning_rate": 3.9547038327526136e-05,
"loss": 0.5955,
"step": 553
},
{
"epoch": 0.8683385579937304,
"grad_norm": 0.2988002772553571,
"learning_rate": 3.951800232288037e-05,
"loss": 0.6844,
"step": 554
},
{
"epoch": 0.8699059561128527,
"grad_norm": 0.3943979880949569,
"learning_rate": 3.948896631823461e-05,
"loss": 0.6475,
"step": 555
},
{
"epoch": 0.8714733542319749,
"grad_norm": 0.27177234282892526,
"learning_rate": 3.945993031358885e-05,
"loss": 0.5958,
"step": 556
},
{
"epoch": 0.8730407523510971,
"grad_norm": 0.3570434423423023,
"learning_rate": 3.943089430894309e-05,
"loss": 0.6098,
"step": 557
},
{
"epoch": 0.8746081504702194,
"grad_norm": 0.32488019197000817,
"learning_rate": 3.940185830429733e-05,
"loss": 0.6096,
"step": 558
},
{
"epoch": 0.8761755485893417,
"grad_norm": 0.34002422668776316,
"learning_rate": 3.937282229965157e-05,
"loss": 0.6773,
"step": 559
},
{
"epoch": 0.877742946708464,
"grad_norm": 0.32402960320703117,
"learning_rate": 3.934378629500581e-05,
"loss": 0.653,
"step": 560
},
{
"epoch": 0.8793103448275862,
"grad_norm": 0.25172675859554433,
"learning_rate": 3.931475029036005e-05,
"loss": 0.5916,
"step": 561
},
{
"epoch": 0.8808777429467085,
"grad_norm": 0.2933596102808772,
"learning_rate": 3.928571428571429e-05,
"loss": 0.5889,
"step": 562
},
{
"epoch": 0.8824451410658307,
"grad_norm": 0.24616214549052226,
"learning_rate": 3.925667828106852e-05,
"loss": 0.6169,
"step": 563
},
{
"epoch": 0.8840125391849529,
"grad_norm": 0.23526321173003545,
"learning_rate": 3.922764227642276e-05,
"loss": 0.5732,
"step": 564
},
{
"epoch": 0.8855799373040752,
"grad_norm": 0.2668797446344972,
"learning_rate": 3.9198606271777003e-05,
"loss": 0.5955,
"step": 565
},
{
"epoch": 0.8871473354231975,
"grad_norm": 0.2680692771508721,
"learning_rate": 3.9169570267131244e-05,
"loss": 0.6012,
"step": 566
},
{
"epoch": 0.8887147335423198,
"grad_norm": 0.2711362379236926,
"learning_rate": 3.9140534262485484e-05,
"loss": 0.6482,
"step": 567
},
{
"epoch": 0.890282131661442,
"grad_norm": 0.25108962374727634,
"learning_rate": 3.9111498257839724e-05,
"loss": 0.5822,
"step": 568
},
{
"epoch": 0.8918495297805643,
"grad_norm": 1.9796436404605717,
"learning_rate": 3.9082462253193964e-05,
"loss": 0.643,
"step": 569
},
{
"epoch": 0.8934169278996865,
"grad_norm": 0.26624589190726405,
"learning_rate": 3.9053426248548204e-05,
"loss": 0.6015,
"step": 570
},
{
"epoch": 0.8949843260188087,
"grad_norm": 0.29608938532394574,
"learning_rate": 3.9024390243902444e-05,
"loss": 0.6673,
"step": 571
},
{
"epoch": 0.896551724137931,
"grad_norm": 0.2316171633303246,
"learning_rate": 3.8995354239256684e-05,
"loss": 0.5115,
"step": 572
},
{
"epoch": 0.8981191222570533,
"grad_norm": 0.280461262526486,
"learning_rate": 3.8966318234610924e-05,
"loss": 0.6094,
"step": 573
},
{
"epoch": 0.8996865203761756,
"grad_norm": 0.24759894152489048,
"learning_rate": 3.8937282229965164e-05,
"loss": 0.5757,
"step": 574
},
{
"epoch": 0.9012539184952978,
"grad_norm": 0.2693472659881042,
"learning_rate": 3.89082462253194e-05,
"loss": 0.6202,
"step": 575
},
{
"epoch": 0.9028213166144201,
"grad_norm": 0.2576112594704447,
"learning_rate": 3.887921022067364e-05,
"loss": 0.5795,
"step": 576
},
{
"epoch": 0.9043887147335423,
"grad_norm": 0.2658989434569696,
"learning_rate": 3.885017421602787e-05,
"loss": 0.5569,
"step": 577
},
{
"epoch": 0.9059561128526645,
"grad_norm": 0.2671974775503844,
"learning_rate": 3.882113821138211e-05,
"loss": 0.5988,
"step": 578
},
{
"epoch": 0.9075235109717869,
"grad_norm": 0.23802841020275206,
"learning_rate": 3.879210220673635e-05,
"loss": 0.5742,
"step": 579
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.2954924658750665,
"learning_rate": 3.876306620209059e-05,
"loss": 0.6228,
"step": 580
},
{
"epoch": 0.9106583072100314,
"grad_norm": 0.2442008942694883,
"learning_rate": 3.873403019744483e-05,
"loss": 0.5045,
"step": 581
},
{
"epoch": 0.9122257053291536,
"grad_norm": 0.2749575837119481,
"learning_rate": 3.870499419279907e-05,
"loss": 0.6773,
"step": 582
},
{
"epoch": 0.9137931034482759,
"grad_norm": 0.24493104400253252,
"learning_rate": 3.867595818815331e-05,
"loss": 0.6108,
"step": 583
},
{
"epoch": 0.9153605015673981,
"grad_norm": 6.675947859360387,
"learning_rate": 3.864692218350755e-05,
"loss": 0.8806,
"step": 584
},
{
"epoch": 0.9169278996865203,
"grad_norm": 0.325004773399676,
"learning_rate": 3.861788617886179e-05,
"loss": 0.6119,
"step": 585
},
{
"epoch": 0.9184952978056427,
"grad_norm": 0.2633675906382314,
"learning_rate": 3.858885017421603e-05,
"loss": 0.6308,
"step": 586
},
{
"epoch": 0.9200626959247649,
"grad_norm": 0.2869162064036577,
"learning_rate": 3.855981416957027e-05,
"loss": 0.7126,
"step": 587
},
{
"epoch": 0.9216300940438872,
"grad_norm": 0.2496607017974609,
"learning_rate": 3.8530778164924505e-05,
"loss": 0.5659,
"step": 588
},
{
"epoch": 0.9231974921630094,
"grad_norm": 0.29687970398231883,
"learning_rate": 3.8501742160278745e-05,
"loss": 0.6265,
"step": 589
},
{
"epoch": 0.9247648902821317,
"grad_norm": 0.27387503472940183,
"learning_rate": 3.8472706155632985e-05,
"loss": 0.6767,
"step": 590
},
{
"epoch": 0.9263322884012539,
"grad_norm": 0.28121043393395223,
"learning_rate": 3.8443670150987225e-05,
"loss": 0.5467,
"step": 591
},
{
"epoch": 0.9278996865203761,
"grad_norm": 0.2755014094291153,
"learning_rate": 3.8414634146341465e-05,
"loss": 0.7101,
"step": 592
},
{
"epoch": 0.9294670846394985,
"grad_norm": 0.26082176352654723,
"learning_rate": 3.8385598141695705e-05,
"loss": 0.5847,
"step": 593
},
{
"epoch": 0.9310344827586207,
"grad_norm": 0.3376531837145628,
"learning_rate": 3.8356562137049945e-05,
"loss": 0.6403,
"step": 594
},
{
"epoch": 0.932601880877743,
"grad_norm": 0.24591625709322615,
"learning_rate": 3.8327526132404185e-05,
"loss": 0.4997,
"step": 595
},
{
"epoch": 0.9341692789968652,
"grad_norm": 0.35037053405771423,
"learning_rate": 3.8298490127758425e-05,
"loss": 0.6225,
"step": 596
},
{
"epoch": 0.9357366771159875,
"grad_norm": 0.31868506158663296,
"learning_rate": 3.8269454123112665e-05,
"loss": 0.6352,
"step": 597
},
{
"epoch": 0.9373040752351097,
"grad_norm": 0.26842709406562054,
"learning_rate": 3.8240418118466905e-05,
"loss": 0.5546,
"step": 598
},
{
"epoch": 0.9388714733542319,
"grad_norm": 0.3233794255592773,
"learning_rate": 3.8211382113821145e-05,
"loss": 0.6162,
"step": 599
},
{
"epoch": 0.9404388714733543,
"grad_norm": 0.2682663468971888,
"learning_rate": 3.818234610917538e-05,
"loss": 0.6324,
"step": 600
},
{
"epoch": 0.9420062695924765,
"grad_norm": 0.31530960766810895,
"learning_rate": 3.815331010452962e-05,
"loss": 0.5598,
"step": 601
},
{
"epoch": 0.9435736677115988,
"grad_norm": 0.27134964750043344,
"learning_rate": 3.812427409988385e-05,
"loss": 0.6012,
"step": 602
},
{
"epoch": 0.945141065830721,
"grad_norm": 0.784862905334818,
"learning_rate": 3.809523809523809e-05,
"loss": 0.6048,
"step": 603
},
{
"epoch": 0.9467084639498433,
"grad_norm": 0.39331570935882215,
"learning_rate": 3.806620209059233e-05,
"loss": 0.6335,
"step": 604
},
{
"epoch": 0.9482758620689655,
"grad_norm": 0.35506302724513417,
"learning_rate": 3.803716608594657e-05,
"loss": 0.5705,
"step": 605
},
{
"epoch": 0.9498432601880877,
"grad_norm": 0.3161007133370568,
"learning_rate": 3.800813008130081e-05,
"loss": 0.5026,
"step": 606
},
{
"epoch": 0.95141065830721,
"grad_norm": 0.3217367407572658,
"learning_rate": 3.797909407665505e-05,
"loss": 0.6871,
"step": 607
},
{
"epoch": 0.9529780564263323,
"grad_norm": 0.2714045364113006,
"learning_rate": 3.795005807200929e-05,
"loss": 0.5375,
"step": 608
},
{
"epoch": 0.9545454545454546,
"grad_norm": 0.3075629649929039,
"learning_rate": 3.792102206736353e-05,
"loss": 0.5717,
"step": 609
},
{
"epoch": 0.9561128526645768,
"grad_norm": 0.29607463693744995,
"learning_rate": 3.789198606271777e-05,
"loss": 0.615,
"step": 610
},
{
"epoch": 0.957680250783699,
"grad_norm": 1.138647248424991,
"learning_rate": 3.786295005807201e-05,
"loss": 0.5637,
"step": 611
},
{
"epoch": 0.9592476489028213,
"grad_norm": 0.3608552434080648,
"learning_rate": 3.783391405342625e-05,
"loss": 0.5753,
"step": 612
},
{
"epoch": 0.9608150470219435,
"grad_norm": 0.3148386708828119,
"learning_rate": 3.780487804878049e-05,
"loss": 0.6003,
"step": 613
},
{
"epoch": 0.9623824451410659,
"grad_norm": 0.3753116862190355,
"learning_rate": 3.7775842044134726e-05,
"loss": 0.6369,
"step": 614
},
{
"epoch": 0.9639498432601881,
"grad_norm": 0.31700516805123413,
"learning_rate": 3.7746806039488966e-05,
"loss": 0.6866,
"step": 615
},
{
"epoch": 0.9655172413793104,
"grad_norm": 2.0995189290604985,
"learning_rate": 3.7717770034843206e-05,
"loss": 0.6472,
"step": 616
},
{
"epoch": 0.9670846394984326,
"grad_norm": 0.3844715525665575,
"learning_rate": 3.7688734030197446e-05,
"loss": 0.5914,
"step": 617
},
{
"epoch": 0.9686520376175548,
"grad_norm": 0.31438270575774696,
"learning_rate": 3.7659698025551686e-05,
"loss": 0.6066,
"step": 618
},
{
"epoch": 0.9702194357366771,
"grad_norm": 0.4037190791196795,
"learning_rate": 3.7630662020905927e-05,
"loss": 0.6052,
"step": 619
},
{
"epoch": 0.9717868338557993,
"grad_norm": 0.3385231024291831,
"learning_rate": 3.760162601626017e-05,
"loss": 0.6414,
"step": 620
},
{
"epoch": 0.9733542319749217,
"grad_norm": 0.28597230584593725,
"learning_rate": 3.757259001161441e-05,
"loss": 0.5706,
"step": 621
},
{
"epoch": 0.9749216300940439,
"grad_norm": 0.46658971890000184,
"learning_rate": 3.754355400696865e-05,
"loss": 0.6485,
"step": 622
},
{
"epoch": 0.9764890282131662,
"grad_norm": 0.30333556071751167,
"learning_rate": 3.751451800232289e-05,
"loss": 0.6331,
"step": 623
},
{
"epoch": 0.9780564263322884,
"grad_norm": 0.3781873694405637,
"learning_rate": 3.748548199767712e-05,
"loss": 0.5819,
"step": 624
},
{
"epoch": 0.9796238244514106,
"grad_norm": 0.3826811112287205,
"learning_rate": 3.745644599303136e-05,
"loss": 0.6619,
"step": 625
},
{
"epoch": 0.9811912225705329,
"grad_norm": 0.3812073339738345,
"learning_rate": 3.74274099883856e-05,
"loss": 0.595,
"step": 626
},
{
"epoch": 0.9827586206896551,
"grad_norm": 0.37832661428643916,
"learning_rate": 3.739837398373984e-05,
"loss": 0.559,
"step": 627
},
{
"epoch": 0.9843260188087775,
"grad_norm": 0.3087349933592067,
"learning_rate": 3.7369337979094074e-05,
"loss": 0.5885,
"step": 628
},
{
"epoch": 0.9858934169278997,
"grad_norm": 0.4384387353534518,
"learning_rate": 3.7340301974448314e-05,
"loss": 0.5878,
"step": 629
},
{
"epoch": 0.987460815047022,
"grad_norm": 0.32361404497337287,
"learning_rate": 3.7311265969802554e-05,
"loss": 0.5755,
"step": 630
},
{
"epoch": 0.9890282131661442,
"grad_norm": 0.27196153521488753,
"learning_rate": 3.7282229965156794e-05,
"loss": 0.5744,
"step": 631
},
{
"epoch": 0.9905956112852664,
"grad_norm": 4.401710050657245,
"learning_rate": 3.7253193960511034e-05,
"loss": 0.649,
"step": 632
},
{
"epoch": 0.9921630094043887,
"grad_norm": 0.8176579291364673,
"learning_rate": 3.7224157955865274e-05,
"loss": 0.614,
"step": 633
},
{
"epoch": 0.9937304075235109,
"grad_norm": 0.39962259762706176,
"learning_rate": 3.7195121951219514e-05,
"loss": 0.6311,
"step": 634
},
{
"epoch": 0.9952978056426333,
"grad_norm": 0.2754466163360415,
"learning_rate": 3.7166085946573754e-05,
"loss": 0.5739,
"step": 635
},
{
"epoch": 0.9968652037617555,
"grad_norm": 0.420562033788279,
"learning_rate": 3.7137049941927994e-05,
"loss": 0.5448,
"step": 636
},
{
"epoch": 0.9984326018808778,
"grad_norm": 0.380641025504617,
"learning_rate": 3.7108013937282234e-05,
"loss": 0.5894,
"step": 637
},
{
"epoch": 1.0,
"grad_norm": 0.2625107029711592,
"learning_rate": 3.7078977932636474e-05,
"loss": 0.5826,
"step": 638
},
{
"epoch": 1.0015673981191222,
"grad_norm": 2.565890824407344,
"learning_rate": 3.7049941927990714e-05,
"loss": 0.6951,
"step": 639
},
{
"epoch": 1.0031347962382444,
"grad_norm": 0.5058315561875574,
"learning_rate": 3.702090592334495e-05,
"loss": 0.5098,
"step": 640
},
{
"epoch": 1.0047021943573669,
"grad_norm": 0.36299194105180255,
"learning_rate": 3.699186991869919e-05,
"loss": 0.5064,
"step": 641
},
{
"epoch": 1.006269592476489,
"grad_norm": 0.26612480460286786,
"learning_rate": 3.696283391405343e-05,
"loss": 0.4776,
"step": 642
},
{
"epoch": 1.0078369905956113,
"grad_norm": 0.36141227421973615,
"learning_rate": 3.693379790940767e-05,
"loss": 0.5319,
"step": 643
},
{
"epoch": 1.0094043887147335,
"grad_norm": 0.6537922020713349,
"learning_rate": 3.690476190476191e-05,
"loss": 0.5352,
"step": 644
},
{
"epoch": 1.0109717868338557,
"grad_norm": 0.2686274576488932,
"learning_rate": 3.687572590011615e-05,
"loss": 0.425,
"step": 645
},
{
"epoch": 1.0125391849529781,
"grad_norm": 0.28075826221106237,
"learning_rate": 3.684668989547039e-05,
"loss": 0.546,
"step": 646
},
{
"epoch": 1.0141065830721003,
"grad_norm": 0.31649064608236477,
"learning_rate": 3.681765389082462e-05,
"loss": 0.5951,
"step": 647
},
{
"epoch": 1.0156739811912225,
"grad_norm": 0.2753302390929797,
"learning_rate": 3.678861788617886e-05,
"loss": 0.5312,
"step": 648
},
{
"epoch": 1.0172413793103448,
"grad_norm": 0.2965560574052741,
"learning_rate": 3.67595818815331e-05,
"loss": 0.5742,
"step": 649
},
{
"epoch": 1.0188087774294672,
"grad_norm": 0.25234140967114527,
"learning_rate": 3.673054587688734e-05,
"loss": 0.4718,
"step": 650
},
{
"epoch": 1.0203761755485894,
"grad_norm": 5.3477275080799895,
"learning_rate": 3.670150987224158e-05,
"loss": 1.2506,
"step": 651
},
{
"epoch": 1.0219435736677116,
"grad_norm": 0.28500729603664166,
"learning_rate": 3.667247386759582e-05,
"loss": 0.5175,
"step": 652
},
{
"epoch": 1.0235109717868338,
"grad_norm": 0.3270735050832799,
"learning_rate": 3.664343786295006e-05,
"loss": 0.5316,
"step": 653
},
{
"epoch": 1.025078369905956,
"grad_norm": 0.2728343543342984,
"learning_rate": 3.6614401858304295e-05,
"loss": 0.5446,
"step": 654
},
{
"epoch": 1.0266457680250785,
"grad_norm": 9.351766852087337,
"learning_rate": 3.6585365853658535e-05,
"loss": 0.6369,
"step": 655
},
{
"epoch": 1.0282131661442007,
"grad_norm": 0.33008605746493763,
"learning_rate": 3.6556329849012775e-05,
"loss": 0.5254,
"step": 656
},
{
"epoch": 1.0297805642633229,
"grad_norm": 0.5842383156688505,
"learning_rate": 3.6527293844367015e-05,
"loss": 0.4722,
"step": 657
},
{
"epoch": 1.031347962382445,
"grad_norm": 0.29504929189459006,
"learning_rate": 3.6498257839721255e-05,
"loss": 0.5454,
"step": 658
},
{
"epoch": 1.0329153605015673,
"grad_norm": 0.3279376529082841,
"learning_rate": 3.6469221835075495e-05,
"loss": 0.5714,
"step": 659
},
{
"epoch": 1.0344827586206897,
"grad_norm": 1.7733480273454432,
"learning_rate": 3.6440185830429736e-05,
"loss": 0.536,
"step": 660
},
{
"epoch": 1.036050156739812,
"grad_norm": 0.33187273257246247,
"learning_rate": 3.6411149825783976e-05,
"loss": 0.5137,
"step": 661
},
{
"epoch": 1.0376175548589341,
"grad_norm": 0.278001743894782,
"learning_rate": 3.6382113821138216e-05,
"loss": 0.484,
"step": 662
},
{
"epoch": 1.0391849529780564,
"grad_norm": 0.2892799387321007,
"learning_rate": 3.6353077816492456e-05,
"loss": 0.5133,
"step": 663
},
{
"epoch": 1.0407523510971788,
"grad_norm": 9.367375247520428,
"learning_rate": 3.6324041811846696e-05,
"loss": 0.6638,
"step": 664
},
{
"epoch": 1.042319749216301,
"grad_norm": 0.3678841060355219,
"learning_rate": 3.629500580720093e-05,
"loss": 0.4696,
"step": 665
},
{
"epoch": 1.0438871473354232,
"grad_norm": 0.3175147315117553,
"learning_rate": 3.626596980255517e-05,
"loss": 0.5393,
"step": 666
},
{
"epoch": 1.0454545454545454,
"grad_norm": 2.2237118834946097,
"learning_rate": 3.623693379790941e-05,
"loss": 0.6393,
"step": 667
},
{
"epoch": 1.0470219435736676,
"grad_norm": 0.48129044547431815,
"learning_rate": 3.620789779326365e-05,
"loss": 0.5454,
"step": 668
},
{
"epoch": 1.04858934169279,
"grad_norm": 0.2748590169465167,
"learning_rate": 3.617886178861789e-05,
"loss": 0.5005,
"step": 669
},
{
"epoch": 1.0501567398119123,
"grad_norm": 1.1271165549558277,
"learning_rate": 3.614982578397213e-05,
"loss": 0.5645,
"step": 670
},
{
"epoch": 1.0517241379310345,
"grad_norm": 1.0079737624860692,
"learning_rate": 3.612078977932636e-05,
"loss": 0.5656,
"step": 671
},
{
"epoch": 1.0532915360501567,
"grad_norm": 0.30352368626588316,
"learning_rate": 3.60917537746806e-05,
"loss": 0.5401,
"step": 672
},
{
"epoch": 1.054858934169279,
"grad_norm": 0.3791618805110646,
"learning_rate": 3.606271777003484e-05,
"loss": 0.4685,
"step": 673
},
{
"epoch": 1.0564263322884013,
"grad_norm": 0.32052695609860665,
"learning_rate": 3.603368176538908e-05,
"loss": 0.5025,
"step": 674
},
{
"epoch": 1.0579937304075235,
"grad_norm": 0.27725756642336513,
"learning_rate": 3.600464576074332e-05,
"loss": 0.4667,
"step": 675
},
{
"epoch": 1.0595611285266457,
"grad_norm": 0.33704771555808155,
"learning_rate": 3.597560975609756e-05,
"loss": 0.4665,
"step": 676
},
{
"epoch": 1.061128526645768,
"grad_norm": 0.2990761618719968,
"learning_rate": 3.59465737514518e-05,
"loss": 0.454,
"step": 677
},
{
"epoch": 1.0626959247648902,
"grad_norm": 0.29667892694529974,
"learning_rate": 3.591753774680604e-05,
"loss": 0.5062,
"step": 678
},
{
"epoch": 1.0642633228840126,
"grad_norm": 0.36743837216902236,
"learning_rate": 3.5888501742160277e-05,
"loss": 0.6078,
"step": 679
},
{
"epoch": 1.0658307210031348,
"grad_norm": 0.2735305124189987,
"learning_rate": 3.585946573751452e-05,
"loss": 0.5239,
"step": 680
},
{
"epoch": 1.067398119122257,
"grad_norm": 0.3720090749833859,
"learning_rate": 3.583042973286876e-05,
"loss": 0.559,
"step": 681
},
{
"epoch": 1.0689655172413792,
"grad_norm": 0.2916623068043165,
"learning_rate": 3.5801393728223e-05,
"loss": 0.4719,
"step": 682
},
{
"epoch": 1.0705329153605017,
"grad_norm": 0.305455111476216,
"learning_rate": 3.577235772357724e-05,
"loss": 0.457,
"step": 683
},
{
"epoch": 1.0721003134796239,
"grad_norm": 0.3557941980226378,
"learning_rate": 3.574332171893148e-05,
"loss": 0.5245,
"step": 684
},
{
"epoch": 1.073667711598746,
"grad_norm": 0.26752996163911935,
"learning_rate": 3.571428571428572e-05,
"loss": 0.5182,
"step": 685
},
{
"epoch": 1.0752351097178683,
"grad_norm": 0.29840651484021097,
"learning_rate": 3.568524970963996e-05,
"loss": 0.4932,
"step": 686
},
{
"epoch": 1.0768025078369905,
"grad_norm": 0.619382458379002,
"learning_rate": 3.56562137049942e-05,
"loss": 0.5084,
"step": 687
},
{
"epoch": 1.078369905956113,
"grad_norm": 0.2842552716137054,
"learning_rate": 3.562717770034844e-05,
"loss": 0.5044,
"step": 688
},
{
"epoch": 1.0799373040752351,
"grad_norm": 0.339365102773381,
"learning_rate": 3.559814169570268e-05,
"loss": 0.5352,
"step": 689
},
{
"epoch": 1.0815047021943573,
"grad_norm": 0.2839757917613132,
"learning_rate": 3.556910569105692e-05,
"loss": 0.4893,
"step": 690
},
{
"epoch": 1.0830721003134796,
"grad_norm": 0.2535247365431648,
"learning_rate": 3.554006968641115e-05,
"loss": 0.4691,
"step": 691
},
{
"epoch": 1.084639498432602,
"grad_norm": 0.3336765707664701,
"learning_rate": 3.551103368176539e-05,
"loss": 0.5342,
"step": 692
},
{
"epoch": 1.0862068965517242,
"grad_norm": 0.2873625020220418,
"learning_rate": 3.548199767711963e-05,
"loss": 0.4942,
"step": 693
},
{
"epoch": 1.0877742946708464,
"grad_norm": 0.47717281276923385,
"learning_rate": 3.5452961672473864e-05,
"loss": 0.4306,
"step": 694
},
{
"epoch": 1.0893416927899686,
"grad_norm": 0.33750218951121175,
"learning_rate": 3.5423925667828104e-05,
"loss": 0.4836,
"step": 695
},
{
"epoch": 1.0909090909090908,
"grad_norm": 0.3216754361909907,
"learning_rate": 3.5394889663182344e-05,
"loss": 0.4853,
"step": 696
},
{
"epoch": 1.0924764890282133,
"grad_norm": 0.2873840618765212,
"learning_rate": 3.5365853658536584e-05,
"loss": 0.4934,
"step": 697
},
{
"epoch": 1.0940438871473355,
"grad_norm": 0.25338427057219687,
"learning_rate": 3.5336817653890824e-05,
"loss": 0.4557,
"step": 698
},
{
"epoch": 1.0956112852664577,
"grad_norm": 0.3606988138862604,
"learning_rate": 3.5307781649245064e-05,
"loss": 0.5376,
"step": 699
},
{
"epoch": 1.09717868338558,
"grad_norm": 0.29128151634717325,
"learning_rate": 3.5278745644599304e-05,
"loss": 0.5341,
"step": 700
},
{
"epoch": 1.098746081504702,
"grad_norm": 0.27737913790339913,
"learning_rate": 3.5249709639953545e-05,
"loss": 0.5404,
"step": 701
},
{
"epoch": 1.1003134796238245,
"grad_norm": 0.26879166089439477,
"learning_rate": 3.5220673635307785e-05,
"loss": 0.531,
"step": 702
},
{
"epoch": 1.1018808777429467,
"grad_norm": 0.24434702577850823,
"learning_rate": 3.5191637630662025e-05,
"loss": 0.505,
"step": 703
},
{
"epoch": 1.103448275862069,
"grad_norm": 0.2731434317743878,
"learning_rate": 3.5162601626016265e-05,
"loss": 0.5048,
"step": 704
},
{
"epoch": 1.1050156739811912,
"grad_norm": 0.27158430157697744,
"learning_rate": 3.51335656213705e-05,
"loss": 0.51,
"step": 705
},
{
"epoch": 1.1065830721003134,
"grad_norm": 0.23077978974242477,
"learning_rate": 3.510452961672474e-05,
"loss": 0.4667,
"step": 706
},
{
"epoch": 1.1081504702194358,
"grad_norm": 0.29582508601843976,
"learning_rate": 3.507549361207898e-05,
"loss": 0.5602,
"step": 707
},
{
"epoch": 1.109717868338558,
"grad_norm": 0.26296864746916393,
"learning_rate": 3.504645760743322e-05,
"loss": 0.515,
"step": 708
},
{
"epoch": 1.1112852664576802,
"grad_norm": 8.325526483069714,
"learning_rate": 3.501742160278746e-05,
"loss": 0.7602,
"step": 709
},
{
"epoch": 1.1128526645768024,
"grad_norm": 0.2951305903755263,
"learning_rate": 3.49883855981417e-05,
"loss": 0.4739,
"step": 710
},
{
"epoch": 1.1144200626959249,
"grad_norm": 0.5816126713767027,
"learning_rate": 3.495934959349594e-05,
"loss": 0.534,
"step": 711
},
{
"epoch": 1.115987460815047,
"grad_norm": 0.2341884661774227,
"learning_rate": 3.493031358885018e-05,
"loss": 0.5107,
"step": 712
},
{
"epoch": 1.1175548589341693,
"grad_norm": 0.27770448655854674,
"learning_rate": 3.490127758420442e-05,
"loss": 0.554,
"step": 713
},
{
"epoch": 1.1191222570532915,
"grad_norm": 0.22994942990367415,
"learning_rate": 3.487224157955866e-05,
"loss": 0.5134,
"step": 714
},
{
"epoch": 1.1206896551724137,
"grad_norm": 0.26347149348528826,
"learning_rate": 3.48432055749129e-05,
"loss": 0.595,
"step": 715
},
{
"epoch": 1.1222570532915361,
"grad_norm": 0.25211160481147693,
"learning_rate": 3.481416957026714e-05,
"loss": 0.4718,
"step": 716
},
{
"epoch": 1.1238244514106583,
"grad_norm": 0.27630629544114166,
"learning_rate": 3.478513356562137e-05,
"loss": 0.5155,
"step": 717
},
{
"epoch": 1.1253918495297806,
"grad_norm": 0.274030589355416,
"learning_rate": 3.475609756097561e-05,
"loss": 0.5288,
"step": 718
},
{
"epoch": 1.1269592476489028,
"grad_norm": 0.25262409515205664,
"learning_rate": 3.4727061556329845e-05,
"loss": 0.5494,
"step": 719
},
{
"epoch": 1.1285266457680252,
"grad_norm": 0.2323072522889267,
"learning_rate": 3.4698025551684086e-05,
"loss": 0.5002,
"step": 720
},
{
"epoch": 1.1300940438871474,
"grad_norm": 0.2748893174632439,
"learning_rate": 3.4668989547038326e-05,
"loss": 0.5468,
"step": 721
},
{
"epoch": 1.1316614420062696,
"grad_norm": 0.2859399098541969,
"learning_rate": 3.4639953542392566e-05,
"loss": 0.5069,
"step": 722
},
{
"epoch": 1.1332288401253918,
"grad_norm": 0.23170483445325368,
"learning_rate": 3.4610917537746806e-05,
"loss": 0.4575,
"step": 723
},
{
"epoch": 1.134796238244514,
"grad_norm": 0.27077330498660174,
"learning_rate": 3.4581881533101046e-05,
"loss": 0.4958,
"step": 724
},
{
"epoch": 1.1363636363636362,
"grad_norm": 0.26685924718062803,
"learning_rate": 3.4552845528455286e-05,
"loss": 0.4939,
"step": 725
},
{
"epoch": 1.1379310344827587,
"grad_norm": 0.42568221921781024,
"learning_rate": 3.4523809523809526e-05,
"loss": 0.5283,
"step": 726
},
{
"epoch": 1.1394984326018809,
"grad_norm": 0.3814437079258895,
"learning_rate": 3.4494773519163766e-05,
"loss": 0.5193,
"step": 727
},
{
"epoch": 1.141065830721003,
"grad_norm": 0.3695759611538198,
"learning_rate": 3.4465737514518006e-05,
"loss": 0.4783,
"step": 728
},
{
"epoch": 1.1426332288401253,
"grad_norm": 0.248078975702304,
"learning_rate": 3.4436701509872246e-05,
"loss": 0.4957,
"step": 729
},
{
"epoch": 1.1442006269592477,
"grad_norm": 1.2253680800321918,
"learning_rate": 3.4407665505226486e-05,
"loss": 0.4547,
"step": 730
},
{
"epoch": 1.14576802507837,
"grad_norm": 0.24385109304986666,
"learning_rate": 3.437862950058072e-05,
"loss": 0.5286,
"step": 731
},
{
"epoch": 1.1473354231974922,
"grad_norm": 0.27150175582281727,
"learning_rate": 3.434959349593496e-05,
"loss": 0.5259,
"step": 732
},
{
"epoch": 1.1489028213166144,
"grad_norm": 0.24471317548151245,
"learning_rate": 3.43205574912892e-05,
"loss": 0.5453,
"step": 733
},
{
"epoch": 1.1504702194357366,
"grad_norm": 0.5115501430384393,
"learning_rate": 3.429152148664344e-05,
"loss": 0.4818,
"step": 734
},
{
"epoch": 1.152037617554859,
"grad_norm": 0.24038903932910174,
"learning_rate": 3.426248548199768e-05,
"loss": 0.4905,
"step": 735
},
{
"epoch": 1.1536050156739812,
"grad_norm": 0.2535433798080968,
"learning_rate": 3.423344947735192e-05,
"loss": 0.4607,
"step": 736
},
{
"epoch": 1.1551724137931034,
"grad_norm": 0.2417709785580235,
"learning_rate": 3.420441347270616e-05,
"loss": 0.5127,
"step": 737
},
{
"epoch": 1.1567398119122256,
"grad_norm": 0.24506504724128927,
"learning_rate": 3.41753774680604e-05,
"loss": 0.513,
"step": 738
},
{
"epoch": 1.158307210031348,
"grad_norm": 0.25459634572567486,
"learning_rate": 3.414634146341464e-05,
"loss": 0.4823,
"step": 739
},
{
"epoch": 1.1598746081504703,
"grad_norm": 0.2566270576721975,
"learning_rate": 3.411730545876887e-05,
"loss": 0.5303,
"step": 740
},
{
"epoch": 1.1614420062695925,
"grad_norm": 0.2440209321234056,
"learning_rate": 3.4088269454123113e-05,
"loss": 0.5611,
"step": 741
},
{
"epoch": 1.1630094043887147,
"grad_norm": 0.24569274606495187,
"learning_rate": 3.4059233449477354e-05,
"loss": 0.4926,
"step": 742
},
{
"epoch": 1.164576802507837,
"grad_norm": 0.2623749515311546,
"learning_rate": 3.4030197444831594e-05,
"loss": 0.5253,
"step": 743
},
{
"epoch": 1.1661442006269593,
"grad_norm": 0.22972719412121864,
"learning_rate": 3.4001161440185834e-05,
"loss": 0.4877,
"step": 744
},
{
"epoch": 1.1677115987460815,
"grad_norm": 0.2553158778999516,
"learning_rate": 3.397212543554007e-05,
"loss": 0.5203,
"step": 745
},
{
"epoch": 1.1692789968652038,
"grad_norm": 0.2525325976352295,
"learning_rate": 3.394308943089431e-05,
"loss": 0.5013,
"step": 746
},
{
"epoch": 1.170846394984326,
"grad_norm": 0.25750044613897305,
"learning_rate": 3.391405342624855e-05,
"loss": 0.5406,
"step": 747
},
{
"epoch": 1.1724137931034484,
"grad_norm": 0.26022986029207307,
"learning_rate": 3.388501742160279e-05,
"loss": 0.5453,
"step": 748
},
{
"epoch": 1.1739811912225706,
"grad_norm": 0.23228864834246377,
"learning_rate": 3.385598141695703e-05,
"loss": 0.4886,
"step": 749
},
{
"epoch": 1.1755485893416928,
"grad_norm": 0.2748125387192771,
"learning_rate": 3.382694541231127e-05,
"loss": 0.4686,
"step": 750
},
{
"epoch": 1.177115987460815,
"grad_norm": 0.22887237950896241,
"learning_rate": 3.379790940766551e-05,
"loss": 0.4471,
"step": 751
},
{
"epoch": 1.1786833855799372,
"grad_norm": 0.25324571844135907,
"learning_rate": 3.376887340301975e-05,
"loss": 0.4471,
"step": 752
},
{
"epoch": 1.1802507836990594,
"grad_norm": 0.2599648547038783,
"learning_rate": 3.373983739837399e-05,
"loss": 0.4891,
"step": 753
},
{
"epoch": 1.1818181818181819,
"grad_norm": 0.2354342167546053,
"learning_rate": 3.371080139372823e-05,
"loss": 0.4797,
"step": 754
},
{
"epoch": 1.183385579937304,
"grad_norm": 0.26503087742794007,
"learning_rate": 3.368176538908247e-05,
"loss": 0.4616,
"step": 755
},
{
"epoch": 1.1849529780564263,
"grad_norm": 0.3008107855572518,
"learning_rate": 3.36527293844367e-05,
"loss": 0.5193,
"step": 756
},
{
"epoch": 1.1865203761755485,
"grad_norm": 0.216934931123227,
"learning_rate": 3.362369337979094e-05,
"loss": 0.4705,
"step": 757
},
{
"epoch": 1.188087774294671,
"grad_norm": 0.24925044790073694,
"learning_rate": 3.359465737514518e-05,
"loss": 0.478,
"step": 758
},
{
"epoch": 1.1896551724137931,
"grad_norm": 0.617288140816794,
"learning_rate": 3.356562137049942e-05,
"loss": 0.4717,
"step": 759
},
{
"epoch": 1.1912225705329154,
"grad_norm": 0.25504016467196094,
"learning_rate": 3.353658536585366e-05,
"loss": 0.5591,
"step": 760
},
{
"epoch": 1.1927899686520376,
"grad_norm": 0.2725877312642507,
"learning_rate": 3.35075493612079e-05,
"loss": 0.5072,
"step": 761
},
{
"epoch": 1.1943573667711598,
"grad_norm": 0.273850633322404,
"learning_rate": 3.347851335656214e-05,
"loss": 0.5555,
"step": 762
},
{
"epoch": 1.1959247648902822,
"grad_norm": 0.27091192577770185,
"learning_rate": 3.344947735191638e-05,
"loss": 0.5055,
"step": 763
},
{
"epoch": 1.1974921630094044,
"grad_norm": 0.261774874430834,
"learning_rate": 3.3420441347270615e-05,
"loss": 0.4327,
"step": 764
},
{
"epoch": 1.1990595611285266,
"grad_norm": 0.25626653632805213,
"learning_rate": 3.3391405342624855e-05,
"loss": 0.4999,
"step": 765
},
{
"epoch": 1.2006269592476488,
"grad_norm": 0.24673050676440197,
"learning_rate": 3.3362369337979095e-05,
"loss": 0.5375,
"step": 766
},
{
"epoch": 1.2021943573667713,
"grad_norm": 0.21863003705043296,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.4623,
"step": 767
},
{
"epoch": 1.2037617554858935,
"grad_norm": 0.25297012323134327,
"learning_rate": 3.3304297328687575e-05,
"loss": 0.5059,
"step": 768
},
{
"epoch": 1.2053291536050157,
"grad_norm": 0.2578708362207169,
"learning_rate": 3.3275261324041815e-05,
"loss": 0.5274,
"step": 769
},
{
"epoch": 1.206896551724138,
"grad_norm": 0.26513453318143076,
"learning_rate": 3.324622531939605e-05,
"loss": 0.4576,
"step": 770
},
{
"epoch": 1.20846394984326,
"grad_norm": 0.2515823886911137,
"learning_rate": 3.321718931475029e-05,
"loss": 0.5229,
"step": 771
},
{
"epoch": 1.2100313479623825,
"grad_norm": 0.2594310131111704,
"learning_rate": 3.318815331010453e-05,
"loss": 0.5043,
"step": 772
},
{
"epoch": 1.2115987460815048,
"grad_norm": 0.30816430976127845,
"learning_rate": 3.315911730545877e-05,
"loss": 0.5928,
"step": 773
},
{
"epoch": 1.213166144200627,
"grad_norm": 0.26910179304075865,
"learning_rate": 3.313008130081301e-05,
"loss": 0.6111,
"step": 774
},
{
"epoch": 1.2147335423197492,
"grad_norm": 0.24720209316322703,
"learning_rate": 3.310104529616725e-05,
"loss": 0.5049,
"step": 775
},
{
"epoch": 1.2163009404388714,
"grad_norm": 0.26123777640470797,
"learning_rate": 3.307200929152149e-05,
"loss": 0.5098,
"step": 776
},
{
"epoch": 1.2178683385579938,
"grad_norm": 0.2366640676620058,
"learning_rate": 3.304297328687573e-05,
"loss": 0.4817,
"step": 777
},
{
"epoch": 1.219435736677116,
"grad_norm": 0.241715496399803,
"learning_rate": 3.301393728222997e-05,
"loss": 0.5386,
"step": 778
},
{
"epoch": 1.2210031347962382,
"grad_norm": 0.2653585333329661,
"learning_rate": 3.298490127758421e-05,
"loss": 0.5391,
"step": 779
},
{
"epoch": 1.2225705329153604,
"grad_norm": 0.23694023304895823,
"learning_rate": 3.295586527293845e-05,
"loss": 0.5104,
"step": 780
},
{
"epoch": 1.2241379310344827,
"grad_norm": 0.2374383183531619,
"learning_rate": 3.292682926829269e-05,
"loss": 0.5058,
"step": 781
},
{
"epoch": 1.225705329153605,
"grad_norm": 0.25543499309057116,
"learning_rate": 3.289779326364692e-05,
"loss": 0.5196,
"step": 782
},
{
"epoch": 1.2272727272727273,
"grad_norm": 0.237572782641992,
"learning_rate": 3.286875725900116e-05,
"loss": 0.5116,
"step": 783
},
{
"epoch": 1.2288401253918495,
"grad_norm": 0.22183229943006116,
"learning_rate": 3.28397212543554e-05,
"loss": 0.496,
"step": 784
},
{
"epoch": 1.2304075235109717,
"grad_norm": 0.2782237212176746,
"learning_rate": 3.281068524970964e-05,
"loss": 0.5197,
"step": 785
},
{
"epoch": 1.2319749216300941,
"grad_norm": 0.23309458340669306,
"learning_rate": 3.278164924506388e-05,
"loss": 0.5408,
"step": 786
},
{
"epoch": 1.2335423197492164,
"grad_norm": 0.2627203468656966,
"learning_rate": 3.2752613240418116e-05,
"loss": 0.5711,
"step": 787
},
{
"epoch": 1.2351097178683386,
"grad_norm": 0.2508848145069765,
"learning_rate": 3.2723577235772356e-05,
"loss": 0.5069,
"step": 788
},
{
"epoch": 1.2366771159874608,
"grad_norm": 0.8056883245480172,
"learning_rate": 3.2694541231126596e-05,
"loss": 0.5935,
"step": 789
},
{
"epoch": 1.238244514106583,
"grad_norm": 0.2517444320916277,
"learning_rate": 3.2665505226480836e-05,
"loss": 0.4501,
"step": 790
},
{
"epoch": 1.2398119122257054,
"grad_norm": 0.25865496476495753,
"learning_rate": 3.2636469221835076e-05,
"loss": 0.5082,
"step": 791
},
{
"epoch": 1.2413793103448276,
"grad_norm": 0.2483466321206937,
"learning_rate": 3.2607433217189316e-05,
"loss": 0.5077,
"step": 792
},
{
"epoch": 1.2429467084639498,
"grad_norm": 0.28092332370577106,
"learning_rate": 3.2578397212543556e-05,
"loss": 0.5546,
"step": 793
},
{
"epoch": 1.244514106583072,
"grad_norm": 0.25663618227628227,
"learning_rate": 3.2549361207897796e-05,
"loss": 0.5159,
"step": 794
},
{
"epoch": 1.2460815047021945,
"grad_norm": 0.24285184694968323,
"learning_rate": 3.2520325203252037e-05,
"loss": 0.4975,
"step": 795
},
{
"epoch": 1.2476489028213167,
"grad_norm": 0.29110129808203644,
"learning_rate": 3.249128919860627e-05,
"loss": 0.5172,
"step": 796
},
{
"epoch": 1.249216300940439,
"grad_norm": 0.2525946390157005,
"learning_rate": 3.246225319396051e-05,
"loss": 0.5347,
"step": 797
},
{
"epoch": 1.250783699059561,
"grad_norm": 0.2760384032397708,
"learning_rate": 3.243321718931475e-05,
"loss": 0.532,
"step": 798
},
{
"epoch": 1.2523510971786833,
"grad_norm": 0.26851010283486904,
"learning_rate": 3.240418118466899e-05,
"loss": 0.5532,
"step": 799
},
{
"epoch": 1.2539184952978055,
"grad_norm": 0.24976062258817883,
"learning_rate": 3.237514518002323e-05,
"loss": 0.4613,
"step": 800
},
{
"epoch": 1.255485893416928,
"grad_norm": 0.2500586085443441,
"learning_rate": 3.234610917537747e-05,
"loss": 0.494,
"step": 801
},
{
"epoch": 1.2570532915360502,
"grad_norm": 0.24849983580055438,
"learning_rate": 3.231707317073171e-05,
"loss": 0.4928,
"step": 802
},
{
"epoch": 1.2586206896551724,
"grad_norm": 0.26576268625770716,
"learning_rate": 3.228803716608595e-05,
"loss": 0.5273,
"step": 803
},
{
"epoch": 1.2601880877742948,
"grad_norm": 0.24083122595529285,
"learning_rate": 3.225900116144019e-05,
"loss": 0.4874,
"step": 804
},
{
"epoch": 1.261755485893417,
"grad_norm": 0.2623570065043791,
"learning_rate": 3.222996515679443e-05,
"loss": 0.4932,
"step": 805
},
{
"epoch": 1.2633228840125392,
"grad_norm": 0.2488846109252028,
"learning_rate": 3.220092915214867e-05,
"loss": 0.492,
"step": 806
},
{
"epoch": 1.2648902821316614,
"grad_norm": 0.2396730611914216,
"learning_rate": 3.217189314750291e-05,
"loss": 0.4764,
"step": 807
},
{
"epoch": 1.2664576802507836,
"grad_norm": 0.23800962365784828,
"learning_rate": 3.2142857142857144e-05,
"loss": 0.4958,
"step": 808
},
{
"epoch": 1.2680250783699059,
"grad_norm": 0.24307219549917802,
"learning_rate": 3.2113821138211384e-05,
"loss": 0.4747,
"step": 809
},
{
"epoch": 1.2695924764890283,
"grad_norm": 0.23763604063988492,
"learning_rate": 3.208478513356562e-05,
"loss": 0.5042,
"step": 810
},
{
"epoch": 1.2711598746081505,
"grad_norm": 0.23071393279066849,
"learning_rate": 3.205574912891986e-05,
"loss": 0.5265,
"step": 811
},
{
"epoch": 1.2727272727272727,
"grad_norm": 3.290254327390922,
"learning_rate": 3.20267131242741e-05,
"loss": 0.6164,
"step": 812
},
{
"epoch": 1.274294670846395,
"grad_norm": 0.25898020864956506,
"learning_rate": 3.199767711962834e-05,
"loss": 0.4828,
"step": 813
},
{
"epoch": 1.2758620689655173,
"grad_norm": 0.27322982393829587,
"learning_rate": 3.196864111498258e-05,
"loss": 0.5108,
"step": 814
},
{
"epoch": 1.2774294670846396,
"grad_norm": 0.23185690635124984,
"learning_rate": 3.193960511033682e-05,
"loss": 0.5244,
"step": 815
},
{
"epoch": 1.2789968652037618,
"grad_norm": 0.24366972257117528,
"learning_rate": 3.191056910569106e-05,
"loss": 0.4681,
"step": 816
},
{
"epoch": 1.280564263322884,
"grad_norm": 0.2639485948442474,
"learning_rate": 3.18815331010453e-05,
"loss": 0.5738,
"step": 817
},
{
"epoch": 1.2821316614420062,
"grad_norm": 0.22301584115328593,
"learning_rate": 3.185249709639954e-05,
"loss": 0.4299,
"step": 818
},
{
"epoch": 1.2836990595611284,
"grad_norm": 0.2609320021272644,
"learning_rate": 3.182346109175378e-05,
"loss": 0.5481,
"step": 819
},
{
"epoch": 1.2852664576802508,
"grad_norm": 1.4808002528259674,
"learning_rate": 3.179442508710802e-05,
"loss": 0.5358,
"step": 820
},
{
"epoch": 1.286833855799373,
"grad_norm": 0.24951242874121393,
"learning_rate": 3.176538908246225e-05,
"loss": 0.5091,
"step": 821
},
{
"epoch": 1.2884012539184952,
"grad_norm": 0.28589402240149986,
"learning_rate": 3.173635307781649e-05,
"loss": 0.5045,
"step": 822
},
{
"epoch": 1.2899686520376177,
"grad_norm": 0.28475643499481873,
"learning_rate": 3.170731707317073e-05,
"loss": 0.5339,
"step": 823
},
{
"epoch": 1.29153605015674,
"grad_norm": 0.2423827183674981,
"learning_rate": 3.167828106852497e-05,
"loss": 0.4862,
"step": 824
},
{
"epoch": 1.293103448275862,
"grad_norm": 0.29974877236476233,
"learning_rate": 3.164924506387921e-05,
"loss": 0.5003,
"step": 825
},
{
"epoch": 1.2946708463949843,
"grad_norm": 0.31080515892157534,
"learning_rate": 3.162020905923345e-05,
"loss": 0.4858,
"step": 826
},
{
"epoch": 1.2962382445141065,
"grad_norm": 0.24776066570548555,
"learning_rate": 3.159117305458769e-05,
"loss": 0.5283,
"step": 827
},
{
"epoch": 1.2978056426332287,
"grad_norm": 0.2356554929259651,
"learning_rate": 3.156213704994193e-05,
"loss": 0.5119,
"step": 828
},
{
"epoch": 1.2993730407523512,
"grad_norm": 0.2797932312781817,
"learning_rate": 3.153310104529617e-05,
"loss": 0.4871,
"step": 829
},
{
"epoch": 1.3009404388714734,
"grad_norm": 0.2625736182079629,
"learning_rate": 3.150406504065041e-05,
"loss": 0.5524,
"step": 830
},
{
"epoch": 1.3025078369905956,
"grad_norm": 0.22843575111051379,
"learning_rate": 3.147502903600465e-05,
"loss": 0.4363,
"step": 831
},
{
"epoch": 1.3040752351097178,
"grad_norm": 0.23734431731059252,
"learning_rate": 3.144599303135889e-05,
"loss": 0.4791,
"step": 832
},
{
"epoch": 1.3056426332288402,
"grad_norm": 0.23144636212605293,
"learning_rate": 3.1416957026713125e-05,
"loss": 0.4629,
"step": 833
},
{
"epoch": 1.3072100313479624,
"grad_norm": 0.2313782249242891,
"learning_rate": 3.1387921022067365e-05,
"loss": 0.5113,
"step": 834
},
{
"epoch": 1.3087774294670846,
"grad_norm": 0.2707647881025698,
"learning_rate": 3.13588850174216e-05,
"loss": 0.5546,
"step": 835
},
{
"epoch": 1.3103448275862069,
"grad_norm": 0.24871068767643345,
"learning_rate": 3.132984901277584e-05,
"loss": 0.4805,
"step": 836
},
{
"epoch": 1.311912225705329,
"grad_norm": 0.24147768192399438,
"learning_rate": 3.130081300813008e-05,
"loss": 0.573,
"step": 837
},
{
"epoch": 1.3134796238244515,
"grad_norm": 0.2298247817132597,
"learning_rate": 3.127177700348432e-05,
"loss": 0.4558,
"step": 838
},
{
"epoch": 1.3150470219435737,
"grad_norm": 0.23747380101851623,
"learning_rate": 3.124274099883856e-05,
"loss": 0.5083,
"step": 839
},
{
"epoch": 1.316614420062696,
"grad_norm": 0.23341393024795143,
"learning_rate": 3.12137049941928e-05,
"loss": 0.5043,
"step": 840
},
{
"epoch": 1.3181818181818181,
"grad_norm": 0.22585235614033436,
"learning_rate": 3.118466898954704e-05,
"loss": 0.5192,
"step": 841
},
{
"epoch": 1.3197492163009406,
"grad_norm": 0.2453987283374587,
"learning_rate": 3.115563298490128e-05,
"loss": 0.5039,
"step": 842
},
{
"epoch": 1.3213166144200628,
"grad_norm": 0.23098532011163436,
"learning_rate": 3.112659698025552e-05,
"loss": 0.4456,
"step": 843
},
{
"epoch": 1.322884012539185,
"grad_norm": 0.24064250805638637,
"learning_rate": 3.109756097560976e-05,
"loss": 0.4619,
"step": 844
},
{
"epoch": 1.3244514106583072,
"grad_norm": 0.23993107767498045,
"learning_rate": 3.1068524970964e-05,
"loss": 0.5377,
"step": 845
},
{
"epoch": 1.3260188087774294,
"grad_norm": 0.23520073207715425,
"learning_rate": 3.103948896631824e-05,
"loss": 0.4992,
"step": 846
},
{
"epoch": 1.3275862068965516,
"grad_norm": 0.2382278723993846,
"learning_rate": 3.101045296167247e-05,
"loss": 0.4666,
"step": 847
},
{
"epoch": 1.329153605015674,
"grad_norm": 0.23199774837952214,
"learning_rate": 3.098141695702671e-05,
"loss": 0.4611,
"step": 848
},
{
"epoch": 1.3307210031347962,
"grad_norm": 0.23910556588638346,
"learning_rate": 3.095238095238095e-05,
"loss": 0.4681,
"step": 849
},
{
"epoch": 1.3322884012539185,
"grad_norm": 0.23123211095737917,
"learning_rate": 3.092334494773519e-05,
"loss": 0.499,
"step": 850
},
{
"epoch": 1.3338557993730409,
"grad_norm": 0.23248405038329087,
"learning_rate": 3.089430894308943e-05,
"loss": 0.4901,
"step": 851
},
{
"epoch": 1.335423197492163,
"grad_norm": 0.2524231539106836,
"learning_rate": 3.086527293844367e-05,
"loss": 0.4976,
"step": 852
},
{
"epoch": 1.3369905956112853,
"grad_norm": 12.08054308666848,
"learning_rate": 3.083623693379791e-05,
"loss": 0.755,
"step": 853
},
{
"epoch": 1.3385579937304075,
"grad_norm": 0.2293978279723572,
"learning_rate": 3.080720092915215e-05,
"loss": 0.511,
"step": 854
},
{
"epoch": 1.3401253918495297,
"grad_norm": 0.24569981062076934,
"learning_rate": 3.077816492450639e-05,
"loss": 0.5039,
"step": 855
},
{
"epoch": 1.341692789968652,
"grad_norm": 0.2234706802903541,
"learning_rate": 3.074912891986063e-05,
"loss": 0.5035,
"step": 856
},
{
"epoch": 1.3432601880877744,
"grad_norm": 0.2613836416764672,
"learning_rate": 3.072009291521487e-05,
"loss": 0.5627,
"step": 857
},
{
"epoch": 1.3448275862068966,
"grad_norm": 0.28438824066677365,
"learning_rate": 3.069105691056911e-05,
"loss": 0.5203,
"step": 858
},
{
"epoch": 1.3463949843260188,
"grad_norm": 0.25299556718603405,
"learning_rate": 3.066202090592335e-05,
"loss": 0.4505,
"step": 859
},
{
"epoch": 1.347962382445141,
"grad_norm": 0.35621123974136176,
"learning_rate": 3.063298490127759e-05,
"loss": 0.4404,
"step": 860
},
{
"epoch": 1.3495297805642634,
"grad_norm": 0.25720227399330625,
"learning_rate": 3.060394889663182e-05,
"loss": 0.4663,
"step": 861
},
{
"epoch": 1.3510971786833856,
"grad_norm": 0.22671395001555406,
"learning_rate": 3.057491289198606e-05,
"loss": 0.4792,
"step": 862
},
{
"epoch": 1.3526645768025078,
"grad_norm": 0.2736247072453033,
"learning_rate": 3.05458768873403e-05,
"loss": 0.5204,
"step": 863
},
{
"epoch": 1.35423197492163,
"grad_norm": 0.25323260452282204,
"learning_rate": 3.051684088269454e-05,
"loss": 0.5053,
"step": 864
},
{
"epoch": 1.3557993730407523,
"grad_norm": 0.23247819516966606,
"learning_rate": 3.048780487804878e-05,
"loss": 0.5035,
"step": 865
},
{
"epoch": 1.3573667711598745,
"grad_norm": 0.23105048560463196,
"learning_rate": 3.045876887340302e-05,
"loss": 0.461,
"step": 866
},
{
"epoch": 1.358934169278997,
"grad_norm": 0.27796513263233874,
"learning_rate": 3.042973286875726e-05,
"loss": 0.5849,
"step": 867
},
{
"epoch": 1.3605015673981191,
"grad_norm": 0.24814585179417864,
"learning_rate": 3.04006968641115e-05,
"loss": 0.5442,
"step": 868
},
{
"epoch": 1.3620689655172413,
"grad_norm": 0.22246261082605454,
"learning_rate": 3.037166085946574e-05,
"loss": 0.4618,
"step": 869
},
{
"epoch": 1.3636363636363638,
"grad_norm": 0.24227566409906076,
"learning_rate": 3.0342624854819977e-05,
"loss": 0.5256,
"step": 870
},
{
"epoch": 1.365203761755486,
"grad_norm": 0.28860736529708464,
"learning_rate": 3.0313588850174217e-05,
"loss": 0.5109,
"step": 871
},
{
"epoch": 1.3667711598746082,
"grad_norm": 0.22904740564213752,
"learning_rate": 3.0284552845528458e-05,
"loss": 0.4782,
"step": 872
},
{
"epoch": 1.3683385579937304,
"grad_norm": 0.2410311228577051,
"learning_rate": 3.0255516840882698e-05,
"loss": 0.5213,
"step": 873
},
{
"epoch": 1.3699059561128526,
"grad_norm": 0.22436954495379727,
"learning_rate": 3.0226480836236938e-05,
"loss": 0.4528,
"step": 874
},
{
"epoch": 1.3714733542319748,
"grad_norm": 0.7050744310262533,
"learning_rate": 3.0197444831591178e-05,
"loss": 0.4877,
"step": 875
},
{
"epoch": 1.3730407523510972,
"grad_norm": 0.8297543091461121,
"learning_rate": 3.0168408826945414e-05,
"loss": 0.4602,
"step": 876
},
{
"epoch": 1.3746081504702194,
"grad_norm": 0.29236298759521184,
"learning_rate": 3.0139372822299655e-05,
"loss": 0.5375,
"step": 877
},
{
"epoch": 1.3761755485893417,
"grad_norm": 0.2403388762337624,
"learning_rate": 3.0110336817653895e-05,
"loss": 0.4555,
"step": 878
},
{
"epoch": 1.3777429467084639,
"grad_norm": 0.2518317046954741,
"learning_rate": 3.0081300813008135e-05,
"loss": 0.5199,
"step": 879
},
{
"epoch": 1.3793103448275863,
"grad_norm": 0.2745268738788722,
"learning_rate": 3.0052264808362368e-05,
"loss": 0.5062,
"step": 880
},
{
"epoch": 1.3808777429467085,
"grad_norm": 0.2348029266806505,
"learning_rate": 3.0023228803716608e-05,
"loss": 0.4567,
"step": 881
},
{
"epoch": 1.3824451410658307,
"grad_norm": 0.24324796508446844,
"learning_rate": 2.9994192799070848e-05,
"loss": 0.4805,
"step": 882
},
{
"epoch": 1.384012539184953,
"grad_norm": 0.23431633364257168,
"learning_rate": 2.9965156794425088e-05,
"loss": 0.5299,
"step": 883
},
{
"epoch": 1.3855799373040751,
"grad_norm": 0.2549508361785958,
"learning_rate": 2.9936120789779325e-05,
"loss": 0.5418,
"step": 884
},
{
"epoch": 1.3871473354231976,
"grad_norm": 0.21765575468194998,
"learning_rate": 2.9907084785133565e-05,
"loss": 0.4582,
"step": 885
},
{
"epoch": 1.3887147335423198,
"grad_norm": 0.24262359751394671,
"learning_rate": 2.9878048780487805e-05,
"loss": 0.4941,
"step": 886
},
{
"epoch": 1.390282131661442,
"grad_norm": 0.2348357018718934,
"learning_rate": 2.9849012775842045e-05,
"loss": 0.5275,
"step": 887
},
{
"epoch": 1.3918495297805642,
"grad_norm": 0.3638755948842071,
"learning_rate": 2.9819976771196285e-05,
"loss": 0.457,
"step": 888
},
{
"epoch": 1.3934169278996866,
"grad_norm": 0.22787150918911697,
"learning_rate": 2.9790940766550525e-05,
"loss": 0.4966,
"step": 889
},
{
"epoch": 1.3949843260188088,
"grad_norm": 0.2594052193331762,
"learning_rate": 2.9761904761904762e-05,
"loss": 0.5482,
"step": 890
},
{
"epoch": 1.396551724137931,
"grad_norm": 0.2674185835252388,
"learning_rate": 2.9732868757259002e-05,
"loss": 0.564,
"step": 891
},
{
"epoch": 1.3981191222570533,
"grad_norm": 0.266443573054244,
"learning_rate": 2.9703832752613242e-05,
"loss": 0.548,
"step": 892
},
{
"epoch": 1.3996865203761755,
"grad_norm": 4.099932475677585,
"learning_rate": 2.9674796747967482e-05,
"loss": 0.5328,
"step": 893
},
{
"epoch": 1.4012539184952977,
"grad_norm": 0.29582956843654296,
"learning_rate": 2.9645760743321722e-05,
"loss": 0.5868,
"step": 894
},
{
"epoch": 1.40282131661442,
"grad_norm": 0.24284793803302607,
"learning_rate": 2.9616724738675962e-05,
"loss": 0.4307,
"step": 895
},
{
"epoch": 1.4043887147335423,
"grad_norm": 0.24485000592038864,
"learning_rate": 2.95876887340302e-05,
"loss": 0.5783,
"step": 896
},
{
"epoch": 1.4059561128526645,
"grad_norm": 0.3057413184497551,
"learning_rate": 2.955865272938444e-05,
"loss": 0.5064,
"step": 897
},
{
"epoch": 1.407523510971787,
"grad_norm": 0.27234034846739114,
"learning_rate": 2.952961672473868e-05,
"loss": 0.5036,
"step": 898
},
{
"epoch": 1.4090909090909092,
"grad_norm": 0.23809075950790443,
"learning_rate": 2.950058072009292e-05,
"loss": 0.5302,
"step": 899
},
{
"epoch": 1.4106583072100314,
"grad_norm": 0.23478636377242862,
"learning_rate": 2.947154471544716e-05,
"loss": 0.3946,
"step": 900
},
{
"epoch": 1.4122257053291536,
"grad_norm": 0.26680732366372806,
"learning_rate": 2.9442508710801396e-05,
"loss": 0.4567,
"step": 901
},
{
"epoch": 1.4137931034482758,
"grad_norm": 0.2744946246471577,
"learning_rate": 2.9413472706155636e-05,
"loss": 0.5026,
"step": 902
},
{
"epoch": 1.415360501567398,
"grad_norm": 0.23183096115569757,
"learning_rate": 2.9384436701509873e-05,
"loss": 0.5179,
"step": 903
},
{
"epoch": 1.4169278996865204,
"grad_norm": 0.25146841188216656,
"learning_rate": 2.935540069686411e-05,
"loss": 0.4533,
"step": 904
},
{
"epoch": 1.4184952978056427,
"grad_norm": 1.0692771068431026,
"learning_rate": 2.932636469221835e-05,
"loss": 0.5092,
"step": 905
},
{
"epoch": 1.4200626959247649,
"grad_norm": 0.2545939539782089,
"learning_rate": 2.929732868757259e-05,
"loss": 0.5143,
"step": 906
},
{
"epoch": 1.421630094043887,
"grad_norm": 0.2672794389743849,
"learning_rate": 2.926829268292683e-05,
"loss": 0.4859,
"step": 907
},
{
"epoch": 1.4231974921630095,
"grad_norm": 0.2513982824899657,
"learning_rate": 2.923925667828107e-05,
"loss": 0.4915,
"step": 908
},
{
"epoch": 1.4247648902821317,
"grad_norm": 0.26661859380934416,
"learning_rate": 2.9210220673635306e-05,
"loss": 0.536,
"step": 909
},
{
"epoch": 1.426332288401254,
"grad_norm": 0.2855334725056522,
"learning_rate": 2.9181184668989546e-05,
"loss": 0.5619,
"step": 910
},
{
"epoch": 1.4278996865203761,
"grad_norm": 0.24995264330261305,
"learning_rate": 2.9152148664343786e-05,
"loss": 0.5694,
"step": 911
},
{
"epoch": 1.4294670846394983,
"grad_norm": 0.2545340874539848,
"learning_rate": 2.9123112659698026e-05,
"loss": 0.5177,
"step": 912
},
{
"epoch": 1.4310344827586206,
"grad_norm": 0.2772197854500256,
"learning_rate": 2.9094076655052267e-05,
"loss": 0.4674,
"step": 913
},
{
"epoch": 1.432601880877743,
"grad_norm": 0.26488079979428186,
"learning_rate": 2.9065040650406507e-05,
"loss": 0.6185,
"step": 914
},
{
"epoch": 1.4341692789968652,
"grad_norm": 0.2133244920607121,
"learning_rate": 2.9036004645760743e-05,
"loss": 0.4445,
"step": 915
},
{
"epoch": 1.4357366771159874,
"grad_norm": 0.2938166933142752,
"learning_rate": 2.9006968641114983e-05,
"loss": 0.5176,
"step": 916
},
{
"epoch": 1.4373040752351098,
"grad_norm": 0.45339160811970514,
"learning_rate": 2.8977932636469223e-05,
"loss": 0.4756,
"step": 917
},
{
"epoch": 1.438871473354232,
"grad_norm": 0.2744290192184883,
"learning_rate": 2.8948896631823464e-05,
"loss": 0.5425,
"step": 918
},
{
"epoch": 1.4404388714733543,
"grad_norm": 0.27448494145047875,
"learning_rate": 2.8919860627177704e-05,
"loss": 0.5283,
"step": 919
},
{
"epoch": 1.4420062695924765,
"grad_norm": 0.24209362117240368,
"learning_rate": 2.8890824622531944e-05,
"loss": 0.5246,
"step": 920
},
{
"epoch": 1.4435736677115987,
"grad_norm": 0.2793593177901661,
"learning_rate": 2.886178861788618e-05,
"loss": 0.4649,
"step": 921
},
{
"epoch": 1.4451410658307209,
"grad_norm": 0.24351531292486417,
"learning_rate": 2.883275261324042e-05,
"loss": 0.4526,
"step": 922
},
{
"epoch": 1.4467084639498433,
"grad_norm": 0.22008043689761916,
"learning_rate": 2.880371660859466e-05,
"loss": 0.4745,
"step": 923
},
{
"epoch": 1.4482758620689655,
"grad_norm": 1.4147758049139687,
"learning_rate": 2.87746806039489e-05,
"loss": 0.4985,
"step": 924
},
{
"epoch": 1.4498432601880877,
"grad_norm": 0.2553802644061815,
"learning_rate": 2.874564459930314e-05,
"loss": 0.501,
"step": 925
},
{
"epoch": 1.4514106583072102,
"grad_norm": 0.288626781858355,
"learning_rate": 2.871660859465738e-05,
"loss": 0.575,
"step": 926
},
{
"epoch": 1.4529780564263324,
"grad_norm": 0.24383612036076524,
"learning_rate": 2.8687572590011614e-05,
"loss": 0.5001,
"step": 927
},
{
"epoch": 1.4545454545454546,
"grad_norm": 0.2665033755302161,
"learning_rate": 2.8658536585365854e-05,
"loss": 0.4875,
"step": 928
},
{
"epoch": 1.4561128526645768,
"grad_norm": 0.3638955735681137,
"learning_rate": 2.862950058072009e-05,
"loss": 0.4859,
"step": 929
},
{
"epoch": 1.457680250783699,
"grad_norm": 0.3255641464585736,
"learning_rate": 2.860046457607433e-05,
"loss": 0.4673,
"step": 930
},
{
"epoch": 1.4592476489028212,
"grad_norm": 0.2601012073297471,
"learning_rate": 2.857142857142857e-05,
"loss": 0.5501,
"step": 931
},
{
"epoch": 1.4608150470219436,
"grad_norm": 0.2750647603260477,
"learning_rate": 2.854239256678281e-05,
"loss": 0.5373,
"step": 932
},
{
"epoch": 1.4623824451410659,
"grad_norm": 0.2458312736665801,
"learning_rate": 2.851335656213705e-05,
"loss": 0.4486,
"step": 933
},
{
"epoch": 1.463949843260188,
"grad_norm": 0.34020925339018193,
"learning_rate": 2.848432055749129e-05,
"loss": 0.5321,
"step": 934
},
{
"epoch": 1.4655172413793103,
"grad_norm": 0.25979937231200845,
"learning_rate": 2.8455284552845528e-05,
"loss": 0.5149,
"step": 935
},
{
"epoch": 1.4670846394984327,
"grad_norm": 0.27179619897185475,
"learning_rate": 2.8426248548199768e-05,
"loss": 0.5354,
"step": 936
},
{
"epoch": 1.468652037617555,
"grad_norm": 0.2342259899166794,
"learning_rate": 2.8397212543554008e-05,
"loss": 0.5001,
"step": 937
},
{
"epoch": 1.4702194357366771,
"grad_norm": 0.294680780516466,
"learning_rate": 2.8368176538908248e-05,
"loss": 0.5296,
"step": 938
},
{
"epoch": 1.4717868338557993,
"grad_norm": 0.2603552658646867,
"learning_rate": 2.8339140534262488e-05,
"loss": 0.5534,
"step": 939
},
{
"epoch": 1.4733542319749215,
"grad_norm": 0.24239422867447952,
"learning_rate": 2.8310104529616728e-05,
"loss": 0.5123,
"step": 940
},
{
"epoch": 1.4749216300940438,
"grad_norm": 0.2510741802352688,
"learning_rate": 2.8281068524970965e-05,
"loss": 0.5398,
"step": 941
},
{
"epoch": 1.4764890282131662,
"grad_norm": 0.26263090783265963,
"learning_rate": 2.8252032520325205e-05,
"loss": 0.4813,
"step": 942
},
{
"epoch": 1.4780564263322884,
"grad_norm": 0.23025578517037956,
"learning_rate": 2.8222996515679445e-05,
"loss": 0.536,
"step": 943
},
{
"epoch": 1.4796238244514106,
"grad_norm": 0.2463266661125675,
"learning_rate": 2.8193960511033685e-05,
"loss": 0.4859,
"step": 944
},
{
"epoch": 1.481191222570533,
"grad_norm": 0.261248311552456,
"learning_rate": 2.8164924506387925e-05,
"loss": 0.5312,
"step": 945
},
{
"epoch": 1.4827586206896552,
"grad_norm": 0.24634390813650336,
"learning_rate": 2.8135888501742165e-05,
"loss": 0.5337,
"step": 946
},
{
"epoch": 1.4843260188087775,
"grad_norm": 0.24994745765206153,
"learning_rate": 2.8106852497096402e-05,
"loss": 0.5117,
"step": 947
},
{
"epoch": 1.4858934169278997,
"grad_norm": 0.2487569456497344,
"learning_rate": 2.8077816492450642e-05,
"loss": 0.4891,
"step": 948
},
{
"epoch": 1.4874608150470219,
"grad_norm": 0.24133491244025632,
"learning_rate": 2.8048780487804882e-05,
"loss": 0.4488,
"step": 949
},
{
"epoch": 1.489028213166144,
"grad_norm": 0.22684545638700815,
"learning_rate": 2.8019744483159115e-05,
"loss": 0.5045,
"step": 950
},
{
"epoch": 1.4905956112852665,
"grad_norm": 0.22487337574272886,
"learning_rate": 2.7990708478513355e-05,
"loss": 0.4718,
"step": 951
},
{
"epoch": 1.4921630094043887,
"grad_norm": 0.23972456485927374,
"learning_rate": 2.7961672473867595e-05,
"loss": 0.4473,
"step": 952
},
{
"epoch": 1.493730407523511,
"grad_norm": 0.2235643364228768,
"learning_rate": 2.7932636469221835e-05,
"loss": 0.5029,
"step": 953
},
{
"epoch": 1.4952978056426331,
"grad_norm": 0.23646518841624745,
"learning_rate": 2.7903600464576076e-05,
"loss": 0.4391,
"step": 954
},
{
"epoch": 1.4968652037617556,
"grad_norm": 0.23398657104258674,
"learning_rate": 2.7874564459930312e-05,
"loss": 0.459,
"step": 955
},
{
"epoch": 1.4984326018808778,
"grad_norm": 0.8664674486026802,
"learning_rate": 2.7845528455284552e-05,
"loss": 0.5328,
"step": 956
},
{
"epoch": 1.5,
"grad_norm": 0.2492598222638702,
"learning_rate": 2.7816492450638792e-05,
"loss": 0.4895,
"step": 957
},
{
"epoch": 1.5015673981191222,
"grad_norm": 0.25671501660386625,
"learning_rate": 2.7787456445993032e-05,
"loss": 0.5901,
"step": 958
},
{
"epoch": 1.5031347962382444,
"grad_norm": 0.21955115713408843,
"learning_rate": 2.7758420441347272e-05,
"loss": 0.4569,
"step": 959
},
{
"epoch": 1.5047021943573666,
"grad_norm": 0.23037682185197492,
"learning_rate": 2.7729384436701513e-05,
"loss": 0.482,
"step": 960
},
{
"epoch": 1.506269592476489,
"grad_norm": 0.24642583807221835,
"learning_rate": 2.770034843205575e-05,
"loss": 0.5032,
"step": 961
},
{
"epoch": 1.5078369905956113,
"grad_norm": 0.22514048951604781,
"learning_rate": 2.767131242740999e-05,
"loss": 0.4712,
"step": 962
},
{
"epoch": 1.5094043887147337,
"grad_norm": 0.211521228556795,
"learning_rate": 2.764227642276423e-05,
"loss": 0.4365,
"step": 963
},
{
"epoch": 1.510971786833856,
"grad_norm": 0.23395689531595548,
"learning_rate": 2.761324041811847e-05,
"loss": 0.5707,
"step": 964
},
{
"epoch": 1.5125391849529781,
"grad_norm": 1.1848485234255033,
"learning_rate": 2.758420441347271e-05,
"loss": 0.5425,
"step": 965
},
{
"epoch": 1.5141065830721003,
"grad_norm": 0.22440328609142943,
"learning_rate": 2.755516840882695e-05,
"loss": 0.4812,
"step": 966
},
{
"epoch": 1.5156739811912225,
"grad_norm": 0.24149395181436178,
"learning_rate": 2.7526132404181186e-05,
"loss": 0.5115,
"step": 967
},
{
"epoch": 1.5172413793103448,
"grad_norm": 0.5940438601925457,
"learning_rate": 2.7497096399535426e-05,
"loss": 0.5613,
"step": 968
},
{
"epoch": 1.518808777429467,
"grad_norm": 0.24752339759670966,
"learning_rate": 2.7468060394889666e-05,
"loss": 0.4822,
"step": 969
},
{
"epoch": 1.5203761755485894,
"grad_norm": 0.2505796267643419,
"learning_rate": 2.7439024390243906e-05,
"loss": 0.5125,
"step": 970
},
{
"epoch": 1.5219435736677116,
"grad_norm": 0.24199834483107444,
"learning_rate": 2.7409988385598147e-05,
"loss": 0.5071,
"step": 971
},
{
"epoch": 1.5235109717868338,
"grad_norm": 0.22601612527917792,
"learning_rate": 2.7380952380952383e-05,
"loss": 0.469,
"step": 972
},
{
"epoch": 1.5250783699059562,
"grad_norm": 0.23686287953416563,
"learning_rate": 2.735191637630662e-05,
"loss": 0.4611,
"step": 973
},
{
"epoch": 1.5266457680250785,
"grad_norm": 0.24776566150323362,
"learning_rate": 2.732288037166086e-05,
"loss": 0.4744,
"step": 974
},
{
"epoch": 1.5282131661442007,
"grad_norm": 0.2395698980847038,
"learning_rate": 2.7293844367015097e-05,
"loss": 0.556,
"step": 975
},
{
"epoch": 1.5297805642633229,
"grad_norm": 0.232298319814481,
"learning_rate": 2.7264808362369337e-05,
"loss": 0.5132,
"step": 976
},
{
"epoch": 1.531347962382445,
"grad_norm": 0.24270945666779017,
"learning_rate": 2.7235772357723577e-05,
"loss": 0.5312,
"step": 977
},
{
"epoch": 1.5329153605015673,
"grad_norm": 0.2145682122689975,
"learning_rate": 2.7206736353077817e-05,
"loss": 0.4269,
"step": 978
},
{
"epoch": 1.5344827586206895,
"grad_norm": 0.2530160167168309,
"learning_rate": 2.7177700348432057e-05,
"loss": 0.5418,
"step": 979
},
{
"epoch": 1.536050156739812,
"grad_norm": 0.23034772471079762,
"learning_rate": 2.7148664343786294e-05,
"loss": 0.4623,
"step": 980
},
{
"epoch": 1.5376175548589341,
"grad_norm": 0.2259099820809604,
"learning_rate": 2.7119628339140534e-05,
"loss": 0.4744,
"step": 981
},
{
"epoch": 1.5391849529780566,
"grad_norm": 0.2513770362617985,
"learning_rate": 2.7090592334494774e-05,
"loss": 0.5677,
"step": 982
},
{
"epoch": 1.5407523510971788,
"grad_norm": 0.3060928870902104,
"learning_rate": 2.7061556329849014e-05,
"loss": 0.4511,
"step": 983
},
{
"epoch": 1.542319749216301,
"grad_norm": 0.24206146043816232,
"learning_rate": 2.7032520325203254e-05,
"loss": 0.4902,
"step": 984
},
{
"epoch": 1.5438871473354232,
"grad_norm": 0.255918132948632,
"learning_rate": 2.7003484320557494e-05,
"loss": 0.5508,
"step": 985
},
{
"epoch": 1.5454545454545454,
"grad_norm": 0.23172684676930663,
"learning_rate": 2.697444831591173e-05,
"loss": 0.4778,
"step": 986
},
{
"epoch": 1.5470219435736676,
"grad_norm": 0.267438131197053,
"learning_rate": 2.694541231126597e-05,
"loss": 0.5172,
"step": 987
},
{
"epoch": 1.5485893416927898,
"grad_norm": 0.2611068753181423,
"learning_rate": 2.691637630662021e-05,
"loss": 0.5224,
"step": 988
},
{
"epoch": 1.5501567398119123,
"grad_norm": 0.2189056335286451,
"learning_rate": 2.688734030197445e-05,
"loss": 0.4554,
"step": 989
},
{
"epoch": 1.5517241379310345,
"grad_norm": 0.25095981661654204,
"learning_rate": 2.685830429732869e-05,
"loss": 0.5094,
"step": 990
},
{
"epoch": 1.5532915360501567,
"grad_norm": 0.2529710518626457,
"learning_rate": 2.682926829268293e-05,
"loss": 0.4755,
"step": 991
},
{
"epoch": 1.5548589341692791,
"grad_norm": 0.26733878801403144,
"learning_rate": 2.6800232288037168e-05,
"loss": 0.5774,
"step": 992
},
{
"epoch": 1.5564263322884013,
"grad_norm": 0.24189193866404185,
"learning_rate": 2.6771196283391408e-05,
"loss": 0.4849,
"step": 993
},
{
"epoch": 1.5579937304075235,
"grad_norm": 0.2758337508036839,
"learning_rate": 2.6742160278745648e-05,
"loss": 0.549,
"step": 994
},
{
"epoch": 1.5595611285266457,
"grad_norm": 0.22012820037849942,
"learning_rate": 2.6713124274099888e-05,
"loss": 0.4521,
"step": 995
},
{
"epoch": 1.561128526645768,
"grad_norm": 0.7120312063538552,
"learning_rate": 2.668408826945412e-05,
"loss": 0.5021,
"step": 996
},
{
"epoch": 1.5626959247648902,
"grad_norm": 0.24732619881205703,
"learning_rate": 2.665505226480836e-05,
"loss": 0.4807,
"step": 997
},
{
"epoch": 1.5642633228840124,
"grad_norm": 0.23290484277182008,
"learning_rate": 2.66260162601626e-05,
"loss": 0.4874,
"step": 998
},
{
"epoch": 1.5658307210031348,
"grad_norm": 0.24638986943932606,
"learning_rate": 2.659698025551684e-05,
"loss": 0.4947,
"step": 999
},
{
"epoch": 1.567398119122257,
"grad_norm": 0.24704275746845988,
"learning_rate": 2.6567944250871078e-05,
"loss": 0.5173,
"step": 1000
},
{
"epoch": 1.5689655172413794,
"grad_norm": 0.22080685681729587,
"learning_rate": 2.6538908246225318e-05,
"loss": 0.4675,
"step": 1001
},
{
"epoch": 1.5705329153605017,
"grad_norm": 0.21505875858381654,
"learning_rate": 2.6509872241579558e-05,
"loss": 0.4463,
"step": 1002
},
{
"epoch": 1.5721003134796239,
"grad_norm": 0.2310410688599683,
"learning_rate": 2.6480836236933798e-05,
"loss": 0.4743,
"step": 1003
},
{
"epoch": 1.573667711598746,
"grad_norm": 0.22699304001850337,
"learning_rate": 2.645180023228804e-05,
"loss": 0.4842,
"step": 1004
},
{
"epoch": 1.5752351097178683,
"grad_norm": 0.26335153506676084,
"learning_rate": 2.642276422764228e-05,
"loss": 0.5349,
"step": 1005
},
{
"epoch": 1.5768025078369905,
"grad_norm": 0.2382628784271775,
"learning_rate": 2.6393728222996515e-05,
"loss": 0.5106,
"step": 1006
},
{
"epoch": 1.5783699059561127,
"grad_norm": 0.22962899820062,
"learning_rate": 2.6364692218350755e-05,
"loss": 0.5175,
"step": 1007
},
{
"epoch": 1.5799373040752351,
"grad_norm": 0.2381938264351059,
"learning_rate": 2.6335656213704995e-05,
"loss": 0.481,
"step": 1008
},
{
"epoch": 1.5815047021943573,
"grad_norm": 0.24433064809946514,
"learning_rate": 2.6306620209059235e-05,
"loss": 0.5098,
"step": 1009
},
{
"epoch": 1.5830721003134798,
"grad_norm": 0.24451007384051987,
"learning_rate": 2.6277584204413475e-05,
"loss": 0.528,
"step": 1010
},
{
"epoch": 1.584639498432602,
"grad_norm": 0.222932716759317,
"learning_rate": 2.6248548199767715e-05,
"loss": 0.4784,
"step": 1011
},
{
"epoch": 1.5862068965517242,
"grad_norm": 0.303739318623454,
"learning_rate": 2.6219512195121952e-05,
"loss": 0.5632,
"step": 1012
},
{
"epoch": 1.5877742946708464,
"grad_norm": 0.22759644714428764,
"learning_rate": 2.6190476190476192e-05,
"loss": 0.4457,
"step": 1013
},
{
"epoch": 1.5893416927899686,
"grad_norm": 0.23695783275426968,
"learning_rate": 2.6161440185830432e-05,
"loss": 0.5616,
"step": 1014
},
{
"epoch": 1.5909090909090908,
"grad_norm": 0.21709644753828625,
"learning_rate": 2.6132404181184672e-05,
"loss": 0.4366,
"step": 1015
},
{
"epoch": 1.592476489028213,
"grad_norm": 0.22918773085044242,
"learning_rate": 2.6103368176538912e-05,
"loss": 0.4532,
"step": 1016
},
{
"epoch": 1.5940438871473355,
"grad_norm": 0.23933868764475408,
"learning_rate": 2.6074332171893152e-05,
"loss": 0.534,
"step": 1017
},
{
"epoch": 1.5956112852664577,
"grad_norm": 0.2621117241918842,
"learning_rate": 2.604529616724739e-05,
"loss": 0.5847,
"step": 1018
},
{
"epoch": 1.59717868338558,
"grad_norm": 0.284204307879969,
"learning_rate": 2.601626016260163e-05,
"loss": 0.5767,
"step": 1019
},
{
"epoch": 1.5987460815047023,
"grad_norm": 0.2317981311148715,
"learning_rate": 2.5987224157955863e-05,
"loss": 0.5236,
"step": 1020
},
{
"epoch": 1.6003134796238245,
"grad_norm": 0.24514374605170278,
"learning_rate": 2.5958188153310103e-05,
"loss": 0.5197,
"step": 1021
},
{
"epoch": 1.6018808777429467,
"grad_norm": 0.2606412315768581,
"learning_rate": 2.5929152148664343e-05,
"loss": 0.5784,
"step": 1022
},
{
"epoch": 1.603448275862069,
"grad_norm": 0.481782267695353,
"learning_rate": 2.5900116144018583e-05,
"loss": 0.5111,
"step": 1023
},
{
"epoch": 1.6050156739811912,
"grad_norm": 0.22067202742490097,
"learning_rate": 2.5871080139372823e-05,
"loss": 0.4371,
"step": 1024
},
{
"epoch": 1.6065830721003134,
"grad_norm": 0.29491718947893636,
"learning_rate": 2.5842044134727063e-05,
"loss": 0.5459,
"step": 1025
},
{
"epoch": 1.6081504702194356,
"grad_norm": 0.2423184759626812,
"learning_rate": 2.58130081300813e-05,
"loss": 0.488,
"step": 1026
},
{
"epoch": 1.609717868338558,
"grad_norm": 0.2731295116513863,
"learning_rate": 2.578397212543554e-05,
"loss": 0.5274,
"step": 1027
},
{
"epoch": 1.6112852664576802,
"grad_norm": 0.23640513676798183,
"learning_rate": 2.575493612078978e-05,
"loss": 0.4559,
"step": 1028
},
{
"epoch": 1.6128526645768027,
"grad_norm": 0.23583724775172812,
"learning_rate": 2.572590011614402e-05,
"loss": 0.4864,
"step": 1029
},
{
"epoch": 1.6144200626959249,
"grad_norm": 0.2447451391101188,
"learning_rate": 2.569686411149826e-05,
"loss": 0.5187,
"step": 1030
},
{
"epoch": 1.615987460815047,
"grad_norm": 0.24421149206796003,
"learning_rate": 2.56678281068525e-05,
"loss": 0.529,
"step": 1031
},
{
"epoch": 1.6175548589341693,
"grad_norm": 0.26522637202760924,
"learning_rate": 2.5638792102206737e-05,
"loss": 0.5173,
"step": 1032
},
{
"epoch": 1.6191222570532915,
"grad_norm": 0.2312017400299983,
"learning_rate": 2.5609756097560977e-05,
"loss": 0.4751,
"step": 1033
},
{
"epoch": 1.6206896551724137,
"grad_norm": 0.24140062008707316,
"learning_rate": 2.5580720092915217e-05,
"loss": 0.5267,
"step": 1034
},
{
"epoch": 1.622257053291536,
"grad_norm": 0.23380038741654843,
"learning_rate": 2.5551684088269457e-05,
"loss": 0.4349,
"step": 1035
},
{
"epoch": 1.6238244514106583,
"grad_norm": 0.22810482044061892,
"learning_rate": 2.5522648083623697e-05,
"loss": 0.4274,
"step": 1036
},
{
"epoch": 1.6253918495297806,
"grad_norm": 0.25274552991682897,
"learning_rate": 2.5493612078977937e-05,
"loss": 0.504,
"step": 1037
},
{
"epoch": 1.626959247648903,
"grad_norm": 0.22793633688225792,
"learning_rate": 2.5464576074332174e-05,
"loss": 0.4975,
"step": 1038
},
{
"epoch": 1.6285266457680252,
"grad_norm": 0.27149983750441337,
"learning_rate": 2.5435540069686414e-05,
"loss": 0.5459,
"step": 1039
},
{
"epoch": 1.6300940438871474,
"grad_norm": 0.29901697425805907,
"learning_rate": 2.5406504065040654e-05,
"loss": 0.5845,
"step": 1040
},
{
"epoch": 1.6316614420062696,
"grad_norm": 0.22493420456523933,
"learning_rate": 2.5377468060394894e-05,
"loss": 0.4722,
"step": 1041
},
{
"epoch": 1.6332288401253918,
"grad_norm": 0.22545516547435465,
"learning_rate": 2.5348432055749134e-05,
"loss": 0.429,
"step": 1042
},
{
"epoch": 1.634796238244514,
"grad_norm": 0.24255652571687983,
"learning_rate": 2.5319396051103367e-05,
"loss": 0.5021,
"step": 1043
},
{
"epoch": 1.6363636363636362,
"grad_norm": 0.24905848645290804,
"learning_rate": 2.5290360046457607e-05,
"loss": 0.5156,
"step": 1044
},
{
"epoch": 1.6379310344827587,
"grad_norm": 0.23785717493738542,
"learning_rate": 2.5261324041811847e-05,
"loss": 0.5262,
"step": 1045
},
{
"epoch": 1.6394984326018809,
"grad_norm": 0.22045712100139658,
"learning_rate": 2.5232288037166084e-05,
"loss": 0.461,
"step": 1046
},
{
"epoch": 1.641065830721003,
"grad_norm": 0.21698127609165585,
"learning_rate": 2.5203252032520324e-05,
"loss": 0.4393,
"step": 1047
},
{
"epoch": 1.6426332288401255,
"grad_norm": 0.2455979232651054,
"learning_rate": 2.5174216027874564e-05,
"loss": 0.4684,
"step": 1048
},
{
"epoch": 1.6442006269592477,
"grad_norm": 0.20571338871050623,
"learning_rate": 2.5145180023228804e-05,
"loss": 0.4547,
"step": 1049
},
{
"epoch": 1.64576802507837,
"grad_norm": 0.22064210812512006,
"learning_rate": 2.5116144018583044e-05,
"loss": 0.4629,
"step": 1050
},
{
"epoch": 1.6473354231974922,
"grad_norm": 0.23094965403956225,
"learning_rate": 2.5087108013937284e-05,
"loss": 0.5328,
"step": 1051
},
{
"epoch": 1.6489028213166144,
"grad_norm": 0.2253739478134092,
"learning_rate": 2.505807200929152e-05,
"loss": 0.4661,
"step": 1052
},
{
"epoch": 1.6504702194357366,
"grad_norm": 0.22409490624386982,
"learning_rate": 2.502903600464576e-05,
"loss": 0.4689,
"step": 1053
},
{
"epoch": 1.6520376175548588,
"grad_norm": 0.22050911911304252,
"learning_rate": 2.5e-05,
"loss": 0.4724,
"step": 1054
},
{
"epoch": 1.6536050156739812,
"grad_norm": 0.8863375028508823,
"learning_rate": 2.497096399535424e-05,
"loss": 0.4757,
"step": 1055
},
{
"epoch": 1.6551724137931034,
"grad_norm": 0.23502433490000255,
"learning_rate": 2.494192799070848e-05,
"loss": 0.4945,
"step": 1056
},
{
"epoch": 1.6567398119122259,
"grad_norm": 0.25873101733346393,
"learning_rate": 2.4912891986062718e-05,
"loss": 0.4849,
"step": 1057
},
{
"epoch": 1.658307210031348,
"grad_norm": 0.24805168633623373,
"learning_rate": 2.4883855981416958e-05,
"loss": 0.5141,
"step": 1058
},
{
"epoch": 1.6598746081504703,
"grad_norm": 0.2847798863946505,
"learning_rate": 2.4854819976771198e-05,
"loss": 0.4839,
"step": 1059
},
{
"epoch": 1.6614420062695925,
"grad_norm": 0.33001122815159334,
"learning_rate": 2.4825783972125435e-05,
"loss": 0.4648,
"step": 1060
},
{
"epoch": 1.6630094043887147,
"grad_norm": 0.2842747426985423,
"learning_rate": 2.4796747967479675e-05,
"loss": 0.5806,
"step": 1061
},
{
"epoch": 1.664576802507837,
"grad_norm": 0.23445161122553101,
"learning_rate": 2.4767711962833915e-05,
"loss": 0.4792,
"step": 1062
},
{
"epoch": 1.6661442006269591,
"grad_norm": 0.2803901297137085,
"learning_rate": 2.4738675958188155e-05,
"loss": 0.5201,
"step": 1063
},
{
"epoch": 1.6677115987460815,
"grad_norm": 0.3012138945115301,
"learning_rate": 2.4709639953542392e-05,
"loss": 0.5338,
"step": 1064
},
{
"epoch": 1.6692789968652038,
"grad_norm": 0.28190575057033707,
"learning_rate": 2.4680603948896632e-05,
"loss": 0.4996,
"step": 1065
},
{
"epoch": 1.670846394984326,
"grad_norm": 0.2425343536871759,
"learning_rate": 2.4651567944250872e-05,
"loss": 0.4815,
"step": 1066
},
{
"epoch": 1.6724137931034484,
"grad_norm": 0.25649035845586127,
"learning_rate": 2.4622531939605112e-05,
"loss": 0.4071,
"step": 1067
},
{
"epoch": 1.6739811912225706,
"grad_norm": 0.28368159296396156,
"learning_rate": 2.4593495934959352e-05,
"loss": 0.4546,
"step": 1068
},
{
"epoch": 1.6755485893416928,
"grad_norm": 0.253043124019978,
"learning_rate": 2.4564459930313592e-05,
"loss": 0.4823,
"step": 1069
},
{
"epoch": 1.677115987460815,
"grad_norm": 0.2554671068303853,
"learning_rate": 2.453542392566783e-05,
"loss": 0.4787,
"step": 1070
},
{
"epoch": 1.6786833855799372,
"grad_norm": 0.26705994310816705,
"learning_rate": 2.450638792102207e-05,
"loss": 0.552,
"step": 1071
},
{
"epoch": 1.6802507836990594,
"grad_norm": 0.2707104326962692,
"learning_rate": 2.4477351916376306e-05,
"loss": 0.4425,
"step": 1072
},
{
"epoch": 1.6818181818181817,
"grad_norm": 0.28525123886196896,
"learning_rate": 2.4448315911730546e-05,
"loss": 0.5535,
"step": 1073
},
{
"epoch": 1.683385579937304,
"grad_norm": 0.24385736791846557,
"learning_rate": 2.4419279907084786e-05,
"loss": 0.522,
"step": 1074
},
{
"epoch": 1.6849529780564263,
"grad_norm": 0.23575183482764786,
"learning_rate": 2.4390243902439026e-05,
"loss": 0.4612,
"step": 1075
},
{
"epoch": 1.6865203761755487,
"grad_norm": 0.24126220934032092,
"learning_rate": 2.4361207897793266e-05,
"loss": 0.5001,
"step": 1076
},
{
"epoch": 1.688087774294671,
"grad_norm": 0.24140196329167837,
"learning_rate": 2.4332171893147502e-05,
"loss": 0.4683,
"step": 1077
},
{
"epoch": 1.6896551724137931,
"grad_norm": 0.22674721600521686,
"learning_rate": 2.4303135888501743e-05,
"loss": 0.4253,
"step": 1078
},
{
"epoch": 1.6912225705329154,
"grad_norm": 0.24341491533409365,
"learning_rate": 2.4274099883855983e-05,
"loss": 0.4847,
"step": 1079
},
{
"epoch": 1.6927899686520376,
"grad_norm": 0.25066284939561445,
"learning_rate": 2.4245063879210223e-05,
"loss": 0.5419,
"step": 1080
},
{
"epoch": 1.6943573667711598,
"grad_norm": 0.26097075856509055,
"learning_rate": 2.4216027874564463e-05,
"loss": 0.5191,
"step": 1081
},
{
"epoch": 1.695924764890282,
"grad_norm": 0.24363987496872533,
"learning_rate": 2.4186991869918703e-05,
"loss": 0.5259,
"step": 1082
},
{
"epoch": 1.6974921630094044,
"grad_norm": 0.23369596420712785,
"learning_rate": 2.415795586527294e-05,
"loss": 0.481,
"step": 1083
},
{
"epoch": 1.6990595611285266,
"grad_norm": 0.2323687873395579,
"learning_rate": 2.4128919860627176e-05,
"loss": 0.4489,
"step": 1084
},
{
"epoch": 1.700626959247649,
"grad_norm": 0.22838015688559432,
"learning_rate": 2.4099883855981416e-05,
"loss": 0.4555,
"step": 1085
},
{
"epoch": 1.7021943573667713,
"grad_norm": 0.26237394192307323,
"learning_rate": 2.4070847851335656e-05,
"loss": 0.5186,
"step": 1086
},
{
"epoch": 1.7037617554858935,
"grad_norm": 0.22151192989912424,
"learning_rate": 2.4041811846689896e-05,
"loss": 0.4809,
"step": 1087
},
{
"epoch": 1.7053291536050157,
"grad_norm": 0.660514415524591,
"learning_rate": 2.4012775842044136e-05,
"loss": 0.5458,
"step": 1088
},
{
"epoch": 1.706896551724138,
"grad_norm": 0.2906130283055915,
"learning_rate": 2.3983739837398377e-05,
"loss": 0.4753,
"step": 1089
},
{
"epoch": 1.70846394984326,
"grad_norm": 0.24292340265286547,
"learning_rate": 2.3954703832752613e-05,
"loss": 0.5056,
"step": 1090
},
{
"epoch": 1.7100313479623823,
"grad_norm": 0.2371807636361452,
"learning_rate": 2.3925667828106853e-05,
"loss": 0.4843,
"step": 1091
},
{
"epoch": 1.7115987460815048,
"grad_norm": 0.2274677312745349,
"learning_rate": 2.3896631823461093e-05,
"loss": 0.5107,
"step": 1092
},
{
"epoch": 1.713166144200627,
"grad_norm": 0.23837783475351587,
"learning_rate": 2.3867595818815333e-05,
"loss": 0.417,
"step": 1093
},
{
"epoch": 1.7147335423197492,
"grad_norm": 0.22662759335933572,
"learning_rate": 2.3838559814169573e-05,
"loss": 0.4584,
"step": 1094
},
{
"epoch": 1.7163009404388716,
"grad_norm": 0.23520708178556912,
"learning_rate": 2.380952380952381e-05,
"loss": 0.5506,
"step": 1095
},
{
"epoch": 1.7178683385579938,
"grad_norm": 0.21841075884869252,
"learning_rate": 2.378048780487805e-05,
"loss": 0.422,
"step": 1096
},
{
"epoch": 1.719435736677116,
"grad_norm": 0.23665017051098475,
"learning_rate": 2.3751451800232287e-05,
"loss": 0.4719,
"step": 1097
},
{
"epoch": 1.7210031347962382,
"grad_norm": 0.21874257931458896,
"learning_rate": 2.3722415795586527e-05,
"loss": 0.4787,
"step": 1098
},
{
"epoch": 1.7225705329153604,
"grad_norm": 0.21315631175224356,
"learning_rate": 2.3693379790940767e-05,
"loss": 0.494,
"step": 1099
},
{
"epoch": 1.7241379310344827,
"grad_norm": 0.236969149334701,
"learning_rate": 2.3664343786295007e-05,
"loss": 0.5415,
"step": 1100
},
{
"epoch": 1.7257053291536049,
"grad_norm": 5.476483313885248,
"learning_rate": 2.3635307781649247e-05,
"loss": 0.5977,
"step": 1101
},
{
"epoch": 1.7272727272727273,
"grad_norm": 0.25449008707327114,
"learning_rate": 2.3606271777003487e-05,
"loss": 0.5681,
"step": 1102
},
{
"epoch": 1.7288401253918495,
"grad_norm": 0.238162393677874,
"learning_rate": 2.3577235772357724e-05,
"loss": 0.4998,
"step": 1103
},
{
"epoch": 1.730407523510972,
"grad_norm": 0.22309921656373896,
"learning_rate": 2.3548199767711964e-05,
"loss": 0.4815,
"step": 1104
},
{
"epoch": 1.7319749216300941,
"grad_norm": 0.2279419567246597,
"learning_rate": 2.3519163763066204e-05,
"loss": 0.4864,
"step": 1105
},
{
"epoch": 1.7335423197492164,
"grad_norm": 0.23701890111767238,
"learning_rate": 2.3490127758420444e-05,
"loss": 0.5858,
"step": 1106
},
{
"epoch": 1.7351097178683386,
"grad_norm": 0.23520231993274252,
"learning_rate": 2.346109175377468e-05,
"loss": 0.5321,
"step": 1107
},
{
"epoch": 1.7366771159874608,
"grad_norm": 0.2153425104403882,
"learning_rate": 2.343205574912892e-05,
"loss": 0.4702,
"step": 1108
},
{
"epoch": 1.738244514106583,
"grad_norm": 0.22568744855829484,
"learning_rate": 2.340301974448316e-05,
"loss": 0.4587,
"step": 1109
},
{
"epoch": 1.7398119122257052,
"grad_norm": 0.4060356955888284,
"learning_rate": 2.3373983739837398e-05,
"loss": 0.4909,
"step": 1110
},
{
"epoch": 1.7413793103448276,
"grad_norm": 0.23096373962260938,
"learning_rate": 2.3344947735191638e-05,
"loss": 0.5281,
"step": 1111
},
{
"epoch": 1.7429467084639498,
"grad_norm": 0.22646619018586336,
"learning_rate": 2.3315911730545878e-05,
"loss": 0.461,
"step": 1112
},
{
"epoch": 1.7445141065830723,
"grad_norm": 0.2274170441331143,
"learning_rate": 2.3286875725900118e-05,
"loss": 0.4631,
"step": 1113
},
{
"epoch": 1.7460815047021945,
"grad_norm": 0.2316827754185088,
"learning_rate": 2.3257839721254358e-05,
"loss": 0.4572,
"step": 1114
},
{
"epoch": 1.7476489028213167,
"grad_norm": 0.22496662904685621,
"learning_rate": 2.3228803716608598e-05,
"loss": 0.5437,
"step": 1115
},
{
"epoch": 1.749216300940439,
"grad_norm": 0.23484411677413483,
"learning_rate": 2.3199767711962835e-05,
"loss": 0.5297,
"step": 1116
},
{
"epoch": 1.750783699059561,
"grad_norm": 0.2711746091349386,
"learning_rate": 2.3170731707317075e-05,
"loss": 0.5257,
"step": 1117
},
{
"epoch": 1.7523510971786833,
"grad_norm": 0.22267088045586972,
"learning_rate": 2.314169570267131e-05,
"loss": 0.4632,
"step": 1118
},
{
"epoch": 1.7539184952978055,
"grad_norm": 0.2667781504623845,
"learning_rate": 2.311265969802555e-05,
"loss": 0.5521,
"step": 1119
},
{
"epoch": 1.7554858934169277,
"grad_norm": 0.268411891388332,
"learning_rate": 2.308362369337979e-05,
"loss": 0.5014,
"step": 1120
},
{
"epoch": 1.7570532915360502,
"grad_norm": 0.4847328830733795,
"learning_rate": 2.305458768873403e-05,
"loss": 0.5516,
"step": 1121
},
{
"epoch": 1.7586206896551724,
"grad_norm": 0.22724175653308198,
"learning_rate": 2.3025551684088272e-05,
"loss": 0.4024,
"step": 1122
},
{
"epoch": 1.7601880877742948,
"grad_norm": 0.24192834556612366,
"learning_rate": 2.299651567944251e-05,
"loss": 0.4852,
"step": 1123
},
{
"epoch": 1.761755485893417,
"grad_norm": 0.25994724317635165,
"learning_rate": 2.296747967479675e-05,
"loss": 0.4812,
"step": 1124
},
{
"epoch": 1.7633228840125392,
"grad_norm": 0.2255353927081471,
"learning_rate": 2.293844367015099e-05,
"loss": 0.4623,
"step": 1125
},
{
"epoch": 1.7648902821316614,
"grad_norm": 0.20666755987154617,
"learning_rate": 2.290940766550523e-05,
"loss": 0.4195,
"step": 1126
},
{
"epoch": 1.7664576802507836,
"grad_norm": 0.215143584526643,
"learning_rate": 2.288037166085947e-05,
"loss": 0.4604,
"step": 1127
},
{
"epoch": 1.7680250783699059,
"grad_norm": 0.32553023872170117,
"learning_rate": 2.285133565621371e-05,
"loss": 0.5515,
"step": 1128
},
{
"epoch": 1.769592476489028,
"grad_norm": 0.23542887108235994,
"learning_rate": 2.2822299651567945e-05,
"loss": 0.5171,
"step": 1129
},
{
"epoch": 1.7711598746081505,
"grad_norm": 0.2270176679656863,
"learning_rate": 2.2793263646922182e-05,
"loss": 0.5156,
"step": 1130
},
{
"epoch": 1.7727272727272727,
"grad_norm": 0.22856116764825032,
"learning_rate": 2.2764227642276422e-05,
"loss": 0.4856,
"step": 1131
},
{
"epoch": 1.7742946708463951,
"grad_norm": 0.23419298057146742,
"learning_rate": 2.2735191637630662e-05,
"loss": 0.4894,
"step": 1132
},
{
"epoch": 1.7758620689655173,
"grad_norm": 0.2570698533824927,
"learning_rate": 2.2706155632984902e-05,
"loss": 0.4803,
"step": 1133
},
{
"epoch": 1.7774294670846396,
"grad_norm": 0.2060290427966179,
"learning_rate": 2.2677119628339142e-05,
"loss": 0.4709,
"step": 1134
},
{
"epoch": 1.7789968652037618,
"grad_norm": 0.2635479890491283,
"learning_rate": 2.264808362369338e-05,
"loss": 0.4661,
"step": 1135
},
{
"epoch": 1.780564263322884,
"grad_norm": 0.23135685126067712,
"learning_rate": 2.261904761904762e-05,
"loss": 0.4779,
"step": 1136
},
{
"epoch": 1.7821316614420062,
"grad_norm": 0.2275072846603574,
"learning_rate": 2.259001161440186e-05,
"loss": 0.4953,
"step": 1137
},
{
"epoch": 1.7836990595611284,
"grad_norm": 0.23022081171550932,
"learning_rate": 2.25609756097561e-05,
"loss": 0.5304,
"step": 1138
},
{
"epoch": 1.7852664576802508,
"grad_norm": 0.22264320727623352,
"learning_rate": 2.253193960511034e-05,
"loss": 0.5411,
"step": 1139
},
{
"epoch": 1.786833855799373,
"grad_norm": 0.24538828670218804,
"learning_rate": 2.250290360046458e-05,
"loss": 0.4604,
"step": 1140
},
{
"epoch": 1.7884012539184952,
"grad_norm": 0.2164288610016194,
"learning_rate": 2.2473867595818816e-05,
"loss": 0.4664,
"step": 1141
},
{
"epoch": 1.7899686520376177,
"grad_norm": 0.2221982337137165,
"learning_rate": 2.2444831591173053e-05,
"loss": 0.5251,
"step": 1142
},
{
"epoch": 1.79153605015674,
"grad_norm": 0.2549702806338198,
"learning_rate": 2.2415795586527293e-05,
"loss": 0.5617,
"step": 1143
},
{
"epoch": 1.793103448275862,
"grad_norm": 0.2237735171181845,
"learning_rate": 2.2386759581881533e-05,
"loss": 0.482,
"step": 1144
},
{
"epoch": 1.7946708463949843,
"grad_norm": 0.22000457400572768,
"learning_rate": 2.2357723577235773e-05,
"loss": 0.5144,
"step": 1145
},
{
"epoch": 1.7962382445141065,
"grad_norm": 0.2550140479062846,
"learning_rate": 2.2328687572590013e-05,
"loss": 0.5068,
"step": 1146
},
{
"epoch": 1.7978056426332287,
"grad_norm": 0.22629830084734834,
"learning_rate": 2.2299651567944253e-05,
"loss": 0.5015,
"step": 1147
},
{
"epoch": 1.799373040752351,
"grad_norm": 0.20659264790468268,
"learning_rate": 2.227061556329849e-05,
"loss": 0.4572,
"step": 1148
},
{
"epoch": 1.8009404388714734,
"grad_norm": 0.21731066838694219,
"learning_rate": 2.224157955865273e-05,
"loss": 0.4294,
"step": 1149
},
{
"epoch": 1.8025078369905956,
"grad_norm": 0.24914083277563334,
"learning_rate": 2.221254355400697e-05,
"loss": 0.4498,
"step": 1150
},
{
"epoch": 1.804075235109718,
"grad_norm": 0.2611334282485535,
"learning_rate": 2.218350754936121e-05,
"loss": 0.4975,
"step": 1151
},
{
"epoch": 1.8056426332288402,
"grad_norm": 0.6877424373550951,
"learning_rate": 2.215447154471545e-05,
"loss": 0.4603,
"step": 1152
},
{
"epoch": 1.8072100313479624,
"grad_norm": 0.24378359860376236,
"learning_rate": 2.2125435540069687e-05,
"loss": 0.4865,
"step": 1153
},
{
"epoch": 1.8087774294670846,
"grad_norm": 0.24066446740209685,
"learning_rate": 2.2096399535423927e-05,
"loss": 0.4782,
"step": 1154
},
{
"epoch": 1.8103448275862069,
"grad_norm": 0.24525576537795962,
"learning_rate": 2.2067363530778164e-05,
"loss": 0.5475,
"step": 1155
},
{
"epoch": 1.811912225705329,
"grad_norm": 0.23453739504886767,
"learning_rate": 2.2038327526132404e-05,
"loss": 0.497,
"step": 1156
},
{
"epoch": 1.8134796238244513,
"grad_norm": 0.2589012119474807,
"learning_rate": 2.2009291521486644e-05,
"loss": 0.5147,
"step": 1157
},
{
"epoch": 1.8150470219435737,
"grad_norm": 0.24871485768454185,
"learning_rate": 2.1980255516840884e-05,
"loss": 0.583,
"step": 1158
},
{
"epoch": 1.816614420062696,
"grad_norm": 0.22178546182505196,
"learning_rate": 2.1951219512195124e-05,
"loss": 0.4884,
"step": 1159
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.24861603133039026,
"learning_rate": 2.1922183507549364e-05,
"loss": 0.4703,
"step": 1160
},
{
"epoch": 1.8197492163009406,
"grad_norm": 0.22399238525725776,
"learning_rate": 2.18931475029036e-05,
"loss": 0.5013,
"step": 1161
},
{
"epoch": 1.8213166144200628,
"grad_norm": 0.240371327144141,
"learning_rate": 2.186411149825784e-05,
"loss": 0.5168,
"step": 1162
},
{
"epoch": 1.822884012539185,
"grad_norm": 2.2977917662246465,
"learning_rate": 2.183507549361208e-05,
"loss": 0.4185,
"step": 1163
},
{
"epoch": 1.8244514106583072,
"grad_norm": 0.29091359278838014,
"learning_rate": 2.180603948896632e-05,
"loss": 0.5153,
"step": 1164
},
{
"epoch": 1.8260188087774294,
"grad_norm": 0.2304095743992187,
"learning_rate": 2.1777003484320557e-05,
"loss": 0.4585,
"step": 1165
},
{
"epoch": 1.8275862068965516,
"grad_norm": 0.2315370567943016,
"learning_rate": 2.1747967479674798e-05,
"loss": 0.4932,
"step": 1166
},
{
"epoch": 1.829153605015674,
"grad_norm": 0.23789362896342212,
"learning_rate": 2.1718931475029038e-05,
"loss": 0.5115,
"step": 1167
},
{
"epoch": 1.8307210031347962,
"grad_norm": 0.22521280603595298,
"learning_rate": 2.1689895470383274e-05,
"loss": 0.4306,
"step": 1168
},
{
"epoch": 1.8322884012539185,
"grad_norm": 0.2812382961207363,
"learning_rate": 2.1660859465737514e-05,
"loss": 0.4415,
"step": 1169
},
{
"epoch": 1.8338557993730409,
"grad_norm": 0.22883795265762283,
"learning_rate": 2.1631823461091754e-05,
"loss": 0.5411,
"step": 1170
},
{
"epoch": 1.835423197492163,
"grad_norm": 0.23514952491911614,
"learning_rate": 2.1602787456445995e-05,
"loss": 0.5229,
"step": 1171
},
{
"epoch": 1.8369905956112853,
"grad_norm": 0.21239773574923726,
"learning_rate": 2.1573751451800235e-05,
"loss": 0.4511,
"step": 1172
},
{
"epoch": 1.8385579937304075,
"grad_norm": 0.25781348675320187,
"learning_rate": 2.1544715447154475e-05,
"loss": 0.5458,
"step": 1173
},
{
"epoch": 1.8401253918495297,
"grad_norm": 0.2790653871731939,
"learning_rate": 2.151567944250871e-05,
"loss": 0.4925,
"step": 1174
},
{
"epoch": 1.841692789968652,
"grad_norm": 73.63313627173757,
"learning_rate": 2.148664343786295e-05,
"loss": 0.5504,
"step": 1175
},
{
"epoch": 1.8432601880877741,
"grad_norm": 0.26583320868263,
"learning_rate": 2.145760743321719e-05,
"loss": 0.4497,
"step": 1176
},
{
"epoch": 1.8448275862068966,
"grad_norm": 0.28126535220103216,
"learning_rate": 2.1428571428571428e-05,
"loss": 0.5,
"step": 1177
},
{
"epoch": 1.8463949843260188,
"grad_norm": 0.2614906080740505,
"learning_rate": 2.1399535423925668e-05,
"loss": 0.5632,
"step": 1178
},
{
"epoch": 1.8479623824451412,
"grad_norm": 0.4387669955918315,
"learning_rate": 2.1370499419279908e-05,
"loss": 0.522,
"step": 1179
},
{
"epoch": 1.8495297805642634,
"grad_norm": 0.26843157187057287,
"learning_rate": 2.134146341463415e-05,
"loss": 0.542,
"step": 1180
},
{
"epoch": 1.8510971786833856,
"grad_norm": 0.24641642175487333,
"learning_rate": 2.1312427409988385e-05,
"loss": 0.4984,
"step": 1181
},
{
"epoch": 1.8526645768025078,
"grad_norm": 0.2276129428332038,
"learning_rate": 2.1283391405342625e-05,
"loss": 0.4515,
"step": 1182
},
{
"epoch": 1.85423197492163,
"grad_norm": 0.27169035348357196,
"learning_rate": 2.1254355400696865e-05,
"loss": 0.4795,
"step": 1183
},
{
"epoch": 1.8557993730407523,
"grad_norm": 0.2568781859247091,
"learning_rate": 2.1225319396051105e-05,
"loss": 0.4621,
"step": 1184
},
{
"epoch": 1.8573667711598745,
"grad_norm": 0.2421175646090106,
"learning_rate": 2.1196283391405345e-05,
"loss": 0.4936,
"step": 1185
},
{
"epoch": 1.858934169278997,
"grad_norm": 0.5631842377189394,
"learning_rate": 2.1167247386759585e-05,
"loss": 0.4757,
"step": 1186
},
{
"epoch": 1.8605015673981191,
"grad_norm": 0.23253418987568247,
"learning_rate": 2.1138211382113822e-05,
"loss": 0.4808,
"step": 1187
},
{
"epoch": 1.8620689655172413,
"grad_norm": 0.23594172841908417,
"learning_rate": 2.110917537746806e-05,
"loss": 0.4866,
"step": 1188
},
{
"epoch": 1.8636363636363638,
"grad_norm": 0.21716378914648898,
"learning_rate": 2.10801393728223e-05,
"loss": 0.4803,
"step": 1189
},
{
"epoch": 1.865203761755486,
"grad_norm": 0.2210319957289926,
"learning_rate": 2.105110336817654e-05,
"loss": 0.4668,
"step": 1190
},
{
"epoch": 1.8667711598746082,
"grad_norm": 0.2536648493262184,
"learning_rate": 2.102206736353078e-05,
"loss": 0.5056,
"step": 1191
},
{
"epoch": 1.8683385579937304,
"grad_norm": 0.5300198848721773,
"learning_rate": 2.099303135888502e-05,
"loss": 0.5171,
"step": 1192
},
{
"epoch": 1.8699059561128526,
"grad_norm": 0.31178632018260877,
"learning_rate": 2.096399535423926e-05,
"loss": 0.5238,
"step": 1193
},
{
"epoch": 1.8714733542319748,
"grad_norm": 0.22255992010883155,
"learning_rate": 2.0934959349593496e-05,
"loss": 0.4728,
"step": 1194
},
{
"epoch": 1.873040752351097,
"grad_norm": 0.2169046656161229,
"learning_rate": 2.0905923344947736e-05,
"loss": 0.4998,
"step": 1195
},
{
"epoch": 1.8746081504702194,
"grad_norm": 0.22809741813887727,
"learning_rate": 2.0876887340301976e-05,
"loss": 0.5196,
"step": 1196
},
{
"epoch": 1.8761755485893417,
"grad_norm": 0.21032719278527157,
"learning_rate": 2.0847851335656216e-05,
"loss": 0.468,
"step": 1197
},
{
"epoch": 1.877742946708464,
"grad_norm": 0.22046656375498908,
"learning_rate": 2.0818815331010456e-05,
"loss": 0.5068,
"step": 1198
},
{
"epoch": 1.8793103448275863,
"grad_norm": 0.20303647001547295,
"learning_rate": 2.0789779326364696e-05,
"loss": 0.4549,
"step": 1199
},
{
"epoch": 1.8808777429467085,
"grad_norm": 0.2405965411908753,
"learning_rate": 2.0760743321718933e-05,
"loss": 0.5258,
"step": 1200
},
{
"epoch": 1.8824451410658307,
"grad_norm": 0.21273721334355186,
"learning_rate": 2.073170731707317e-05,
"loss": 0.4778,
"step": 1201
},
{
"epoch": 1.884012539184953,
"grad_norm": 0.23297237618642222,
"learning_rate": 2.070267131242741e-05,
"loss": 0.4005,
"step": 1202
},
{
"epoch": 1.8855799373040751,
"grad_norm": 0.2588185487551403,
"learning_rate": 2.067363530778165e-05,
"loss": 0.5083,
"step": 1203
},
{
"epoch": 1.8871473354231973,
"grad_norm": 0.22876949839154342,
"learning_rate": 2.064459930313589e-05,
"loss": 0.4833,
"step": 1204
},
{
"epoch": 1.8887147335423198,
"grad_norm": 0.24192121105715353,
"learning_rate": 2.061556329849013e-05,
"loss": 0.4518,
"step": 1205
},
{
"epoch": 1.890282131661442,
"grad_norm": 0.2568595065207791,
"learning_rate": 2.058652729384437e-05,
"loss": 0.5874,
"step": 1206
},
{
"epoch": 1.8918495297805644,
"grad_norm": 0.22087114698004034,
"learning_rate": 2.0557491289198607e-05,
"loss": 0.4621,
"step": 1207
},
{
"epoch": 1.8934169278996866,
"grad_norm": 0.22199780179572154,
"learning_rate": 2.0528455284552847e-05,
"loss": 0.476,
"step": 1208
},
{
"epoch": 1.8949843260188088,
"grad_norm": 0.2123849740405547,
"learning_rate": 2.0499419279907087e-05,
"loss": 0.4873,
"step": 1209
},
{
"epoch": 1.896551724137931,
"grad_norm": 0.21271856542041845,
"learning_rate": 2.0470383275261327e-05,
"loss": 0.4644,
"step": 1210
},
{
"epoch": 1.8981191222570533,
"grad_norm": 0.41170730296873687,
"learning_rate": 2.0441347270615567e-05,
"loss": 0.4999,
"step": 1211
},
{
"epoch": 1.8996865203761755,
"grad_norm": 0.22435135609653784,
"learning_rate": 2.0412311265969803e-05,
"loss": 0.4459,
"step": 1212
},
{
"epoch": 1.9012539184952977,
"grad_norm": 0.22206233385883878,
"learning_rate": 2.038327526132404e-05,
"loss": 0.4846,
"step": 1213
},
{
"epoch": 1.90282131661442,
"grad_norm": 0.647340354792101,
"learning_rate": 2.035423925667828e-05,
"loss": 0.5457,
"step": 1214
},
{
"epoch": 1.9043887147335423,
"grad_norm": 0.22632144456777614,
"learning_rate": 2.032520325203252e-05,
"loss": 0.4532,
"step": 1215
},
{
"epoch": 1.9059561128526645,
"grad_norm": 0.2148676358478946,
"learning_rate": 2.029616724738676e-05,
"loss": 0.4397,
"step": 1216
},
{
"epoch": 1.907523510971787,
"grad_norm": 0.2279505716760199,
"learning_rate": 2.0267131242741e-05,
"loss": 0.4729,
"step": 1217
},
{
"epoch": 1.9090909090909092,
"grad_norm": 0.2462598908452099,
"learning_rate": 2.023809523809524e-05,
"loss": 0.4701,
"step": 1218
},
{
"epoch": 1.9106583072100314,
"grad_norm": 0.22866370260341298,
"learning_rate": 2.0209059233449477e-05,
"loss": 0.4515,
"step": 1219
},
{
"epoch": 1.9122257053291536,
"grad_norm": 0.25017576490435445,
"learning_rate": 2.0180023228803717e-05,
"loss": 0.5354,
"step": 1220
},
{
"epoch": 1.9137931034482758,
"grad_norm": 0.24624147462495452,
"learning_rate": 2.0150987224157957e-05,
"loss": 0.478,
"step": 1221
},
{
"epoch": 1.915360501567398,
"grad_norm": 0.23273648589917753,
"learning_rate": 2.0121951219512197e-05,
"loss": 0.4919,
"step": 1222
},
{
"epoch": 1.9169278996865202,
"grad_norm": 0.2223932673784375,
"learning_rate": 2.0092915214866434e-05,
"loss": 0.455,
"step": 1223
},
{
"epoch": 1.9184952978056427,
"grad_norm": 0.23333402086655927,
"learning_rate": 2.0063879210220674e-05,
"loss": 0.4927,
"step": 1224
},
{
"epoch": 1.9200626959247649,
"grad_norm": 0.2225531258788467,
"learning_rate": 2.0034843205574914e-05,
"loss": 0.4854,
"step": 1225
},
{
"epoch": 1.9216300940438873,
"grad_norm": 0.2328840055197953,
"learning_rate": 2.000580720092915e-05,
"loss": 0.5171,
"step": 1226
},
{
"epoch": 1.9231974921630095,
"grad_norm": 3.2305798998868913,
"learning_rate": 1.997677119628339e-05,
"loss": 0.4895,
"step": 1227
},
{
"epoch": 1.9247648902821317,
"grad_norm": 0.26905382603682465,
"learning_rate": 1.994773519163763e-05,
"loss": 0.4805,
"step": 1228
},
{
"epoch": 1.926332288401254,
"grad_norm": 0.26640212844075145,
"learning_rate": 1.991869918699187e-05,
"loss": 0.5401,
"step": 1229
},
{
"epoch": 1.9278996865203761,
"grad_norm": 0.22965989681451268,
"learning_rate": 1.988966318234611e-05,
"loss": 0.4607,
"step": 1230
},
{
"epoch": 1.9294670846394983,
"grad_norm": 0.2615346290827532,
"learning_rate": 1.986062717770035e-05,
"loss": 0.5009,
"step": 1231
},
{
"epoch": 1.9310344827586206,
"grad_norm": 0.2569904031386111,
"learning_rate": 1.9831591173054588e-05,
"loss": 0.4863,
"step": 1232
},
{
"epoch": 1.932601880877743,
"grad_norm": 0.22104642417343234,
"learning_rate": 1.9802555168408828e-05,
"loss": 0.4785,
"step": 1233
},
{
"epoch": 1.9341692789968652,
"grad_norm": 0.26672053407122986,
"learning_rate": 1.9773519163763068e-05,
"loss": 0.533,
"step": 1234
},
{
"epoch": 1.9357366771159876,
"grad_norm": 0.269244824039732,
"learning_rate": 1.9744483159117305e-05,
"loss": 0.4691,
"step": 1235
},
{
"epoch": 1.9373040752351098,
"grad_norm": 0.20524069308338913,
"learning_rate": 1.9715447154471545e-05,
"loss": 0.4577,
"step": 1236
},
{
"epoch": 1.938871473354232,
"grad_norm": 0.23408123510006631,
"learning_rate": 1.9686411149825785e-05,
"loss": 0.5276,
"step": 1237
},
{
"epoch": 1.9404388714733543,
"grad_norm": 0.23185904094082752,
"learning_rate": 1.9657375145180025e-05,
"loss": 0.4435,
"step": 1238
},
{
"epoch": 1.9420062695924765,
"grad_norm": 0.2615308565062291,
"learning_rate": 1.962833914053426e-05,
"loss": 0.5415,
"step": 1239
},
{
"epoch": 1.9435736677115987,
"grad_norm": 0.20974747096008906,
"learning_rate": 1.9599303135888502e-05,
"loss": 0.4388,
"step": 1240
},
{
"epoch": 1.9451410658307209,
"grad_norm": 0.22515375847872027,
"learning_rate": 1.9570267131242742e-05,
"loss": 0.4613,
"step": 1241
},
{
"epoch": 1.9467084639498433,
"grad_norm": 0.22156570088360883,
"learning_rate": 1.9541231126596982e-05,
"loss": 0.4498,
"step": 1242
},
{
"epoch": 1.9482758620689655,
"grad_norm": 0.23964376071459034,
"learning_rate": 1.9512195121951222e-05,
"loss": 0.5264,
"step": 1243
},
{
"epoch": 1.9498432601880877,
"grad_norm": 0.2564459119411411,
"learning_rate": 1.9483159117305462e-05,
"loss": 0.5334,
"step": 1244
},
{
"epoch": 1.9514106583072102,
"grad_norm": 0.21706597378834933,
"learning_rate": 1.94541231126597e-05,
"loss": 0.4593,
"step": 1245
},
{
"epoch": 1.9529780564263324,
"grad_norm": 0.20423505957617663,
"learning_rate": 1.9425087108013935e-05,
"loss": 0.4387,
"step": 1246
},
{
"epoch": 1.9545454545454546,
"grad_norm": 0.24873508576536182,
"learning_rate": 1.9396051103368175e-05,
"loss": 0.5375,
"step": 1247
},
{
"epoch": 1.9561128526645768,
"grad_norm": 0.21842448768828446,
"learning_rate": 1.9367015098722416e-05,
"loss": 0.4298,
"step": 1248
},
{
"epoch": 1.957680250783699,
"grad_norm": 0.2438507660324614,
"learning_rate": 1.9337979094076656e-05,
"loss": 0.4904,
"step": 1249
},
{
"epoch": 1.9592476489028212,
"grad_norm": 0.2489627372527784,
"learning_rate": 1.9308943089430896e-05,
"loss": 0.5187,
"step": 1250
},
{
"epoch": 1.9608150470219434,
"grad_norm": 0.21323082053928757,
"learning_rate": 1.9279907084785136e-05,
"loss": 0.4109,
"step": 1251
},
{
"epoch": 1.9623824451410659,
"grad_norm": 0.23506126091942822,
"learning_rate": 1.9250871080139372e-05,
"loss": 0.4742,
"step": 1252
},
{
"epoch": 1.963949843260188,
"grad_norm": 0.2485729783565428,
"learning_rate": 1.9221835075493612e-05,
"loss": 0.5074,
"step": 1253
},
{
"epoch": 1.9655172413793105,
"grad_norm": 0.2158705347813762,
"learning_rate": 1.9192799070847853e-05,
"loss": 0.4719,
"step": 1254
},
{
"epoch": 1.9670846394984327,
"grad_norm": 0.2259721263741004,
"learning_rate": 1.9163763066202093e-05,
"loss": 0.4792,
"step": 1255
},
{
"epoch": 1.968652037617555,
"grad_norm": 0.2416035358051937,
"learning_rate": 1.9134727061556333e-05,
"loss": 0.5307,
"step": 1256
},
{
"epoch": 1.9702194357366771,
"grad_norm": 0.2308027515662428,
"learning_rate": 1.9105691056910573e-05,
"loss": 0.4833,
"step": 1257
},
{
"epoch": 1.9717868338557993,
"grad_norm": 0.2096261981529678,
"learning_rate": 1.907665505226481e-05,
"loss": 0.4358,
"step": 1258
},
{
"epoch": 1.9733542319749215,
"grad_norm": 0.22027862749951732,
"learning_rate": 1.9047619047619046e-05,
"loss": 0.4841,
"step": 1259
},
{
"epoch": 1.9749216300940438,
"grad_norm": 0.2257333424911397,
"learning_rate": 1.9018583042973286e-05,
"loss": 0.5174,
"step": 1260
},
{
"epoch": 1.9764890282131662,
"grad_norm": 0.23462187576584245,
"learning_rate": 1.8989547038327526e-05,
"loss": 0.4803,
"step": 1261
},
{
"epoch": 1.9780564263322884,
"grad_norm": 0.21733752195762368,
"learning_rate": 1.8960511033681766e-05,
"loss": 0.477,
"step": 1262
},
{
"epoch": 1.9796238244514106,
"grad_norm": 0.21234651227886653,
"learning_rate": 1.8931475029036006e-05,
"loss": 0.4823,
"step": 1263
},
{
"epoch": 1.981191222570533,
"grad_norm": 0.21405663213550588,
"learning_rate": 1.8902439024390246e-05,
"loss": 0.5145,
"step": 1264
},
{
"epoch": 1.9827586206896552,
"grad_norm": 0.23850275775502766,
"learning_rate": 1.8873403019744483e-05,
"loss": 0.5162,
"step": 1265
},
{
"epoch": 1.9843260188087775,
"grad_norm": 0.22411365158212962,
"learning_rate": 1.8844367015098723e-05,
"loss": 0.4527,
"step": 1266
},
{
"epoch": 1.9858934169278997,
"grad_norm": 0.22070578483588693,
"learning_rate": 1.8815331010452963e-05,
"loss": 0.4696,
"step": 1267
},
{
"epoch": 1.9874608150470219,
"grad_norm": 0.22979006613195319,
"learning_rate": 1.8786295005807203e-05,
"loss": 0.5274,
"step": 1268
},
{
"epoch": 1.989028213166144,
"grad_norm": 0.22441712393193283,
"learning_rate": 1.8757259001161443e-05,
"loss": 0.5119,
"step": 1269
},
{
"epoch": 1.9905956112852663,
"grad_norm": 0.2050029592472821,
"learning_rate": 1.872822299651568e-05,
"loss": 0.4329,
"step": 1270
},
{
"epoch": 1.9921630094043887,
"grad_norm": 0.2488940841675231,
"learning_rate": 1.869918699186992e-05,
"loss": 0.4899,
"step": 1271
},
{
"epoch": 1.993730407523511,
"grad_norm": 0.25741065031366145,
"learning_rate": 1.8670150987224157e-05,
"loss": 0.5253,
"step": 1272
},
{
"epoch": 1.9952978056426334,
"grad_norm": 0.22187618579780174,
"learning_rate": 1.8641114982578397e-05,
"loss": 0.4512,
"step": 1273
},
{
"epoch": 1.9968652037617556,
"grad_norm": 0.23279633275794878,
"learning_rate": 1.8612078977932637e-05,
"loss": 0.4702,
"step": 1274
},
{
"epoch": 1.9984326018808778,
"grad_norm": 0.22649784762099365,
"learning_rate": 1.8583042973286877e-05,
"loss": 0.4716,
"step": 1275
},
{
"epoch": 2.0,
"grad_norm": 0.2317478558030449,
"learning_rate": 1.8554006968641117e-05,
"loss": 0.5175,
"step": 1276
},
{
"epoch": 2.001567398119122,
"grad_norm": 0.3016045034889876,
"learning_rate": 1.8524970963995357e-05,
"loss": 0.384,
"step": 1277
},
{
"epoch": 2.0031347962382444,
"grad_norm": 0.2711185443725285,
"learning_rate": 1.8495934959349594e-05,
"loss": 0.3726,
"step": 1278
},
{
"epoch": 2.0047021943573666,
"grad_norm": 0.23942210419820809,
"learning_rate": 1.8466898954703834e-05,
"loss": 0.3956,
"step": 1279
},
{
"epoch": 2.006269592476489,
"grad_norm": 0.42162440302527865,
"learning_rate": 1.8437862950058074e-05,
"loss": 0.3457,
"step": 1280
},
{
"epoch": 2.0078369905956115,
"grad_norm": 0.324699099553601,
"learning_rate": 1.840882694541231e-05,
"loss": 0.4279,
"step": 1281
},
{
"epoch": 2.0094043887147337,
"grad_norm": 0.26388724687003484,
"learning_rate": 1.837979094076655e-05,
"loss": 0.3932,
"step": 1282
},
{
"epoch": 2.010971786833856,
"grad_norm": 0.3458173349704004,
"learning_rate": 1.835075493612079e-05,
"loss": 0.3872,
"step": 1283
},
{
"epoch": 2.012539184952978,
"grad_norm": 0.25732253188127396,
"learning_rate": 1.832171893147503e-05,
"loss": 0.4038,
"step": 1284
},
{
"epoch": 2.0141065830721003,
"grad_norm": 0.23070774871532593,
"learning_rate": 1.8292682926829268e-05,
"loss": 0.3843,
"step": 1285
},
{
"epoch": 2.0156739811912225,
"grad_norm": 0.23458913207935939,
"learning_rate": 1.8263646922183508e-05,
"loss": 0.3491,
"step": 1286
},
{
"epoch": 2.0172413793103448,
"grad_norm": 0.2853555853299823,
"learning_rate": 1.8234610917537748e-05,
"loss": 0.3787,
"step": 1287
},
{
"epoch": 2.018808777429467,
"grad_norm": 0.28580352616705207,
"learning_rate": 1.8205574912891988e-05,
"loss": 0.4004,
"step": 1288
},
{
"epoch": 2.020376175548589,
"grad_norm": 0.20379340673256727,
"learning_rate": 1.8176538908246228e-05,
"loss": 0.3272,
"step": 1289
},
{
"epoch": 2.0219435736677114,
"grad_norm": 0.24418605859808137,
"learning_rate": 1.8147502903600465e-05,
"loss": 0.4295,
"step": 1290
},
{
"epoch": 2.023510971786834,
"grad_norm": 0.2530150279810617,
"learning_rate": 1.8118466898954705e-05,
"loss": 0.432,
"step": 1291
},
{
"epoch": 2.0250783699059562,
"grad_norm": 0.2665080922302213,
"learning_rate": 1.8089430894308945e-05,
"loss": 0.4229,
"step": 1292
},
{
"epoch": 2.0266457680250785,
"grad_norm": 0.25358101006797734,
"learning_rate": 1.806039488966318e-05,
"loss": 0.4418,
"step": 1293
},
{
"epoch": 2.0282131661442007,
"grad_norm": 0.21486829858060913,
"learning_rate": 1.803135888501742e-05,
"loss": 0.3338,
"step": 1294
},
{
"epoch": 2.029780564263323,
"grad_norm": 0.24862357434228044,
"learning_rate": 1.800232288037166e-05,
"loss": 0.4177,
"step": 1295
},
{
"epoch": 2.031347962382445,
"grad_norm": 0.23142571424773956,
"learning_rate": 1.79732868757259e-05,
"loss": 0.3434,
"step": 1296
},
{
"epoch": 2.0329153605015673,
"grad_norm": 0.2287535918671729,
"learning_rate": 1.7944250871080138e-05,
"loss": 0.3571,
"step": 1297
},
{
"epoch": 2.0344827586206895,
"grad_norm": 0.23466323647369347,
"learning_rate": 1.791521486643438e-05,
"loss": 0.3797,
"step": 1298
},
{
"epoch": 2.0360501567398117,
"grad_norm": 0.24794830492955913,
"learning_rate": 1.788617886178862e-05,
"loss": 0.3802,
"step": 1299
},
{
"epoch": 2.0376175548589344,
"grad_norm": 0.2250482789476333,
"learning_rate": 1.785714285714286e-05,
"loss": 0.3567,
"step": 1300
},
{
"epoch": 2.0391849529780566,
"grad_norm": 0.23421187520103157,
"learning_rate": 1.78281068524971e-05,
"loss": 0.3686,
"step": 1301
},
{
"epoch": 2.040752351097179,
"grad_norm": 0.2172460090520996,
"learning_rate": 1.779907084785134e-05,
"loss": 0.3387,
"step": 1302
},
{
"epoch": 2.042319749216301,
"grad_norm": 0.2490352944229209,
"learning_rate": 1.7770034843205575e-05,
"loss": 0.4367,
"step": 1303
},
{
"epoch": 2.043887147335423,
"grad_norm": 0.23390167233772038,
"learning_rate": 1.7740998838559815e-05,
"loss": 0.3933,
"step": 1304
},
{
"epoch": 2.0454545454545454,
"grad_norm": 0.2154428611648203,
"learning_rate": 1.7711962833914052e-05,
"loss": 0.3721,
"step": 1305
},
{
"epoch": 2.0470219435736676,
"grad_norm": 0.22488509981890156,
"learning_rate": 1.7682926829268292e-05,
"loss": 0.387,
"step": 1306
},
{
"epoch": 2.04858934169279,
"grad_norm": 0.21358047011672282,
"learning_rate": 1.7653890824622532e-05,
"loss": 0.3648,
"step": 1307
},
{
"epoch": 2.050156739811912,
"grad_norm": 0.22740125505414938,
"learning_rate": 1.7624854819976772e-05,
"loss": 0.3826,
"step": 1308
},
{
"epoch": 2.0517241379310347,
"grad_norm": 0.21740843474033014,
"learning_rate": 1.7595818815331012e-05,
"loss": 0.3773,
"step": 1309
},
{
"epoch": 2.053291536050157,
"grad_norm": 0.21683910592540398,
"learning_rate": 1.756678281068525e-05,
"loss": 0.3947,
"step": 1310
},
{
"epoch": 2.054858934169279,
"grad_norm": 0.21926290300033208,
"learning_rate": 1.753774680603949e-05,
"loss": 0.3985,
"step": 1311
},
{
"epoch": 2.0564263322884013,
"grad_norm": 0.20628548039224892,
"learning_rate": 1.750871080139373e-05,
"loss": 0.3267,
"step": 1312
},
{
"epoch": 2.0579937304075235,
"grad_norm": 0.21807031654222211,
"learning_rate": 1.747967479674797e-05,
"loss": 0.3846,
"step": 1313
},
{
"epoch": 2.0595611285266457,
"grad_norm": 0.23289791050365893,
"learning_rate": 1.745063879210221e-05,
"loss": 0.3852,
"step": 1314
},
{
"epoch": 2.061128526645768,
"grad_norm": 0.20754354484155585,
"learning_rate": 1.742160278745645e-05,
"loss": 0.3736,
"step": 1315
},
{
"epoch": 2.06269592476489,
"grad_norm": 0.21439136351504162,
"learning_rate": 1.7392566782810686e-05,
"loss": 0.3397,
"step": 1316
},
{
"epoch": 2.0642633228840124,
"grad_norm": 0.2124333048441544,
"learning_rate": 1.7363530778164923e-05,
"loss": 0.3812,
"step": 1317
},
{
"epoch": 2.0658307210031346,
"grad_norm": 0.20977135216146575,
"learning_rate": 1.7334494773519163e-05,
"loss": 0.3735,
"step": 1318
},
{
"epoch": 2.0673981191222572,
"grad_norm": 0.20938024105144634,
"learning_rate": 1.7305458768873403e-05,
"loss": 0.3856,
"step": 1319
},
{
"epoch": 2.0689655172413794,
"grad_norm": 0.24319746633809733,
"learning_rate": 1.7276422764227643e-05,
"loss": 0.3942,
"step": 1320
},
{
"epoch": 2.0705329153605017,
"grad_norm": 0.8303946989617832,
"learning_rate": 1.7247386759581883e-05,
"loss": 0.4225,
"step": 1321
},
{
"epoch": 2.072100313479624,
"grad_norm": 0.24250383892343036,
"learning_rate": 1.7218350754936123e-05,
"loss": 0.4264,
"step": 1322
},
{
"epoch": 2.073667711598746,
"grad_norm": 0.23139217170194482,
"learning_rate": 1.718931475029036e-05,
"loss": 0.4094,
"step": 1323
},
{
"epoch": 2.0752351097178683,
"grad_norm": 0.22545789405555944,
"learning_rate": 1.71602787456446e-05,
"loss": 0.3971,
"step": 1324
},
{
"epoch": 2.0768025078369905,
"grad_norm": 0.22005590503233582,
"learning_rate": 1.713124274099884e-05,
"loss": 0.371,
"step": 1325
},
{
"epoch": 2.0783699059561127,
"grad_norm": 0.23325673375969508,
"learning_rate": 1.710220673635308e-05,
"loss": 0.4207,
"step": 1326
},
{
"epoch": 2.079937304075235,
"grad_norm": 0.20485960132084632,
"learning_rate": 1.707317073170732e-05,
"loss": 0.3456,
"step": 1327
},
{
"epoch": 2.0815047021943576,
"grad_norm": 0.23422691192153022,
"learning_rate": 1.7044134727061557e-05,
"loss": 0.3723,
"step": 1328
},
{
"epoch": 2.08307210031348,
"grad_norm": 2.9762353128412657,
"learning_rate": 1.7015098722415797e-05,
"loss": 0.501,
"step": 1329
},
{
"epoch": 2.084639498432602,
"grad_norm": 0.22859491911019164,
"learning_rate": 1.6986062717770033e-05,
"loss": 0.3968,
"step": 1330
},
{
"epoch": 2.086206896551724,
"grad_norm": 0.23641886675349144,
"learning_rate": 1.6957026713124274e-05,
"loss": 0.4534,
"step": 1331
},
{
"epoch": 2.0877742946708464,
"grad_norm": 0.2182135146013649,
"learning_rate": 1.6927990708478514e-05,
"loss": 0.3857,
"step": 1332
},
{
"epoch": 2.0893416927899686,
"grad_norm": 0.24838234195793804,
"learning_rate": 1.6898954703832754e-05,
"loss": 0.4287,
"step": 1333
},
{
"epoch": 2.090909090909091,
"grad_norm": 0.24883328442577682,
"learning_rate": 1.6869918699186994e-05,
"loss": 0.4074,
"step": 1334
},
{
"epoch": 2.092476489028213,
"grad_norm": 0.2344030005778799,
"learning_rate": 1.6840882694541234e-05,
"loss": 0.4799,
"step": 1335
},
{
"epoch": 2.0940438871473352,
"grad_norm": 0.8512500701497243,
"learning_rate": 1.681184668989547e-05,
"loss": 0.3724,
"step": 1336
},
{
"epoch": 2.0956112852664575,
"grad_norm": 0.24472625207593882,
"learning_rate": 1.678281068524971e-05,
"loss": 0.3743,
"step": 1337
},
{
"epoch": 2.09717868338558,
"grad_norm": 0.2505491094410723,
"learning_rate": 1.675377468060395e-05,
"loss": 0.4147,
"step": 1338
},
{
"epoch": 2.0987460815047023,
"grad_norm": 0.23479580679436338,
"learning_rate": 1.672473867595819e-05,
"loss": 0.3863,
"step": 1339
},
{
"epoch": 2.1003134796238245,
"grad_norm": 0.217781486439528,
"learning_rate": 1.6695702671312427e-05,
"loss": 0.3926,
"step": 1340
},
{
"epoch": 2.1018808777429467,
"grad_norm": 0.23862009912162474,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.4086,
"step": 1341
},
{
"epoch": 2.103448275862069,
"grad_norm": 0.2532258840880795,
"learning_rate": 1.6637630662020908e-05,
"loss": 0.4384,
"step": 1342
},
{
"epoch": 2.105015673981191,
"grad_norm": 0.2225280258448,
"learning_rate": 1.6608594657375144e-05,
"loss": 0.3538,
"step": 1343
},
{
"epoch": 2.1065830721003134,
"grad_norm": 0.21092090442675202,
"learning_rate": 1.6579558652729384e-05,
"loss": 0.3326,
"step": 1344
},
{
"epoch": 2.1081504702194356,
"grad_norm": 0.23878170093813125,
"learning_rate": 1.6550522648083624e-05,
"loss": 0.4024,
"step": 1345
},
{
"epoch": 2.109717868338558,
"grad_norm": 0.22963870977380504,
"learning_rate": 1.6521486643437864e-05,
"loss": 0.4177,
"step": 1346
},
{
"epoch": 2.1112852664576804,
"grad_norm": 0.20852208370046096,
"learning_rate": 1.6492450638792104e-05,
"loss": 0.3532,
"step": 1347
},
{
"epoch": 2.1128526645768027,
"grad_norm": 0.2529913416392096,
"learning_rate": 1.6463414634146345e-05,
"loss": 0.4417,
"step": 1348
},
{
"epoch": 2.114420062695925,
"grad_norm": 0.23496901888733637,
"learning_rate": 1.643437862950058e-05,
"loss": 0.4428,
"step": 1349
},
{
"epoch": 2.115987460815047,
"grad_norm": 0.22345907270073861,
"learning_rate": 1.640534262485482e-05,
"loss": 0.3883,
"step": 1350
},
{
"epoch": 2.1175548589341693,
"grad_norm": 0.24484274954376423,
"learning_rate": 1.6376306620209058e-05,
"loss": 0.4359,
"step": 1351
},
{
"epoch": 2.1191222570532915,
"grad_norm": 0.23332173515496304,
"learning_rate": 1.6347270615563298e-05,
"loss": 0.4179,
"step": 1352
},
{
"epoch": 2.1206896551724137,
"grad_norm": 0.21120082971778073,
"learning_rate": 1.6318234610917538e-05,
"loss": 0.3876,
"step": 1353
},
{
"epoch": 2.122257053291536,
"grad_norm": 0.2376601833516237,
"learning_rate": 1.6289198606271778e-05,
"loss": 0.4251,
"step": 1354
},
{
"epoch": 2.123824451410658,
"grad_norm": 0.21785800563044125,
"learning_rate": 1.6260162601626018e-05,
"loss": 0.3883,
"step": 1355
},
{
"epoch": 2.1253918495297803,
"grad_norm": 0.2272856318659006,
"learning_rate": 1.6231126596980255e-05,
"loss": 0.4225,
"step": 1356
},
{
"epoch": 2.126959247648903,
"grad_norm": 0.2114177016136048,
"learning_rate": 1.6202090592334495e-05,
"loss": 0.3523,
"step": 1357
},
{
"epoch": 2.128526645768025,
"grad_norm": 0.2462813037767963,
"learning_rate": 1.6173054587688735e-05,
"loss": 0.404,
"step": 1358
},
{
"epoch": 2.1300940438871474,
"grad_norm": 0.8377488298637206,
"learning_rate": 1.6144018583042975e-05,
"loss": 0.3785,
"step": 1359
},
{
"epoch": 2.1316614420062696,
"grad_norm": 0.22167829267674488,
"learning_rate": 1.6114982578397215e-05,
"loss": 0.414,
"step": 1360
},
{
"epoch": 2.133228840125392,
"grad_norm": 0.22492613724191302,
"learning_rate": 1.6085946573751455e-05,
"loss": 0.3867,
"step": 1361
},
{
"epoch": 2.134796238244514,
"grad_norm": 0.22457605099737993,
"learning_rate": 1.6056910569105692e-05,
"loss": 0.3875,
"step": 1362
},
{
"epoch": 2.1363636363636362,
"grad_norm": 0.44633193580614966,
"learning_rate": 1.602787456445993e-05,
"loss": 0.38,
"step": 1363
},
{
"epoch": 2.1379310344827585,
"grad_norm": 0.22877099444643206,
"learning_rate": 1.599883855981417e-05,
"loss": 0.4011,
"step": 1364
},
{
"epoch": 2.139498432601881,
"grad_norm": 0.20582694363707654,
"learning_rate": 1.596980255516841e-05,
"loss": 0.3543,
"step": 1365
},
{
"epoch": 2.1410658307210033,
"grad_norm": 0.21798850291233066,
"learning_rate": 1.594076655052265e-05,
"loss": 0.3977,
"step": 1366
},
{
"epoch": 2.1426332288401255,
"grad_norm": 0.22086324898085452,
"learning_rate": 1.591173054587689e-05,
"loss": 0.3612,
"step": 1367
},
{
"epoch": 2.1442006269592477,
"grad_norm": 0.24785683844581236,
"learning_rate": 1.5882694541231126e-05,
"loss": 0.3774,
"step": 1368
},
{
"epoch": 2.14576802507837,
"grad_norm": 0.22696638059320995,
"learning_rate": 1.5853658536585366e-05,
"loss": 0.408,
"step": 1369
},
{
"epoch": 2.147335423197492,
"grad_norm": 0.21974673180582338,
"learning_rate": 1.5824622531939606e-05,
"loss": 0.4011,
"step": 1370
},
{
"epoch": 2.1489028213166144,
"grad_norm": 0.22148034806051028,
"learning_rate": 1.5795586527293846e-05,
"loss": 0.3919,
"step": 1371
},
{
"epoch": 2.1504702194357366,
"grad_norm": 0.2235713552939153,
"learning_rate": 1.5766550522648086e-05,
"loss": 0.4088,
"step": 1372
},
{
"epoch": 2.152037617554859,
"grad_norm": 0.22509615959799562,
"learning_rate": 1.5737514518002326e-05,
"loss": 0.4263,
"step": 1373
},
{
"epoch": 2.153605015673981,
"grad_norm": 0.22816501354397425,
"learning_rate": 1.5708478513356563e-05,
"loss": 0.3983,
"step": 1374
},
{
"epoch": 2.1551724137931036,
"grad_norm": 0.5492205412659418,
"learning_rate": 1.56794425087108e-05,
"loss": 0.4016,
"step": 1375
},
{
"epoch": 2.156739811912226,
"grad_norm": 0.21415098608373004,
"learning_rate": 1.565040650406504e-05,
"loss": 0.3807,
"step": 1376
},
{
"epoch": 2.158307210031348,
"grad_norm": 0.22590572041898857,
"learning_rate": 1.562137049941928e-05,
"loss": 0.3099,
"step": 1377
},
{
"epoch": 2.1598746081504703,
"grad_norm": 0.2645317880069889,
"learning_rate": 1.559233449477352e-05,
"loss": 0.4095,
"step": 1378
},
{
"epoch": 2.1614420062695925,
"grad_norm": 0.2194024821211268,
"learning_rate": 1.556329849012776e-05,
"loss": 0.375,
"step": 1379
},
{
"epoch": 2.1630094043887147,
"grad_norm": 0.23247567551586235,
"learning_rate": 1.5534262485482e-05,
"loss": 0.4101,
"step": 1380
},
{
"epoch": 2.164576802507837,
"grad_norm": 0.24150388515454585,
"learning_rate": 1.5505226480836236e-05,
"loss": 0.4038,
"step": 1381
},
{
"epoch": 2.166144200626959,
"grad_norm": 0.24172899321954042,
"learning_rate": 1.5476190476190476e-05,
"loss": 0.4172,
"step": 1382
},
{
"epoch": 2.1677115987460813,
"grad_norm": 0.2245351005863513,
"learning_rate": 1.5447154471544717e-05,
"loss": 0.3811,
"step": 1383
},
{
"epoch": 2.169278996865204,
"grad_norm": 0.22650596997599107,
"learning_rate": 1.5418118466898957e-05,
"loss": 0.4042,
"step": 1384
},
{
"epoch": 2.170846394984326,
"grad_norm": 0.23956402713575353,
"learning_rate": 1.5389082462253197e-05,
"loss": 0.3634,
"step": 1385
},
{
"epoch": 2.1724137931034484,
"grad_norm": 0.22917118263392167,
"learning_rate": 1.5360046457607433e-05,
"loss": 0.3919,
"step": 1386
},
{
"epoch": 2.1739811912225706,
"grad_norm": 0.238359365856219,
"learning_rate": 1.5331010452961673e-05,
"loss": 0.3759,
"step": 1387
},
{
"epoch": 2.175548589341693,
"grad_norm": 0.43205569118939474,
"learning_rate": 1.530197444831591e-05,
"loss": 0.3941,
"step": 1388
},
{
"epoch": 2.177115987460815,
"grad_norm": 0.22692498970401898,
"learning_rate": 1.527293844367015e-05,
"loss": 0.3811,
"step": 1389
},
{
"epoch": 2.1786833855799372,
"grad_norm": 0.24650492914985356,
"learning_rate": 1.524390243902439e-05,
"loss": 0.4169,
"step": 1390
},
{
"epoch": 2.1802507836990594,
"grad_norm": 0.22255153576517683,
"learning_rate": 1.521486643437863e-05,
"loss": 0.3952,
"step": 1391
},
{
"epoch": 2.1818181818181817,
"grad_norm": 0.24135671648798906,
"learning_rate": 1.518583042973287e-05,
"loss": 0.4175,
"step": 1392
},
{
"epoch": 2.183385579937304,
"grad_norm": 0.23011369578359675,
"learning_rate": 1.5156794425087109e-05,
"loss": 0.3832,
"step": 1393
},
{
"epoch": 2.1849529780564265,
"grad_norm": 0.22057305499360844,
"learning_rate": 1.5127758420441349e-05,
"loss": 0.3899,
"step": 1394
},
{
"epoch": 2.1865203761755487,
"grad_norm": 0.2168974279559207,
"learning_rate": 1.5098722415795589e-05,
"loss": 0.3645,
"step": 1395
},
{
"epoch": 2.188087774294671,
"grad_norm": 0.2203201838867848,
"learning_rate": 1.5069686411149827e-05,
"loss": 0.3874,
"step": 1396
},
{
"epoch": 2.189655172413793,
"grad_norm": 0.23197445966836772,
"learning_rate": 1.5040650406504067e-05,
"loss": 0.3547,
"step": 1397
},
{
"epoch": 2.1912225705329154,
"grad_norm": 0.22813198289911552,
"learning_rate": 1.5011614401858304e-05,
"loss": 0.4289,
"step": 1398
},
{
"epoch": 2.1927899686520376,
"grad_norm": 0.2013311016000081,
"learning_rate": 1.4982578397212544e-05,
"loss": 0.3401,
"step": 1399
},
{
"epoch": 2.19435736677116,
"grad_norm": 4.160715442568691,
"learning_rate": 1.4953542392566782e-05,
"loss": 0.8723,
"step": 1400
},
{
"epoch": 2.195924764890282,
"grad_norm": 0.21857333268112392,
"learning_rate": 1.4924506387921023e-05,
"loss": 0.363,
"step": 1401
},
{
"epoch": 2.197492163009404,
"grad_norm": 0.23839460231672255,
"learning_rate": 1.4895470383275263e-05,
"loss": 0.419,
"step": 1402
},
{
"epoch": 2.199059561128527,
"grad_norm": 0.21964605599409082,
"learning_rate": 1.4866434378629501e-05,
"loss": 0.3885,
"step": 1403
},
{
"epoch": 2.200626959247649,
"grad_norm": 0.22549874517822516,
"learning_rate": 1.4837398373983741e-05,
"loss": 0.4081,
"step": 1404
},
{
"epoch": 2.2021943573667713,
"grad_norm": 0.21879521222550022,
"learning_rate": 1.4808362369337981e-05,
"loss": 0.3977,
"step": 1405
},
{
"epoch": 2.2037617554858935,
"grad_norm": 0.23204578894741143,
"learning_rate": 1.477932636469222e-05,
"loss": 0.3864,
"step": 1406
},
{
"epoch": 2.2053291536050157,
"grad_norm": 0.2054986195477762,
"learning_rate": 1.475029036004646e-05,
"loss": 0.3924,
"step": 1407
},
{
"epoch": 2.206896551724138,
"grad_norm": 0.22513057473716772,
"learning_rate": 1.4721254355400698e-05,
"loss": 0.355,
"step": 1408
},
{
"epoch": 2.20846394984326,
"grad_norm": 0.21973034989676304,
"learning_rate": 1.4692218350754936e-05,
"loss": 0.4144,
"step": 1409
},
{
"epoch": 2.2100313479623823,
"grad_norm": 0.21506979286886613,
"learning_rate": 1.4663182346109175e-05,
"loss": 0.3725,
"step": 1410
},
{
"epoch": 2.2115987460815045,
"grad_norm": 0.2515060684062142,
"learning_rate": 1.4634146341463415e-05,
"loss": 0.4543,
"step": 1411
},
{
"epoch": 2.2131661442006267,
"grad_norm": 0.2216155199987655,
"learning_rate": 1.4605110336817653e-05,
"loss": 0.3646,
"step": 1412
},
{
"epoch": 2.2147335423197494,
"grad_norm": 0.2089117853510474,
"learning_rate": 1.4576074332171893e-05,
"loss": 0.3693,
"step": 1413
},
{
"epoch": 2.2163009404388716,
"grad_norm": 0.2068450856617191,
"learning_rate": 1.4547038327526133e-05,
"loss": 0.3444,
"step": 1414
},
{
"epoch": 2.217868338557994,
"grad_norm": 0.22808563521700204,
"learning_rate": 1.4518002322880372e-05,
"loss": 0.3921,
"step": 1415
},
{
"epoch": 2.219435736677116,
"grad_norm": 0.27100902071682864,
"learning_rate": 1.4488966318234612e-05,
"loss": 0.4019,
"step": 1416
},
{
"epoch": 2.2210031347962382,
"grad_norm": 0.22799831903413328,
"learning_rate": 1.4459930313588852e-05,
"loss": 0.3767,
"step": 1417
},
{
"epoch": 2.2225705329153604,
"grad_norm": 0.22407048366756852,
"learning_rate": 1.443089430894309e-05,
"loss": 0.3831,
"step": 1418
},
{
"epoch": 2.2241379310344827,
"grad_norm": 0.21011161472412213,
"learning_rate": 1.440185830429733e-05,
"loss": 0.3653,
"step": 1419
},
{
"epoch": 2.225705329153605,
"grad_norm": 0.24637895585861108,
"learning_rate": 1.437282229965157e-05,
"loss": 0.3549,
"step": 1420
},
{
"epoch": 2.227272727272727,
"grad_norm": 0.2154819828077039,
"learning_rate": 1.4343786295005807e-05,
"loss": 0.3837,
"step": 1421
},
{
"epoch": 2.2288401253918497,
"grad_norm": 0.22751290019301884,
"learning_rate": 1.4314750290360045e-05,
"loss": 0.4304,
"step": 1422
},
{
"epoch": 2.230407523510972,
"grad_norm": 0.2044348399845428,
"learning_rate": 1.4285714285714285e-05,
"loss": 0.3274,
"step": 1423
},
{
"epoch": 2.231974921630094,
"grad_norm": 0.2517282219454231,
"learning_rate": 1.4256678281068526e-05,
"loss": 0.407,
"step": 1424
},
{
"epoch": 2.2335423197492164,
"grad_norm": 0.22800517799350375,
"learning_rate": 1.4227642276422764e-05,
"loss": 0.3959,
"step": 1425
},
{
"epoch": 2.2351097178683386,
"grad_norm": 0.20982593953194267,
"learning_rate": 1.4198606271777004e-05,
"loss": 0.3702,
"step": 1426
},
{
"epoch": 2.2366771159874608,
"grad_norm": 0.2242309626466426,
"learning_rate": 1.4169570267131244e-05,
"loss": 0.3991,
"step": 1427
},
{
"epoch": 2.238244514106583,
"grad_norm": 0.2172374363673727,
"learning_rate": 1.4140534262485482e-05,
"loss": 0.3559,
"step": 1428
},
{
"epoch": 2.239811912225705,
"grad_norm": 0.2142791761032023,
"learning_rate": 1.4111498257839722e-05,
"loss": 0.3541,
"step": 1429
},
{
"epoch": 2.2413793103448274,
"grad_norm": 0.21109384864320618,
"learning_rate": 1.4082462253193963e-05,
"loss": 0.3884,
"step": 1430
},
{
"epoch": 2.2429467084639496,
"grad_norm": 0.20636900040679224,
"learning_rate": 1.4053426248548201e-05,
"loss": 0.3638,
"step": 1431
},
{
"epoch": 2.2445141065830723,
"grad_norm": 0.21496758001213104,
"learning_rate": 1.4024390243902441e-05,
"loss": 0.3844,
"step": 1432
},
{
"epoch": 2.2460815047021945,
"grad_norm": 0.212867824522857,
"learning_rate": 1.3995354239256678e-05,
"loss": 0.3913,
"step": 1433
},
{
"epoch": 2.2476489028213167,
"grad_norm": 0.20692452235979328,
"learning_rate": 1.3966318234610918e-05,
"loss": 0.3719,
"step": 1434
},
{
"epoch": 2.249216300940439,
"grad_norm": 0.2148958199216298,
"learning_rate": 1.3937282229965156e-05,
"loss": 0.396,
"step": 1435
},
{
"epoch": 2.250783699059561,
"grad_norm": 0.21822554652002332,
"learning_rate": 1.3908246225319396e-05,
"loss": 0.4076,
"step": 1436
},
{
"epoch": 2.2523510971786833,
"grad_norm": 0.19894432927019218,
"learning_rate": 1.3879210220673636e-05,
"loss": 0.3354,
"step": 1437
},
{
"epoch": 2.2539184952978055,
"grad_norm": 0.20442591798696103,
"learning_rate": 1.3850174216027875e-05,
"loss": 0.3747,
"step": 1438
},
{
"epoch": 2.2554858934169277,
"grad_norm": 0.216383136692838,
"learning_rate": 1.3821138211382115e-05,
"loss": 0.3919,
"step": 1439
},
{
"epoch": 2.2570532915360504,
"grad_norm": 0.22840492930437667,
"learning_rate": 1.3792102206736355e-05,
"loss": 0.4363,
"step": 1440
},
{
"epoch": 2.2586206896551726,
"grad_norm": 0.2169086106011938,
"learning_rate": 1.3763066202090593e-05,
"loss": 0.3836,
"step": 1441
},
{
"epoch": 2.260188087774295,
"grad_norm": 0.2221688631736411,
"learning_rate": 1.3734030197444833e-05,
"loss": 0.3924,
"step": 1442
},
{
"epoch": 2.261755485893417,
"grad_norm": 0.22914264333534756,
"learning_rate": 1.3704994192799073e-05,
"loss": 0.4289,
"step": 1443
},
{
"epoch": 2.2633228840125392,
"grad_norm": 0.2176699860538497,
"learning_rate": 1.367595818815331e-05,
"loss": 0.404,
"step": 1444
},
{
"epoch": 2.2648902821316614,
"grad_norm": 0.24098051034817927,
"learning_rate": 1.3646922183507548e-05,
"loss": 0.4558,
"step": 1445
},
{
"epoch": 2.2664576802507836,
"grad_norm": 0.2514640857375969,
"learning_rate": 1.3617886178861788e-05,
"loss": 0.4444,
"step": 1446
},
{
"epoch": 2.268025078369906,
"grad_norm": 0.231319607616134,
"learning_rate": 1.3588850174216028e-05,
"loss": 0.3889,
"step": 1447
},
{
"epoch": 2.269592476489028,
"grad_norm": 0.2196749149078622,
"learning_rate": 1.3559814169570267e-05,
"loss": 0.4363,
"step": 1448
},
{
"epoch": 2.2711598746081503,
"grad_norm": 10.116323526920379,
"learning_rate": 1.3530778164924507e-05,
"loss": 0.761,
"step": 1449
},
{
"epoch": 2.2727272727272725,
"grad_norm": 0.2645323906253032,
"learning_rate": 1.3501742160278747e-05,
"loss": 0.3623,
"step": 1450
},
{
"epoch": 2.274294670846395,
"grad_norm": 0.21632961189313282,
"learning_rate": 1.3472706155632985e-05,
"loss": 0.4269,
"step": 1451
},
{
"epoch": 2.2758620689655173,
"grad_norm": 0.21706597585175708,
"learning_rate": 1.3443670150987225e-05,
"loss": 0.4237,
"step": 1452
},
{
"epoch": 2.2774294670846396,
"grad_norm": 0.2133092044457383,
"learning_rate": 1.3414634146341466e-05,
"loss": 0.3969,
"step": 1453
},
{
"epoch": 2.2789968652037618,
"grad_norm": 0.21392990606679235,
"learning_rate": 1.3385598141695704e-05,
"loss": 0.3909,
"step": 1454
},
{
"epoch": 2.280564263322884,
"grad_norm": 0.20892279009457249,
"learning_rate": 1.3356562137049944e-05,
"loss": 0.3592,
"step": 1455
},
{
"epoch": 2.282131661442006,
"grad_norm": 0.2158558425081699,
"learning_rate": 1.332752613240418e-05,
"loss": 0.3979,
"step": 1456
},
{
"epoch": 2.2836990595611284,
"grad_norm": 0.19469040233248036,
"learning_rate": 1.329849012775842e-05,
"loss": 0.3708,
"step": 1457
},
{
"epoch": 2.2852664576802506,
"grad_norm": 0.24278911745974094,
"learning_rate": 1.3269454123112659e-05,
"loss": 0.4159,
"step": 1458
},
{
"epoch": 2.2868338557993733,
"grad_norm": 0.21993724597664988,
"learning_rate": 1.3240418118466899e-05,
"loss": 0.3931,
"step": 1459
},
{
"epoch": 2.2884012539184955,
"grad_norm": 0.21485516203046573,
"learning_rate": 1.321138211382114e-05,
"loss": 0.3914,
"step": 1460
},
{
"epoch": 2.2899686520376177,
"grad_norm": 0.22317769942524632,
"learning_rate": 1.3182346109175378e-05,
"loss": 0.389,
"step": 1461
},
{
"epoch": 2.29153605015674,
"grad_norm": 0.23108999247499296,
"learning_rate": 1.3153310104529618e-05,
"loss": 0.4014,
"step": 1462
},
{
"epoch": 2.293103448275862,
"grad_norm": 0.2086228267188771,
"learning_rate": 1.3124274099883858e-05,
"loss": 0.3774,
"step": 1463
},
{
"epoch": 2.2946708463949843,
"grad_norm": 0.2123228940575844,
"learning_rate": 1.3095238095238096e-05,
"loss": 0.3965,
"step": 1464
},
{
"epoch": 2.2962382445141065,
"grad_norm": 0.2629868360711429,
"learning_rate": 1.3066202090592336e-05,
"loss": 0.3833,
"step": 1465
},
{
"epoch": 2.2978056426332287,
"grad_norm": 0.9458196776333948,
"learning_rate": 1.3037166085946576e-05,
"loss": 0.4334,
"step": 1466
},
{
"epoch": 2.299373040752351,
"grad_norm": 0.23914656103630766,
"learning_rate": 1.3008130081300815e-05,
"loss": 0.4154,
"step": 1467
},
{
"epoch": 2.300940438871473,
"grad_norm": 0.20074496467961164,
"learning_rate": 1.2979094076655051e-05,
"loss": 0.3588,
"step": 1468
},
{
"epoch": 2.302507836990596,
"grad_norm": 0.21180871427305603,
"learning_rate": 1.2950058072009291e-05,
"loss": 0.348,
"step": 1469
},
{
"epoch": 2.304075235109718,
"grad_norm": 0.21022766411007135,
"learning_rate": 1.2921022067363531e-05,
"loss": 0.3786,
"step": 1470
},
{
"epoch": 2.30564263322884,
"grad_norm": 0.20968284264242637,
"learning_rate": 1.289198606271777e-05,
"loss": 0.3569,
"step": 1471
},
{
"epoch": 2.3072100313479624,
"grad_norm": 0.2181197537952486,
"learning_rate": 1.286295005807201e-05,
"loss": 0.3949,
"step": 1472
},
{
"epoch": 2.3087774294670846,
"grad_norm": 0.2211333342749832,
"learning_rate": 1.283391405342625e-05,
"loss": 0.3692,
"step": 1473
},
{
"epoch": 2.310344827586207,
"grad_norm": 0.2244156525690313,
"learning_rate": 1.2804878048780488e-05,
"loss": 0.4258,
"step": 1474
},
{
"epoch": 2.311912225705329,
"grad_norm": 0.22757523458898943,
"learning_rate": 1.2775842044134728e-05,
"loss": 0.4038,
"step": 1475
},
{
"epoch": 2.3134796238244513,
"grad_norm": 0.20530673237688743,
"learning_rate": 1.2746806039488968e-05,
"loss": 0.3796,
"step": 1476
},
{
"epoch": 2.3150470219435735,
"grad_norm": 0.24341242681189784,
"learning_rate": 1.2717770034843207e-05,
"loss": 0.4227,
"step": 1477
},
{
"epoch": 2.316614420062696,
"grad_norm": 1.2715177292130853,
"learning_rate": 1.2688734030197447e-05,
"loss": 0.4121,
"step": 1478
},
{
"epoch": 2.3181818181818183,
"grad_norm": 0.22390913021849132,
"learning_rate": 1.2659698025551684e-05,
"loss": 0.4466,
"step": 1479
},
{
"epoch": 2.3197492163009406,
"grad_norm": 0.21064165851942057,
"learning_rate": 1.2630662020905924e-05,
"loss": 0.3639,
"step": 1480
},
{
"epoch": 2.3213166144200628,
"grad_norm": 0.20919739723875302,
"learning_rate": 1.2601626016260162e-05,
"loss": 0.4176,
"step": 1481
},
{
"epoch": 2.322884012539185,
"grad_norm": 0.22109216989622962,
"learning_rate": 1.2572590011614402e-05,
"loss": 0.4027,
"step": 1482
},
{
"epoch": 2.324451410658307,
"grad_norm": 0.203708227588505,
"learning_rate": 1.2543554006968642e-05,
"loss": 0.3548,
"step": 1483
},
{
"epoch": 2.3260188087774294,
"grad_norm": 0.21176608040281908,
"learning_rate": 1.251451800232288e-05,
"loss": 0.3813,
"step": 1484
},
{
"epoch": 2.3275862068965516,
"grad_norm": 0.21440279682061467,
"learning_rate": 1.248548199767712e-05,
"loss": 0.4237,
"step": 1485
},
{
"epoch": 2.329153605015674,
"grad_norm": 0.20832624073253747,
"learning_rate": 1.2456445993031359e-05,
"loss": 0.3921,
"step": 1486
},
{
"epoch": 2.330721003134796,
"grad_norm": 0.19574291167428767,
"learning_rate": 1.2427409988385599e-05,
"loss": 0.3747,
"step": 1487
},
{
"epoch": 2.3322884012539187,
"grad_norm": 0.2287018792830052,
"learning_rate": 1.2398373983739837e-05,
"loss": 0.4061,
"step": 1488
},
{
"epoch": 2.333855799373041,
"grad_norm": 0.22064533132837721,
"learning_rate": 1.2369337979094078e-05,
"loss": 0.3978,
"step": 1489
},
{
"epoch": 2.335423197492163,
"grad_norm": 0.20939278391278404,
"learning_rate": 1.2340301974448316e-05,
"loss": 0.4064,
"step": 1490
},
{
"epoch": 2.3369905956112853,
"grad_norm": 0.23941007023877459,
"learning_rate": 1.2311265969802556e-05,
"loss": 0.4234,
"step": 1491
},
{
"epoch": 2.3385579937304075,
"grad_norm": 0.22685478597847814,
"learning_rate": 1.2282229965156796e-05,
"loss": 0.3886,
"step": 1492
},
{
"epoch": 2.3401253918495297,
"grad_norm": 0.21262820038757507,
"learning_rate": 1.2253193960511034e-05,
"loss": 0.4028,
"step": 1493
},
{
"epoch": 2.341692789968652,
"grad_norm": 0.22172895740367574,
"learning_rate": 1.2224157955865273e-05,
"loss": 0.3533,
"step": 1494
},
{
"epoch": 2.343260188087774,
"grad_norm": 0.2981150843731816,
"learning_rate": 1.2195121951219513e-05,
"loss": 0.4152,
"step": 1495
},
{
"epoch": 2.344827586206897,
"grad_norm": 0.19517500529752402,
"learning_rate": 1.2166085946573751e-05,
"loss": 0.3502,
"step": 1496
},
{
"epoch": 2.346394984326019,
"grad_norm": 0.22000357117823627,
"learning_rate": 1.2137049941927991e-05,
"loss": 0.3917,
"step": 1497
},
{
"epoch": 2.347962382445141,
"grad_norm": 0.21308856035892038,
"learning_rate": 1.2108013937282231e-05,
"loss": 0.3916,
"step": 1498
},
{
"epoch": 2.3495297805642634,
"grad_norm": 0.23772901971659796,
"learning_rate": 1.207897793263647e-05,
"loss": 0.3955,
"step": 1499
},
{
"epoch": 2.3510971786833856,
"grad_norm": 0.22220197958091942,
"learning_rate": 1.2049941927990708e-05,
"loss": 0.3964,
"step": 1500
},
{
"epoch": 2.352664576802508,
"grad_norm": 0.2013754527479702,
"learning_rate": 1.2020905923344948e-05,
"loss": 0.3442,
"step": 1501
},
{
"epoch": 2.35423197492163,
"grad_norm": 0.2310794461854635,
"learning_rate": 1.1991869918699188e-05,
"loss": 0.4158,
"step": 1502
},
{
"epoch": 2.3557993730407523,
"grad_norm": 0.22933884225175488,
"learning_rate": 1.1962833914053427e-05,
"loss": 0.4259,
"step": 1503
},
{
"epoch": 2.3573667711598745,
"grad_norm": 0.2194401970727593,
"learning_rate": 1.1933797909407667e-05,
"loss": 0.3918,
"step": 1504
},
{
"epoch": 2.3589341692789967,
"grad_norm": 0.2132221089918718,
"learning_rate": 1.1904761904761905e-05,
"loss": 0.4187,
"step": 1505
},
{
"epoch": 2.360501567398119,
"grad_norm": 0.21304924513291806,
"learning_rate": 1.1875725900116143e-05,
"loss": 0.4011,
"step": 1506
},
{
"epoch": 2.3620689655172415,
"grad_norm": 0.22819185297220917,
"learning_rate": 1.1846689895470384e-05,
"loss": 0.4356,
"step": 1507
},
{
"epoch": 2.3636363636363638,
"grad_norm": 0.22701019324236366,
"learning_rate": 1.1817653890824624e-05,
"loss": 0.4004,
"step": 1508
},
{
"epoch": 2.365203761755486,
"grad_norm": 0.21610071120989868,
"learning_rate": 1.1788617886178862e-05,
"loss": 0.3918,
"step": 1509
},
{
"epoch": 2.366771159874608,
"grad_norm": 0.21673730952348153,
"learning_rate": 1.1759581881533102e-05,
"loss": 0.4122,
"step": 1510
},
{
"epoch": 2.3683385579937304,
"grad_norm": 0.22676262866146163,
"learning_rate": 1.173054587688734e-05,
"loss": 0.4081,
"step": 1511
},
{
"epoch": 2.3699059561128526,
"grad_norm": 0.2366186215493259,
"learning_rate": 1.170150987224158e-05,
"loss": 0.4174,
"step": 1512
},
{
"epoch": 2.371473354231975,
"grad_norm": 0.2052724792135809,
"learning_rate": 1.1672473867595819e-05,
"loss": 0.347,
"step": 1513
},
{
"epoch": 2.373040752351097,
"grad_norm": 0.1957325426779778,
"learning_rate": 1.1643437862950059e-05,
"loss": 0.3694,
"step": 1514
},
{
"epoch": 2.3746081504702197,
"grad_norm": 0.21488563072811945,
"learning_rate": 1.1614401858304299e-05,
"loss": 0.4178,
"step": 1515
},
{
"epoch": 2.376175548589342,
"grad_norm": 0.21407219288100632,
"learning_rate": 1.1585365853658537e-05,
"loss": 0.369,
"step": 1516
},
{
"epoch": 2.377742946708464,
"grad_norm": 0.2265077652479755,
"learning_rate": 1.1556329849012776e-05,
"loss": 0.3568,
"step": 1517
},
{
"epoch": 2.3793103448275863,
"grad_norm": 0.21527238612211164,
"learning_rate": 1.1527293844367016e-05,
"loss": 0.4008,
"step": 1518
},
{
"epoch": 2.3808777429467085,
"grad_norm": 0.2070100258025272,
"learning_rate": 1.1498257839721254e-05,
"loss": 0.3797,
"step": 1519
},
{
"epoch": 2.3824451410658307,
"grad_norm": 0.20518058199829728,
"learning_rate": 1.1469221835075494e-05,
"loss": 0.3425,
"step": 1520
},
{
"epoch": 2.384012539184953,
"grad_norm": 0.22744663250743066,
"learning_rate": 1.1440185830429734e-05,
"loss": 0.3807,
"step": 1521
},
{
"epoch": 2.385579937304075,
"grad_norm": 0.22717564244750985,
"learning_rate": 1.1411149825783973e-05,
"loss": 0.4239,
"step": 1522
},
{
"epoch": 2.3871473354231973,
"grad_norm": 0.20609894966416528,
"learning_rate": 1.1382113821138211e-05,
"loss": 0.3661,
"step": 1523
},
{
"epoch": 2.3887147335423196,
"grad_norm": 1.3445208947591385,
"learning_rate": 1.1353077816492451e-05,
"loss": 0.5277,
"step": 1524
},
{
"epoch": 2.3902821316614418,
"grad_norm": 0.21978672194086818,
"learning_rate": 1.132404181184669e-05,
"loss": 0.3469,
"step": 1525
},
{
"epoch": 2.3918495297805644,
"grad_norm": 0.27594512634679386,
"learning_rate": 1.129500580720093e-05,
"loss": 0.4301,
"step": 1526
},
{
"epoch": 2.3934169278996866,
"grad_norm": 0.23365818060399277,
"learning_rate": 1.126596980255517e-05,
"loss": 0.3924,
"step": 1527
},
{
"epoch": 2.394984326018809,
"grad_norm": 0.21330483059082833,
"learning_rate": 1.1236933797909408e-05,
"loss": 0.3881,
"step": 1528
},
{
"epoch": 2.396551724137931,
"grad_norm": 0.25035693538237025,
"learning_rate": 1.1207897793263646e-05,
"loss": 0.4321,
"step": 1529
},
{
"epoch": 2.3981191222570533,
"grad_norm": 0.2358401134095024,
"learning_rate": 1.1178861788617887e-05,
"loss": 0.3993,
"step": 1530
},
{
"epoch": 2.3996865203761755,
"grad_norm": 0.2238531655908333,
"learning_rate": 1.1149825783972127e-05,
"loss": 0.3908,
"step": 1531
},
{
"epoch": 2.4012539184952977,
"grad_norm": 0.4385606439037899,
"learning_rate": 1.1120789779326365e-05,
"loss": 0.3972,
"step": 1532
},
{
"epoch": 2.40282131661442,
"grad_norm": 0.21720733208379828,
"learning_rate": 1.1091753774680605e-05,
"loss": 0.4108,
"step": 1533
},
{
"epoch": 2.4043887147335425,
"grad_norm": 0.20700367084093768,
"learning_rate": 1.1062717770034843e-05,
"loss": 0.3773,
"step": 1534
},
{
"epoch": 2.4059561128526648,
"grad_norm": 0.21627600291746932,
"learning_rate": 1.1033681765389082e-05,
"loss": 0.4184,
"step": 1535
},
{
"epoch": 2.407523510971787,
"grad_norm": 0.2128428226133079,
"learning_rate": 1.1004645760743322e-05,
"loss": 0.371,
"step": 1536
},
{
"epoch": 2.409090909090909,
"grad_norm": 0.21268468432804466,
"learning_rate": 1.0975609756097562e-05,
"loss": 0.381,
"step": 1537
},
{
"epoch": 2.4106583072100314,
"grad_norm": 0.22547533141346843,
"learning_rate": 1.09465737514518e-05,
"loss": 0.3952,
"step": 1538
},
{
"epoch": 2.4122257053291536,
"grad_norm": 0.21435607332525308,
"learning_rate": 1.091753774680604e-05,
"loss": 0.3964,
"step": 1539
},
{
"epoch": 2.413793103448276,
"grad_norm": 0.20924848244931302,
"learning_rate": 1.0888501742160279e-05,
"loss": 0.3897,
"step": 1540
},
{
"epoch": 2.415360501567398,
"grad_norm": 0.22964918681938828,
"learning_rate": 1.0859465737514519e-05,
"loss": 0.4089,
"step": 1541
},
{
"epoch": 2.41692789968652,
"grad_norm": 0.21166058096607523,
"learning_rate": 1.0830429732868757e-05,
"loss": 0.3692,
"step": 1542
},
{
"epoch": 2.4184952978056424,
"grad_norm": 0.20041133254345728,
"learning_rate": 1.0801393728222997e-05,
"loss": 0.3339,
"step": 1543
},
{
"epoch": 2.420062695924765,
"grad_norm": 0.21875742055932645,
"learning_rate": 1.0772357723577237e-05,
"loss": 0.3619,
"step": 1544
},
{
"epoch": 2.4216300940438873,
"grad_norm": 0.2209486626342778,
"learning_rate": 1.0743321718931476e-05,
"loss": 0.406,
"step": 1545
},
{
"epoch": 2.4231974921630095,
"grad_norm": 0.20803313221440278,
"learning_rate": 1.0714285714285714e-05,
"loss": 0.3838,
"step": 1546
},
{
"epoch": 2.4247648902821317,
"grad_norm": 0.22049620523018323,
"learning_rate": 1.0685249709639954e-05,
"loss": 0.4434,
"step": 1547
},
{
"epoch": 2.426332288401254,
"grad_norm": 0.21527628415849295,
"learning_rate": 1.0656213704994193e-05,
"loss": 0.3573,
"step": 1548
},
{
"epoch": 2.427899686520376,
"grad_norm": 0.2256713837483707,
"learning_rate": 1.0627177700348433e-05,
"loss": 0.4398,
"step": 1549
},
{
"epoch": 2.4294670846394983,
"grad_norm": 0.2032472677991991,
"learning_rate": 1.0598141695702673e-05,
"loss": 0.374,
"step": 1550
},
{
"epoch": 2.4310344827586206,
"grad_norm": 0.21417695016466265,
"learning_rate": 1.0569105691056911e-05,
"loss": 0.3852,
"step": 1551
},
{
"epoch": 2.4326018808777428,
"grad_norm": 0.22198390643370178,
"learning_rate": 1.054006968641115e-05,
"loss": 0.3946,
"step": 1552
},
{
"epoch": 2.4341692789968654,
"grad_norm": 0.2265235788987626,
"learning_rate": 1.051103368176539e-05,
"loss": 0.4042,
"step": 1553
},
{
"epoch": 2.4357366771159876,
"grad_norm": 0.20903209196213013,
"learning_rate": 1.048199767711963e-05,
"loss": 0.3442,
"step": 1554
},
{
"epoch": 2.43730407523511,
"grad_norm": 0.20741452118724962,
"learning_rate": 1.0452961672473868e-05,
"loss": 0.3801,
"step": 1555
},
{
"epoch": 2.438871473354232,
"grad_norm": 0.2292753772408627,
"learning_rate": 1.0423925667828108e-05,
"loss": 0.4435,
"step": 1556
},
{
"epoch": 2.4404388714733543,
"grad_norm": 0.21314444424538911,
"learning_rate": 1.0394889663182348e-05,
"loss": 0.3844,
"step": 1557
},
{
"epoch": 2.4420062695924765,
"grad_norm": 0.2459830918097845,
"learning_rate": 1.0365853658536585e-05,
"loss": 0.4135,
"step": 1558
},
{
"epoch": 2.4435736677115987,
"grad_norm": 0.21462615099706644,
"learning_rate": 1.0336817653890825e-05,
"loss": 0.3987,
"step": 1559
},
{
"epoch": 2.445141065830721,
"grad_norm": 0.21939387382262845,
"learning_rate": 1.0307781649245065e-05,
"loss": 0.3886,
"step": 1560
},
{
"epoch": 2.446708463949843,
"grad_norm": 0.2160691339005593,
"learning_rate": 1.0278745644599303e-05,
"loss": 0.3877,
"step": 1561
},
{
"epoch": 2.4482758620689653,
"grad_norm": 0.19815458444458361,
"learning_rate": 1.0249709639953543e-05,
"loss": 0.3286,
"step": 1562
},
{
"epoch": 2.449843260188088,
"grad_norm": 0.20839663019915286,
"learning_rate": 1.0220673635307783e-05,
"loss": 0.3717,
"step": 1563
},
{
"epoch": 2.45141065830721,
"grad_norm": 0.20253407113408933,
"learning_rate": 1.019163763066202e-05,
"loss": 0.3845,
"step": 1564
},
{
"epoch": 2.4529780564263324,
"grad_norm": 0.21853161560462833,
"learning_rate": 1.016260162601626e-05,
"loss": 0.4022,
"step": 1565
},
{
"epoch": 2.4545454545454546,
"grad_norm": 0.2113907039554158,
"learning_rate": 1.01335656213705e-05,
"loss": 0.3798,
"step": 1566
},
{
"epoch": 2.456112852664577,
"grad_norm": 0.21678839080662807,
"learning_rate": 1.0104529616724739e-05,
"loss": 0.3931,
"step": 1567
},
{
"epoch": 2.457680250783699,
"grad_norm": 0.20618679845465682,
"learning_rate": 1.0075493612078979e-05,
"loss": 0.3829,
"step": 1568
},
{
"epoch": 2.459247648902821,
"grad_norm": 0.21142300872987968,
"learning_rate": 1.0046457607433217e-05,
"loss": 0.3729,
"step": 1569
},
{
"epoch": 2.4608150470219434,
"grad_norm": 0.20077014673456417,
"learning_rate": 1.0017421602787457e-05,
"loss": 0.3454,
"step": 1570
},
{
"epoch": 2.462382445141066,
"grad_norm": 0.21141216302808705,
"learning_rate": 9.988385598141695e-06,
"loss": 0.4014,
"step": 1571
},
{
"epoch": 2.4639498432601883,
"grad_norm": 0.20229369099062197,
"learning_rate": 9.959349593495936e-06,
"loss": 0.354,
"step": 1572
},
{
"epoch": 2.4655172413793105,
"grad_norm": 0.20407821610566276,
"learning_rate": 9.930313588850176e-06,
"loss": 0.3855,
"step": 1573
},
{
"epoch": 2.4670846394984327,
"grad_norm": 0.22904487908524798,
"learning_rate": 9.901277584204414e-06,
"loss": 0.4435,
"step": 1574
},
{
"epoch": 2.468652037617555,
"grad_norm": 0.2121359993760671,
"learning_rate": 9.872241579558652e-06,
"loss": 0.4028,
"step": 1575
},
{
"epoch": 2.470219435736677,
"grad_norm": 0.20553789862502297,
"learning_rate": 9.843205574912892e-06,
"loss": 0.3608,
"step": 1576
},
{
"epoch": 2.4717868338557993,
"grad_norm": 0.21604686454643038,
"learning_rate": 9.81416957026713e-06,
"loss": 0.3758,
"step": 1577
},
{
"epoch": 2.4733542319749215,
"grad_norm": 0.21982526705440078,
"learning_rate": 9.785133565621371e-06,
"loss": 0.3952,
"step": 1578
},
{
"epoch": 2.4749216300940438,
"grad_norm": 0.2102600198148821,
"learning_rate": 9.756097560975611e-06,
"loss": 0.3908,
"step": 1579
},
{
"epoch": 2.476489028213166,
"grad_norm": 0.18844887131964042,
"learning_rate": 9.72706155632985e-06,
"loss": 0.3296,
"step": 1580
},
{
"epoch": 2.478056426332288,
"grad_norm": 0.2350934212107331,
"learning_rate": 9.698025551684088e-06,
"loss": 0.4487,
"step": 1581
},
{
"epoch": 2.479623824451411,
"grad_norm": 0.20982273887921915,
"learning_rate": 9.668989547038328e-06,
"loss": 0.3737,
"step": 1582
},
{
"epoch": 2.481191222570533,
"grad_norm": 0.21427390614969788,
"learning_rate": 9.639953542392568e-06,
"loss": 0.4233,
"step": 1583
},
{
"epoch": 2.4827586206896552,
"grad_norm": 0.20203657400430713,
"learning_rate": 9.610917537746806e-06,
"loss": 0.3715,
"step": 1584
},
{
"epoch": 2.4843260188087775,
"grad_norm": 0.22645614488968127,
"learning_rate": 9.581881533101046e-06,
"loss": 0.4424,
"step": 1585
},
{
"epoch": 2.4858934169278997,
"grad_norm": 0.22809626394927282,
"learning_rate": 9.552845528455286e-06,
"loss": 0.3977,
"step": 1586
},
{
"epoch": 2.487460815047022,
"grad_norm": 0.22544345767286836,
"learning_rate": 9.523809523809523e-06,
"loss": 0.418,
"step": 1587
},
{
"epoch": 2.489028213166144,
"grad_norm": 0.20088369057157787,
"learning_rate": 9.494773519163763e-06,
"loss": 0.3782,
"step": 1588
},
{
"epoch": 2.4905956112852663,
"grad_norm": 0.22115247771537433,
"learning_rate": 9.465737514518003e-06,
"loss": 0.3729,
"step": 1589
},
{
"epoch": 2.492163009404389,
"grad_norm": 0.22569680505732442,
"learning_rate": 9.436701509872242e-06,
"loss": 0.3639,
"step": 1590
},
{
"epoch": 2.493730407523511,
"grad_norm": 0.22693675699600682,
"learning_rate": 9.407665505226482e-06,
"loss": 0.3973,
"step": 1591
},
{
"epoch": 2.4952978056426334,
"grad_norm": 0.20787141807588558,
"learning_rate": 9.378629500580722e-06,
"loss": 0.3576,
"step": 1592
},
{
"epoch": 2.4968652037617556,
"grad_norm": 0.22033051199911208,
"learning_rate": 9.34959349593496e-06,
"loss": 0.4053,
"step": 1593
},
{
"epoch": 2.498432601880878,
"grad_norm": 0.2272842689775136,
"learning_rate": 9.320557491289198e-06,
"loss": 0.4566,
"step": 1594
},
{
"epoch": 2.5,
"grad_norm": 0.2229265154394172,
"learning_rate": 9.291521486643439e-06,
"loss": 0.4201,
"step": 1595
},
{
"epoch": 2.501567398119122,
"grad_norm": 0.24791697968728427,
"learning_rate": 9.262485481997679e-06,
"loss": 0.4446,
"step": 1596
},
{
"epoch": 2.5031347962382444,
"grad_norm": 0.2225558704258798,
"learning_rate": 9.233449477351917e-06,
"loss": 0.3814,
"step": 1597
},
{
"epoch": 2.5047021943573666,
"grad_norm": 0.41481709583278137,
"learning_rate": 9.204413472706155e-06,
"loss": 0.4391,
"step": 1598
},
{
"epoch": 2.506269592476489,
"grad_norm": 0.3437252478492092,
"learning_rate": 9.175377468060395e-06,
"loss": 0.3286,
"step": 1599
},
{
"epoch": 2.507836990595611,
"grad_norm": 0.2135466460261532,
"learning_rate": 9.146341463414634e-06,
"loss": 0.3678,
"step": 1600
},
{
"epoch": 2.5094043887147337,
"grad_norm": 0.2385356694035386,
"learning_rate": 9.117305458768874e-06,
"loss": 0.3639,
"step": 1601
},
{
"epoch": 2.510971786833856,
"grad_norm": 0.2272372284514338,
"learning_rate": 9.088269454123114e-06,
"loss": 0.4015,
"step": 1602
},
{
"epoch": 2.512539184952978,
"grad_norm": 0.2155353656644725,
"learning_rate": 9.059233449477352e-06,
"loss": 0.3627,
"step": 1603
},
{
"epoch": 2.5141065830721003,
"grad_norm": 0.20487587257813644,
"learning_rate": 9.03019744483159e-06,
"loss": 0.3189,
"step": 1604
},
{
"epoch": 2.5156739811912225,
"grad_norm": 0.2424803869657363,
"learning_rate": 9.00116144018583e-06,
"loss": 0.4073,
"step": 1605
},
{
"epoch": 2.5172413793103448,
"grad_norm": 0.22213448199091101,
"learning_rate": 8.972125435540069e-06,
"loss": 0.4442,
"step": 1606
},
{
"epoch": 2.518808777429467,
"grad_norm": 0.20321998236233935,
"learning_rate": 8.94308943089431e-06,
"loss": 0.3637,
"step": 1607
},
{
"epoch": 2.5203761755485896,
"grad_norm": 0.21177797108086452,
"learning_rate": 8.91405342624855e-06,
"loss": 0.3855,
"step": 1608
},
{
"epoch": 2.521943573667712,
"grad_norm": 0.22407910997690136,
"learning_rate": 8.885017421602788e-06,
"loss": 0.414,
"step": 1609
},
{
"epoch": 2.523510971786834,
"grad_norm": 0.21147745127156367,
"learning_rate": 8.855981416957026e-06,
"loss": 0.4118,
"step": 1610
},
{
"epoch": 2.5250783699059562,
"grad_norm": 0.21491285222638729,
"learning_rate": 8.826945412311266e-06,
"loss": 0.3783,
"step": 1611
},
{
"epoch": 2.5266457680250785,
"grad_norm": 0.2127974426305754,
"learning_rate": 8.797909407665506e-06,
"loss": 0.3664,
"step": 1612
},
{
"epoch": 2.5282131661442007,
"grad_norm": 0.20147626534147428,
"learning_rate": 8.768873403019745e-06,
"loss": 0.3634,
"step": 1613
},
{
"epoch": 2.529780564263323,
"grad_norm": 0.20581343178162737,
"learning_rate": 8.739837398373985e-06,
"loss": 0.3658,
"step": 1614
},
{
"epoch": 2.531347962382445,
"grad_norm": 0.2122240639077212,
"learning_rate": 8.710801393728225e-06,
"loss": 0.3926,
"step": 1615
},
{
"epoch": 2.5329153605015673,
"grad_norm": 0.2121641229196656,
"learning_rate": 8.681765389082461e-06,
"loss": 0.3764,
"step": 1616
},
{
"epoch": 2.5344827586206895,
"grad_norm": 0.2034018470379647,
"learning_rate": 8.652729384436701e-06,
"loss": 0.3905,
"step": 1617
},
{
"epoch": 2.5360501567398117,
"grad_norm": 0.22353543644127133,
"learning_rate": 8.623693379790942e-06,
"loss": 0.4489,
"step": 1618
},
{
"epoch": 2.537617554858934,
"grad_norm": 0.1998783096138713,
"learning_rate": 8.59465737514518e-06,
"loss": 0.373,
"step": 1619
},
{
"epoch": 2.5391849529780566,
"grad_norm": 0.21679886714043442,
"learning_rate": 8.56562137049942e-06,
"loss": 0.4189,
"step": 1620
},
{
"epoch": 2.540752351097179,
"grad_norm": 0.2152370016269805,
"learning_rate": 8.53658536585366e-06,
"loss": 0.408,
"step": 1621
},
{
"epoch": 2.542319749216301,
"grad_norm": 0.21748723620348373,
"learning_rate": 8.507549361207898e-06,
"loss": 0.3756,
"step": 1622
},
{
"epoch": 2.543887147335423,
"grad_norm": 0.21011703923615516,
"learning_rate": 8.478513356562137e-06,
"loss": 0.4123,
"step": 1623
},
{
"epoch": 2.5454545454545454,
"grad_norm": 0.21063303103125317,
"learning_rate": 8.449477351916377e-06,
"loss": 0.3952,
"step": 1624
},
{
"epoch": 2.5470219435736676,
"grad_norm": 0.20322263602435564,
"learning_rate": 8.420441347270617e-06,
"loss": 0.3933,
"step": 1625
},
{
"epoch": 2.54858934169279,
"grad_norm": 0.2312411133289649,
"learning_rate": 8.391405342624855e-06,
"loss": 0.4237,
"step": 1626
},
{
"epoch": 2.5501567398119125,
"grad_norm": 0.20709792262496676,
"learning_rate": 8.362369337979095e-06,
"loss": 0.3806,
"step": 1627
},
{
"epoch": 2.5517241379310347,
"grad_norm": 0.20720515723432348,
"learning_rate": 8.333333333333334e-06,
"loss": 0.3924,
"step": 1628
},
{
"epoch": 2.553291536050157,
"grad_norm": 0.19878491898006162,
"learning_rate": 8.304297328687572e-06,
"loss": 0.3784,
"step": 1629
},
{
"epoch": 2.554858934169279,
"grad_norm": 0.2111152127052182,
"learning_rate": 8.275261324041812e-06,
"loss": 0.4267,
"step": 1630
},
{
"epoch": 2.5564263322884013,
"grad_norm": 0.21991172082255483,
"learning_rate": 8.246225319396052e-06,
"loss": 0.3969,
"step": 1631
},
{
"epoch": 2.5579937304075235,
"grad_norm": 0.2144979368667917,
"learning_rate": 8.21718931475029e-06,
"loss": 0.3277,
"step": 1632
},
{
"epoch": 2.5595611285266457,
"grad_norm": 0.21515546232710703,
"learning_rate": 8.188153310104529e-06,
"loss": 0.4096,
"step": 1633
},
{
"epoch": 2.561128526645768,
"grad_norm": 0.20613703274392328,
"learning_rate": 8.159117305458769e-06,
"loss": 0.3831,
"step": 1634
},
{
"epoch": 2.56269592476489,
"grad_norm": 0.20769051289074864,
"learning_rate": 8.130081300813009e-06,
"loss": 0.3811,
"step": 1635
},
{
"epoch": 2.5642633228840124,
"grad_norm": 0.20372601339537103,
"learning_rate": 8.101045296167248e-06,
"loss": 0.3641,
"step": 1636
},
{
"epoch": 2.5658307210031346,
"grad_norm": 0.2190887664342815,
"learning_rate": 8.072009291521488e-06,
"loss": 0.3714,
"step": 1637
},
{
"epoch": 2.567398119122257,
"grad_norm": 0.21304087367098237,
"learning_rate": 8.042973286875728e-06,
"loss": 0.3661,
"step": 1638
},
{
"epoch": 2.5689655172413794,
"grad_norm": 0.2193984821386709,
"learning_rate": 8.013937282229964e-06,
"loss": 0.3825,
"step": 1639
},
{
"epoch": 2.5705329153605017,
"grad_norm": 0.20648638553470125,
"learning_rate": 7.984901277584204e-06,
"loss": 0.3699,
"step": 1640
},
{
"epoch": 2.572100313479624,
"grad_norm": 0.22256750342804663,
"learning_rate": 7.955865272938444e-06,
"loss": 0.4313,
"step": 1641
},
{
"epoch": 2.573667711598746,
"grad_norm": 0.205903782872554,
"learning_rate": 7.926829268292683e-06,
"loss": 0.3598,
"step": 1642
},
{
"epoch": 2.5752351097178683,
"grad_norm": 0.22616217473886757,
"learning_rate": 7.897793263646923e-06,
"loss": 0.4423,
"step": 1643
},
{
"epoch": 2.5768025078369905,
"grad_norm": 0.2100859377490203,
"learning_rate": 7.868757259001163e-06,
"loss": 0.3796,
"step": 1644
},
{
"epoch": 2.5783699059561127,
"grad_norm": 0.21931860003947296,
"learning_rate": 7.8397212543554e-06,
"loss": 0.4112,
"step": 1645
},
{
"epoch": 2.5799373040752354,
"grad_norm": 0.21725530231920775,
"learning_rate": 7.81068524970964e-06,
"loss": 0.403,
"step": 1646
},
{
"epoch": 2.5815047021943576,
"grad_norm": 0.2175133660470492,
"learning_rate": 7.78164924506388e-06,
"loss": 0.3661,
"step": 1647
},
{
"epoch": 2.58307210031348,
"grad_norm": 0.22248591678375004,
"learning_rate": 7.752613240418118e-06,
"loss": 0.398,
"step": 1648
},
{
"epoch": 2.584639498432602,
"grad_norm": 0.2092112224535233,
"learning_rate": 7.723577235772358e-06,
"loss": 0.3771,
"step": 1649
},
{
"epoch": 2.586206896551724,
"grad_norm": 0.2306781359965977,
"learning_rate": 7.694541231126598e-06,
"loss": 0.4166,
"step": 1650
},
{
"epoch": 2.5877742946708464,
"grad_norm": 0.22512493337849498,
"learning_rate": 7.665505226480837e-06,
"loss": 0.363,
"step": 1651
},
{
"epoch": 2.5893416927899686,
"grad_norm": 0.20188811710337703,
"learning_rate": 7.636469221835075e-06,
"loss": 0.3888,
"step": 1652
},
{
"epoch": 2.590909090909091,
"grad_norm": 0.2103630193947898,
"learning_rate": 7.607433217189315e-06,
"loss": 0.4187,
"step": 1653
},
{
"epoch": 2.592476489028213,
"grad_norm": 0.22882716634085085,
"learning_rate": 7.578397212543554e-06,
"loss": 0.41,
"step": 1654
},
{
"epoch": 2.5940438871473352,
"grad_norm": 0.21035968486104123,
"learning_rate": 7.5493612078977944e-06,
"loss": 0.3948,
"step": 1655
},
{
"epoch": 2.5956112852664575,
"grad_norm": 0.2085309233043066,
"learning_rate": 7.520325203252034e-06,
"loss": 0.3638,
"step": 1656
},
{
"epoch": 2.5971786833855797,
"grad_norm": 0.21427280843104632,
"learning_rate": 7.491289198606272e-06,
"loss": 0.4081,
"step": 1657
},
{
"epoch": 2.5987460815047023,
"grad_norm": 0.20721356740937916,
"learning_rate": 7.462253193960511e-06,
"loss": 0.4004,
"step": 1658
},
{
"epoch": 2.6003134796238245,
"grad_norm": 0.19740579955275428,
"learning_rate": 7.4332171893147505e-06,
"loss": 0.3626,
"step": 1659
},
{
"epoch": 2.6018808777429467,
"grad_norm": 0.22575529515102039,
"learning_rate": 7.4041811846689906e-06,
"loss": 0.4028,
"step": 1660
},
{
"epoch": 2.603448275862069,
"grad_norm": 0.2321346346216216,
"learning_rate": 7.37514518002323e-06,
"loss": 0.382,
"step": 1661
},
{
"epoch": 2.605015673981191,
"grad_norm": 0.20574155215377873,
"learning_rate": 7.346109175377468e-06,
"loss": 0.3739,
"step": 1662
},
{
"epoch": 2.6065830721003134,
"grad_norm": 0.2112443858827014,
"learning_rate": 7.317073170731707e-06,
"loss": 0.3894,
"step": 1663
},
{
"epoch": 2.6081504702194356,
"grad_norm": 0.1986834247505152,
"learning_rate": 7.288037166085947e-06,
"loss": 0.3551,
"step": 1664
},
{
"epoch": 2.6097178683385582,
"grad_norm": 0.22074841538683354,
"learning_rate": 7.259001161440186e-06,
"loss": 0.4494,
"step": 1665
},
{
"epoch": 2.6112852664576804,
"grad_norm": 0.22008142832771252,
"learning_rate": 7.229965156794426e-06,
"loss": 0.3878,
"step": 1666
},
{
"epoch": 2.6128526645768027,
"grad_norm": 0.22255353131062552,
"learning_rate": 7.200929152148665e-06,
"loss": 0.4184,
"step": 1667
},
{
"epoch": 2.614420062695925,
"grad_norm": 0.20950561748612373,
"learning_rate": 7.1718931475029035e-06,
"loss": 0.4027,
"step": 1668
},
{
"epoch": 2.615987460815047,
"grad_norm": 0.20160499132354823,
"learning_rate": 7.142857142857143e-06,
"loss": 0.3715,
"step": 1669
},
{
"epoch": 2.6175548589341693,
"grad_norm": 0.22290482221163432,
"learning_rate": 7.113821138211382e-06,
"loss": 0.3919,
"step": 1670
},
{
"epoch": 2.6191222570532915,
"grad_norm": 0.19367831973104394,
"learning_rate": 7.084785133565622e-06,
"loss": 0.3422,
"step": 1671
},
{
"epoch": 2.6206896551724137,
"grad_norm": 14.521002864220776,
"learning_rate": 7.055749128919861e-06,
"loss": 0.6121,
"step": 1672
},
{
"epoch": 2.622257053291536,
"grad_norm": 0.2114597590190911,
"learning_rate": 7.0267131242741005e-06,
"loss": 0.3796,
"step": 1673
},
{
"epoch": 2.623824451410658,
"grad_norm": 0.2154569984883554,
"learning_rate": 6.997677119628339e-06,
"loss": 0.3642,
"step": 1674
},
{
"epoch": 2.6253918495297803,
"grad_norm": 0.21495379186865748,
"learning_rate": 6.968641114982578e-06,
"loss": 0.4193,
"step": 1675
},
{
"epoch": 2.626959247648903,
"grad_norm": 0.19653206945216414,
"learning_rate": 6.939605110336818e-06,
"loss": 0.3548,
"step": 1676
},
{
"epoch": 2.628526645768025,
"grad_norm": 0.21206337582569879,
"learning_rate": 6.910569105691057e-06,
"loss": 0.4046,
"step": 1677
},
{
"epoch": 2.6300940438871474,
"grad_norm": 0.2073320545004377,
"learning_rate": 6.8815331010452966e-06,
"loss": 0.403,
"step": 1678
},
{
"epoch": 2.6316614420062696,
"grad_norm": 0.20678515605220799,
"learning_rate": 6.852497096399537e-06,
"loss": 0.4083,
"step": 1679
},
{
"epoch": 2.633228840125392,
"grad_norm": 0.20358033733926217,
"learning_rate": 6.823461091753774e-06,
"loss": 0.3908,
"step": 1680
},
{
"epoch": 2.634796238244514,
"grad_norm": 0.20641005234929408,
"learning_rate": 6.794425087108014e-06,
"loss": 0.3611,
"step": 1681
},
{
"epoch": 2.6363636363636362,
"grad_norm": 0.2044181038234271,
"learning_rate": 6.7653890824622535e-06,
"loss": 0.3749,
"step": 1682
},
{
"epoch": 2.637931034482759,
"grad_norm": 0.2095165622887021,
"learning_rate": 6.736353077816493e-06,
"loss": 0.4116,
"step": 1683
},
{
"epoch": 2.639498432601881,
"grad_norm": 0.2024758112337613,
"learning_rate": 6.707317073170733e-06,
"loss": 0.3649,
"step": 1684
},
{
"epoch": 2.6410658307210033,
"grad_norm": 0.2105334094319248,
"learning_rate": 6.678281068524972e-06,
"loss": 0.4247,
"step": 1685
},
{
"epoch": 2.6426332288401255,
"grad_norm": 0.19913304984732097,
"learning_rate": 6.64924506387921e-06,
"loss": 0.3959,
"step": 1686
},
{
"epoch": 2.6442006269592477,
"grad_norm": 0.2449696285929931,
"learning_rate": 6.6202090592334496e-06,
"loss": 0.4777,
"step": 1687
},
{
"epoch": 2.64576802507837,
"grad_norm": 0.20520316722057552,
"learning_rate": 6.591173054587689e-06,
"loss": 0.3264,
"step": 1688
},
{
"epoch": 2.647335423197492,
"grad_norm": 0.33623608082984235,
"learning_rate": 6.562137049941929e-06,
"loss": 0.4017,
"step": 1689
},
{
"epoch": 2.6489028213166144,
"grad_norm": 0.2097355925178851,
"learning_rate": 6.533101045296168e-06,
"loss": 0.362,
"step": 1690
},
{
"epoch": 2.6504702194357366,
"grad_norm": 0.2056569733867475,
"learning_rate": 6.504065040650407e-06,
"loss": 0.3604,
"step": 1691
},
{
"epoch": 2.652037617554859,
"grad_norm": 0.21079307144820503,
"learning_rate": 6.475029036004646e-06,
"loss": 0.3892,
"step": 1692
},
{
"epoch": 2.653605015673981,
"grad_norm": 0.2085042943121247,
"learning_rate": 6.445993031358885e-06,
"loss": 0.3485,
"step": 1693
},
{
"epoch": 2.655172413793103,
"grad_norm": 0.21193491216231328,
"learning_rate": 6.416957026713125e-06,
"loss": 0.4012,
"step": 1694
},
{
"epoch": 2.656739811912226,
"grad_norm": 0.2135804385851231,
"learning_rate": 6.387921022067364e-06,
"loss": 0.4312,
"step": 1695
},
{
"epoch": 2.658307210031348,
"grad_norm": 0.19962804635531128,
"learning_rate": 6.3588850174216034e-06,
"loss": 0.3912,
"step": 1696
},
{
"epoch": 2.6598746081504703,
"grad_norm": 0.2115936017877178,
"learning_rate": 6.329849012775842e-06,
"loss": 0.368,
"step": 1697
},
{
"epoch": 2.6614420062695925,
"grad_norm": 0.1990124209560855,
"learning_rate": 6.300813008130081e-06,
"loss": 0.332,
"step": 1698
},
{
"epoch": 2.6630094043887147,
"grad_norm": 0.21760501054529097,
"learning_rate": 6.271777003484321e-06,
"loss": 0.4226,
"step": 1699
},
{
"epoch": 2.664576802507837,
"grad_norm": 0.21345640666592042,
"learning_rate": 6.24274099883856e-06,
"loss": 0.3986,
"step": 1700
},
{
"epoch": 2.666144200626959,
"grad_norm": 0.21408008127605904,
"learning_rate": 6.2137049941927995e-06,
"loss": 0.3898,
"step": 1701
},
{
"epoch": 2.6677115987460818,
"grad_norm": 0.19571786194676302,
"learning_rate": 6.184668989547039e-06,
"loss": 0.3834,
"step": 1702
},
{
"epoch": 2.669278996865204,
"grad_norm": 0.2404347919263504,
"learning_rate": 6.155632984901278e-06,
"loss": 0.3686,
"step": 1703
},
{
"epoch": 2.670846394984326,
"grad_norm": 0.19306192036008385,
"learning_rate": 6.126596980255517e-06,
"loss": 0.3463,
"step": 1704
},
{
"epoch": 2.6724137931034484,
"grad_norm": 0.20606767150047262,
"learning_rate": 6.0975609756097564e-06,
"loss": 0.4075,
"step": 1705
},
{
"epoch": 2.6739811912225706,
"grad_norm": 0.18891991394678573,
"learning_rate": 6.068524970963996e-06,
"loss": 0.312,
"step": 1706
},
{
"epoch": 2.675548589341693,
"grad_norm": 0.19745441951266315,
"learning_rate": 6.039488966318235e-06,
"loss": 0.3762,
"step": 1707
},
{
"epoch": 2.677115987460815,
"grad_norm": 0.20457798426351834,
"learning_rate": 6.010452961672474e-06,
"loss": 0.3902,
"step": 1708
},
{
"epoch": 2.6786833855799372,
"grad_norm": 0.21744744675684946,
"learning_rate": 5.981416957026713e-06,
"loss": 0.4179,
"step": 1709
},
{
"epoch": 2.6802507836990594,
"grad_norm": 0.22179264901118365,
"learning_rate": 5.9523809523809525e-06,
"loss": 0.4456,
"step": 1710
},
{
"epoch": 2.6818181818181817,
"grad_norm": 0.22398457993669907,
"learning_rate": 5.923344947735192e-06,
"loss": 0.4342,
"step": 1711
},
{
"epoch": 2.683385579937304,
"grad_norm": 0.19247868413922312,
"learning_rate": 5.894308943089431e-06,
"loss": 0.3613,
"step": 1712
},
{
"epoch": 2.684952978056426,
"grad_norm": 0.2067919477094333,
"learning_rate": 5.86527293844367e-06,
"loss": 0.3873,
"step": 1713
},
{
"epoch": 2.6865203761755487,
"grad_norm": 0.21419907046573164,
"learning_rate": 5.8362369337979094e-06,
"loss": 0.3902,
"step": 1714
},
{
"epoch": 2.688087774294671,
"grad_norm": 0.19612176449653587,
"learning_rate": 5.8072009291521495e-06,
"loss": 0.3792,
"step": 1715
},
{
"epoch": 2.689655172413793,
"grad_norm": 0.49095378833202863,
"learning_rate": 5.778164924506388e-06,
"loss": 0.3859,
"step": 1716
},
{
"epoch": 2.6912225705329154,
"grad_norm": 0.2227348200420127,
"learning_rate": 5.749128919860627e-06,
"loss": 0.3867,
"step": 1717
},
{
"epoch": 2.6927899686520376,
"grad_norm": 0.19585522328846408,
"learning_rate": 5.720092915214867e-06,
"loss": 0.3355,
"step": 1718
},
{
"epoch": 2.69435736677116,
"grad_norm": 0.20199517006438408,
"learning_rate": 5.6910569105691056e-06,
"loss": 0.4078,
"step": 1719
},
{
"epoch": 2.695924764890282,
"grad_norm": 0.19771833153385085,
"learning_rate": 5.662020905923345e-06,
"loss": 0.3505,
"step": 1720
},
{
"epoch": 2.6974921630094046,
"grad_norm": 0.22307410913829476,
"learning_rate": 5.632984901277585e-06,
"loss": 0.3619,
"step": 1721
},
{
"epoch": 2.699059561128527,
"grad_norm": 0.21409084886806337,
"learning_rate": 5.603948896631823e-06,
"loss": 0.3828,
"step": 1722
},
{
"epoch": 2.700626959247649,
"grad_norm": 0.2072758349811982,
"learning_rate": 5.574912891986063e-06,
"loss": 0.4075,
"step": 1723
},
{
"epoch": 2.7021943573667713,
"grad_norm": 0.19524225744406618,
"learning_rate": 5.5458768873403025e-06,
"loss": 0.3308,
"step": 1724
},
{
"epoch": 2.7037617554858935,
"grad_norm": 0.20757669752373695,
"learning_rate": 5.516840882694541e-06,
"loss": 0.3817,
"step": 1725
},
{
"epoch": 2.7053291536050157,
"grad_norm": 0.2064983676970277,
"learning_rate": 5.487804878048781e-06,
"loss": 0.383,
"step": 1726
},
{
"epoch": 2.706896551724138,
"grad_norm": 0.1950014547110373,
"learning_rate": 5.45876887340302e-06,
"loss": 0.3275,
"step": 1727
},
{
"epoch": 2.70846394984326,
"grad_norm": 0.19668168589229096,
"learning_rate": 5.429732868757259e-06,
"loss": 0.3624,
"step": 1728
},
{
"epoch": 2.7100313479623823,
"grad_norm": 0.20232690647362545,
"learning_rate": 5.400696864111499e-06,
"loss": 0.389,
"step": 1729
},
{
"epoch": 2.7115987460815045,
"grad_norm": 0.20755669227409304,
"learning_rate": 5.371660859465738e-06,
"loss": 0.3693,
"step": 1730
},
{
"epoch": 2.7131661442006267,
"grad_norm": 0.2356726333764161,
"learning_rate": 5.342624854819977e-06,
"loss": 0.4718,
"step": 1731
},
{
"epoch": 2.714733542319749,
"grad_norm": 0.20024726320199737,
"learning_rate": 5.313588850174216e-06,
"loss": 0.3575,
"step": 1732
},
{
"epoch": 2.7163009404388716,
"grad_norm": 0.2394776121018088,
"learning_rate": 5.2845528455284555e-06,
"loss": 0.3835,
"step": 1733
},
{
"epoch": 2.717868338557994,
"grad_norm": 0.21115570133380895,
"learning_rate": 5.255516840882695e-06,
"loss": 0.4002,
"step": 1734
},
{
"epoch": 2.719435736677116,
"grad_norm": 0.20486426192579069,
"learning_rate": 5.226480836236934e-06,
"loss": 0.3922,
"step": 1735
},
{
"epoch": 2.7210031347962382,
"grad_norm": 0.209519958590161,
"learning_rate": 5.197444831591174e-06,
"loss": 0.3882,
"step": 1736
},
{
"epoch": 2.7225705329153604,
"grad_norm": 0.2070078855417866,
"learning_rate": 5.168408826945412e-06,
"loss": 0.401,
"step": 1737
},
{
"epoch": 2.7241379310344827,
"grad_norm": 0.20672755932102393,
"learning_rate": 5.139372822299652e-06,
"loss": 0.4035,
"step": 1738
},
{
"epoch": 2.725705329153605,
"grad_norm": 0.20273120738095546,
"learning_rate": 5.110336817653892e-06,
"loss": 0.339,
"step": 1739
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.19175030816935623,
"learning_rate": 5.08130081300813e-06,
"loss": 0.3656,
"step": 1740
},
{
"epoch": 2.7288401253918497,
"grad_norm": 0.21945795289985917,
"learning_rate": 5.052264808362369e-06,
"loss": 0.4213,
"step": 1741
},
{
"epoch": 2.730407523510972,
"grad_norm": 0.2192450471953969,
"learning_rate": 5.0232288037166085e-06,
"loss": 0.4151,
"step": 1742
},
{
"epoch": 2.731974921630094,
"grad_norm": 0.21093715334737953,
"learning_rate": 4.994192799070848e-06,
"loss": 0.3854,
"step": 1743
},
{
"epoch": 2.7335423197492164,
"grad_norm": 0.21164384122194882,
"learning_rate": 4.965156794425088e-06,
"loss": 0.4191,
"step": 1744
},
{
"epoch": 2.7351097178683386,
"grad_norm": 0.21774640567006973,
"learning_rate": 4.936120789779326e-06,
"loss": 0.3873,
"step": 1745
},
{
"epoch": 2.7366771159874608,
"grad_norm": 0.1991276883144391,
"learning_rate": 4.907084785133565e-06,
"loss": 0.3849,
"step": 1746
},
{
"epoch": 2.738244514106583,
"grad_norm": 0.19616917303935325,
"learning_rate": 4.8780487804878055e-06,
"loss": 0.3652,
"step": 1747
},
{
"epoch": 2.739811912225705,
"grad_norm": 0.19572468344513516,
"learning_rate": 4.849012775842044e-06,
"loss": 0.3622,
"step": 1748
},
{
"epoch": 2.7413793103448274,
"grad_norm": 0.21459214926999276,
"learning_rate": 4.819976771196284e-06,
"loss": 0.3962,
"step": 1749
},
{
"epoch": 2.7429467084639496,
"grad_norm": 0.1951929602273108,
"learning_rate": 4.790940766550523e-06,
"loss": 0.3606,
"step": 1750
},
{
"epoch": 2.7445141065830723,
"grad_norm": 0.194670660720483,
"learning_rate": 4.7619047619047615e-06,
"loss": 0.3571,
"step": 1751
},
{
"epoch": 2.7460815047021945,
"grad_norm": 0.20049847492684636,
"learning_rate": 4.732868757259002e-06,
"loss": 0.3732,
"step": 1752
},
{
"epoch": 2.7476489028213167,
"grad_norm": 0.2137556582318085,
"learning_rate": 4.703832752613241e-06,
"loss": 0.4015,
"step": 1753
},
{
"epoch": 2.749216300940439,
"grad_norm": 0.1999449375547595,
"learning_rate": 4.67479674796748e-06,
"loss": 0.3803,
"step": 1754
},
{
"epoch": 2.750783699059561,
"grad_norm": 0.21179293404696928,
"learning_rate": 4.645760743321719e-06,
"loss": 0.4187,
"step": 1755
},
{
"epoch": 2.7523510971786833,
"grad_norm": 0.22040112898501196,
"learning_rate": 4.6167247386759585e-06,
"loss": 0.3929,
"step": 1756
},
{
"epoch": 2.7539184952978055,
"grad_norm": 0.20366115131487297,
"learning_rate": 4.587688734030198e-06,
"loss": 0.3731,
"step": 1757
},
{
"epoch": 2.7554858934169277,
"grad_norm": 0.21594685169117797,
"learning_rate": 4.558652729384437e-06,
"loss": 0.406,
"step": 1758
},
{
"epoch": 2.7570532915360504,
"grad_norm": 0.2196651784671209,
"learning_rate": 4.529616724738676e-06,
"loss": 0.4049,
"step": 1759
},
{
"epoch": 2.7586206896551726,
"grad_norm": 0.22602250823880535,
"learning_rate": 4.500580720092915e-06,
"loss": 0.3815,
"step": 1760
},
{
"epoch": 2.760188087774295,
"grad_norm": 0.19192238512212834,
"learning_rate": 4.471544715447155e-06,
"loss": 0.3522,
"step": 1761
},
{
"epoch": 2.761755485893417,
"grad_norm": 0.21376670218244564,
"learning_rate": 4.442508710801394e-06,
"loss": 0.4202,
"step": 1762
},
{
"epoch": 2.7633228840125392,
"grad_norm": 0.2038193708020861,
"learning_rate": 4.413472706155633e-06,
"loss": 0.3983,
"step": 1763
},
{
"epoch": 2.7648902821316614,
"grad_norm": 0.19726254303161112,
"learning_rate": 4.384436701509872e-06,
"loss": 0.3313,
"step": 1764
},
{
"epoch": 2.7664576802507836,
"grad_norm": 0.1957770125141079,
"learning_rate": 4.355400696864112e-06,
"loss": 0.3616,
"step": 1765
},
{
"epoch": 2.768025078369906,
"grad_norm": 0.21717914194980095,
"learning_rate": 4.326364692218351e-06,
"loss": 0.3941,
"step": 1766
},
{
"epoch": 2.769592476489028,
"grad_norm": 0.19808646580832612,
"learning_rate": 4.29732868757259e-06,
"loss": 0.3591,
"step": 1767
},
{
"epoch": 2.7711598746081503,
"grad_norm": 0.2188870474662506,
"learning_rate": 4.26829268292683e-06,
"loss": 0.427,
"step": 1768
},
{
"epoch": 2.7727272727272725,
"grad_norm": 0.21019678094185829,
"learning_rate": 4.239256678281068e-06,
"loss": 0.3993,
"step": 1769
},
{
"epoch": 2.774294670846395,
"grad_norm": 0.2022061884564159,
"learning_rate": 4.2102206736353085e-06,
"loss": 0.4045,
"step": 1770
},
{
"epoch": 2.7758620689655173,
"grad_norm": 0.2171905465100344,
"learning_rate": 4.181184668989548e-06,
"loss": 0.3901,
"step": 1771
},
{
"epoch": 2.7774294670846396,
"grad_norm": 0.21214128003980268,
"learning_rate": 4.152148664343786e-06,
"loss": 0.3989,
"step": 1772
},
{
"epoch": 2.7789968652037618,
"grad_norm": 0.21240246743061694,
"learning_rate": 4.123112659698026e-06,
"loss": 0.3949,
"step": 1773
},
{
"epoch": 2.780564263322884,
"grad_norm": 0.20694610611379463,
"learning_rate": 4.0940766550522645e-06,
"loss": 0.382,
"step": 1774
},
{
"epoch": 2.782131661442006,
"grad_norm": 0.20732869909038223,
"learning_rate": 4.0650406504065046e-06,
"loss": 0.4165,
"step": 1775
},
{
"epoch": 2.7836990595611284,
"grad_norm": 0.20284810119361177,
"learning_rate": 4.036004645760744e-06,
"loss": 0.3904,
"step": 1776
},
{
"epoch": 2.785266457680251,
"grad_norm": 0.2223709753913864,
"learning_rate": 4.006968641114982e-06,
"loss": 0.4287,
"step": 1777
},
{
"epoch": 2.7868338557993733,
"grad_norm": 0.20801689897478376,
"learning_rate": 3.977932636469222e-06,
"loss": 0.3901,
"step": 1778
},
{
"epoch": 2.7884012539184955,
"grad_norm": 0.20310704016443154,
"learning_rate": 3.9488966318234615e-06,
"loss": 0.3924,
"step": 1779
},
{
"epoch": 2.7899686520376177,
"grad_norm": 0.20326295568370042,
"learning_rate": 3.9198606271777e-06,
"loss": 0.3795,
"step": 1780
},
{
"epoch": 2.79153605015674,
"grad_norm": 0.20728505849751913,
"learning_rate": 3.89082462253194e-06,
"loss": 0.3457,
"step": 1781
},
{
"epoch": 2.793103448275862,
"grad_norm": 0.21451534532389158,
"learning_rate": 3.861788617886179e-06,
"loss": 0.3852,
"step": 1782
},
{
"epoch": 2.7946708463949843,
"grad_norm": 0.21498389783229865,
"learning_rate": 3.832752613240418e-06,
"loss": 0.3771,
"step": 1783
},
{
"epoch": 2.7962382445141065,
"grad_norm": 0.2087509574580468,
"learning_rate": 3.8037166085946576e-06,
"loss": 0.4267,
"step": 1784
},
{
"epoch": 2.7978056426332287,
"grad_norm": 0.20348085860307633,
"learning_rate": 3.7746806039488972e-06,
"loss": 0.3753,
"step": 1785
},
{
"epoch": 2.799373040752351,
"grad_norm": 0.20371293195533918,
"learning_rate": 3.745644599303136e-06,
"loss": 0.3784,
"step": 1786
},
{
"epoch": 2.800940438871473,
"grad_norm": 0.21695914404639635,
"learning_rate": 3.7166085946573752e-06,
"loss": 0.4116,
"step": 1787
},
{
"epoch": 2.8025078369905954,
"grad_norm": 9.529116000347809,
"learning_rate": 3.687572590011615e-06,
"loss": 0.3964,
"step": 1788
},
{
"epoch": 2.804075235109718,
"grad_norm": 0.20915480470723066,
"learning_rate": 3.6585365853658537e-06,
"loss": 0.3742,
"step": 1789
},
{
"epoch": 2.80564263322884,
"grad_norm": 0.20968626359950274,
"learning_rate": 3.629500580720093e-06,
"loss": 0.4054,
"step": 1790
},
{
"epoch": 2.8072100313479624,
"grad_norm": 0.2031757149183511,
"learning_rate": 3.6004645760743326e-06,
"loss": 0.4161,
"step": 1791
},
{
"epoch": 2.8087774294670846,
"grad_norm": 0.2075261562127685,
"learning_rate": 3.5714285714285714e-06,
"loss": 0.4128,
"step": 1792
},
{
"epoch": 2.810344827586207,
"grad_norm": 0.1985371706905934,
"learning_rate": 3.542392566782811e-06,
"loss": 0.3975,
"step": 1793
},
{
"epoch": 2.811912225705329,
"grad_norm": 0.21229983119993262,
"learning_rate": 3.5133565621370502e-06,
"loss": 0.3845,
"step": 1794
},
{
"epoch": 2.8134796238244513,
"grad_norm": 0.19207828589537732,
"learning_rate": 3.484320557491289e-06,
"loss": 0.3566,
"step": 1795
},
{
"epoch": 2.815047021943574,
"grad_norm": 0.18192959154108446,
"learning_rate": 3.4552845528455287e-06,
"loss": 0.3385,
"step": 1796
},
{
"epoch": 2.816614420062696,
"grad_norm": 0.21042133171494554,
"learning_rate": 3.4262485481997683e-06,
"loss": 0.3985,
"step": 1797
},
{
"epoch": 2.8181818181818183,
"grad_norm": 0.2168463316693494,
"learning_rate": 3.397212543554007e-06,
"loss": 0.4487,
"step": 1798
},
{
"epoch": 2.8197492163009406,
"grad_norm": 0.19775989375757375,
"learning_rate": 3.3681765389082463e-06,
"loss": 0.3839,
"step": 1799
},
{
"epoch": 2.8213166144200628,
"grad_norm": 0.22462393842798062,
"learning_rate": 3.339140534262486e-06,
"loss": 0.3535,
"step": 1800
},
{
"epoch": 2.822884012539185,
"grad_norm": 0.20080908191024524,
"learning_rate": 3.3101045296167248e-06,
"loss": 0.3822,
"step": 1801
},
{
"epoch": 2.824451410658307,
"grad_norm": 0.21332812686606004,
"learning_rate": 3.2810685249709644e-06,
"loss": 0.4176,
"step": 1802
},
{
"epoch": 2.8260188087774294,
"grad_norm": 0.22166294685359927,
"learning_rate": 3.2520325203252037e-06,
"loss": 0.4606,
"step": 1803
},
{
"epoch": 2.8275862068965516,
"grad_norm": 0.19053451183770714,
"learning_rate": 3.2229965156794425e-06,
"loss": 0.3584,
"step": 1804
},
{
"epoch": 2.829153605015674,
"grad_norm": 0.6773174467750256,
"learning_rate": 3.193960511033682e-06,
"loss": 0.4176,
"step": 1805
},
{
"epoch": 2.830721003134796,
"grad_norm": 0.19632621705233055,
"learning_rate": 3.164924506387921e-06,
"loss": 0.368,
"step": 1806
},
{
"epoch": 2.8322884012539182,
"grad_norm": 0.2106626450621571,
"learning_rate": 3.1358885017421605e-06,
"loss": 0.4197,
"step": 1807
},
{
"epoch": 2.833855799373041,
"grad_norm": 0.19753533561454786,
"learning_rate": 3.1068524970963998e-06,
"loss": 0.4029,
"step": 1808
},
{
"epoch": 2.835423197492163,
"grad_norm": 0.17980534734691658,
"learning_rate": 3.077816492450639e-06,
"loss": 0.3235,
"step": 1809
},
{
"epoch": 2.8369905956112853,
"grad_norm": 0.21571222295079373,
"learning_rate": 3.0487804878048782e-06,
"loss": 0.4351,
"step": 1810
},
{
"epoch": 2.8385579937304075,
"grad_norm": 0.19588241795970035,
"learning_rate": 3.0197444831591174e-06,
"loss": 0.3614,
"step": 1811
},
{
"epoch": 2.8401253918495297,
"grad_norm": 0.21160952859412838,
"learning_rate": 2.9907084785133567e-06,
"loss": 0.4277,
"step": 1812
},
{
"epoch": 2.841692789968652,
"grad_norm": 0.22539238998076463,
"learning_rate": 2.961672473867596e-06,
"loss": 0.3645,
"step": 1813
},
{
"epoch": 2.843260188087774,
"grad_norm": 0.19990709108986202,
"learning_rate": 2.932636469221835e-06,
"loss": 0.3879,
"step": 1814
},
{
"epoch": 2.844827586206897,
"grad_norm": 0.20579132599609237,
"learning_rate": 2.9036004645760748e-06,
"loss": 0.4049,
"step": 1815
},
{
"epoch": 2.846394984326019,
"grad_norm": 0.20931424380168814,
"learning_rate": 2.8745644599303136e-06,
"loss": 0.3962,
"step": 1816
},
{
"epoch": 2.847962382445141,
"grad_norm": 0.19065620200244152,
"learning_rate": 2.8455284552845528e-06,
"loss": 0.3607,
"step": 1817
},
{
"epoch": 2.8495297805642634,
"grad_norm": 0.20225167024977028,
"learning_rate": 2.8164924506387924e-06,
"loss": 0.3864,
"step": 1818
},
{
"epoch": 2.8510971786833856,
"grad_norm": 0.22096115503142363,
"learning_rate": 2.7874564459930316e-06,
"loss": 0.4005,
"step": 1819
},
{
"epoch": 2.852664576802508,
"grad_norm": 0.2005356042386704,
"learning_rate": 2.7584204413472704e-06,
"loss": 0.4079,
"step": 1820
},
{
"epoch": 2.85423197492163,
"grad_norm": 0.1885645048135854,
"learning_rate": 2.72938443670151e-06,
"loss": 0.3503,
"step": 1821
},
{
"epoch": 2.8557993730407523,
"grad_norm": 0.20752292054627078,
"learning_rate": 2.7003484320557493e-06,
"loss": 0.4104,
"step": 1822
},
{
"epoch": 2.8573667711598745,
"grad_norm": 0.2143259950768639,
"learning_rate": 2.6713124274099885e-06,
"loss": 0.4216,
"step": 1823
},
{
"epoch": 2.8589341692789967,
"grad_norm": 0.19842896211685657,
"learning_rate": 2.6422764227642278e-06,
"loss": 0.3704,
"step": 1824
},
{
"epoch": 2.860501567398119,
"grad_norm": 0.2155505782528911,
"learning_rate": 2.613240418118467e-06,
"loss": 0.4189,
"step": 1825
},
{
"epoch": 2.862068965517241,
"grad_norm": 0.19902209551145886,
"learning_rate": 2.584204413472706e-06,
"loss": 0.3937,
"step": 1826
},
{
"epoch": 2.8636363636363638,
"grad_norm": 0.19719214947975014,
"learning_rate": 2.555168408826946e-06,
"loss": 0.3578,
"step": 1827
},
{
"epoch": 2.865203761755486,
"grad_norm": 0.19747198892365053,
"learning_rate": 2.5261324041811846e-06,
"loss": 0.3821,
"step": 1828
},
{
"epoch": 2.866771159874608,
"grad_norm": 0.21482024799858845,
"learning_rate": 2.497096399535424e-06,
"loss": 0.4251,
"step": 1829
},
{
"epoch": 2.8683385579937304,
"grad_norm": 0.19785877712831804,
"learning_rate": 2.468060394889663e-06,
"loss": 0.3507,
"step": 1830
},
{
"epoch": 2.8699059561128526,
"grad_norm": 0.20486139054266103,
"learning_rate": 2.4390243902439027e-06,
"loss": 0.3971,
"step": 1831
},
{
"epoch": 2.871473354231975,
"grad_norm": 0.19085136908402417,
"learning_rate": 2.409988385598142e-06,
"loss": 0.3311,
"step": 1832
},
{
"epoch": 2.873040752351097,
"grad_norm": 0.22062265310334978,
"learning_rate": 2.3809523809523808e-06,
"loss": 0.4115,
"step": 1833
},
{
"epoch": 2.8746081504702197,
"grad_norm": 0.20166606888869645,
"learning_rate": 2.3519163763066204e-06,
"loss": 0.3699,
"step": 1834
},
{
"epoch": 2.876175548589342,
"grad_norm": 0.30385777696238614,
"learning_rate": 2.3228803716608596e-06,
"loss": 0.4581,
"step": 1835
},
{
"epoch": 2.877742946708464,
"grad_norm": 0.20336945964744443,
"learning_rate": 2.293844367015099e-06,
"loss": 0.391,
"step": 1836
},
{
"epoch": 2.8793103448275863,
"grad_norm": 0.20664782879939947,
"learning_rate": 2.264808362369338e-06,
"loss": 0.3835,
"step": 1837
},
{
"epoch": 2.8808777429467085,
"grad_norm": 0.20420945408754165,
"learning_rate": 2.2357723577235773e-06,
"loss": 0.4036,
"step": 1838
},
{
"epoch": 2.8824451410658307,
"grad_norm": 0.20735499147980596,
"learning_rate": 2.2067363530778165e-06,
"loss": 0.4109,
"step": 1839
},
{
"epoch": 2.884012539184953,
"grad_norm": 0.22409872240671644,
"learning_rate": 2.177700348432056e-06,
"loss": 0.4118,
"step": 1840
},
{
"epoch": 2.885579937304075,
"grad_norm": 0.21121824160450584,
"learning_rate": 2.148664343786295e-06,
"loss": 0.4292,
"step": 1841
},
{
"epoch": 2.8871473354231973,
"grad_norm": 0.20184449506962784,
"learning_rate": 2.119628339140534e-06,
"loss": 0.3779,
"step": 1842
},
{
"epoch": 2.8887147335423196,
"grad_norm": 0.21548054771209063,
"learning_rate": 2.090592334494774e-06,
"loss": 0.4003,
"step": 1843
},
{
"epoch": 2.8902821316614418,
"grad_norm": 0.20314931943941122,
"learning_rate": 2.061556329849013e-06,
"loss": 0.4265,
"step": 1844
},
{
"epoch": 2.8918495297805644,
"grad_norm": 0.20461604088332855,
"learning_rate": 2.0325203252032523e-06,
"loss": 0.3926,
"step": 1845
},
{
"epoch": 2.8934169278996866,
"grad_norm": 0.2055159207876563,
"learning_rate": 2.003484320557491e-06,
"loss": 0.3949,
"step": 1846
},
{
"epoch": 2.894984326018809,
"grad_norm": 0.21079414450775316,
"learning_rate": 1.9744483159117307e-06,
"loss": 0.4144,
"step": 1847
},
{
"epoch": 2.896551724137931,
"grad_norm": 0.19854439235277346,
"learning_rate": 1.94541231126597e-06,
"loss": 0.3716,
"step": 1848
},
{
"epoch": 2.8981191222570533,
"grad_norm": 0.2071566347582639,
"learning_rate": 1.916376306620209e-06,
"loss": 0.3677,
"step": 1849
},
{
"epoch": 2.8996865203761755,
"grad_norm": 0.1984484914425988,
"learning_rate": 1.8873403019744486e-06,
"loss": 0.3736,
"step": 1850
},
{
"epoch": 2.9012539184952977,
"grad_norm": 0.18491599113883173,
"learning_rate": 1.8583042973286876e-06,
"loss": 0.3275,
"step": 1851
},
{
"epoch": 2.9028213166144203,
"grad_norm": 0.2133162078015566,
"learning_rate": 1.8292682926829268e-06,
"loss": 0.4023,
"step": 1852
},
{
"epoch": 2.9043887147335425,
"grad_norm": 0.1846345669459825,
"learning_rate": 1.8002322880371663e-06,
"loss": 0.3225,
"step": 1853
},
{
"epoch": 2.9059561128526648,
"grad_norm": 0.19840845743687782,
"learning_rate": 1.7711962833914055e-06,
"loss": 0.3984,
"step": 1854
},
{
"epoch": 2.907523510971787,
"grad_norm": 0.1894330899822013,
"learning_rate": 1.7421602787456445e-06,
"loss": 0.3661,
"step": 1855
},
{
"epoch": 2.909090909090909,
"grad_norm": 0.20288431449701422,
"learning_rate": 1.7131242740998842e-06,
"loss": 0.404,
"step": 1856
},
{
"epoch": 2.9106583072100314,
"grad_norm": 0.1857813416926803,
"learning_rate": 1.6840882694541232e-06,
"loss": 0.3218,
"step": 1857
},
{
"epoch": 2.9122257053291536,
"grad_norm": 0.19698045586439447,
"learning_rate": 1.6550522648083624e-06,
"loss": 0.3675,
"step": 1858
},
{
"epoch": 2.913793103448276,
"grad_norm": 0.2003773188397701,
"learning_rate": 1.6260162601626018e-06,
"loss": 0.402,
"step": 1859
},
{
"epoch": 2.915360501567398,
"grad_norm": 0.2298411177027167,
"learning_rate": 1.596980255516841e-06,
"loss": 0.5056,
"step": 1860
},
{
"epoch": 2.91692789968652,
"grad_norm": 0.1985266330239211,
"learning_rate": 1.5679442508710803e-06,
"loss": 0.3737,
"step": 1861
},
{
"epoch": 2.9184952978056424,
"grad_norm": 0.19567097639084194,
"learning_rate": 1.5389082462253195e-06,
"loss": 0.3633,
"step": 1862
},
{
"epoch": 2.9200626959247646,
"grad_norm": 0.19927527835383982,
"learning_rate": 1.5098722415795587e-06,
"loss": 0.3981,
"step": 1863
},
{
"epoch": 2.9216300940438873,
"grad_norm": 0.20897402287219788,
"learning_rate": 1.480836236933798e-06,
"loss": 0.4064,
"step": 1864
},
{
"epoch": 2.9231974921630095,
"grad_norm": 0.20264056591140836,
"learning_rate": 1.4518002322880374e-06,
"loss": 0.408,
"step": 1865
},
{
"epoch": 2.9247648902821317,
"grad_norm": 0.20494685221890308,
"learning_rate": 1.4227642276422764e-06,
"loss": 0.3661,
"step": 1866
},
{
"epoch": 2.926332288401254,
"grad_norm": 0.20124161508371585,
"learning_rate": 1.3937282229965158e-06,
"loss": 0.3691,
"step": 1867
},
{
"epoch": 2.927899686520376,
"grad_norm": 0.21420830329614984,
"learning_rate": 1.364692218350755e-06,
"loss": 0.4518,
"step": 1868
},
{
"epoch": 2.9294670846394983,
"grad_norm": 0.19985049791302342,
"learning_rate": 1.3356562137049943e-06,
"loss": 0.3783,
"step": 1869
},
{
"epoch": 2.9310344827586206,
"grad_norm": 0.19363458093524055,
"learning_rate": 1.3066202090592335e-06,
"loss": 0.3714,
"step": 1870
},
{
"epoch": 2.932601880877743,
"grad_norm": 0.19940111751604894,
"learning_rate": 1.277584204413473e-06,
"loss": 0.3946,
"step": 1871
},
{
"epoch": 2.9341692789968654,
"grad_norm": 0.20813739840363824,
"learning_rate": 1.248548199767712e-06,
"loss": 0.4278,
"step": 1872
},
{
"epoch": 2.9357366771159876,
"grad_norm": 0.1976614952538811,
"learning_rate": 1.2195121951219514e-06,
"loss": 0.3918,
"step": 1873
},
{
"epoch": 2.93730407523511,
"grad_norm": 0.1961210123305466,
"learning_rate": 1.1904761904761904e-06,
"loss": 0.3797,
"step": 1874
},
{
"epoch": 2.938871473354232,
"grad_norm": 0.20283260675121106,
"learning_rate": 1.1614401858304298e-06,
"loss": 0.3999,
"step": 1875
},
{
"epoch": 2.9404388714733543,
"grad_norm": 0.1956020418487346,
"learning_rate": 1.132404181184669e-06,
"loss": 0.3644,
"step": 1876
},
{
"epoch": 2.9420062695924765,
"grad_norm": 0.25730083515425867,
"learning_rate": 1.1033681765389083e-06,
"loss": 0.3635,
"step": 1877
},
{
"epoch": 2.9435736677115987,
"grad_norm": 0.19854458107404122,
"learning_rate": 1.0743321718931475e-06,
"loss": 0.3695,
"step": 1878
},
{
"epoch": 2.945141065830721,
"grad_norm": 0.1930495280939154,
"learning_rate": 1.045296167247387e-06,
"loss": 0.3858,
"step": 1879
},
{
"epoch": 2.946708463949843,
"grad_norm": 0.2005722465743604,
"learning_rate": 1.0162601626016261e-06,
"loss": 0.3756,
"step": 1880
},
{
"epoch": 2.9482758620689653,
"grad_norm": 0.19379947763595515,
"learning_rate": 9.872241579558654e-07,
"loss": 0.3746,
"step": 1881
},
{
"epoch": 2.9498432601880875,
"grad_norm": 0.191807851400299,
"learning_rate": 9.581881533101046e-07,
"loss": 0.3425,
"step": 1882
},
{
"epoch": 2.95141065830721,
"grad_norm": 0.21160783500449798,
"learning_rate": 9.291521486643438e-07,
"loss": 0.4282,
"step": 1883
},
{
"epoch": 2.9529780564263324,
"grad_norm": 0.2038808502070182,
"learning_rate": 9.001161440185831e-07,
"loss": 0.4127,
"step": 1884
},
{
"epoch": 2.9545454545454546,
"grad_norm": 0.20935048354928004,
"learning_rate": 8.710801393728223e-07,
"loss": 0.4048,
"step": 1885
},
{
"epoch": 2.956112852664577,
"grad_norm": 0.750704778511612,
"learning_rate": 8.420441347270616e-07,
"loss": 0.3836,
"step": 1886
},
{
"epoch": 2.957680250783699,
"grad_norm": 0.20110071837556542,
"learning_rate": 8.130081300813009e-07,
"loss": 0.371,
"step": 1887
},
{
"epoch": 2.959247648902821,
"grad_norm": 0.19618321946450118,
"learning_rate": 7.839721254355401e-07,
"loss": 0.3785,
"step": 1888
},
{
"epoch": 2.9608150470219434,
"grad_norm": 0.21084638489085192,
"learning_rate": 7.549361207897794e-07,
"loss": 0.4277,
"step": 1889
},
{
"epoch": 2.962382445141066,
"grad_norm": 0.31431000294740724,
"learning_rate": 7.259001161440187e-07,
"loss": 0.4224,
"step": 1890
},
{
"epoch": 2.9639498432601883,
"grad_norm": 0.19566033726981788,
"learning_rate": 6.968641114982579e-07,
"loss": 0.3887,
"step": 1891
},
{
"epoch": 2.9655172413793105,
"grad_norm": 0.18659512115217813,
"learning_rate": 6.678281068524971e-07,
"loss": 0.3579,
"step": 1892
},
{
"epoch": 2.9670846394984327,
"grad_norm": 0.19590170828199036,
"learning_rate": 6.387921022067365e-07,
"loss": 0.3676,
"step": 1893
},
{
"epoch": 2.968652037617555,
"grad_norm": 0.19264853131187518,
"learning_rate": 6.097560975609757e-07,
"loss": 0.4001,
"step": 1894
},
{
"epoch": 2.970219435736677,
"grad_norm": 0.21104911069921803,
"learning_rate": 5.807200929152149e-07,
"loss": 0.4481,
"step": 1895
},
{
"epoch": 2.9717868338557993,
"grad_norm": 0.1903212559399273,
"learning_rate": 5.516840882694541e-07,
"loss": 0.3623,
"step": 1896
},
{
"epoch": 2.9733542319749215,
"grad_norm": 0.20611735383425056,
"learning_rate": 5.226480836236935e-07,
"loss": 0.4004,
"step": 1897
},
{
"epoch": 2.9749216300940438,
"grad_norm": 0.19924235508793547,
"learning_rate": 4.936120789779327e-07,
"loss": 0.4043,
"step": 1898
},
{
"epoch": 2.976489028213166,
"grad_norm": 0.2033352758712455,
"learning_rate": 4.645760743321719e-07,
"loss": 0.4133,
"step": 1899
},
{
"epoch": 2.978056426332288,
"grad_norm": 0.20250832849206352,
"learning_rate": 4.3554006968641113e-07,
"loss": 0.3801,
"step": 1900
},
{
"epoch": 2.9796238244514104,
"grad_norm": 0.1917654496469268,
"learning_rate": 4.0650406504065046e-07,
"loss": 0.365,
"step": 1901
},
{
"epoch": 2.981191222570533,
"grad_norm": 0.20367695241858128,
"learning_rate": 3.774680603948897e-07,
"loss": 0.4294,
"step": 1902
},
{
"epoch": 2.9827586206896552,
"grad_norm": 0.19973202085650638,
"learning_rate": 3.4843205574912896e-07,
"loss": 0.388,
"step": 1903
},
{
"epoch": 2.9843260188087775,
"grad_norm": 0.2143680856915744,
"learning_rate": 3.1939605110336823e-07,
"loss": 0.4534,
"step": 1904
},
{
"epoch": 2.9858934169278997,
"grad_norm": 0.1875777246107858,
"learning_rate": 2.9036004645760745e-07,
"loss": 0.3513,
"step": 1905
},
{
"epoch": 2.987460815047022,
"grad_norm": 0.19521507452041267,
"learning_rate": 2.6132404181184673e-07,
"loss": 0.3788,
"step": 1906
},
{
"epoch": 2.989028213166144,
"grad_norm": 0.20215805229725287,
"learning_rate": 2.3228803716608595e-07,
"loss": 0.4046,
"step": 1907
},
{
"epoch": 2.9905956112852663,
"grad_norm": 0.194705376383754,
"learning_rate": 2.0325203252032523e-07,
"loss": 0.3824,
"step": 1908
},
{
"epoch": 2.992163009404389,
"grad_norm": 0.2157590107041691,
"learning_rate": 1.7421602787456448e-07,
"loss": 0.403,
"step": 1909
},
{
"epoch": 2.993730407523511,
"grad_norm": 0.19717238145755608,
"learning_rate": 1.4518002322880373e-07,
"loss": 0.3736,
"step": 1910
},
{
"epoch": 2.9952978056426334,
"grad_norm": 0.2043648416305194,
"learning_rate": 1.1614401858304298e-07,
"loss": 0.3944,
"step": 1911
},
{
"epoch": 2.9968652037617556,
"grad_norm": 0.19589493053214369,
"learning_rate": 8.710801393728224e-08,
"loss": 0.377,
"step": 1912
},
{
"epoch": 2.998432601880878,
"grad_norm": 0.1955973192107657,
"learning_rate": 5.807200929152149e-08,
"loss": 0.3777,
"step": 1913
},
{
"epoch": 3.0,
"grad_norm": 0.19783144889827486,
"learning_rate": 2.9036004645760744e-08,
"loss": 0.3452,
"step": 1914
},
{
"epoch": 3.0,
"step": 1914,
"total_flos": 2.1284944775642874e+19,
"train_loss": 0.6043848266733602,
"train_runtime": 65497.0395,
"train_samples_per_second": 0.467,
"train_steps_per_second": 0.029
}
],
"logging_steps": 1,
"max_steps": 1914,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.1284944775642874e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}