ti / trainer_state.json
cfli's picture
Upload folder using huggingface_hub
ce28a45 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1329,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007524454477050414,
"grad_norm": 270.3504638671875,
"learning_rate": 0.0,
"loss": 7.3906,
"step": 1
},
{
"epoch": 0.0015048908954100827,
"grad_norm": 218.03387451171875,
"learning_rate": 1.4925373134328358e-07,
"loss": 6.1914,
"step": 2
},
{
"epoch": 0.002257336343115124,
"grad_norm": 234.69158935546875,
"learning_rate": 2.9850746268656716e-07,
"loss": 6.3906,
"step": 3
},
{
"epoch": 0.0030097817908201654,
"grad_norm": 184.603759765625,
"learning_rate": 4.4776119402985074e-07,
"loss": 5.8496,
"step": 4
},
{
"epoch": 0.003762227238525207,
"grad_norm": 217.74620056152344,
"learning_rate": 5.970149253731343e-07,
"loss": 6.4297,
"step": 5
},
{
"epoch": 0.004514672686230248,
"grad_norm": 234.3373260498047,
"learning_rate": 7.462686567164179e-07,
"loss": 6.1973,
"step": 6
},
{
"epoch": 0.005267118133935289,
"grad_norm": 225.37820434570312,
"learning_rate": 8.955223880597015e-07,
"loss": 6.8613,
"step": 7
},
{
"epoch": 0.006019563581640331,
"grad_norm": 236.18878173828125,
"learning_rate": 1.044776119402985e-06,
"loss": 6.3555,
"step": 8
},
{
"epoch": 0.006772009029345372,
"grad_norm": 189.50917053222656,
"learning_rate": 1.1940298507462686e-06,
"loss": 5.1914,
"step": 9
},
{
"epoch": 0.007524454477050414,
"grad_norm": 183.52633666992188,
"learning_rate": 1.3432835820895524e-06,
"loss": 4.8828,
"step": 10
},
{
"epoch": 0.008276899924755455,
"grad_norm": 205.61083984375,
"learning_rate": 1.4925373134328358e-06,
"loss": 6.0234,
"step": 11
},
{
"epoch": 0.009029345372460496,
"grad_norm": 153.64642333984375,
"learning_rate": 1.6417910447761196e-06,
"loss": 5.627,
"step": 12
},
{
"epoch": 0.009781790820165538,
"grad_norm": 153.3954315185547,
"learning_rate": 1.791044776119403e-06,
"loss": 4.7754,
"step": 13
},
{
"epoch": 0.010534236267870579,
"grad_norm": 173.182373046875,
"learning_rate": 1.9402985074626867e-06,
"loss": 5.5039,
"step": 14
},
{
"epoch": 0.011286681715575621,
"grad_norm": 159.0615692138672,
"learning_rate": 2.08955223880597e-06,
"loss": 4.877,
"step": 15
},
{
"epoch": 0.012039127163280662,
"grad_norm": 131.60826110839844,
"learning_rate": 2.238805970149254e-06,
"loss": 4.5352,
"step": 16
},
{
"epoch": 0.012791572610985704,
"grad_norm": 75.3213119506836,
"learning_rate": 2.3880597014925373e-06,
"loss": 4.6309,
"step": 17
},
{
"epoch": 0.013544018058690745,
"grad_norm": 55.645530700683594,
"learning_rate": 2.537313432835821e-06,
"loss": 4.0176,
"step": 18
},
{
"epoch": 0.014296463506395787,
"grad_norm": 33.673458099365234,
"learning_rate": 2.686567164179105e-06,
"loss": 3.9746,
"step": 19
},
{
"epoch": 0.015048908954100828,
"grad_norm": 27.180540084838867,
"learning_rate": 2.835820895522388e-06,
"loss": 4.4297,
"step": 20
},
{
"epoch": 0.01580135440180587,
"grad_norm": 21.726226806640625,
"learning_rate": 2.9850746268656716e-06,
"loss": 3.875,
"step": 21
},
{
"epoch": 0.01655379984951091,
"grad_norm": 110.95691680908203,
"learning_rate": 3.1343283582089558e-06,
"loss": 4.4336,
"step": 22
},
{
"epoch": 0.01730624529721595,
"grad_norm": 231.21478271484375,
"learning_rate": 3.283582089552239e-06,
"loss": 5.5293,
"step": 23
},
{
"epoch": 0.01805869074492099,
"grad_norm": 290.27117919921875,
"learning_rate": 3.4328358208955225e-06,
"loss": 6.1211,
"step": 24
},
{
"epoch": 0.018811136192626036,
"grad_norm": 259.8589172363281,
"learning_rate": 3.582089552238806e-06,
"loss": 6.1562,
"step": 25
},
{
"epoch": 0.019563581640331076,
"grad_norm": 284.6676940917969,
"learning_rate": 3.73134328358209e-06,
"loss": 6.0078,
"step": 26
},
{
"epoch": 0.020316027088036117,
"grad_norm": 307.04949951171875,
"learning_rate": 3.8805970149253735e-06,
"loss": 6.2539,
"step": 27
},
{
"epoch": 0.021068472535741158,
"grad_norm": 238.06138610839844,
"learning_rate": 4.029850746268657e-06,
"loss": 5.5859,
"step": 28
},
{
"epoch": 0.0218209179834462,
"grad_norm": 218.7796630859375,
"learning_rate": 4.17910447761194e-06,
"loss": 5.3477,
"step": 29
},
{
"epoch": 0.022573363431151242,
"grad_norm": 225.3642578125,
"learning_rate": 4.3283582089552236e-06,
"loss": 5.0352,
"step": 30
},
{
"epoch": 0.023325808878856283,
"grad_norm": 187.6018524169922,
"learning_rate": 4.477611940298508e-06,
"loss": 4.7285,
"step": 31
},
{
"epoch": 0.024078254326561323,
"grad_norm": 113.01893615722656,
"learning_rate": 4.626865671641791e-06,
"loss": 4.0684,
"step": 32
},
{
"epoch": 0.024830699774266364,
"grad_norm": 69.8572769165039,
"learning_rate": 4.7761194029850745e-06,
"loss": 4.2344,
"step": 33
},
{
"epoch": 0.025583145221971408,
"grad_norm": 33.99457931518555,
"learning_rate": 4.925373134328359e-06,
"loss": 3.9727,
"step": 34
},
{
"epoch": 0.02633559066967645,
"grad_norm": 18.248146057128906,
"learning_rate": 5.074626865671642e-06,
"loss": 3.8906,
"step": 35
},
{
"epoch": 0.02708803611738149,
"grad_norm": 83.59455108642578,
"learning_rate": 5.2238805970149255e-06,
"loss": 4.1074,
"step": 36
},
{
"epoch": 0.02784048156508653,
"grad_norm": 45.39522171020508,
"learning_rate": 5.37313432835821e-06,
"loss": 3.959,
"step": 37
},
{
"epoch": 0.028592927012791574,
"grad_norm": 75.98773956298828,
"learning_rate": 5.522388059701493e-06,
"loss": 4.2012,
"step": 38
},
{
"epoch": 0.029345372460496615,
"grad_norm": 78.67504119873047,
"learning_rate": 5.671641791044776e-06,
"loss": 3.8828,
"step": 39
},
{
"epoch": 0.030097817908201655,
"grad_norm": 73.8619613647461,
"learning_rate": 5.820895522388061e-06,
"loss": 3.8926,
"step": 40
},
{
"epoch": 0.030850263355906696,
"grad_norm": 58.589107513427734,
"learning_rate": 5.970149253731343e-06,
"loss": 3.7617,
"step": 41
},
{
"epoch": 0.03160270880361174,
"grad_norm": 49.83146286010742,
"learning_rate": 6.119402985074627e-06,
"loss": 3.8926,
"step": 42
},
{
"epoch": 0.03235515425131678,
"grad_norm": 26.51479721069336,
"learning_rate": 6.2686567164179116e-06,
"loss": 3.3848,
"step": 43
},
{
"epoch": 0.03310759969902182,
"grad_norm": 17.305334091186523,
"learning_rate": 6.417910447761194e-06,
"loss": 3.291,
"step": 44
},
{
"epoch": 0.033860045146726865,
"grad_norm": 22.08051872253418,
"learning_rate": 6.567164179104478e-06,
"loss": 3.4844,
"step": 45
},
{
"epoch": 0.0346124905944319,
"grad_norm": 43.44215393066406,
"learning_rate": 6.7164179104477625e-06,
"loss": 3.3047,
"step": 46
},
{
"epoch": 0.035364936042136946,
"grad_norm": 39.65483474731445,
"learning_rate": 6.865671641791045e-06,
"loss": 3.8867,
"step": 47
},
{
"epoch": 0.03611738148984198,
"grad_norm": 37.06460952758789,
"learning_rate": 7.014925373134329e-06,
"loss": 3.6523,
"step": 48
},
{
"epoch": 0.03686982693754703,
"grad_norm": 29.9962158203125,
"learning_rate": 7.164179104477612e-06,
"loss": 3.2207,
"step": 49
},
{
"epoch": 0.03762227238525207,
"grad_norm": 32.21905517578125,
"learning_rate": 7.313432835820896e-06,
"loss": 3.791,
"step": 50
},
{
"epoch": 0.03837471783295711,
"grad_norm": 26.621665954589844,
"learning_rate": 7.46268656716418e-06,
"loss": 3.2559,
"step": 51
},
{
"epoch": 0.03912716328066215,
"grad_norm": 26.82464599609375,
"learning_rate": 7.611940298507463e-06,
"loss": 3.832,
"step": 52
},
{
"epoch": 0.0398796087283672,
"grad_norm": 27.126306533813477,
"learning_rate": 7.761194029850747e-06,
"loss": 3.541,
"step": 53
},
{
"epoch": 0.040632054176072234,
"grad_norm": 29.40413475036621,
"learning_rate": 7.91044776119403e-06,
"loss": 3.666,
"step": 54
},
{
"epoch": 0.04138449962377728,
"grad_norm": 24.618732452392578,
"learning_rate": 8.059701492537314e-06,
"loss": 3.1562,
"step": 55
},
{
"epoch": 0.042136945071482315,
"grad_norm": 25.24435806274414,
"learning_rate": 8.208955223880599e-06,
"loss": 2.9307,
"step": 56
},
{
"epoch": 0.04288939051918736,
"grad_norm": 21.84393310546875,
"learning_rate": 8.35820895522388e-06,
"loss": 4.084,
"step": 57
},
{
"epoch": 0.0436418359668924,
"grad_norm": 20.979150772094727,
"learning_rate": 8.507462686567165e-06,
"loss": 3.1816,
"step": 58
},
{
"epoch": 0.04439428141459744,
"grad_norm": 23.046876907348633,
"learning_rate": 8.656716417910447e-06,
"loss": 3.8184,
"step": 59
},
{
"epoch": 0.045146726862302484,
"grad_norm": 27.49558448791504,
"learning_rate": 8.805970149253732e-06,
"loss": 3.4219,
"step": 60
},
{
"epoch": 0.04589917231000752,
"grad_norm": 21.256746292114258,
"learning_rate": 8.955223880597016e-06,
"loss": 3.0449,
"step": 61
},
{
"epoch": 0.046651617757712566,
"grad_norm": 25.865859985351562,
"learning_rate": 9.104477611940299e-06,
"loss": 3.2988,
"step": 62
},
{
"epoch": 0.04740406320541761,
"grad_norm": 45.46930694580078,
"learning_rate": 9.253731343283582e-06,
"loss": 2.583,
"step": 63
},
{
"epoch": 0.04815650865312265,
"grad_norm": 41.94594955444336,
"learning_rate": 9.402985074626867e-06,
"loss": 3.5938,
"step": 64
},
{
"epoch": 0.04890895410082769,
"grad_norm": 32.41459274291992,
"learning_rate": 9.552238805970149e-06,
"loss": 3.373,
"step": 65
},
{
"epoch": 0.04966139954853273,
"grad_norm": 31.194244384765625,
"learning_rate": 9.701492537313434e-06,
"loss": 3.377,
"step": 66
},
{
"epoch": 0.05041384499623777,
"grad_norm": 24.94992446899414,
"learning_rate": 9.850746268656717e-06,
"loss": 3.3174,
"step": 67
},
{
"epoch": 0.051166290443942816,
"grad_norm": 31.37045669555664,
"learning_rate": 1e-05,
"loss": 3.0703,
"step": 68
},
{
"epoch": 0.05191873589164785,
"grad_norm": 38.56293869018555,
"learning_rate": 9.992076069730588e-06,
"loss": 3.4922,
"step": 69
},
{
"epoch": 0.0526711813393529,
"grad_norm": 21.66668701171875,
"learning_rate": 9.984152139461173e-06,
"loss": 2.7578,
"step": 70
},
{
"epoch": 0.05342362678705794,
"grad_norm": 26.6785831451416,
"learning_rate": 9.97622820919176e-06,
"loss": 3.0439,
"step": 71
},
{
"epoch": 0.05417607223476298,
"grad_norm": 66.90357208251953,
"learning_rate": 9.968304278922346e-06,
"loss": 3.4922,
"step": 72
},
{
"epoch": 0.05492851768246802,
"grad_norm": 17.229740142822266,
"learning_rate": 9.960380348652933e-06,
"loss": 3.1445,
"step": 73
},
{
"epoch": 0.05568096313017306,
"grad_norm": 34.178218841552734,
"learning_rate": 9.95245641838352e-06,
"loss": 2.6406,
"step": 74
},
{
"epoch": 0.056433408577878104,
"grad_norm": 21.619211196899414,
"learning_rate": 9.944532488114107e-06,
"loss": 3.0439,
"step": 75
},
{
"epoch": 0.05718585402558315,
"grad_norm": 35.69949722290039,
"learning_rate": 9.936608557844692e-06,
"loss": 3.2402,
"step": 76
},
{
"epoch": 0.057938299473288185,
"grad_norm": 68.4375,
"learning_rate": 9.928684627575277e-06,
"loss": 3.5332,
"step": 77
},
{
"epoch": 0.05869074492099323,
"grad_norm": 22.187849044799805,
"learning_rate": 9.920760697305864e-06,
"loss": 2.9551,
"step": 78
},
{
"epoch": 0.059443190368698266,
"grad_norm": 22.072538375854492,
"learning_rate": 9.912836767036451e-06,
"loss": 2.7285,
"step": 79
},
{
"epoch": 0.06019563581640331,
"grad_norm": 32.30579376220703,
"learning_rate": 9.904912836767039e-06,
"loss": 2.8945,
"step": 80
},
{
"epoch": 0.060948081264108354,
"grad_norm": 24.991010665893555,
"learning_rate": 9.896988906497624e-06,
"loss": 3.1191,
"step": 81
},
{
"epoch": 0.06170052671181339,
"grad_norm": 52.45732498168945,
"learning_rate": 9.88906497622821e-06,
"loss": 3.5137,
"step": 82
},
{
"epoch": 0.062452972159518436,
"grad_norm": 26.008697509765625,
"learning_rate": 9.881141045958796e-06,
"loss": 3.2324,
"step": 83
},
{
"epoch": 0.06320541760722348,
"grad_norm": 38.75912094116211,
"learning_rate": 9.873217115689383e-06,
"loss": 3.2129,
"step": 84
},
{
"epoch": 0.06395786305492852,
"grad_norm": 31.209091186523438,
"learning_rate": 9.86529318541997e-06,
"loss": 3.1328,
"step": 85
},
{
"epoch": 0.06471030850263355,
"grad_norm": 34.91722106933594,
"learning_rate": 9.857369255150556e-06,
"loss": 3.0449,
"step": 86
},
{
"epoch": 0.0654627539503386,
"grad_norm": 26.631141662597656,
"learning_rate": 9.849445324881141e-06,
"loss": 3.332,
"step": 87
},
{
"epoch": 0.06621519939804364,
"grad_norm": 18.19337272644043,
"learning_rate": 9.841521394611728e-06,
"loss": 3.0322,
"step": 88
},
{
"epoch": 0.06696764484574869,
"grad_norm": 55.028099060058594,
"learning_rate": 9.833597464342315e-06,
"loss": 3.4453,
"step": 89
},
{
"epoch": 0.06772009029345373,
"grad_norm": 35.84956741333008,
"learning_rate": 9.825673534072902e-06,
"loss": 2.8672,
"step": 90
},
{
"epoch": 0.06847253574115876,
"grad_norm": 29.46920394897461,
"learning_rate": 9.817749603803487e-06,
"loss": 2.6416,
"step": 91
},
{
"epoch": 0.0692249811888638,
"grad_norm": 42.81882095336914,
"learning_rate": 9.809825673534073e-06,
"loss": 2.8066,
"step": 92
},
{
"epoch": 0.06997742663656885,
"grad_norm": 20.653282165527344,
"learning_rate": 9.80190174326466e-06,
"loss": 3.0195,
"step": 93
},
{
"epoch": 0.07072987208427389,
"grad_norm": 37.75685501098633,
"learning_rate": 9.793977812995247e-06,
"loss": 2.8027,
"step": 94
},
{
"epoch": 0.07148231753197894,
"grad_norm": 30.57415199279785,
"learning_rate": 9.786053882725834e-06,
"loss": 3.6602,
"step": 95
},
{
"epoch": 0.07223476297968397,
"grad_norm": 31.511478424072266,
"learning_rate": 9.77812995245642e-06,
"loss": 3.248,
"step": 96
},
{
"epoch": 0.07298720842738901,
"grad_norm": 53.98818588256836,
"learning_rate": 9.770206022187005e-06,
"loss": 3.4531,
"step": 97
},
{
"epoch": 0.07373965387509406,
"grad_norm": 22.481534957885742,
"learning_rate": 9.762282091917592e-06,
"loss": 2.6758,
"step": 98
},
{
"epoch": 0.0744920993227991,
"grad_norm": 22.52367401123047,
"learning_rate": 9.754358161648179e-06,
"loss": 3.2891,
"step": 99
},
{
"epoch": 0.07524454477050414,
"grad_norm": 24.657718658447266,
"learning_rate": 9.746434231378766e-06,
"loss": 2.7432,
"step": 100
},
{
"epoch": 0.07599699021820917,
"grad_norm": 27.113811492919922,
"learning_rate": 9.738510301109351e-06,
"loss": 3.5898,
"step": 101
},
{
"epoch": 0.07674943566591422,
"grad_norm": 31.955333709716797,
"learning_rate": 9.730586370839936e-06,
"loss": 2.7695,
"step": 102
},
{
"epoch": 0.07750188111361926,
"grad_norm": 32.23259735107422,
"learning_rate": 9.722662440570524e-06,
"loss": 3.043,
"step": 103
},
{
"epoch": 0.0782543265613243,
"grad_norm": 31.208330154418945,
"learning_rate": 9.71473851030111e-06,
"loss": 2.8613,
"step": 104
},
{
"epoch": 0.07900677200902935,
"grad_norm": 18.11272621154785,
"learning_rate": 9.706814580031696e-06,
"loss": 2.8672,
"step": 105
},
{
"epoch": 0.0797592174567344,
"grad_norm": 32.13460159301758,
"learning_rate": 9.698890649762283e-06,
"loss": 3.2012,
"step": 106
},
{
"epoch": 0.08051166290443942,
"grad_norm": 26.03815269470215,
"learning_rate": 9.69096671949287e-06,
"loss": 2.8633,
"step": 107
},
{
"epoch": 0.08126410835214447,
"grad_norm": 30.48563575744629,
"learning_rate": 9.683042789223455e-06,
"loss": 3.1289,
"step": 108
},
{
"epoch": 0.08201655379984951,
"grad_norm": 33.55179977416992,
"learning_rate": 9.675118858954042e-06,
"loss": 3.002,
"step": 109
},
{
"epoch": 0.08276899924755456,
"grad_norm": 37.811912536621094,
"learning_rate": 9.667194928684628e-06,
"loss": 2.6934,
"step": 110
},
{
"epoch": 0.0835214446952596,
"grad_norm": 24.619897842407227,
"learning_rate": 9.659270998415215e-06,
"loss": 3.1836,
"step": 111
},
{
"epoch": 0.08427389014296463,
"grad_norm": 54.72816848754883,
"learning_rate": 9.651347068145802e-06,
"loss": 3.5566,
"step": 112
},
{
"epoch": 0.08502633559066967,
"grad_norm": 36.67848205566406,
"learning_rate": 9.643423137876387e-06,
"loss": 2.627,
"step": 113
},
{
"epoch": 0.08577878103837472,
"grad_norm": 22.27309799194336,
"learning_rate": 9.635499207606974e-06,
"loss": 3.2832,
"step": 114
},
{
"epoch": 0.08653122648607976,
"grad_norm": 30.337501525878906,
"learning_rate": 9.62757527733756e-06,
"loss": 3.2109,
"step": 115
},
{
"epoch": 0.0872836719337848,
"grad_norm": 34.67364501953125,
"learning_rate": 9.619651347068147e-06,
"loss": 2.9326,
"step": 116
},
{
"epoch": 0.08803611738148984,
"grad_norm": 19.17691421508789,
"learning_rate": 9.611727416798734e-06,
"loss": 3.1348,
"step": 117
},
{
"epoch": 0.08878856282919488,
"grad_norm": 29.645797729492188,
"learning_rate": 9.603803486529319e-06,
"loss": 3.3652,
"step": 118
},
{
"epoch": 0.08954100827689992,
"grad_norm": 26.387907028198242,
"learning_rate": 9.595879556259906e-06,
"loss": 3.2578,
"step": 119
},
{
"epoch": 0.09029345372460497,
"grad_norm": 20.296672821044922,
"learning_rate": 9.587955625990491e-06,
"loss": 3.0078,
"step": 120
},
{
"epoch": 0.09104589917231001,
"grad_norm": 18.01280975341797,
"learning_rate": 9.580031695721078e-06,
"loss": 3.5566,
"step": 121
},
{
"epoch": 0.09179834462001504,
"grad_norm": 39.96554183959961,
"learning_rate": 9.572107765451665e-06,
"loss": 3.1553,
"step": 122
},
{
"epoch": 0.09255079006772009,
"grad_norm": 24.43425941467285,
"learning_rate": 9.56418383518225e-06,
"loss": 3.459,
"step": 123
},
{
"epoch": 0.09330323551542513,
"grad_norm": 47.12384033203125,
"learning_rate": 9.556259904912838e-06,
"loss": 3.0674,
"step": 124
},
{
"epoch": 0.09405568096313018,
"grad_norm": 34.72853469848633,
"learning_rate": 9.548335974643423e-06,
"loss": 3.0586,
"step": 125
},
{
"epoch": 0.09480812641083522,
"grad_norm": 23.229869842529297,
"learning_rate": 9.54041204437401e-06,
"loss": 2.8223,
"step": 126
},
{
"epoch": 0.09556057185854025,
"grad_norm": 20.17858123779297,
"learning_rate": 9.532488114104597e-06,
"loss": 2.8359,
"step": 127
},
{
"epoch": 0.0963130173062453,
"grad_norm": 33.083961486816406,
"learning_rate": 9.524564183835183e-06,
"loss": 3.0605,
"step": 128
},
{
"epoch": 0.09706546275395034,
"grad_norm": 50.06746292114258,
"learning_rate": 9.51664025356577e-06,
"loss": 2.5381,
"step": 129
},
{
"epoch": 0.09781790820165538,
"grad_norm": 22.124317169189453,
"learning_rate": 9.508716323296355e-06,
"loss": 2.9277,
"step": 130
},
{
"epoch": 0.09857035364936043,
"grad_norm": 62.62016677856445,
"learning_rate": 9.500792393026942e-06,
"loss": 2.6914,
"step": 131
},
{
"epoch": 0.09932279909706546,
"grad_norm": 49.8092155456543,
"learning_rate": 9.492868462757529e-06,
"loss": 3.1816,
"step": 132
},
{
"epoch": 0.1000752445447705,
"grad_norm": 26.982786178588867,
"learning_rate": 9.484944532488114e-06,
"loss": 2.7666,
"step": 133
},
{
"epoch": 0.10082768999247554,
"grad_norm": 26.919538497924805,
"learning_rate": 9.477020602218701e-06,
"loss": 2.6143,
"step": 134
},
{
"epoch": 0.10158013544018059,
"grad_norm": 32.252845764160156,
"learning_rate": 9.469096671949287e-06,
"loss": 2.8164,
"step": 135
},
{
"epoch": 0.10233258088788563,
"grad_norm": 48.72047424316406,
"learning_rate": 9.461172741679874e-06,
"loss": 3.2793,
"step": 136
},
{
"epoch": 0.10308502633559068,
"grad_norm": 70.15787506103516,
"learning_rate": 9.45324881141046e-06,
"loss": 2.9873,
"step": 137
},
{
"epoch": 0.1038374717832957,
"grad_norm": 88.84703826904297,
"learning_rate": 9.445324881141046e-06,
"loss": 3.2109,
"step": 138
},
{
"epoch": 0.10458991723100075,
"grad_norm": 74.57728576660156,
"learning_rate": 9.437400950871633e-06,
"loss": 3.5938,
"step": 139
},
{
"epoch": 0.1053423626787058,
"grad_norm": 31.566608428955078,
"learning_rate": 9.429477020602219e-06,
"loss": 3.2354,
"step": 140
},
{
"epoch": 0.10609480812641084,
"grad_norm": 21.33376121520996,
"learning_rate": 9.421553090332806e-06,
"loss": 2.5771,
"step": 141
},
{
"epoch": 0.10684725357411588,
"grad_norm": 35.084815979003906,
"learning_rate": 9.413629160063393e-06,
"loss": 2.8975,
"step": 142
},
{
"epoch": 0.10759969902182091,
"grad_norm": 55.037899017333984,
"learning_rate": 9.405705229793978e-06,
"loss": 3.3027,
"step": 143
},
{
"epoch": 0.10835214446952596,
"grad_norm": 89.90687561035156,
"learning_rate": 9.397781299524565e-06,
"loss": 3.0615,
"step": 144
},
{
"epoch": 0.109104589917231,
"grad_norm": 45.729373931884766,
"learning_rate": 9.38985736925515e-06,
"loss": 3.0762,
"step": 145
},
{
"epoch": 0.10985703536493605,
"grad_norm": 29.83327865600586,
"learning_rate": 9.381933438985737e-06,
"loss": 2.8154,
"step": 146
},
{
"epoch": 0.11060948081264109,
"grad_norm": 26.802101135253906,
"learning_rate": 9.374009508716324e-06,
"loss": 3.0469,
"step": 147
},
{
"epoch": 0.11136192626034612,
"grad_norm": 49.10624313354492,
"learning_rate": 9.366085578446912e-06,
"loss": 3.0547,
"step": 148
},
{
"epoch": 0.11211437170805116,
"grad_norm": 73.40638732910156,
"learning_rate": 9.358161648177497e-06,
"loss": 3.8574,
"step": 149
},
{
"epoch": 0.11286681715575621,
"grad_norm": 18.41849136352539,
"learning_rate": 9.350237717908082e-06,
"loss": 2.5781,
"step": 150
},
{
"epoch": 0.11361926260346125,
"grad_norm": 36.31776428222656,
"learning_rate": 9.34231378763867e-06,
"loss": 2.9971,
"step": 151
},
{
"epoch": 0.1143717080511663,
"grad_norm": 25.525556564331055,
"learning_rate": 9.334389857369256e-06,
"loss": 3.2266,
"step": 152
},
{
"epoch": 0.11512415349887133,
"grad_norm": 29.12396812438965,
"learning_rate": 9.326465927099843e-06,
"loss": 3.3887,
"step": 153
},
{
"epoch": 0.11587659894657637,
"grad_norm": 21.789138793945312,
"learning_rate": 9.318541996830429e-06,
"loss": 2.8047,
"step": 154
},
{
"epoch": 0.11662904439428141,
"grad_norm": 26.248098373413086,
"learning_rate": 9.310618066561014e-06,
"loss": 3.2852,
"step": 155
},
{
"epoch": 0.11738148984198646,
"grad_norm": 18.92224884033203,
"learning_rate": 9.302694136291601e-06,
"loss": 2.667,
"step": 156
},
{
"epoch": 0.1181339352896915,
"grad_norm": 32.231597900390625,
"learning_rate": 9.294770206022188e-06,
"loss": 2.9531,
"step": 157
},
{
"epoch": 0.11888638073739653,
"grad_norm": 33.85251235961914,
"learning_rate": 9.286846275752775e-06,
"loss": 3.0371,
"step": 158
},
{
"epoch": 0.11963882618510158,
"grad_norm": 21.563522338867188,
"learning_rate": 9.27892234548336e-06,
"loss": 2.8262,
"step": 159
},
{
"epoch": 0.12039127163280662,
"grad_norm": 18.7381649017334,
"learning_rate": 9.270998415213946e-06,
"loss": 2.9717,
"step": 160
},
{
"epoch": 0.12114371708051166,
"grad_norm": 22.355424880981445,
"learning_rate": 9.263074484944533e-06,
"loss": 2.8105,
"step": 161
},
{
"epoch": 0.12189616252821671,
"grad_norm": 41.8394660949707,
"learning_rate": 9.25515055467512e-06,
"loss": 2.9395,
"step": 162
},
{
"epoch": 0.12264860797592174,
"grad_norm": 31.401140213012695,
"learning_rate": 9.247226624405707e-06,
"loss": 3.1904,
"step": 163
},
{
"epoch": 0.12340105342362678,
"grad_norm": 29.943819046020508,
"learning_rate": 9.239302694136292e-06,
"loss": 2.7578,
"step": 164
},
{
"epoch": 0.12415349887133183,
"grad_norm": 26.2047119140625,
"learning_rate": 9.231378763866878e-06,
"loss": 3.0879,
"step": 165
},
{
"epoch": 0.12490594431903687,
"grad_norm": 24.09654998779297,
"learning_rate": 9.223454833597465e-06,
"loss": 3.0625,
"step": 166
},
{
"epoch": 0.1256583897667419,
"grad_norm": 20.292509078979492,
"learning_rate": 9.215530903328052e-06,
"loss": 2.6992,
"step": 167
},
{
"epoch": 0.12641083521444696,
"grad_norm": 24.645313262939453,
"learning_rate": 9.207606973058639e-06,
"loss": 2.8486,
"step": 168
},
{
"epoch": 0.127163280662152,
"grad_norm": 47.42299270629883,
"learning_rate": 9.199683042789224e-06,
"loss": 3.0645,
"step": 169
},
{
"epoch": 0.12791572610985705,
"grad_norm": 31.85733413696289,
"learning_rate": 9.19175911251981e-06,
"loss": 3.2109,
"step": 170
},
{
"epoch": 0.12866817155756208,
"grad_norm": 24.819351196289062,
"learning_rate": 9.183835182250396e-06,
"loss": 2.9268,
"step": 171
},
{
"epoch": 0.1294206170052671,
"grad_norm": 32.172393798828125,
"learning_rate": 9.175911251980984e-06,
"loss": 2.7207,
"step": 172
},
{
"epoch": 0.13017306245297217,
"grad_norm": 28.043930053710938,
"learning_rate": 9.16798732171157e-06,
"loss": 2.9805,
"step": 173
},
{
"epoch": 0.1309255079006772,
"grad_norm": 85.42208862304688,
"learning_rate": 9.160063391442156e-06,
"loss": 3.0645,
"step": 174
},
{
"epoch": 0.13167795334838225,
"grad_norm": 45.047210693359375,
"learning_rate": 9.152139461172741e-06,
"loss": 2.8105,
"step": 175
},
{
"epoch": 0.13243039879608728,
"grad_norm": 24.070430755615234,
"learning_rate": 9.144215530903328e-06,
"loss": 3.1055,
"step": 176
},
{
"epoch": 0.13318284424379231,
"grad_norm": 21.807907104492188,
"learning_rate": 9.136291600633915e-06,
"loss": 3.1523,
"step": 177
},
{
"epoch": 0.13393528969149737,
"grad_norm": 28.564992904663086,
"learning_rate": 9.128367670364502e-06,
"loss": 2.9219,
"step": 178
},
{
"epoch": 0.1346877351392024,
"grad_norm": 21.0492000579834,
"learning_rate": 9.120443740095088e-06,
"loss": 3.4619,
"step": 179
},
{
"epoch": 0.13544018058690746,
"grad_norm": 60.33738708496094,
"learning_rate": 9.112519809825675e-06,
"loss": 2.9395,
"step": 180
},
{
"epoch": 0.1361926260346125,
"grad_norm": 17.952964782714844,
"learning_rate": 9.10459587955626e-06,
"loss": 2.4043,
"step": 181
},
{
"epoch": 0.13694507148231752,
"grad_norm": 35.40900802612305,
"learning_rate": 9.096671949286847e-06,
"loss": 2.709,
"step": 182
},
{
"epoch": 0.13769751693002258,
"grad_norm": 18.852981567382812,
"learning_rate": 9.088748019017434e-06,
"loss": 2.8086,
"step": 183
},
{
"epoch": 0.1384499623777276,
"grad_norm": 29.301170349121094,
"learning_rate": 9.08082408874802e-06,
"loss": 3.6641,
"step": 184
},
{
"epoch": 0.13920240782543267,
"grad_norm": 46.67580032348633,
"learning_rate": 9.072900158478607e-06,
"loss": 2.8477,
"step": 185
},
{
"epoch": 0.1399548532731377,
"grad_norm": 28.98253631591797,
"learning_rate": 9.064976228209192e-06,
"loss": 3.0342,
"step": 186
},
{
"epoch": 0.14070729872084273,
"grad_norm": 25.457124710083008,
"learning_rate": 9.057052297939779e-06,
"loss": 2.6748,
"step": 187
},
{
"epoch": 0.14145974416854779,
"grad_norm": 26.476932525634766,
"learning_rate": 9.049128367670366e-06,
"loss": 3.0654,
"step": 188
},
{
"epoch": 0.14221218961625282,
"grad_norm": 22.27900505065918,
"learning_rate": 9.041204437400951e-06,
"loss": 3.1953,
"step": 189
},
{
"epoch": 0.14296463506395787,
"grad_norm": 38.6421012878418,
"learning_rate": 9.033280507131538e-06,
"loss": 3.3906,
"step": 190
},
{
"epoch": 0.1437170805116629,
"grad_norm": 46.56447982788086,
"learning_rate": 9.025356576862124e-06,
"loss": 2.8262,
"step": 191
},
{
"epoch": 0.14446952595936793,
"grad_norm": 27.99505043029785,
"learning_rate": 9.01743264659271e-06,
"loss": 2.7803,
"step": 192
},
{
"epoch": 0.145221971407073,
"grad_norm": 20.122390747070312,
"learning_rate": 9.009508716323298e-06,
"loss": 2.1504,
"step": 193
},
{
"epoch": 0.14597441685477802,
"grad_norm": 34.87712478637695,
"learning_rate": 9.001584786053883e-06,
"loss": 2.6416,
"step": 194
},
{
"epoch": 0.14672686230248308,
"grad_norm": 31.978273391723633,
"learning_rate": 8.99366085578447e-06,
"loss": 3.1445,
"step": 195
},
{
"epoch": 0.1474793077501881,
"grad_norm": 25.95554542541504,
"learning_rate": 8.985736925515056e-06,
"loss": 2.918,
"step": 196
},
{
"epoch": 0.14823175319789314,
"grad_norm": 63.560035705566406,
"learning_rate": 8.977812995245643e-06,
"loss": 2.5674,
"step": 197
},
{
"epoch": 0.1489841986455982,
"grad_norm": 50.40849304199219,
"learning_rate": 8.96988906497623e-06,
"loss": 3.4824,
"step": 198
},
{
"epoch": 0.14973664409330323,
"grad_norm": 36.795013427734375,
"learning_rate": 8.961965134706815e-06,
"loss": 3.3955,
"step": 199
},
{
"epoch": 0.1504890895410083,
"grad_norm": 23.839466094970703,
"learning_rate": 8.954041204437402e-06,
"loss": 2.9502,
"step": 200
},
{
"epoch": 0.15124153498871332,
"grad_norm": 33.121131896972656,
"learning_rate": 8.946117274167987e-06,
"loss": 2.749,
"step": 201
},
{
"epoch": 0.15199398043641835,
"grad_norm": 16.309022903442383,
"learning_rate": 8.938193343898574e-06,
"loss": 2.5889,
"step": 202
},
{
"epoch": 0.1527464258841234,
"grad_norm": 23.139263153076172,
"learning_rate": 8.930269413629161e-06,
"loss": 2.9033,
"step": 203
},
{
"epoch": 0.15349887133182843,
"grad_norm": 26.247356414794922,
"learning_rate": 8.922345483359747e-06,
"loss": 2.7109,
"step": 204
},
{
"epoch": 0.1542513167795335,
"grad_norm": 55.275264739990234,
"learning_rate": 8.914421553090334e-06,
"loss": 3.2109,
"step": 205
},
{
"epoch": 0.15500376222723852,
"grad_norm": 37.078006744384766,
"learning_rate": 8.90649762282092e-06,
"loss": 3.3486,
"step": 206
},
{
"epoch": 0.15575620767494355,
"grad_norm": 40.6878662109375,
"learning_rate": 8.898573692551506e-06,
"loss": 3.3584,
"step": 207
},
{
"epoch": 0.1565086531226486,
"grad_norm": 19.50431251525879,
"learning_rate": 8.890649762282093e-06,
"loss": 3.2998,
"step": 208
},
{
"epoch": 0.15726109857035364,
"grad_norm": 21.160484313964844,
"learning_rate": 8.882725832012679e-06,
"loss": 2.8994,
"step": 209
},
{
"epoch": 0.1580135440180587,
"grad_norm": 28.999177932739258,
"learning_rate": 8.874801901743266e-06,
"loss": 2.4697,
"step": 210
},
{
"epoch": 0.15876598946576373,
"grad_norm": 61.649654388427734,
"learning_rate": 8.866877971473851e-06,
"loss": 3.0391,
"step": 211
},
{
"epoch": 0.1595184349134688,
"grad_norm": 43.92689514160156,
"learning_rate": 8.858954041204438e-06,
"loss": 3.0234,
"step": 212
},
{
"epoch": 0.16027088036117382,
"grad_norm": 49.55857467651367,
"learning_rate": 8.851030110935025e-06,
"loss": 2.8916,
"step": 213
},
{
"epoch": 0.16102332580887885,
"grad_norm": 31.43752670288086,
"learning_rate": 8.84310618066561e-06,
"loss": 3.0225,
"step": 214
},
{
"epoch": 0.1617757712565839,
"grad_norm": 39.49406051635742,
"learning_rate": 8.835182250396197e-06,
"loss": 2.6123,
"step": 215
},
{
"epoch": 0.16252821670428894,
"grad_norm": 39.66098403930664,
"learning_rate": 8.827258320126783e-06,
"loss": 3.0879,
"step": 216
},
{
"epoch": 0.163280662151994,
"grad_norm": 23.214488983154297,
"learning_rate": 8.81933438985737e-06,
"loss": 2.9238,
"step": 217
},
{
"epoch": 0.16403310759969902,
"grad_norm": 23.282730102539062,
"learning_rate": 8.811410459587957e-06,
"loss": 3.0547,
"step": 218
},
{
"epoch": 0.16478555304740405,
"grad_norm": 27.417238235473633,
"learning_rate": 8.803486529318542e-06,
"loss": 2.6113,
"step": 219
},
{
"epoch": 0.1655379984951091,
"grad_norm": 19.980924606323242,
"learning_rate": 8.79556259904913e-06,
"loss": 2.3564,
"step": 220
},
{
"epoch": 0.16629044394281414,
"grad_norm": 18.440303802490234,
"learning_rate": 8.787638668779716e-06,
"loss": 2.3672,
"step": 221
},
{
"epoch": 0.1670428893905192,
"grad_norm": 27.1029052734375,
"learning_rate": 8.779714738510302e-06,
"loss": 2.6631,
"step": 222
},
{
"epoch": 0.16779533483822423,
"grad_norm": 25.787336349487305,
"learning_rate": 8.771790808240889e-06,
"loss": 2.6289,
"step": 223
},
{
"epoch": 0.16854778028592926,
"grad_norm": 31.063365936279297,
"learning_rate": 8.763866877971474e-06,
"loss": 3.0146,
"step": 224
},
{
"epoch": 0.16930022573363432,
"grad_norm": 39.28546905517578,
"learning_rate": 8.755942947702061e-06,
"loss": 3.3584,
"step": 225
},
{
"epoch": 0.17005267118133935,
"grad_norm": 41.93635940551758,
"learning_rate": 8.748019017432648e-06,
"loss": 2.5762,
"step": 226
},
{
"epoch": 0.1708051166290444,
"grad_norm": 26.295791625976562,
"learning_rate": 8.740095087163233e-06,
"loss": 3.0781,
"step": 227
},
{
"epoch": 0.17155756207674944,
"grad_norm": 32.962379455566406,
"learning_rate": 8.73217115689382e-06,
"loss": 2.5918,
"step": 228
},
{
"epoch": 0.17231000752445447,
"grad_norm": 29.288211822509766,
"learning_rate": 8.724247226624406e-06,
"loss": 3.043,
"step": 229
},
{
"epoch": 0.17306245297215953,
"grad_norm": 23.087108612060547,
"learning_rate": 8.716323296354993e-06,
"loss": 2.9277,
"step": 230
},
{
"epoch": 0.17381489841986456,
"grad_norm": 57.002716064453125,
"learning_rate": 8.70839936608558e-06,
"loss": 2.9668,
"step": 231
},
{
"epoch": 0.1745673438675696,
"grad_norm": 21.86600685119629,
"learning_rate": 8.700475435816165e-06,
"loss": 2.627,
"step": 232
},
{
"epoch": 0.17531978931527464,
"grad_norm": 19.007726669311523,
"learning_rate": 8.692551505546752e-06,
"loss": 2.8184,
"step": 233
},
{
"epoch": 0.17607223476297967,
"grad_norm": 22.28285026550293,
"learning_rate": 8.684627575277338e-06,
"loss": 2.8965,
"step": 234
},
{
"epoch": 0.17682468021068473,
"grad_norm": 43.95874786376953,
"learning_rate": 8.676703645007925e-06,
"loss": 2.7998,
"step": 235
},
{
"epoch": 0.17757712565838976,
"grad_norm": 21.108388900756836,
"learning_rate": 8.668779714738512e-06,
"loss": 2.7344,
"step": 236
},
{
"epoch": 0.17832957110609482,
"grad_norm": 21.255399703979492,
"learning_rate": 8.660855784469097e-06,
"loss": 3.0166,
"step": 237
},
{
"epoch": 0.17908201655379985,
"grad_norm": 38.1331672668457,
"learning_rate": 8.652931854199684e-06,
"loss": 2.5342,
"step": 238
},
{
"epoch": 0.17983446200150488,
"grad_norm": 27.367652893066406,
"learning_rate": 8.64500792393027e-06,
"loss": 3.0059,
"step": 239
},
{
"epoch": 0.18058690744920994,
"grad_norm": 21.36908531188965,
"learning_rate": 8.637083993660857e-06,
"loss": 2.5986,
"step": 240
},
{
"epoch": 0.18133935289691497,
"grad_norm": 24.965232849121094,
"learning_rate": 8.629160063391444e-06,
"loss": 2.7422,
"step": 241
},
{
"epoch": 0.18209179834462003,
"grad_norm": 26.240676879882812,
"learning_rate": 8.621236133122029e-06,
"loss": 2.7207,
"step": 242
},
{
"epoch": 0.18284424379232506,
"grad_norm": 25.53062629699707,
"learning_rate": 8.613312202852616e-06,
"loss": 2.9189,
"step": 243
},
{
"epoch": 0.1835966892400301,
"grad_norm": 30.517423629760742,
"learning_rate": 8.605388272583201e-06,
"loss": 3.0566,
"step": 244
},
{
"epoch": 0.18434913468773514,
"grad_norm": 25.685810089111328,
"learning_rate": 8.597464342313788e-06,
"loss": 2.3613,
"step": 245
},
{
"epoch": 0.18510158013544017,
"grad_norm": 25.89740562438965,
"learning_rate": 8.589540412044375e-06,
"loss": 3.126,
"step": 246
},
{
"epoch": 0.18585402558314523,
"grad_norm": 18.76670265197754,
"learning_rate": 8.58161648177496e-06,
"loss": 2.4902,
"step": 247
},
{
"epoch": 0.18660647103085026,
"grad_norm": 27.958297729492188,
"learning_rate": 8.573692551505548e-06,
"loss": 2.7461,
"step": 248
},
{
"epoch": 0.1873589164785553,
"grad_norm": 21.461612701416016,
"learning_rate": 8.565768621236133e-06,
"loss": 2.9189,
"step": 249
},
{
"epoch": 0.18811136192626035,
"grad_norm": 23.18748664855957,
"learning_rate": 8.55784469096672e-06,
"loss": 2.3467,
"step": 250
},
{
"epoch": 0.18886380737396538,
"grad_norm": 50.05548858642578,
"learning_rate": 8.549920760697307e-06,
"loss": 2.7305,
"step": 251
},
{
"epoch": 0.18961625282167044,
"grad_norm": 38.1525764465332,
"learning_rate": 8.541996830427893e-06,
"loss": 3.5957,
"step": 252
},
{
"epoch": 0.19036869826937547,
"grad_norm": 41.369293212890625,
"learning_rate": 8.53407290015848e-06,
"loss": 2.4688,
"step": 253
},
{
"epoch": 0.1911211437170805,
"grad_norm": 30.51654815673828,
"learning_rate": 8.526148969889065e-06,
"loss": 2.9824,
"step": 254
},
{
"epoch": 0.19187358916478556,
"grad_norm": 50.003944396972656,
"learning_rate": 8.518225039619652e-06,
"loss": 2.8887,
"step": 255
},
{
"epoch": 0.1926260346124906,
"grad_norm": 22.59164810180664,
"learning_rate": 8.510301109350239e-06,
"loss": 2.4131,
"step": 256
},
{
"epoch": 0.19337848006019565,
"grad_norm": 21.20647621154785,
"learning_rate": 8.502377179080824e-06,
"loss": 2.5166,
"step": 257
},
{
"epoch": 0.19413092550790068,
"grad_norm": 20.6810302734375,
"learning_rate": 8.494453248811411e-06,
"loss": 2.5273,
"step": 258
},
{
"epoch": 0.1948833709556057,
"grad_norm": 27.598730087280273,
"learning_rate": 8.486529318541997e-06,
"loss": 2.6357,
"step": 259
},
{
"epoch": 0.19563581640331076,
"grad_norm": 28.025638580322266,
"learning_rate": 8.478605388272584e-06,
"loss": 3.1553,
"step": 260
},
{
"epoch": 0.1963882618510158,
"grad_norm": 25.879112243652344,
"learning_rate": 8.47068145800317e-06,
"loss": 3.0566,
"step": 261
},
{
"epoch": 0.19714070729872085,
"grad_norm": 37.4360466003418,
"learning_rate": 8.462757527733758e-06,
"loss": 3.5303,
"step": 262
},
{
"epoch": 0.19789315274642588,
"grad_norm": 27.776281356811523,
"learning_rate": 8.454833597464343e-06,
"loss": 2.7139,
"step": 263
},
{
"epoch": 0.1986455981941309,
"grad_norm": 30.19846534729004,
"learning_rate": 8.446909667194929e-06,
"loss": 2.8203,
"step": 264
},
{
"epoch": 0.19939804364183597,
"grad_norm": 35.52974319458008,
"learning_rate": 8.438985736925516e-06,
"loss": 3.0059,
"step": 265
},
{
"epoch": 0.200150489089541,
"grad_norm": 18.416040420532227,
"learning_rate": 8.431061806656103e-06,
"loss": 2.8809,
"step": 266
},
{
"epoch": 0.20090293453724606,
"grad_norm": 24.434906005859375,
"learning_rate": 8.42313787638669e-06,
"loss": 2.9688,
"step": 267
},
{
"epoch": 0.2016553799849511,
"grad_norm": 17.731821060180664,
"learning_rate": 8.415213946117275e-06,
"loss": 2.7617,
"step": 268
},
{
"epoch": 0.20240782543265612,
"grad_norm": 49.564125061035156,
"learning_rate": 8.40729001584786e-06,
"loss": 3.2773,
"step": 269
},
{
"epoch": 0.20316027088036118,
"grad_norm": 23.3712215423584,
"learning_rate": 8.399366085578447e-06,
"loss": 3.2969,
"step": 270
},
{
"epoch": 0.2039127163280662,
"grad_norm": 18.392166137695312,
"learning_rate": 8.391442155309034e-06,
"loss": 2.6846,
"step": 271
},
{
"epoch": 0.20466516177577126,
"grad_norm": 17.793779373168945,
"learning_rate": 8.383518225039621e-06,
"loss": 2.6895,
"step": 272
},
{
"epoch": 0.2054176072234763,
"grad_norm": 27.635812759399414,
"learning_rate": 8.375594294770207e-06,
"loss": 2.751,
"step": 273
},
{
"epoch": 0.20617005267118135,
"grad_norm": 52.76430130004883,
"learning_rate": 8.367670364500792e-06,
"loss": 2.8457,
"step": 274
},
{
"epoch": 0.20692249811888638,
"grad_norm": 23.622615814208984,
"learning_rate": 8.35974643423138e-06,
"loss": 2.8574,
"step": 275
},
{
"epoch": 0.2076749435665914,
"grad_norm": 33.52585983276367,
"learning_rate": 8.351822503961966e-06,
"loss": 2.9033,
"step": 276
},
{
"epoch": 0.20842738901429647,
"grad_norm": 50.93476867675781,
"learning_rate": 8.343898573692553e-06,
"loss": 2.75,
"step": 277
},
{
"epoch": 0.2091798344620015,
"grad_norm": 20.75809097290039,
"learning_rate": 8.335974643423139e-06,
"loss": 2.3896,
"step": 278
},
{
"epoch": 0.20993227990970656,
"grad_norm": 25.250638961791992,
"learning_rate": 8.328050713153724e-06,
"loss": 2.7451,
"step": 279
},
{
"epoch": 0.2106847253574116,
"grad_norm": 26.620756149291992,
"learning_rate": 8.320126782884311e-06,
"loss": 2.7588,
"step": 280
},
{
"epoch": 0.21143717080511662,
"grad_norm": 21.96516990661621,
"learning_rate": 8.312202852614898e-06,
"loss": 2.9248,
"step": 281
},
{
"epoch": 0.21218961625282168,
"grad_norm": 26.421092987060547,
"learning_rate": 8.304278922345485e-06,
"loss": 2.8486,
"step": 282
},
{
"epoch": 0.2129420617005267,
"grad_norm": 20.29400062561035,
"learning_rate": 8.29635499207607e-06,
"loss": 2.9727,
"step": 283
},
{
"epoch": 0.21369450714823177,
"grad_norm": 29.198728561401367,
"learning_rate": 8.288431061806656e-06,
"loss": 2.4951,
"step": 284
},
{
"epoch": 0.2144469525959368,
"grad_norm": 23.257631301879883,
"learning_rate": 8.280507131537243e-06,
"loss": 2.9082,
"step": 285
},
{
"epoch": 0.21519939804364183,
"grad_norm": 51.7125358581543,
"learning_rate": 8.27258320126783e-06,
"loss": 2.5234,
"step": 286
},
{
"epoch": 0.21595184349134688,
"grad_norm": 24.36754035949707,
"learning_rate": 8.264659270998417e-06,
"loss": 2.6406,
"step": 287
},
{
"epoch": 0.21670428893905191,
"grad_norm": 30.53868865966797,
"learning_rate": 8.256735340729002e-06,
"loss": 2.7607,
"step": 288
},
{
"epoch": 0.21745673438675697,
"grad_norm": 27.01729393005371,
"learning_rate": 8.24881141045959e-06,
"loss": 2.8223,
"step": 289
},
{
"epoch": 0.218209179834462,
"grad_norm": 24.644309997558594,
"learning_rate": 8.240887480190175e-06,
"loss": 2.75,
"step": 290
},
{
"epoch": 0.21896162528216703,
"grad_norm": 45.585166931152344,
"learning_rate": 8.232963549920762e-06,
"loss": 2.4844,
"step": 291
},
{
"epoch": 0.2197140707298721,
"grad_norm": 24.803770065307617,
"learning_rate": 8.225039619651349e-06,
"loss": 2.8115,
"step": 292
},
{
"epoch": 0.22046651617757712,
"grad_norm": 29.119102478027344,
"learning_rate": 8.217115689381934e-06,
"loss": 3.543,
"step": 293
},
{
"epoch": 0.22121896162528218,
"grad_norm": 35.074031829833984,
"learning_rate": 8.209191759112521e-06,
"loss": 3.1289,
"step": 294
},
{
"epoch": 0.2219714070729872,
"grad_norm": 34.88431167602539,
"learning_rate": 8.201267828843106e-06,
"loss": 2.9209,
"step": 295
},
{
"epoch": 0.22272385252069224,
"grad_norm": 36.36684799194336,
"learning_rate": 8.193343898573693e-06,
"loss": 2.8428,
"step": 296
},
{
"epoch": 0.2234762979683973,
"grad_norm": 40.56970977783203,
"learning_rate": 8.18541996830428e-06,
"loss": 2.5723,
"step": 297
},
{
"epoch": 0.22422874341610233,
"grad_norm": 32.04544448852539,
"learning_rate": 8.177496038034866e-06,
"loss": 3.0215,
"step": 298
},
{
"epoch": 0.22498118886380739,
"grad_norm": 21.52565574645996,
"learning_rate": 8.169572107765453e-06,
"loss": 3.4941,
"step": 299
},
{
"epoch": 0.22573363431151242,
"grad_norm": 17.581174850463867,
"learning_rate": 8.161648177496038e-06,
"loss": 2.7656,
"step": 300
},
{
"epoch": 0.22648607975921745,
"grad_norm": 17.084651947021484,
"learning_rate": 8.153724247226625e-06,
"loss": 2.3311,
"step": 301
},
{
"epoch": 0.2272385252069225,
"grad_norm": 20.111968994140625,
"learning_rate": 8.145800316957212e-06,
"loss": 2.8135,
"step": 302
},
{
"epoch": 0.22799097065462753,
"grad_norm": 38.68403625488281,
"learning_rate": 8.137876386687798e-06,
"loss": 2.9668,
"step": 303
},
{
"epoch": 0.2287434161023326,
"grad_norm": 63.05813980102539,
"learning_rate": 8.129952456418385e-06,
"loss": 3.1807,
"step": 304
},
{
"epoch": 0.22949586155003762,
"grad_norm": 26.78461265563965,
"learning_rate": 8.12202852614897e-06,
"loss": 3.1641,
"step": 305
},
{
"epoch": 0.23024830699774265,
"grad_norm": 22.460067749023438,
"learning_rate": 8.114104595879557e-06,
"loss": 2.8037,
"step": 306
},
{
"epoch": 0.2310007524454477,
"grad_norm": 33.41761779785156,
"learning_rate": 8.106180665610144e-06,
"loss": 3.3008,
"step": 307
},
{
"epoch": 0.23175319789315274,
"grad_norm": 30.717609405517578,
"learning_rate": 8.09825673534073e-06,
"loss": 2.7041,
"step": 308
},
{
"epoch": 0.2325056433408578,
"grad_norm": 64.30577850341797,
"learning_rate": 8.090332805071317e-06,
"loss": 3.1162,
"step": 309
},
{
"epoch": 0.23325808878856283,
"grad_norm": 35.22840118408203,
"learning_rate": 8.082408874801902e-06,
"loss": 2.6377,
"step": 310
},
{
"epoch": 0.23401053423626786,
"grad_norm": 20.862028121948242,
"learning_rate": 8.074484944532489e-06,
"loss": 2.7969,
"step": 311
},
{
"epoch": 0.23476297968397292,
"grad_norm": 26.325332641601562,
"learning_rate": 8.066561014263076e-06,
"loss": 2.6836,
"step": 312
},
{
"epoch": 0.23551542513167795,
"grad_norm": 50.82712173461914,
"learning_rate": 8.058637083993661e-06,
"loss": 3.0352,
"step": 313
},
{
"epoch": 0.236267870579383,
"grad_norm": 32.116920471191406,
"learning_rate": 8.050713153724248e-06,
"loss": 3.0156,
"step": 314
},
{
"epoch": 0.23702031602708803,
"grad_norm": 18.274662017822266,
"learning_rate": 8.042789223454834e-06,
"loss": 2.791,
"step": 315
},
{
"epoch": 0.23777276147479307,
"grad_norm": 30.45490837097168,
"learning_rate": 8.03486529318542e-06,
"loss": 2.8223,
"step": 316
},
{
"epoch": 0.23852520692249812,
"grad_norm": 34.29349899291992,
"learning_rate": 8.026941362916006e-06,
"loss": 2.8623,
"step": 317
},
{
"epoch": 0.23927765237020315,
"grad_norm": 23.390230178833008,
"learning_rate": 8.019017432646593e-06,
"loss": 2.8398,
"step": 318
},
{
"epoch": 0.2400300978179082,
"grad_norm": 39.82279968261719,
"learning_rate": 8.01109350237718e-06,
"loss": 3.3867,
"step": 319
},
{
"epoch": 0.24078254326561324,
"grad_norm": 16.220998764038086,
"learning_rate": 8.003169572107765e-06,
"loss": 3.0234,
"step": 320
},
{
"epoch": 0.24153498871331827,
"grad_norm": 19.393198013305664,
"learning_rate": 7.995245641838353e-06,
"loss": 3.1367,
"step": 321
},
{
"epoch": 0.24228743416102333,
"grad_norm": 20.260129928588867,
"learning_rate": 7.987321711568938e-06,
"loss": 2.5,
"step": 322
},
{
"epoch": 0.24303987960872836,
"grad_norm": 26.83426284790039,
"learning_rate": 7.979397781299525e-06,
"loss": 2.7949,
"step": 323
},
{
"epoch": 0.24379232505643342,
"grad_norm": 16.979801177978516,
"learning_rate": 7.971473851030112e-06,
"loss": 2.748,
"step": 324
},
{
"epoch": 0.24454477050413845,
"grad_norm": 32.53853988647461,
"learning_rate": 7.963549920760697e-06,
"loss": 2.6592,
"step": 325
},
{
"epoch": 0.24529721595184348,
"grad_norm": 35.54403305053711,
"learning_rate": 7.955625990491284e-06,
"loss": 2.7207,
"step": 326
},
{
"epoch": 0.24604966139954854,
"grad_norm": 26.685930252075195,
"learning_rate": 7.94770206022187e-06,
"loss": 2.7061,
"step": 327
},
{
"epoch": 0.24680210684725357,
"grad_norm": 25.83123207092285,
"learning_rate": 7.939778129952457e-06,
"loss": 3.1514,
"step": 328
},
{
"epoch": 0.24755455229495862,
"grad_norm": 20.83842658996582,
"learning_rate": 7.931854199683044e-06,
"loss": 2.4766,
"step": 329
},
{
"epoch": 0.24830699774266365,
"grad_norm": 25.924402236938477,
"learning_rate": 7.92393026941363e-06,
"loss": 2.8398,
"step": 330
},
{
"epoch": 0.24905944319036868,
"grad_norm": 31.948139190673828,
"learning_rate": 7.916006339144216e-06,
"loss": 3.0576,
"step": 331
},
{
"epoch": 0.24981188863807374,
"grad_norm": 22.187664031982422,
"learning_rate": 7.908082408874802e-06,
"loss": 3.3047,
"step": 332
},
{
"epoch": 0.2505643340857788,
"grad_norm": 34.52567672729492,
"learning_rate": 7.900158478605389e-06,
"loss": 3.168,
"step": 333
},
{
"epoch": 0.2513167795334838,
"grad_norm": 21.3934326171875,
"learning_rate": 7.892234548335976e-06,
"loss": 3.041,
"step": 334
},
{
"epoch": 0.2520692249811889,
"grad_norm": 42.445343017578125,
"learning_rate": 7.884310618066563e-06,
"loss": 2.6445,
"step": 335
},
{
"epoch": 0.2528216704288939,
"grad_norm": 36.86784744262695,
"learning_rate": 7.876386687797148e-06,
"loss": 2.7383,
"step": 336
},
{
"epoch": 0.25357411587659895,
"grad_norm": 56.01811981201172,
"learning_rate": 7.868462757527733e-06,
"loss": 2.6211,
"step": 337
},
{
"epoch": 0.254326561324304,
"grad_norm": 23.526220321655273,
"learning_rate": 7.86053882725832e-06,
"loss": 2.6914,
"step": 338
},
{
"epoch": 0.255079006772009,
"grad_norm": 23.751649856567383,
"learning_rate": 7.852614896988907e-06,
"loss": 2.9531,
"step": 339
},
{
"epoch": 0.2558314522197141,
"grad_norm": 16.780838012695312,
"learning_rate": 7.844690966719494e-06,
"loss": 2.6992,
"step": 340
},
{
"epoch": 0.2565838976674191,
"grad_norm": 31.69736671447754,
"learning_rate": 7.83676703645008e-06,
"loss": 3.0254,
"step": 341
},
{
"epoch": 0.25733634311512416,
"grad_norm": 26.73095703125,
"learning_rate": 7.828843106180665e-06,
"loss": 2.3555,
"step": 342
},
{
"epoch": 0.2580887885628292,
"grad_norm": 19.62910270690918,
"learning_rate": 7.820919175911252e-06,
"loss": 2.6621,
"step": 343
},
{
"epoch": 0.2588412340105342,
"grad_norm": 25.195018768310547,
"learning_rate": 7.81299524564184e-06,
"loss": 2.5732,
"step": 344
},
{
"epoch": 0.2595936794582393,
"grad_norm": 28.618427276611328,
"learning_rate": 7.805071315372426e-06,
"loss": 2.8193,
"step": 345
},
{
"epoch": 0.26034612490594433,
"grad_norm": 35.68719482421875,
"learning_rate": 7.797147385103012e-06,
"loss": 2.916,
"step": 346
},
{
"epoch": 0.26109857035364936,
"grad_norm": 28.501699447631836,
"learning_rate": 7.789223454833597e-06,
"loss": 2.541,
"step": 347
},
{
"epoch": 0.2618510158013544,
"grad_norm": 24.121973037719727,
"learning_rate": 7.781299524564184e-06,
"loss": 2.7383,
"step": 348
},
{
"epoch": 0.2626034612490594,
"grad_norm": 27.293880462646484,
"learning_rate": 7.773375594294771e-06,
"loss": 2.8281,
"step": 349
},
{
"epoch": 0.2633559066967645,
"grad_norm": 26.175559997558594,
"learning_rate": 7.765451664025358e-06,
"loss": 2.9297,
"step": 350
},
{
"epoch": 0.26410835214446954,
"grad_norm": 46.64474868774414,
"learning_rate": 7.757527733755943e-06,
"loss": 2.8604,
"step": 351
},
{
"epoch": 0.26486079759217457,
"grad_norm": 25.304244995117188,
"learning_rate": 7.749603803486529e-06,
"loss": 3.1445,
"step": 352
},
{
"epoch": 0.2656132430398796,
"grad_norm": 24.73731803894043,
"learning_rate": 7.741679873217116e-06,
"loss": 2.4854,
"step": 353
},
{
"epoch": 0.26636568848758463,
"grad_norm": 23.45802116394043,
"learning_rate": 7.733755942947703e-06,
"loss": 2.5186,
"step": 354
},
{
"epoch": 0.2671181339352897,
"grad_norm": 56.9620361328125,
"learning_rate": 7.72583201267829e-06,
"loss": 2.8076,
"step": 355
},
{
"epoch": 0.26787057938299474,
"grad_norm": 33.860206604003906,
"learning_rate": 7.717908082408875e-06,
"loss": 2.6973,
"step": 356
},
{
"epoch": 0.2686230248306998,
"grad_norm": 45.37279510498047,
"learning_rate": 7.70998415213946e-06,
"loss": 2.8223,
"step": 357
},
{
"epoch": 0.2693754702784048,
"grad_norm": 18.210763931274414,
"learning_rate": 7.702060221870048e-06,
"loss": 2.4668,
"step": 358
},
{
"epoch": 0.27012791572610984,
"grad_norm": 25.83952522277832,
"learning_rate": 7.694136291600635e-06,
"loss": 2.7012,
"step": 359
},
{
"epoch": 0.2708803611738149,
"grad_norm": 27.17302894592285,
"learning_rate": 7.686212361331222e-06,
"loss": 2.4834,
"step": 360
},
{
"epoch": 0.27163280662151995,
"grad_norm": 20.181354522705078,
"learning_rate": 7.678288431061807e-06,
"loss": 3.4531,
"step": 361
},
{
"epoch": 0.272385252069225,
"grad_norm": 40.30481719970703,
"learning_rate": 7.670364500792394e-06,
"loss": 2.9521,
"step": 362
},
{
"epoch": 0.27313769751693,
"grad_norm": 53.8978271484375,
"learning_rate": 7.66244057052298e-06,
"loss": 2.3486,
"step": 363
},
{
"epoch": 0.27389014296463504,
"grad_norm": 28.822010040283203,
"learning_rate": 7.654516640253566e-06,
"loss": 3.0176,
"step": 364
},
{
"epoch": 0.2746425884123401,
"grad_norm": 30.344985961914062,
"learning_rate": 7.646592709984154e-06,
"loss": 2.6895,
"step": 365
},
{
"epoch": 0.27539503386004516,
"grad_norm": 31.335912704467773,
"learning_rate": 7.638668779714739e-06,
"loss": 2.5693,
"step": 366
},
{
"epoch": 0.2761474793077502,
"grad_norm": 29.214174270629883,
"learning_rate": 7.630744849445326e-06,
"loss": 3.4844,
"step": 367
},
{
"epoch": 0.2768999247554552,
"grad_norm": 31.165367126464844,
"learning_rate": 7.622820919175912e-06,
"loss": 2.9199,
"step": 368
},
{
"epoch": 0.27765237020316025,
"grad_norm": 31.91497230529785,
"learning_rate": 7.614896988906498e-06,
"loss": 3.2559,
"step": 369
},
{
"epoch": 0.27840481565086533,
"grad_norm": 17.61781120300293,
"learning_rate": 7.606973058637085e-06,
"loss": 2.332,
"step": 370
},
{
"epoch": 0.27915726109857036,
"grad_norm": 22.988460540771484,
"learning_rate": 7.5990491283676715e-06,
"loss": 2.3535,
"step": 371
},
{
"epoch": 0.2799097065462754,
"grad_norm": 22.7020263671875,
"learning_rate": 7.591125198098257e-06,
"loss": 2.9844,
"step": 372
},
{
"epoch": 0.2806621519939804,
"grad_norm": 21.630903244018555,
"learning_rate": 7.583201267828844e-06,
"loss": 2.5059,
"step": 373
},
{
"epoch": 0.28141459744168545,
"grad_norm": 17.710723876953125,
"learning_rate": 7.57527733755943e-06,
"loss": 2.8945,
"step": 374
},
{
"epoch": 0.28216704288939054,
"grad_norm": 19.4321231842041,
"learning_rate": 7.567353407290017e-06,
"loss": 2.8369,
"step": 375
},
{
"epoch": 0.28291948833709557,
"grad_norm": 48.42848587036133,
"learning_rate": 7.559429477020603e-06,
"loss": 3.0225,
"step": 376
},
{
"epoch": 0.2836719337848006,
"grad_norm": 34.190399169921875,
"learning_rate": 7.551505546751189e-06,
"loss": 2.4053,
"step": 377
},
{
"epoch": 0.28442437923250563,
"grad_norm": 19.983348846435547,
"learning_rate": 7.543581616481776e-06,
"loss": 2.6807,
"step": 378
},
{
"epoch": 0.28517682468021066,
"grad_norm": 19.783525466918945,
"learning_rate": 7.535657686212362e-06,
"loss": 2.5547,
"step": 379
},
{
"epoch": 0.28592927012791575,
"grad_norm": 49.98888397216797,
"learning_rate": 7.527733755942949e-06,
"loss": 3.2402,
"step": 380
},
{
"epoch": 0.2866817155756208,
"grad_norm": 38.5321044921875,
"learning_rate": 7.519809825673535e-06,
"loss": 2.9453,
"step": 381
},
{
"epoch": 0.2874341610233258,
"grad_norm": 36.0396842956543,
"learning_rate": 7.5118858954041205e-06,
"loss": 2.6807,
"step": 382
},
{
"epoch": 0.28818660647103084,
"grad_norm": 24.879011154174805,
"learning_rate": 7.5039619651347075e-06,
"loss": 2.4502,
"step": 383
},
{
"epoch": 0.28893905191873587,
"grad_norm": 22.27727699279785,
"learning_rate": 7.496038034865294e-06,
"loss": 2.5293,
"step": 384
},
{
"epoch": 0.28969149736644095,
"grad_norm": 45.97170639038086,
"learning_rate": 7.488114104595881e-06,
"loss": 2.7891,
"step": 385
},
{
"epoch": 0.290443942814146,
"grad_norm": 51.14545822143555,
"learning_rate": 7.480190174326466e-06,
"loss": 3.1641,
"step": 386
},
{
"epoch": 0.291196388261851,
"grad_norm": 58.7318000793457,
"learning_rate": 7.472266244057052e-06,
"loss": 2.4834,
"step": 387
},
{
"epoch": 0.29194883370955604,
"grad_norm": 38.557979583740234,
"learning_rate": 7.464342313787639e-06,
"loss": 2.4551,
"step": 388
},
{
"epoch": 0.2927012791572611,
"grad_norm": 30.150957107543945,
"learning_rate": 7.4564183835182255e-06,
"loss": 3.127,
"step": 389
},
{
"epoch": 0.29345372460496616,
"grad_norm": 24.9825496673584,
"learning_rate": 7.4484944532488126e-06,
"loss": 2.9766,
"step": 390
},
{
"epoch": 0.2942061700526712,
"grad_norm": 34.27793884277344,
"learning_rate": 7.440570522979398e-06,
"loss": 2.8467,
"step": 391
},
{
"epoch": 0.2949586155003762,
"grad_norm": 40.562530517578125,
"learning_rate": 7.432646592709984e-06,
"loss": 2.7354,
"step": 392
},
{
"epoch": 0.29571106094808125,
"grad_norm": 27.370962142944336,
"learning_rate": 7.424722662440571e-06,
"loss": 2.6748,
"step": 393
},
{
"epoch": 0.2964635063957863,
"grad_norm": 23.176212310791016,
"learning_rate": 7.416798732171157e-06,
"loss": 2.3291,
"step": 394
},
{
"epoch": 0.29721595184349137,
"grad_norm": 20.481536865234375,
"learning_rate": 7.408874801901744e-06,
"loss": 2.5684,
"step": 395
},
{
"epoch": 0.2979683972911964,
"grad_norm": 18.736602783203125,
"learning_rate": 7.40095087163233e-06,
"loss": 2.4946,
"step": 396
},
{
"epoch": 0.2987208427389014,
"grad_norm": 21.316904067993164,
"learning_rate": 7.393026941362916e-06,
"loss": 2.8203,
"step": 397
},
{
"epoch": 0.29947328818660646,
"grad_norm": 23.690597534179688,
"learning_rate": 7.385103011093503e-06,
"loss": 3.0039,
"step": 398
},
{
"epoch": 0.3002257336343115,
"grad_norm": 37.80546188354492,
"learning_rate": 7.377179080824089e-06,
"loss": 2.4521,
"step": 399
},
{
"epoch": 0.3009781790820166,
"grad_norm": 34.161041259765625,
"learning_rate": 7.369255150554676e-06,
"loss": 2.6895,
"step": 400
},
{
"epoch": 0.3017306245297216,
"grad_norm": 30.497211456298828,
"learning_rate": 7.3613312202852615e-06,
"loss": 3.082,
"step": 401
},
{
"epoch": 0.30248306997742663,
"grad_norm": 18.200454711914062,
"learning_rate": 7.3534072900158486e-06,
"loss": 2.8096,
"step": 402
},
{
"epoch": 0.30323551542513166,
"grad_norm": 21.7595157623291,
"learning_rate": 7.345483359746435e-06,
"loss": 2.9072,
"step": 403
},
{
"epoch": 0.3039879608728367,
"grad_norm": 27.392026901245117,
"learning_rate": 7.337559429477021e-06,
"loss": 3.0117,
"step": 404
},
{
"epoch": 0.3047404063205418,
"grad_norm": 26.6084041595459,
"learning_rate": 7.329635499207608e-06,
"loss": 2.3936,
"step": 405
},
{
"epoch": 0.3054928517682468,
"grad_norm": 27.991764068603516,
"learning_rate": 7.321711568938193e-06,
"loss": 2.4004,
"step": 406
},
{
"epoch": 0.30624529721595184,
"grad_norm": 14.712749481201172,
"learning_rate": 7.31378763866878e-06,
"loss": 2.458,
"step": 407
},
{
"epoch": 0.30699774266365687,
"grad_norm": 23.980497360229492,
"learning_rate": 7.305863708399367e-06,
"loss": 3.0781,
"step": 408
},
{
"epoch": 0.3077501881113619,
"grad_norm": 19.64727783203125,
"learning_rate": 7.297939778129954e-06,
"loss": 2.7021,
"step": 409
},
{
"epoch": 0.308502633559067,
"grad_norm": 24.826862335205078,
"learning_rate": 7.29001584786054e-06,
"loss": 2.7246,
"step": 410
},
{
"epoch": 0.309255079006772,
"grad_norm": 27.15070343017578,
"learning_rate": 7.282091917591125e-06,
"loss": 2.8262,
"step": 411
},
{
"epoch": 0.31000752445447705,
"grad_norm": 19.45827293395996,
"learning_rate": 7.274167987321712e-06,
"loss": 2.8467,
"step": 412
},
{
"epoch": 0.3107599699021821,
"grad_norm": 28.588729858398438,
"learning_rate": 7.266244057052298e-06,
"loss": 2.8027,
"step": 413
},
{
"epoch": 0.3115124153498871,
"grad_norm": 20.850120544433594,
"learning_rate": 7.2583201267828854e-06,
"loss": 2.4756,
"step": 414
},
{
"epoch": 0.3122648607975922,
"grad_norm": 40.70508575439453,
"learning_rate": 7.250396196513472e-06,
"loss": 2.9219,
"step": 415
},
{
"epoch": 0.3130173062452972,
"grad_norm": 44.51210403442383,
"learning_rate": 7.242472266244057e-06,
"loss": 2.8672,
"step": 416
},
{
"epoch": 0.31376975169300225,
"grad_norm": 25.947845458984375,
"learning_rate": 7.234548335974644e-06,
"loss": 2.8848,
"step": 417
},
{
"epoch": 0.3145221971407073,
"grad_norm": 31.283601760864258,
"learning_rate": 7.22662440570523e-06,
"loss": 2.834,
"step": 418
},
{
"epoch": 0.3152746425884123,
"grad_norm": 32.150917053222656,
"learning_rate": 7.218700475435817e-06,
"loss": 2.6348,
"step": 419
},
{
"epoch": 0.3160270880361174,
"grad_norm": 25.346118927001953,
"learning_rate": 7.2107765451664034e-06,
"loss": 2.8711,
"step": 420
},
{
"epoch": 0.31677953348382243,
"grad_norm": 33.96476364135742,
"learning_rate": 7.202852614896989e-06,
"loss": 3.2031,
"step": 421
},
{
"epoch": 0.31753197893152746,
"grad_norm": 19.30603790283203,
"learning_rate": 7.194928684627576e-06,
"loss": 2.668,
"step": 422
},
{
"epoch": 0.3182844243792325,
"grad_norm": 19.03653907775879,
"learning_rate": 7.187004754358162e-06,
"loss": 2.4395,
"step": 423
},
{
"epoch": 0.3190368698269376,
"grad_norm": 26.007007598876953,
"learning_rate": 7.179080824088749e-06,
"loss": 2.6426,
"step": 424
},
{
"epoch": 0.3197893152746426,
"grad_norm": 19.245302200317383,
"learning_rate": 7.171156893819335e-06,
"loss": 2.8105,
"step": 425
},
{
"epoch": 0.32054176072234764,
"grad_norm": 19.74489402770996,
"learning_rate": 7.163232963549921e-06,
"loss": 2.9902,
"step": 426
},
{
"epoch": 0.32129420617005267,
"grad_norm": 39.224300384521484,
"learning_rate": 7.155309033280508e-06,
"loss": 3.1826,
"step": 427
},
{
"epoch": 0.3220466516177577,
"grad_norm": 19.44862174987793,
"learning_rate": 7.147385103011094e-06,
"loss": 2.4424,
"step": 428
},
{
"epoch": 0.3227990970654628,
"grad_norm": 19.365638732910156,
"learning_rate": 7.139461172741681e-06,
"loss": 3.041,
"step": 429
},
{
"epoch": 0.3235515425131678,
"grad_norm": 44.24783706665039,
"learning_rate": 7.131537242472267e-06,
"loss": 2.9551,
"step": 430
},
{
"epoch": 0.32430398796087284,
"grad_norm": 44.232093811035156,
"learning_rate": 7.123613312202852e-06,
"loss": 2.8555,
"step": 431
},
{
"epoch": 0.32505643340857787,
"grad_norm": 21.96976661682129,
"learning_rate": 7.1156893819334394e-06,
"loss": 2.7236,
"step": 432
},
{
"epoch": 0.3258088788562829,
"grad_norm": 36.008201599121094,
"learning_rate": 7.107765451664026e-06,
"loss": 2.7588,
"step": 433
},
{
"epoch": 0.326561324303988,
"grad_norm": 22.36960220336914,
"learning_rate": 7.099841521394613e-06,
"loss": 2.2725,
"step": 434
},
{
"epoch": 0.327313769751693,
"grad_norm": 41.12551498413086,
"learning_rate": 7.091917591125199e-06,
"loss": 2.7461,
"step": 435
},
{
"epoch": 0.32806621519939805,
"grad_norm": 42.819915771484375,
"learning_rate": 7.083993660855785e-06,
"loss": 2.998,
"step": 436
},
{
"epoch": 0.3288186606471031,
"grad_norm": 82.13551330566406,
"learning_rate": 7.076069730586371e-06,
"loss": 3.2363,
"step": 437
},
{
"epoch": 0.3295711060948081,
"grad_norm": 43.233760833740234,
"learning_rate": 7.0681458003169574e-06,
"loss": 2.2656,
"step": 438
},
{
"epoch": 0.3303235515425132,
"grad_norm": 19.841270446777344,
"learning_rate": 7.0602218700475445e-06,
"loss": 2.7637,
"step": 439
},
{
"epoch": 0.3310759969902182,
"grad_norm": 20.722280502319336,
"learning_rate": 7.052297939778131e-06,
"loss": 3.1152,
"step": 440
},
{
"epoch": 0.33182844243792325,
"grad_norm": 56.44236755371094,
"learning_rate": 7.044374009508717e-06,
"loss": 2.8633,
"step": 441
},
{
"epoch": 0.3325808878856283,
"grad_norm": 28.693634033203125,
"learning_rate": 7.036450079239303e-06,
"loss": 2.9644,
"step": 442
},
{
"epoch": 0.3333333333333333,
"grad_norm": 20.19970703125,
"learning_rate": 7.02852614896989e-06,
"loss": 2.374,
"step": 443
},
{
"epoch": 0.3340857787810384,
"grad_norm": 37.031002044677734,
"learning_rate": 7.020602218700476e-06,
"loss": 3.0312,
"step": 444
},
{
"epoch": 0.33483822422874343,
"grad_norm": 25.726011276245117,
"learning_rate": 7.0126782884310625e-06,
"loss": 2.6094,
"step": 445
},
{
"epoch": 0.33559066967644846,
"grad_norm": 19.664955139160156,
"learning_rate": 7.004754358161649e-06,
"loss": 2.498,
"step": 446
},
{
"epoch": 0.3363431151241535,
"grad_norm": 35.93477249145508,
"learning_rate": 6.996830427892235e-06,
"loss": 2.8789,
"step": 447
},
{
"epoch": 0.3370955605718585,
"grad_norm": 24.938539505004883,
"learning_rate": 6.988906497622822e-06,
"loss": 3.0039,
"step": 448
},
{
"epoch": 0.3378480060195636,
"grad_norm": 18.138105392456055,
"learning_rate": 6.980982567353408e-06,
"loss": 2.3467,
"step": 449
},
{
"epoch": 0.33860045146726864,
"grad_norm": 30.399091720581055,
"learning_rate": 6.973058637083995e-06,
"loss": 2.3643,
"step": 450
},
{
"epoch": 0.33935289691497367,
"grad_norm": 28.36125373840332,
"learning_rate": 6.9651347068145805e-06,
"loss": 2.8232,
"step": 451
},
{
"epoch": 0.3401053423626787,
"grad_norm": 24.042072296142578,
"learning_rate": 6.957210776545167e-06,
"loss": 2.9639,
"step": 452
},
{
"epoch": 0.34085778781038373,
"grad_norm": 20.310359954833984,
"learning_rate": 6.949286846275754e-06,
"loss": 2.2578,
"step": 453
},
{
"epoch": 0.3416102332580888,
"grad_norm": 31.458209991455078,
"learning_rate": 6.94136291600634e-06,
"loss": 2.7305,
"step": 454
},
{
"epoch": 0.34236267870579384,
"grad_norm": 16.078969955444336,
"learning_rate": 6.933438985736925e-06,
"loss": 2.376,
"step": 455
},
{
"epoch": 0.3431151241534989,
"grad_norm": 24.066688537597656,
"learning_rate": 6.925515055467512e-06,
"loss": 2.7109,
"step": 456
},
{
"epoch": 0.3438675696012039,
"grad_norm": 25.2286376953125,
"learning_rate": 6.9175911251980985e-06,
"loss": 2.2617,
"step": 457
},
{
"epoch": 0.34462001504890893,
"grad_norm": 18.005447387695312,
"learning_rate": 6.9096671949286855e-06,
"loss": 2.5898,
"step": 458
},
{
"epoch": 0.345372460496614,
"grad_norm": 41.71696472167969,
"learning_rate": 6.901743264659272e-06,
"loss": 2.9941,
"step": 459
},
{
"epoch": 0.34612490594431905,
"grad_norm": 20.321922302246094,
"learning_rate": 6.893819334389857e-06,
"loss": 2.3975,
"step": 460
},
{
"epoch": 0.3468773513920241,
"grad_norm": 35.73655700683594,
"learning_rate": 6.885895404120444e-06,
"loss": 2.8447,
"step": 461
},
{
"epoch": 0.3476297968397291,
"grad_norm": 26.43678092956543,
"learning_rate": 6.87797147385103e-06,
"loss": 2.4873,
"step": 462
},
{
"epoch": 0.34838224228743414,
"grad_norm": 17.385337829589844,
"learning_rate": 6.870047543581617e-06,
"loss": 2.5557,
"step": 463
},
{
"epoch": 0.3491346877351392,
"grad_norm": 19.804067611694336,
"learning_rate": 6.8621236133122035e-06,
"loss": 2.1064,
"step": 464
},
{
"epoch": 0.34988713318284426,
"grad_norm": 40.8009033203125,
"learning_rate": 6.854199683042789e-06,
"loss": 2.8848,
"step": 465
},
{
"epoch": 0.3506395786305493,
"grad_norm": 41.40995407104492,
"learning_rate": 6.846275752773376e-06,
"loss": 2.8926,
"step": 466
},
{
"epoch": 0.3513920240782543,
"grad_norm": 26.25147819519043,
"learning_rate": 6.838351822503962e-06,
"loss": 2.6045,
"step": 467
},
{
"epoch": 0.35214446952595935,
"grad_norm": 23.980907440185547,
"learning_rate": 6.830427892234549e-06,
"loss": 2.0391,
"step": 468
},
{
"epoch": 0.35289691497366443,
"grad_norm": 23.586442947387695,
"learning_rate": 6.822503961965135e-06,
"loss": 2.6167,
"step": 469
},
{
"epoch": 0.35364936042136946,
"grad_norm": 36.71504592895508,
"learning_rate": 6.8145800316957216e-06,
"loss": 3.2363,
"step": 470
},
{
"epoch": 0.3544018058690745,
"grad_norm": 30.827280044555664,
"learning_rate": 6.806656101426308e-06,
"loss": 2.9033,
"step": 471
},
{
"epoch": 0.3551542513167795,
"grad_norm": 27.950368881225586,
"learning_rate": 6.798732171156894e-06,
"loss": 2.4824,
"step": 472
},
{
"epoch": 0.35590669676448455,
"grad_norm": 51.84375,
"learning_rate": 6.790808240887481e-06,
"loss": 2.6455,
"step": 473
},
{
"epoch": 0.35665914221218964,
"grad_norm": 19.127580642700195,
"learning_rate": 6.782884310618067e-06,
"loss": 2.043,
"step": 474
},
{
"epoch": 0.35741158765989467,
"grad_norm": 20.604124069213867,
"learning_rate": 6.774960380348653e-06,
"loss": 3.0059,
"step": 475
},
{
"epoch": 0.3581640331075997,
"grad_norm": 20.379287719726562,
"learning_rate": 6.7670364500792396e-06,
"loss": 2.6055,
"step": 476
},
{
"epoch": 0.35891647855530473,
"grad_norm": 37.65190124511719,
"learning_rate": 6.759112519809827e-06,
"loss": 2.3867,
"step": 477
},
{
"epoch": 0.35966892400300976,
"grad_norm": 27.957950592041016,
"learning_rate": 6.751188589540413e-06,
"loss": 2.7119,
"step": 478
},
{
"epoch": 0.36042136945071485,
"grad_norm": 20.14349937438965,
"learning_rate": 6.743264659270999e-06,
"loss": 2.6367,
"step": 479
},
{
"epoch": 0.3611738148984199,
"grad_norm": 21.61663818359375,
"learning_rate": 6.735340729001585e-06,
"loss": 2.7178,
"step": 480
},
{
"epoch": 0.3619262603461249,
"grad_norm": 34.000892639160156,
"learning_rate": 6.727416798732171e-06,
"loss": 3.0371,
"step": 481
},
{
"epoch": 0.36267870579382994,
"grad_norm": 29.969745635986328,
"learning_rate": 6.719492868462758e-06,
"loss": 2.4268,
"step": 482
},
{
"epoch": 0.36343115124153497,
"grad_norm": 17.91866111755371,
"learning_rate": 6.711568938193345e-06,
"loss": 2.4336,
"step": 483
},
{
"epoch": 0.36418359668924005,
"grad_norm": 17.030073165893555,
"learning_rate": 6.703645007923932e-06,
"loss": 2.4556,
"step": 484
},
{
"epoch": 0.3649360421369451,
"grad_norm": 35.608009338378906,
"learning_rate": 6.695721077654517e-06,
"loss": 2.8691,
"step": 485
},
{
"epoch": 0.3656884875846501,
"grad_norm": 30.210041046142578,
"learning_rate": 6.687797147385103e-06,
"loss": 3.2031,
"step": 486
},
{
"epoch": 0.36644093303235514,
"grad_norm": 39.14574432373047,
"learning_rate": 6.67987321711569e-06,
"loss": 2.6631,
"step": 487
},
{
"epoch": 0.3671933784800602,
"grad_norm": 21.58750343322754,
"learning_rate": 6.671949286846276e-06,
"loss": 2.6328,
"step": 488
},
{
"epoch": 0.36794582392776526,
"grad_norm": 23.24883270263672,
"learning_rate": 6.6640253565768635e-06,
"loss": 3.1035,
"step": 489
},
{
"epoch": 0.3686982693754703,
"grad_norm": 29.510072708129883,
"learning_rate": 6.656101426307449e-06,
"loss": 3.4375,
"step": 490
},
{
"epoch": 0.3694507148231753,
"grad_norm": 19.159259796142578,
"learning_rate": 6.648177496038035e-06,
"loss": 2.4209,
"step": 491
},
{
"epoch": 0.37020316027088035,
"grad_norm": 33.7953987121582,
"learning_rate": 6.640253565768622e-06,
"loss": 2.4502,
"step": 492
},
{
"epoch": 0.3709556057185854,
"grad_norm": 20.189437866210938,
"learning_rate": 6.632329635499208e-06,
"loss": 2.2578,
"step": 493
},
{
"epoch": 0.37170805116629047,
"grad_norm": 24.063936233520508,
"learning_rate": 6.624405705229795e-06,
"loss": 2.75,
"step": 494
},
{
"epoch": 0.3724604966139955,
"grad_norm": 28.4869384765625,
"learning_rate": 6.616481774960381e-06,
"loss": 2.5381,
"step": 495
},
{
"epoch": 0.3732129420617005,
"grad_norm": 30.88617706298828,
"learning_rate": 6.608557844690967e-06,
"loss": 2.5,
"step": 496
},
{
"epoch": 0.37396538750940556,
"grad_norm": 22.271657943725586,
"learning_rate": 6.600633914421554e-06,
"loss": 3.2891,
"step": 497
},
{
"epoch": 0.3747178329571106,
"grad_norm": 27.613380432128906,
"learning_rate": 6.59270998415214e-06,
"loss": 2.8301,
"step": 498
},
{
"epoch": 0.37547027840481567,
"grad_norm": 19.099576950073242,
"learning_rate": 6.584786053882727e-06,
"loss": 2.3486,
"step": 499
},
{
"epoch": 0.3762227238525207,
"grad_norm": 30.90146827697754,
"learning_rate": 6.5768621236133124e-06,
"loss": 2.4004,
"step": 500
},
{
"epoch": 0.37697516930022573,
"grad_norm": 45.43968200683594,
"learning_rate": 6.568938193343899e-06,
"loss": 2.4746,
"step": 501
},
{
"epoch": 0.37772761474793076,
"grad_norm": 19.64021110534668,
"learning_rate": 6.561014263074486e-06,
"loss": 2.0723,
"step": 502
},
{
"epoch": 0.3784800601956358,
"grad_norm": 20.16904067993164,
"learning_rate": 6.553090332805072e-06,
"loss": 2.3096,
"step": 503
},
{
"epoch": 0.3792325056433409,
"grad_norm": 40.98538589477539,
"learning_rate": 6.545166402535659e-06,
"loss": 2.3213,
"step": 504
},
{
"epoch": 0.3799849510910459,
"grad_norm": 24.2117862701416,
"learning_rate": 6.537242472266244e-06,
"loss": 2.7021,
"step": 505
},
{
"epoch": 0.38073739653875094,
"grad_norm": 23.935213088989258,
"learning_rate": 6.5293185419968304e-06,
"loss": 2.293,
"step": 506
},
{
"epoch": 0.38148984198645597,
"grad_norm": 34.53661346435547,
"learning_rate": 6.5213946117274175e-06,
"loss": 2.3984,
"step": 507
},
{
"epoch": 0.382242287434161,
"grad_norm": 25.864501953125,
"learning_rate": 6.513470681458004e-06,
"loss": 2.2148,
"step": 508
},
{
"epoch": 0.3829947328818661,
"grad_norm": 32.563419342041016,
"learning_rate": 6.505546751188591e-06,
"loss": 3.1729,
"step": 509
},
{
"epoch": 0.3837471783295711,
"grad_norm": 22.276206970214844,
"learning_rate": 6.497622820919176e-06,
"loss": 2.6211,
"step": 510
},
{
"epoch": 0.38449962377727614,
"grad_norm": 26.751834869384766,
"learning_rate": 6.489698890649762e-06,
"loss": 2.6377,
"step": 511
},
{
"epoch": 0.3852520692249812,
"grad_norm": 20.746671676635742,
"learning_rate": 6.481774960380349e-06,
"loss": 2.4854,
"step": 512
},
{
"epoch": 0.3860045146726862,
"grad_norm": 32.32805633544922,
"learning_rate": 6.4738510301109355e-06,
"loss": 3.1035,
"step": 513
},
{
"epoch": 0.3867569601203913,
"grad_norm": 17.703580856323242,
"learning_rate": 6.4659270998415225e-06,
"loss": 2.3711,
"step": 514
},
{
"epoch": 0.3875094055680963,
"grad_norm": 27.50429344177246,
"learning_rate": 6.458003169572108e-06,
"loss": 2.6309,
"step": 515
},
{
"epoch": 0.38826185101580135,
"grad_norm": 21.104827880859375,
"learning_rate": 6.450079239302695e-06,
"loss": 2.6279,
"step": 516
},
{
"epoch": 0.3890142964635064,
"grad_norm": 20.792558670043945,
"learning_rate": 6.442155309033281e-06,
"loss": 2.6025,
"step": 517
},
{
"epoch": 0.3897667419112114,
"grad_norm": 35.51402282714844,
"learning_rate": 6.434231378763868e-06,
"loss": 2.499,
"step": 518
},
{
"epoch": 0.3905191873589165,
"grad_norm": 17.71217155456543,
"learning_rate": 6.426307448494454e-06,
"loss": 2.1094,
"step": 519
},
{
"epoch": 0.3912716328066215,
"grad_norm": 20.250795364379883,
"learning_rate": 6.41838351822504e-06,
"loss": 2.8779,
"step": 520
},
{
"epoch": 0.39202407825432656,
"grad_norm": 24.222923278808594,
"learning_rate": 6.410459587955627e-06,
"loss": 2.5967,
"step": 521
},
{
"epoch": 0.3927765237020316,
"grad_norm": 61.864662170410156,
"learning_rate": 6.402535657686213e-06,
"loss": 3.1523,
"step": 522
},
{
"epoch": 0.3935289691497366,
"grad_norm": 39.048553466796875,
"learning_rate": 6.3946117274168e-06,
"loss": 2.7773,
"step": 523
},
{
"epoch": 0.3942814145974417,
"grad_norm": 17.10451316833496,
"learning_rate": 6.386687797147385e-06,
"loss": 2.4375,
"step": 524
},
{
"epoch": 0.39503386004514673,
"grad_norm": 47.01410675048828,
"learning_rate": 6.3787638668779715e-06,
"loss": 2.5957,
"step": 525
},
{
"epoch": 0.39578630549285176,
"grad_norm": 29.18340301513672,
"learning_rate": 6.3708399366085585e-06,
"loss": 2.6543,
"step": 526
},
{
"epoch": 0.3965387509405568,
"grad_norm": 20.794097900390625,
"learning_rate": 6.362916006339145e-06,
"loss": 2.6143,
"step": 527
},
{
"epoch": 0.3972911963882618,
"grad_norm": 24.712759017944336,
"learning_rate": 6.354992076069732e-06,
"loss": 2.1914,
"step": 528
},
{
"epoch": 0.3980436418359669,
"grad_norm": 35.4837646484375,
"learning_rate": 6.347068145800317e-06,
"loss": 2.5078,
"step": 529
},
{
"epoch": 0.39879608728367194,
"grad_norm": 45.86260986328125,
"learning_rate": 6.339144215530903e-06,
"loss": 3.543,
"step": 530
},
{
"epoch": 0.39954853273137697,
"grad_norm": 59.333621978759766,
"learning_rate": 6.33122028526149e-06,
"loss": 2.9795,
"step": 531
},
{
"epoch": 0.400300978179082,
"grad_norm": 24.066104888916016,
"learning_rate": 6.3232963549920765e-06,
"loss": 2.627,
"step": 532
},
{
"epoch": 0.40105342362678703,
"grad_norm": 26.95193099975586,
"learning_rate": 6.3153724247226636e-06,
"loss": 2.5371,
"step": 533
},
{
"epoch": 0.4018058690744921,
"grad_norm": 23.84280776977539,
"learning_rate": 6.307448494453249e-06,
"loss": 1.8809,
"step": 534
},
{
"epoch": 0.40255831452219715,
"grad_norm": 22.4331111907959,
"learning_rate": 6.299524564183835e-06,
"loss": 2.3213,
"step": 535
},
{
"epoch": 0.4033107599699022,
"grad_norm": 22.308320999145508,
"learning_rate": 6.291600633914422e-06,
"loss": 2.2129,
"step": 536
},
{
"epoch": 0.4040632054176072,
"grad_norm": 26.568504333496094,
"learning_rate": 6.283676703645008e-06,
"loss": 2.3184,
"step": 537
},
{
"epoch": 0.40481565086531224,
"grad_norm": 66.00670623779297,
"learning_rate": 6.275752773375595e-06,
"loss": 3.3457,
"step": 538
},
{
"epoch": 0.4055680963130173,
"grad_norm": 43.05216979980469,
"learning_rate": 6.267828843106181e-06,
"loss": 2.3125,
"step": 539
},
{
"epoch": 0.40632054176072235,
"grad_norm": 31.19127655029297,
"learning_rate": 6.259904912836767e-06,
"loss": 2.623,
"step": 540
},
{
"epoch": 0.4070729872084274,
"grad_norm": 27.80292320251465,
"learning_rate": 6.251980982567354e-06,
"loss": 2.7129,
"step": 541
},
{
"epoch": 0.4078254326561324,
"grad_norm": 22.0871524810791,
"learning_rate": 6.24405705229794e-06,
"loss": 2.5322,
"step": 542
},
{
"epoch": 0.40857787810383744,
"grad_norm": 46.346588134765625,
"learning_rate": 6.236133122028527e-06,
"loss": 2.9668,
"step": 543
},
{
"epoch": 0.40933032355154253,
"grad_norm": 55.41552734375,
"learning_rate": 6.2282091917591125e-06,
"loss": 2.542,
"step": 544
},
{
"epoch": 0.41008276899924756,
"grad_norm": 21.744953155517578,
"learning_rate": 6.220285261489699e-06,
"loss": 2.4756,
"step": 545
},
{
"epoch": 0.4108352144469526,
"grad_norm": 54.682456970214844,
"learning_rate": 6.212361331220286e-06,
"loss": 2.7295,
"step": 546
},
{
"epoch": 0.4115876598946576,
"grad_norm": 27.935394287109375,
"learning_rate": 6.204437400950872e-06,
"loss": 2.6572,
"step": 547
},
{
"epoch": 0.4123401053423627,
"grad_norm": 41.32936477661133,
"learning_rate": 6.196513470681459e-06,
"loss": 2.1665,
"step": 548
},
{
"epoch": 0.41309255079006774,
"grad_norm": 16.199787139892578,
"learning_rate": 6.188589540412044e-06,
"loss": 2.2158,
"step": 549
},
{
"epoch": 0.41384499623777277,
"grad_norm": 31.044275283813477,
"learning_rate": 6.180665610142631e-06,
"loss": 2.4707,
"step": 550
},
{
"epoch": 0.4145974416854778,
"grad_norm": 24.700780868530273,
"learning_rate": 6.172741679873218e-06,
"loss": 2.5107,
"step": 551
},
{
"epoch": 0.4153498871331828,
"grad_norm": 43.95857238769531,
"learning_rate": 6.164817749603804e-06,
"loss": 2.6475,
"step": 552
},
{
"epoch": 0.4161023325808879,
"grad_norm": 38.148746490478516,
"learning_rate": 6.156893819334391e-06,
"loss": 2.2852,
"step": 553
},
{
"epoch": 0.41685477802859294,
"grad_norm": 35.26376724243164,
"learning_rate": 6.148969889064976e-06,
"loss": 2.335,
"step": 554
},
{
"epoch": 0.417607223476298,
"grad_norm": 21.494600296020508,
"learning_rate": 6.141045958795563e-06,
"loss": 2.7549,
"step": 555
},
{
"epoch": 0.418359668924003,
"grad_norm": 18.99728012084961,
"learning_rate": 6.133122028526149e-06,
"loss": 2.2207,
"step": 556
},
{
"epoch": 0.41911211437170803,
"grad_norm": 21.479780197143555,
"learning_rate": 6.1251980982567364e-06,
"loss": 2.292,
"step": 557
},
{
"epoch": 0.4198645598194131,
"grad_norm": 41.83938980102539,
"learning_rate": 6.117274167987323e-06,
"loss": 3.2812,
"step": 558
},
{
"epoch": 0.42061700526711815,
"grad_norm": 27.518131256103516,
"learning_rate": 6.109350237717908e-06,
"loss": 2.2949,
"step": 559
},
{
"epoch": 0.4213694507148232,
"grad_norm": 24.70359992980957,
"learning_rate": 6.101426307448495e-06,
"loss": 2.5342,
"step": 560
},
{
"epoch": 0.4221218961625282,
"grad_norm": 29.205961227416992,
"learning_rate": 6.093502377179081e-06,
"loss": 2.0991,
"step": 561
},
{
"epoch": 0.42287434161023324,
"grad_norm": 53.45262908935547,
"learning_rate": 6.085578446909668e-06,
"loss": 2.9941,
"step": 562
},
{
"epoch": 0.4236267870579383,
"grad_norm": 22.154132843017578,
"learning_rate": 6.0776545166402544e-06,
"loss": 2.748,
"step": 563
},
{
"epoch": 0.42437923250564336,
"grad_norm": 26.759519577026367,
"learning_rate": 6.06973058637084e-06,
"loss": 2.833,
"step": 564
},
{
"epoch": 0.4251316779533484,
"grad_norm": 38.618797302246094,
"learning_rate": 6.061806656101427e-06,
"loss": 2.9258,
"step": 565
},
{
"epoch": 0.4258841234010534,
"grad_norm": 25.398147583007812,
"learning_rate": 6.053882725832013e-06,
"loss": 2.251,
"step": 566
},
{
"epoch": 0.42663656884875845,
"grad_norm": 21.547807693481445,
"learning_rate": 6.0459587955626e-06,
"loss": 2.4678,
"step": 567
},
{
"epoch": 0.42738901429646353,
"grad_norm": 30.388839721679688,
"learning_rate": 6.038034865293186e-06,
"loss": 2.3594,
"step": 568
},
{
"epoch": 0.42814145974416856,
"grad_norm": 23.49654197692871,
"learning_rate": 6.030110935023772e-06,
"loss": 2.5439,
"step": 569
},
{
"epoch": 0.4288939051918736,
"grad_norm": 20.27910614013672,
"learning_rate": 6.022187004754359e-06,
"loss": 2.6602,
"step": 570
},
{
"epoch": 0.4296463506395786,
"grad_norm": 24.437910079956055,
"learning_rate": 6.014263074484945e-06,
"loss": 3.2051,
"step": 571
},
{
"epoch": 0.43039879608728365,
"grad_norm": 34.834957122802734,
"learning_rate": 6.006339144215532e-06,
"loss": 2.5488,
"step": 572
},
{
"epoch": 0.43115124153498874,
"grad_norm": 27.473241806030273,
"learning_rate": 5.998415213946118e-06,
"loss": 2.6602,
"step": 573
},
{
"epoch": 0.43190368698269377,
"grad_norm": 25.978803634643555,
"learning_rate": 5.990491283676703e-06,
"loss": 2.6504,
"step": 574
},
{
"epoch": 0.4326561324303988,
"grad_norm": 33.283329010009766,
"learning_rate": 5.9825673534072905e-06,
"loss": 2.7822,
"step": 575
},
{
"epoch": 0.43340857787810383,
"grad_norm": 26.3125,
"learning_rate": 5.974643423137877e-06,
"loss": 2.6143,
"step": 576
},
{
"epoch": 0.43416102332580886,
"grad_norm": 17.16096305847168,
"learning_rate": 5.966719492868464e-06,
"loss": 2.2725,
"step": 577
},
{
"epoch": 0.43491346877351394,
"grad_norm": 16.176515579223633,
"learning_rate": 5.95879556259905e-06,
"loss": 1.9639,
"step": 578
},
{
"epoch": 0.435665914221219,
"grad_norm": 16.54779815673828,
"learning_rate": 5.950871632329635e-06,
"loss": 2.4492,
"step": 579
},
{
"epoch": 0.436418359668924,
"grad_norm": 43.56422805786133,
"learning_rate": 5.942947702060222e-06,
"loss": 3.1377,
"step": 580
},
{
"epoch": 0.43717080511662904,
"grad_norm": 52.872222900390625,
"learning_rate": 5.9350237717908085e-06,
"loss": 2.4629,
"step": 581
},
{
"epoch": 0.43792325056433407,
"grad_norm": 22.31889533996582,
"learning_rate": 5.9270998415213955e-06,
"loss": 2.6084,
"step": 582
},
{
"epoch": 0.43867569601203915,
"grad_norm": 19.257272720336914,
"learning_rate": 5.919175911251982e-06,
"loss": 2.3779,
"step": 583
},
{
"epoch": 0.4394281414597442,
"grad_norm": 42.08190155029297,
"learning_rate": 5.911251980982568e-06,
"loss": 2.5972,
"step": 584
},
{
"epoch": 0.4401805869074492,
"grad_norm": 30.580303192138672,
"learning_rate": 5.903328050713154e-06,
"loss": 2.2529,
"step": 585
},
{
"epoch": 0.44093303235515424,
"grad_norm": 23.7304744720459,
"learning_rate": 5.89540412044374e-06,
"loss": 2.4678,
"step": 586
},
{
"epoch": 0.44168547780285927,
"grad_norm": 40.74665451049805,
"learning_rate": 5.887480190174327e-06,
"loss": 2.4082,
"step": 587
},
{
"epoch": 0.44243792325056436,
"grad_norm": 40.44230270385742,
"learning_rate": 5.8795562599049135e-06,
"loss": 2.8574,
"step": 588
},
{
"epoch": 0.4431903686982694,
"grad_norm": 33.095977783203125,
"learning_rate": 5.8716323296355e-06,
"loss": 3.1426,
"step": 589
},
{
"epoch": 0.4439428141459744,
"grad_norm": 42.75359344482422,
"learning_rate": 5.863708399366086e-06,
"loss": 2.3594,
"step": 590
},
{
"epoch": 0.44469525959367945,
"grad_norm": 49.90977478027344,
"learning_rate": 5.855784469096673e-06,
"loss": 2.4434,
"step": 591
},
{
"epoch": 0.4454477050413845,
"grad_norm": 24.432693481445312,
"learning_rate": 5.847860538827259e-06,
"loss": 2.3477,
"step": 592
},
{
"epoch": 0.44620015048908956,
"grad_norm": 19.295387268066406,
"learning_rate": 5.839936608557845e-06,
"loss": 2.3105,
"step": 593
},
{
"epoch": 0.4469525959367946,
"grad_norm": 27.823911666870117,
"learning_rate": 5.8320126782884315e-06,
"loss": 3.0078,
"step": 594
},
{
"epoch": 0.4477050413844996,
"grad_norm": 43.58839797973633,
"learning_rate": 5.824088748019018e-06,
"loss": 2.6113,
"step": 595
},
{
"epoch": 0.44845748683220465,
"grad_norm": 26.71452522277832,
"learning_rate": 5.816164817749605e-06,
"loss": 2.5693,
"step": 596
},
{
"epoch": 0.4492099322799097,
"grad_norm": 18.590194702148438,
"learning_rate": 5.808240887480191e-06,
"loss": 2.3506,
"step": 597
},
{
"epoch": 0.44996237772761477,
"grad_norm": 24.772584915161133,
"learning_rate": 5.800316957210776e-06,
"loss": 2.7256,
"step": 598
},
{
"epoch": 0.4507148231753198,
"grad_norm": 31.634029388427734,
"learning_rate": 5.792393026941363e-06,
"loss": 2.8867,
"step": 599
},
{
"epoch": 0.45146726862302483,
"grad_norm": 19.37485694885254,
"learning_rate": 5.7844690966719495e-06,
"loss": 2.9492,
"step": 600
},
{
"epoch": 0.45221971407072986,
"grad_norm": 16.228641510009766,
"learning_rate": 5.7765451664025366e-06,
"loss": 2.4717,
"step": 601
},
{
"epoch": 0.4529721595184349,
"grad_norm": 19.831296920776367,
"learning_rate": 5.768621236133123e-06,
"loss": 1.9902,
"step": 602
},
{
"epoch": 0.45372460496614,
"grad_norm": 19.382762908935547,
"learning_rate": 5.760697305863708e-06,
"loss": 2.8203,
"step": 603
},
{
"epoch": 0.454477050413845,
"grad_norm": 21.581256866455078,
"learning_rate": 5.752773375594295e-06,
"loss": 2.4434,
"step": 604
},
{
"epoch": 0.45522949586155004,
"grad_norm": 26.005481719970703,
"learning_rate": 5.744849445324881e-06,
"loss": 2.6973,
"step": 605
},
{
"epoch": 0.45598194130925507,
"grad_norm": 18.75592803955078,
"learning_rate": 5.736925515055468e-06,
"loss": 2.292,
"step": 606
},
{
"epoch": 0.4567343867569601,
"grad_norm": 20.964447021484375,
"learning_rate": 5.7290015847860546e-06,
"loss": 2.6367,
"step": 607
},
{
"epoch": 0.4574868322046652,
"grad_norm": 23.393482208251953,
"learning_rate": 5.72107765451664e-06,
"loss": 2.4834,
"step": 608
},
{
"epoch": 0.4582392776523702,
"grad_norm": 25.4880428314209,
"learning_rate": 5.713153724247227e-06,
"loss": 2.3193,
"step": 609
},
{
"epoch": 0.45899172310007524,
"grad_norm": 21.29010772705078,
"learning_rate": 5.705229793977813e-06,
"loss": 3.0508,
"step": 610
},
{
"epoch": 0.4597441685477803,
"grad_norm": 32.27946853637695,
"learning_rate": 5.6973058637084e-06,
"loss": 2.3809,
"step": 611
},
{
"epoch": 0.4604966139954853,
"grad_norm": 22.592269897460938,
"learning_rate": 5.689381933438986e-06,
"loss": 2.6006,
"step": 612
},
{
"epoch": 0.4612490594431904,
"grad_norm": 17.350631713867188,
"learning_rate": 5.681458003169572e-06,
"loss": 2.3916,
"step": 613
},
{
"epoch": 0.4620015048908954,
"grad_norm": 30.66547203063965,
"learning_rate": 5.673534072900159e-06,
"loss": 3.041,
"step": 614
},
{
"epoch": 0.46275395033860045,
"grad_norm": 16.543447494506836,
"learning_rate": 5.665610142630745e-06,
"loss": 2.4219,
"step": 615
},
{
"epoch": 0.4635063957863055,
"grad_norm": 17.638381958007812,
"learning_rate": 5.657686212361332e-06,
"loss": 2.2246,
"step": 616
},
{
"epoch": 0.4642588412340105,
"grad_norm": 18.163257598876953,
"learning_rate": 5.649762282091918e-06,
"loss": 2.7139,
"step": 617
},
{
"epoch": 0.4650112866817156,
"grad_norm": 29.288700103759766,
"learning_rate": 5.6418383518225035e-06,
"loss": 2.5703,
"step": 618
},
{
"epoch": 0.4657637321294206,
"grad_norm": 34.10881042480469,
"learning_rate": 5.6339144215530906e-06,
"loss": 2.9375,
"step": 619
},
{
"epoch": 0.46651617757712566,
"grad_norm": 16.555177688598633,
"learning_rate": 5.625990491283677e-06,
"loss": 2.2275,
"step": 620
},
{
"epoch": 0.4672686230248307,
"grad_norm": 19.65677261352539,
"learning_rate": 5.618066561014264e-06,
"loss": 2.2822,
"step": 621
},
{
"epoch": 0.4680210684725357,
"grad_norm": 21.970407485961914,
"learning_rate": 5.61014263074485e-06,
"loss": 2.793,
"step": 622
},
{
"epoch": 0.4687735139202408,
"grad_norm": 21.59258460998535,
"learning_rate": 5.602218700475436e-06,
"loss": 2.166,
"step": 623
},
{
"epoch": 0.46952595936794583,
"grad_norm": 22.609561920166016,
"learning_rate": 5.594294770206022e-06,
"loss": 2.4883,
"step": 624
},
{
"epoch": 0.47027840481565086,
"grad_norm": 34.6531867980957,
"learning_rate": 5.586370839936609e-06,
"loss": 2.5146,
"step": 625
},
{
"epoch": 0.4710308502633559,
"grad_norm": 26.691869735717773,
"learning_rate": 5.578446909667196e-06,
"loss": 2.7393,
"step": 626
},
{
"epoch": 0.4717832957110609,
"grad_norm": 28.01468276977539,
"learning_rate": 5.570522979397782e-06,
"loss": 2.9395,
"step": 627
},
{
"epoch": 0.472535741158766,
"grad_norm": 25.104557037353516,
"learning_rate": 5.562599049128368e-06,
"loss": 1.9766,
"step": 628
},
{
"epoch": 0.47328818660647104,
"grad_norm": 19.037647247314453,
"learning_rate": 5.554675118858954e-06,
"loss": 2.0107,
"step": 629
},
{
"epoch": 0.47404063205417607,
"grad_norm": 19.321691513061523,
"learning_rate": 5.546751188589541e-06,
"loss": 1.8467,
"step": 630
},
{
"epoch": 0.4747930775018811,
"grad_norm": 36.88700866699219,
"learning_rate": 5.5388272583201274e-06,
"loss": 1.9414,
"step": 631
},
{
"epoch": 0.47554552294958613,
"grad_norm": 28.881858825683594,
"learning_rate": 5.5309033280507145e-06,
"loss": 2.6367,
"step": 632
},
{
"epoch": 0.4762979683972912,
"grad_norm": 32.7785758972168,
"learning_rate": 5.5229793977813e-06,
"loss": 2.2744,
"step": 633
},
{
"epoch": 0.47705041384499625,
"grad_norm": 33.875144958496094,
"learning_rate": 5.515055467511886e-06,
"loss": 3.0371,
"step": 634
},
{
"epoch": 0.4778028592927013,
"grad_norm": 26.45054817199707,
"learning_rate": 5.507131537242473e-06,
"loss": 2.4355,
"step": 635
},
{
"epoch": 0.4785553047404063,
"grad_norm": 21.72730827331543,
"learning_rate": 5.499207606973059e-06,
"loss": 2.2876,
"step": 636
},
{
"epoch": 0.47930775018811134,
"grad_norm": 42.81020736694336,
"learning_rate": 5.491283676703646e-06,
"loss": 3.3457,
"step": 637
},
{
"epoch": 0.4800601956358164,
"grad_norm": 37.48054504394531,
"learning_rate": 5.483359746434232e-06,
"loss": 2.5371,
"step": 638
},
{
"epoch": 0.48081264108352145,
"grad_norm": 38.4602165222168,
"learning_rate": 5.475435816164818e-06,
"loss": 2.4478,
"step": 639
},
{
"epoch": 0.4815650865312265,
"grad_norm": 31.48103141784668,
"learning_rate": 5.467511885895405e-06,
"loss": 2.6201,
"step": 640
},
{
"epoch": 0.4823175319789315,
"grad_norm": 43.95348358154297,
"learning_rate": 5.459587955625991e-06,
"loss": 2.6621,
"step": 641
},
{
"epoch": 0.48306997742663654,
"grad_norm": 25.062053680419922,
"learning_rate": 5.451664025356578e-06,
"loss": 2.6729,
"step": 642
},
{
"epoch": 0.48382242287434163,
"grad_norm": 41.68134689331055,
"learning_rate": 5.4437400950871634e-06,
"loss": 2.5586,
"step": 643
},
{
"epoch": 0.48457486832204666,
"grad_norm": 28.32135581970215,
"learning_rate": 5.43581616481775e-06,
"loss": 2.6699,
"step": 644
},
{
"epoch": 0.4853273137697517,
"grad_norm": 45.01399612426758,
"learning_rate": 5.427892234548337e-06,
"loss": 2.5332,
"step": 645
},
{
"epoch": 0.4860797592174567,
"grad_norm": 34.67613220214844,
"learning_rate": 5.419968304278923e-06,
"loss": 2.3018,
"step": 646
},
{
"epoch": 0.48683220466516175,
"grad_norm": 19.990108489990234,
"learning_rate": 5.41204437400951e-06,
"loss": 2.1064,
"step": 647
},
{
"epoch": 0.48758465011286684,
"grad_norm": 20.448556900024414,
"learning_rate": 5.404120443740095e-06,
"loss": 2.4502,
"step": 648
},
{
"epoch": 0.48833709556057187,
"grad_norm": 16.833580017089844,
"learning_rate": 5.3961965134706814e-06,
"loss": 2.0293,
"step": 649
},
{
"epoch": 0.4890895410082769,
"grad_norm": 31.61375617980957,
"learning_rate": 5.3882725832012685e-06,
"loss": 2.4951,
"step": 650
},
{
"epoch": 0.4898419864559819,
"grad_norm": 37.58226013183594,
"learning_rate": 5.380348652931855e-06,
"loss": 2.8955,
"step": 651
},
{
"epoch": 0.49059443190368696,
"grad_norm": 61.47941589355469,
"learning_rate": 5.372424722662442e-06,
"loss": 3.2871,
"step": 652
},
{
"epoch": 0.49134687735139204,
"grad_norm": 28.28410530090332,
"learning_rate": 5.364500792393027e-06,
"loss": 2.3145,
"step": 653
},
{
"epoch": 0.49209932279909707,
"grad_norm": 51.59183883666992,
"learning_rate": 5.356576862123613e-06,
"loss": 2.7842,
"step": 654
},
{
"epoch": 0.4928517682468021,
"grad_norm": 19.695018768310547,
"learning_rate": 5.3486529318542e-06,
"loss": 2.2412,
"step": 655
},
{
"epoch": 0.49360421369450713,
"grad_norm": 22.1850528717041,
"learning_rate": 5.3407290015847865e-06,
"loss": 2.7324,
"step": 656
},
{
"epoch": 0.49435665914221216,
"grad_norm": 22.114850997924805,
"learning_rate": 5.3328050713153735e-06,
"loss": 2.0723,
"step": 657
},
{
"epoch": 0.49510910458991725,
"grad_norm": 38.758094787597656,
"learning_rate": 5.324881141045959e-06,
"loss": 2.9834,
"step": 658
},
{
"epoch": 0.4958615500376223,
"grad_norm": 23.552003860473633,
"learning_rate": 5.316957210776545e-06,
"loss": 2.6865,
"step": 659
},
{
"epoch": 0.4966139954853273,
"grad_norm": 25.014806747436523,
"learning_rate": 5.309033280507132e-06,
"loss": 2.7842,
"step": 660
},
{
"epoch": 0.49736644093303234,
"grad_norm": 24.715150833129883,
"learning_rate": 5.301109350237718e-06,
"loss": 1.8623,
"step": 661
},
{
"epoch": 0.49811888638073737,
"grad_norm": 21.476627349853516,
"learning_rate": 5.293185419968305e-06,
"loss": 2.1055,
"step": 662
},
{
"epoch": 0.49887133182844245,
"grad_norm": 20.212873458862305,
"learning_rate": 5.285261489698891e-06,
"loss": 2.1934,
"step": 663
},
{
"epoch": 0.4996237772761475,
"grad_norm": 21.110328674316406,
"learning_rate": 5.277337559429478e-06,
"loss": 2.7734,
"step": 664
},
{
"epoch": 0.5003762227238525,
"grad_norm": 23.664304733276367,
"learning_rate": 5.269413629160064e-06,
"loss": 2.3623,
"step": 665
},
{
"epoch": 0.5011286681715575,
"grad_norm": 24.913185119628906,
"learning_rate": 5.26148969889065e-06,
"loss": 2.5967,
"step": 666
},
{
"epoch": 0.5018811136192626,
"grad_norm": 23.190227508544922,
"learning_rate": 5.253565768621236e-06,
"loss": 2.6738,
"step": 667
},
{
"epoch": 0.5026335590669676,
"grad_norm": 21.779712677001953,
"learning_rate": 5.2456418383518225e-06,
"loss": 2.6104,
"step": 668
},
{
"epoch": 0.5033860045146726,
"grad_norm": 26.490779876708984,
"learning_rate": 5.2377179080824095e-06,
"loss": 2.2998,
"step": 669
},
{
"epoch": 0.5041384499623778,
"grad_norm": 16.361080169677734,
"learning_rate": 5.229793977812996e-06,
"loss": 2.2891,
"step": 670
},
{
"epoch": 0.5048908954100828,
"grad_norm": 26.185226440429688,
"learning_rate": 5.221870047543583e-06,
"loss": 2.3906,
"step": 671
},
{
"epoch": 0.5056433408577878,
"grad_norm": 21.47825050354004,
"learning_rate": 5.213946117274168e-06,
"loss": 2.6611,
"step": 672
},
{
"epoch": 0.5063957863054929,
"grad_norm": 37.27735900878906,
"learning_rate": 5.206022187004754e-06,
"loss": 2.7812,
"step": 673
},
{
"epoch": 0.5071482317531979,
"grad_norm": 18.173118591308594,
"learning_rate": 5.198098256735341e-06,
"loss": 1.9253,
"step": 674
},
{
"epoch": 0.5079006772009029,
"grad_norm": 35.55131912231445,
"learning_rate": 5.1901743264659275e-06,
"loss": 2.1836,
"step": 675
},
{
"epoch": 0.508653122648608,
"grad_norm": 17.132335662841797,
"learning_rate": 5.182250396196515e-06,
"loss": 2.2422,
"step": 676
},
{
"epoch": 0.509405568096313,
"grad_norm": 17.84613800048828,
"learning_rate": 5.1743264659271e-06,
"loss": 2.3105,
"step": 677
},
{
"epoch": 0.510158013544018,
"grad_norm": 29.677824020385742,
"learning_rate": 5.166402535657686e-06,
"loss": 2.6875,
"step": 678
},
{
"epoch": 0.510910458991723,
"grad_norm": 34.693511962890625,
"learning_rate": 5.158478605388273e-06,
"loss": 2.5439,
"step": 679
},
{
"epoch": 0.5116629044394282,
"grad_norm": 22.879697799682617,
"learning_rate": 5.150554675118859e-06,
"loss": 2.4609,
"step": 680
},
{
"epoch": 0.5124153498871332,
"grad_norm": 21.615089416503906,
"learning_rate": 5.142630744849446e-06,
"loss": 2.5283,
"step": 681
},
{
"epoch": 0.5131677953348383,
"grad_norm": 31.141887664794922,
"learning_rate": 5.134706814580032e-06,
"loss": 2.4053,
"step": 682
},
{
"epoch": 0.5139202407825433,
"grad_norm": 36.289127349853516,
"learning_rate": 5.126782884310618e-06,
"loss": 2.4961,
"step": 683
},
{
"epoch": 0.5146726862302483,
"grad_norm": 25.587385177612305,
"learning_rate": 5.118858954041205e-06,
"loss": 2.7744,
"step": 684
},
{
"epoch": 0.5154251316779533,
"grad_norm": 30.78696060180664,
"learning_rate": 5.110935023771791e-06,
"loss": 2.2627,
"step": 685
},
{
"epoch": 0.5161775771256584,
"grad_norm": 28.154029846191406,
"learning_rate": 5.103011093502378e-06,
"loss": 2.627,
"step": 686
},
{
"epoch": 0.5169300225733634,
"grad_norm": 23.078495025634766,
"learning_rate": 5.0950871632329636e-06,
"loss": 2.251,
"step": 687
},
{
"epoch": 0.5176824680210684,
"grad_norm": 18.885520935058594,
"learning_rate": 5.08716323296355e-06,
"loss": 2.1416,
"step": 688
},
{
"epoch": 0.5184349134687735,
"grad_norm": 21.535594940185547,
"learning_rate": 5.079239302694137e-06,
"loss": 2.5967,
"step": 689
},
{
"epoch": 0.5191873589164786,
"grad_norm": 23.832807540893555,
"learning_rate": 5.071315372424723e-06,
"loss": 2.6113,
"step": 690
},
{
"epoch": 0.5199398043641836,
"grad_norm": 25.892114639282227,
"learning_rate": 5.06339144215531e-06,
"loss": 2.1582,
"step": 691
},
{
"epoch": 0.5206922498118887,
"grad_norm": 40.396018981933594,
"learning_rate": 5.055467511885895e-06,
"loss": 2.5918,
"step": 692
},
{
"epoch": 0.5214446952595937,
"grad_norm": 30.396682739257812,
"learning_rate": 5.0475435816164816e-06,
"loss": 2.5098,
"step": 693
},
{
"epoch": 0.5221971407072987,
"grad_norm": 28.615299224853516,
"learning_rate": 5.039619651347069e-06,
"loss": 2.6514,
"step": 694
},
{
"epoch": 0.5229495861550038,
"grad_norm": 16.894996643066406,
"learning_rate": 5.031695721077655e-06,
"loss": 2.249,
"step": 695
},
{
"epoch": 0.5237020316027088,
"grad_norm": 27.297014236450195,
"learning_rate": 5.023771790808242e-06,
"loss": 2.582,
"step": 696
},
{
"epoch": 0.5244544770504138,
"grad_norm": 26.86957550048828,
"learning_rate": 5.015847860538827e-06,
"loss": 2.1602,
"step": 697
},
{
"epoch": 0.5252069224981188,
"grad_norm": 22.478004455566406,
"learning_rate": 5.007923930269414e-06,
"loss": 2.208,
"step": 698
},
{
"epoch": 0.5259593679458239,
"grad_norm": 16.2978515625,
"learning_rate": 5e-06,
"loss": 2.1294,
"step": 699
},
{
"epoch": 0.526711813393529,
"grad_norm": 26.94550895690918,
"learning_rate": 4.992076069730587e-06,
"loss": 2.9023,
"step": 700
},
{
"epoch": 0.527464258841234,
"grad_norm": 21.26068687438965,
"learning_rate": 4.984152139461173e-06,
"loss": 1.9463,
"step": 701
},
{
"epoch": 0.5282167042889391,
"grad_norm": 17.80802345275879,
"learning_rate": 4.97622820919176e-06,
"loss": 2.2588,
"step": 702
},
{
"epoch": 0.5289691497366441,
"grad_norm": 22.475358963012695,
"learning_rate": 4.968304278922346e-06,
"loss": 2.25,
"step": 703
},
{
"epoch": 0.5297215951843491,
"grad_norm": 29.277667999267578,
"learning_rate": 4.960380348652932e-06,
"loss": 2.5811,
"step": 704
},
{
"epoch": 0.5304740406320542,
"grad_norm": 45.544376373291016,
"learning_rate": 4.952456418383519e-06,
"loss": 2.7852,
"step": 705
},
{
"epoch": 0.5312264860797592,
"grad_norm": 22.807954788208008,
"learning_rate": 4.944532488114105e-06,
"loss": 2.4385,
"step": 706
},
{
"epoch": 0.5319789315274642,
"grad_norm": 24.76763153076172,
"learning_rate": 4.936608557844692e-06,
"loss": 3.041,
"step": 707
},
{
"epoch": 0.5327313769751693,
"grad_norm": 32.09806823730469,
"learning_rate": 4.928684627575278e-06,
"loss": 2.3145,
"step": 708
},
{
"epoch": 0.5334838224228743,
"grad_norm": 23.560874938964844,
"learning_rate": 4.920760697305864e-06,
"loss": 3.0161,
"step": 709
},
{
"epoch": 0.5342362678705794,
"grad_norm": 20.40456199645996,
"learning_rate": 4.912836767036451e-06,
"loss": 2.251,
"step": 710
},
{
"epoch": 0.5349887133182845,
"grad_norm": 29.588829040527344,
"learning_rate": 4.904912836767036e-06,
"loss": 2.5146,
"step": 711
},
{
"epoch": 0.5357411587659895,
"grad_norm": 29.529502868652344,
"learning_rate": 4.8969889064976235e-06,
"loss": 3.1211,
"step": 712
},
{
"epoch": 0.5364936042136945,
"grad_norm": 17.787813186645508,
"learning_rate": 4.88906497622821e-06,
"loss": 2.3545,
"step": 713
},
{
"epoch": 0.5372460496613995,
"grad_norm": 21.947816848754883,
"learning_rate": 4.881141045958796e-06,
"loss": 2.375,
"step": 714
},
{
"epoch": 0.5379984951091046,
"grad_norm": 22.692014694213867,
"learning_rate": 4.873217115689383e-06,
"loss": 2.8008,
"step": 715
},
{
"epoch": 0.5387509405568096,
"grad_norm": 19.035554885864258,
"learning_rate": 4.865293185419968e-06,
"loss": 2.7129,
"step": 716
},
{
"epoch": 0.5395033860045146,
"grad_norm": 15.194079399108887,
"learning_rate": 4.857369255150555e-06,
"loss": 2.3203,
"step": 717
},
{
"epoch": 0.5402558314522197,
"grad_norm": 18.011316299438477,
"learning_rate": 4.8494453248811415e-06,
"loss": 2.3613,
"step": 718
},
{
"epoch": 0.5410082768999247,
"grad_norm": 21.924701690673828,
"learning_rate": 4.841521394611728e-06,
"loss": 2.4072,
"step": 719
},
{
"epoch": 0.5417607223476298,
"grad_norm": 40.228458404541016,
"learning_rate": 4.833597464342314e-06,
"loss": 2.6533,
"step": 720
},
{
"epoch": 0.5425131677953349,
"grad_norm": 16.44277572631836,
"learning_rate": 4.825673534072901e-06,
"loss": 1.8018,
"step": 721
},
{
"epoch": 0.5432656132430399,
"grad_norm": 25.804616928100586,
"learning_rate": 4.817749603803487e-06,
"loss": 2.4727,
"step": 722
},
{
"epoch": 0.5440180586907449,
"grad_norm": 26.694299697875977,
"learning_rate": 4.809825673534073e-06,
"loss": 3.1221,
"step": 723
},
{
"epoch": 0.54477050413845,
"grad_norm": 26.506877899169922,
"learning_rate": 4.8019017432646595e-06,
"loss": 2.3232,
"step": 724
},
{
"epoch": 0.545522949586155,
"grad_norm": 24.158933639526367,
"learning_rate": 4.793977812995246e-06,
"loss": 2.7012,
"step": 725
},
{
"epoch": 0.54627539503386,
"grad_norm": 25.846845626831055,
"learning_rate": 4.786053882725833e-06,
"loss": 2.5986,
"step": 726
},
{
"epoch": 0.547027840481565,
"grad_norm": 24.62186622619629,
"learning_rate": 4.778129952456419e-06,
"loss": 2.5259,
"step": 727
},
{
"epoch": 0.5477802859292701,
"grad_norm": 26.85883331298828,
"learning_rate": 4.770206022187005e-06,
"loss": 3.0781,
"step": 728
},
{
"epoch": 0.5485327313769752,
"grad_norm": 27.05722427368164,
"learning_rate": 4.762282091917591e-06,
"loss": 2.2344,
"step": 729
},
{
"epoch": 0.5492851768246803,
"grad_norm": 30.71999740600586,
"learning_rate": 4.7543581616481775e-06,
"loss": 2.6318,
"step": 730
},
{
"epoch": 0.5500376222723853,
"grad_norm": 19.75069236755371,
"learning_rate": 4.7464342313787645e-06,
"loss": 2.0889,
"step": 731
},
{
"epoch": 0.5507900677200903,
"grad_norm": 20.5186710357666,
"learning_rate": 4.738510301109351e-06,
"loss": 2.2495,
"step": 732
},
{
"epoch": 0.5515425131677953,
"grad_norm": 30.734027862548828,
"learning_rate": 4.730586370839937e-06,
"loss": 2.5781,
"step": 733
},
{
"epoch": 0.5522949586155004,
"grad_norm": 27.587772369384766,
"learning_rate": 4.722662440570523e-06,
"loss": 2.085,
"step": 734
},
{
"epoch": 0.5530474040632054,
"grad_norm": 21.96542739868164,
"learning_rate": 4.714738510301109e-06,
"loss": 2.6475,
"step": 735
},
{
"epoch": 0.5537998495109104,
"grad_norm": 34.89696502685547,
"learning_rate": 4.706814580031696e-06,
"loss": 2.2959,
"step": 736
},
{
"epoch": 0.5545522949586155,
"grad_norm": 23.43917465209961,
"learning_rate": 4.6988906497622825e-06,
"loss": 2.3931,
"step": 737
},
{
"epoch": 0.5553047404063205,
"grad_norm": 33.7728385925293,
"learning_rate": 4.690966719492869e-06,
"loss": 2.75,
"step": 738
},
{
"epoch": 0.5560571858540256,
"grad_norm": 25.656301498413086,
"learning_rate": 4.683042789223456e-06,
"loss": 2.7324,
"step": 739
},
{
"epoch": 0.5568096313017307,
"grad_norm": 23.849233627319336,
"learning_rate": 4.675118858954041e-06,
"loss": 2.7832,
"step": 740
},
{
"epoch": 0.5575620767494357,
"grad_norm": 24.854156494140625,
"learning_rate": 4.667194928684628e-06,
"loss": 2.8164,
"step": 741
},
{
"epoch": 0.5583145221971407,
"grad_norm": 20.215167999267578,
"learning_rate": 4.659270998415214e-06,
"loss": 2.4922,
"step": 742
},
{
"epoch": 0.5590669676448458,
"grad_norm": 21.410198211669922,
"learning_rate": 4.6513470681458005e-06,
"loss": 2.6812,
"step": 743
},
{
"epoch": 0.5598194130925508,
"grad_norm": 37.980003356933594,
"learning_rate": 4.6434231378763876e-06,
"loss": 2.6025,
"step": 744
},
{
"epoch": 0.5605718585402558,
"grad_norm": 27.364931106567383,
"learning_rate": 4.635499207606973e-06,
"loss": 2.1377,
"step": 745
},
{
"epoch": 0.5613243039879608,
"grad_norm": 36.03047180175781,
"learning_rate": 4.62757527733756e-06,
"loss": 2.4512,
"step": 746
},
{
"epoch": 0.5620767494356659,
"grad_norm": 20.39134407043457,
"learning_rate": 4.619651347068146e-06,
"loss": 2.71,
"step": 747
},
{
"epoch": 0.5628291948833709,
"grad_norm": 16.574569702148438,
"learning_rate": 4.611727416798732e-06,
"loss": 2.2793,
"step": 748
},
{
"epoch": 0.563581640331076,
"grad_norm": 27.37680435180664,
"learning_rate": 4.603803486529319e-06,
"loss": 2.752,
"step": 749
},
{
"epoch": 0.5643340857787811,
"grad_norm": 19.884410858154297,
"learning_rate": 4.595879556259905e-06,
"loss": 2.3926,
"step": 750
},
{
"epoch": 0.5650865312264861,
"grad_norm": 19.763639450073242,
"learning_rate": 4.587955625990492e-06,
"loss": 2.833,
"step": 751
},
{
"epoch": 0.5658389766741911,
"grad_norm": 19.394676208496094,
"learning_rate": 4.580031695721078e-06,
"loss": 2.3857,
"step": 752
},
{
"epoch": 0.5665914221218962,
"grad_norm": 18.420915603637695,
"learning_rate": 4.572107765451664e-06,
"loss": 2.376,
"step": 753
},
{
"epoch": 0.5673438675696012,
"grad_norm": 25.283594131469727,
"learning_rate": 4.564183835182251e-06,
"loss": 2.5928,
"step": 754
},
{
"epoch": 0.5680963130173062,
"grad_norm": 21.629606246948242,
"learning_rate": 4.556259904912837e-06,
"loss": 2.5918,
"step": 755
},
{
"epoch": 0.5688487584650113,
"grad_norm": 24.263954162597656,
"learning_rate": 4.5483359746434236e-06,
"loss": 2.2422,
"step": 756
},
{
"epoch": 0.5696012039127163,
"grad_norm": 29.89535903930664,
"learning_rate": 4.54041204437401e-06,
"loss": 2.3711,
"step": 757
},
{
"epoch": 0.5703536493604213,
"grad_norm": 21.822158813476562,
"learning_rate": 4.532488114104596e-06,
"loss": 2.6113,
"step": 758
},
{
"epoch": 0.5711060948081265,
"grad_norm": 21.718902587890625,
"learning_rate": 4.524564183835183e-06,
"loss": 2.46,
"step": 759
},
{
"epoch": 0.5718585402558315,
"grad_norm": 21.7495059967041,
"learning_rate": 4.516640253565769e-06,
"loss": 2.0186,
"step": 760
},
{
"epoch": 0.5726109857035365,
"grad_norm": 21.98088836669922,
"learning_rate": 4.508716323296355e-06,
"loss": 2.7881,
"step": 761
},
{
"epoch": 0.5733634311512416,
"grad_norm": 43.60248947143555,
"learning_rate": 4.500792393026942e-06,
"loss": 2.8164,
"step": 762
},
{
"epoch": 0.5741158765989466,
"grad_norm": 20.0263614654541,
"learning_rate": 4.492868462757528e-06,
"loss": 1.9448,
"step": 763
},
{
"epoch": 0.5748683220466516,
"grad_norm": 21.096193313598633,
"learning_rate": 4.484944532488115e-06,
"loss": 2.1396,
"step": 764
},
{
"epoch": 0.5756207674943566,
"grad_norm": 19.693714141845703,
"learning_rate": 4.477020602218701e-06,
"loss": 2.2139,
"step": 765
},
{
"epoch": 0.5763732129420617,
"grad_norm": 19.59661293029785,
"learning_rate": 4.469096671949287e-06,
"loss": 2.5938,
"step": 766
},
{
"epoch": 0.5771256583897667,
"grad_norm": 17.570878982543945,
"learning_rate": 4.461172741679873e-06,
"loss": 2.0215,
"step": 767
},
{
"epoch": 0.5778781038374717,
"grad_norm": 29.355201721191406,
"learning_rate": 4.45324881141046e-06,
"loss": 2.2734,
"step": 768
},
{
"epoch": 0.5786305492851769,
"grad_norm": 36.004547119140625,
"learning_rate": 4.445324881141047e-06,
"loss": 3.3457,
"step": 769
},
{
"epoch": 0.5793829947328819,
"grad_norm": 21.928863525390625,
"learning_rate": 4.437400950871633e-06,
"loss": 2.2832,
"step": 770
},
{
"epoch": 0.5801354401805869,
"grad_norm": 24.527210235595703,
"learning_rate": 4.429477020602219e-06,
"loss": 2.8252,
"step": 771
},
{
"epoch": 0.580887885628292,
"grad_norm": 30.435258865356445,
"learning_rate": 4.421553090332805e-06,
"loss": 2.1875,
"step": 772
},
{
"epoch": 0.581640331075997,
"grad_norm": 21.116355895996094,
"learning_rate": 4.413629160063391e-06,
"loss": 2.2197,
"step": 773
},
{
"epoch": 0.582392776523702,
"grad_norm": 22.062442779541016,
"learning_rate": 4.4057052297939784e-06,
"loss": 2.8711,
"step": 774
},
{
"epoch": 0.5831452219714071,
"grad_norm": 25.258756637573242,
"learning_rate": 4.397781299524565e-06,
"loss": 2.0986,
"step": 775
},
{
"epoch": 0.5838976674191121,
"grad_norm": 31.283647537231445,
"learning_rate": 4.389857369255151e-06,
"loss": 2.1328,
"step": 776
},
{
"epoch": 0.5846501128668171,
"grad_norm": 20.281776428222656,
"learning_rate": 4.381933438985737e-06,
"loss": 2.167,
"step": 777
},
{
"epoch": 0.5854025583145221,
"grad_norm": 27.260391235351562,
"learning_rate": 4.374009508716324e-06,
"loss": 2.0811,
"step": 778
},
{
"epoch": 0.5861550037622273,
"grad_norm": 37.832515716552734,
"learning_rate": 4.36608557844691e-06,
"loss": 2.9639,
"step": 779
},
{
"epoch": 0.5869074492099323,
"grad_norm": 22.24700164794922,
"learning_rate": 4.3581616481774964e-06,
"loss": 2.3242,
"step": 780
},
{
"epoch": 0.5876598946576373,
"grad_norm": 32.77638626098633,
"learning_rate": 4.350237717908083e-06,
"loss": 1.8867,
"step": 781
},
{
"epoch": 0.5884123401053424,
"grad_norm": 22.048324584960938,
"learning_rate": 4.342313787638669e-06,
"loss": 1.8193,
"step": 782
},
{
"epoch": 0.5891647855530474,
"grad_norm": 21.58878517150879,
"learning_rate": 4.334389857369256e-06,
"loss": 3.0986,
"step": 783
},
{
"epoch": 0.5899172310007524,
"grad_norm": 18.198331832885742,
"learning_rate": 4.326465927099842e-06,
"loss": 1.9155,
"step": 784
},
{
"epoch": 0.5906696764484575,
"grad_norm": 31.84845733642578,
"learning_rate": 4.318541996830428e-06,
"loss": 1.9473,
"step": 785
},
{
"epoch": 0.5914221218961625,
"grad_norm": 24.339372634887695,
"learning_rate": 4.3106180665610144e-06,
"loss": 2.5181,
"step": 786
},
{
"epoch": 0.5921745673438675,
"grad_norm": 26.267627716064453,
"learning_rate": 4.302694136291601e-06,
"loss": 2.3174,
"step": 787
},
{
"epoch": 0.5929270127915726,
"grad_norm": 28.163755416870117,
"learning_rate": 4.294770206022188e-06,
"loss": 2.3691,
"step": 788
},
{
"epoch": 0.5936794582392777,
"grad_norm": 23.921737670898438,
"learning_rate": 4.286846275752774e-06,
"loss": 1.9277,
"step": 789
},
{
"epoch": 0.5944319036869827,
"grad_norm": 30.403043746948242,
"learning_rate": 4.27892234548336e-06,
"loss": 2.7305,
"step": 790
},
{
"epoch": 0.5951843491346878,
"grad_norm": 21.035133361816406,
"learning_rate": 4.270998415213946e-06,
"loss": 1.9492,
"step": 791
},
{
"epoch": 0.5959367945823928,
"grad_norm": 25.801963806152344,
"learning_rate": 4.2630744849445325e-06,
"loss": 2.6982,
"step": 792
},
{
"epoch": 0.5966892400300978,
"grad_norm": 28.979450225830078,
"learning_rate": 4.2551505546751195e-06,
"loss": 2.3574,
"step": 793
},
{
"epoch": 0.5974416854778029,
"grad_norm": 34.52888107299805,
"learning_rate": 4.247226624405706e-06,
"loss": 2.5796,
"step": 794
},
{
"epoch": 0.5981941309255079,
"grad_norm": 24.638835906982422,
"learning_rate": 4.239302694136292e-06,
"loss": 1.9937,
"step": 795
},
{
"epoch": 0.5989465763732129,
"grad_norm": 32.92772674560547,
"learning_rate": 4.231378763866879e-06,
"loss": 2.0918,
"step": 796
},
{
"epoch": 0.5996990218209179,
"grad_norm": 34.20918273925781,
"learning_rate": 4.223454833597464e-06,
"loss": 2.6455,
"step": 797
},
{
"epoch": 0.600451467268623,
"grad_norm": 21.07083511352539,
"learning_rate": 4.215530903328051e-06,
"loss": 2.2432,
"step": 798
},
{
"epoch": 0.6012039127163281,
"grad_norm": 22.572980880737305,
"learning_rate": 4.2076069730586375e-06,
"loss": 2.1455,
"step": 799
},
{
"epoch": 0.6019563581640331,
"grad_norm": 19.050878524780273,
"learning_rate": 4.199683042789224e-06,
"loss": 1.998,
"step": 800
},
{
"epoch": 0.6027088036117382,
"grad_norm": 17.435394287109375,
"learning_rate": 4.191759112519811e-06,
"loss": 2.0913,
"step": 801
},
{
"epoch": 0.6034612490594432,
"grad_norm": 24.223140716552734,
"learning_rate": 4.183835182250396e-06,
"loss": 2.2686,
"step": 802
},
{
"epoch": 0.6042136945071482,
"grad_norm": 22.403162002563477,
"learning_rate": 4.175911251980983e-06,
"loss": 2.7402,
"step": 803
},
{
"epoch": 0.6049661399548533,
"grad_norm": 24.95384979248047,
"learning_rate": 4.167987321711569e-06,
"loss": 2.5898,
"step": 804
},
{
"epoch": 0.6057185854025583,
"grad_norm": 54.6262321472168,
"learning_rate": 4.1600633914421555e-06,
"loss": 2.5508,
"step": 805
},
{
"epoch": 0.6064710308502633,
"grad_norm": 31.041257858276367,
"learning_rate": 4.1521394611727425e-06,
"loss": 2.2725,
"step": 806
},
{
"epoch": 0.6072234762979684,
"grad_norm": 25.92288589477539,
"learning_rate": 4.144215530903328e-06,
"loss": 2.2832,
"step": 807
},
{
"epoch": 0.6079759217456734,
"grad_norm": 30.848102569580078,
"learning_rate": 4.136291600633915e-06,
"loss": 2.2432,
"step": 808
},
{
"epoch": 0.6087283671933785,
"grad_norm": 25.898963928222656,
"learning_rate": 4.128367670364501e-06,
"loss": 2.5078,
"step": 809
},
{
"epoch": 0.6094808126410836,
"grad_norm": 19.408071517944336,
"learning_rate": 4.120443740095087e-06,
"loss": 2.458,
"step": 810
},
{
"epoch": 0.6102332580887886,
"grad_norm": 24.330867767333984,
"learning_rate": 4.112519809825674e-06,
"loss": 2.3916,
"step": 811
},
{
"epoch": 0.6109857035364936,
"grad_norm": 38.193077087402344,
"learning_rate": 4.1045958795562605e-06,
"loss": 2.1689,
"step": 812
},
{
"epoch": 0.6117381489841986,
"grad_norm": 19.677576065063477,
"learning_rate": 4.096671949286847e-06,
"loss": 2.0405,
"step": 813
},
{
"epoch": 0.6124905944319037,
"grad_norm": 25.960607528686523,
"learning_rate": 4.088748019017433e-06,
"loss": 2.8359,
"step": 814
},
{
"epoch": 0.6132430398796087,
"grad_norm": 26.78199005126953,
"learning_rate": 4.080824088748019e-06,
"loss": 2.7383,
"step": 815
},
{
"epoch": 0.6139954853273137,
"grad_norm": 37.22509002685547,
"learning_rate": 4.072900158478606e-06,
"loss": 2.3555,
"step": 816
},
{
"epoch": 0.6147479307750188,
"grad_norm": 21.195919036865234,
"learning_rate": 4.064976228209192e-06,
"loss": 2.0752,
"step": 817
},
{
"epoch": 0.6155003762227238,
"grad_norm": 24.543123245239258,
"learning_rate": 4.0570522979397786e-06,
"loss": 2.3223,
"step": 818
},
{
"epoch": 0.6162528216704289,
"grad_norm": 23.601247787475586,
"learning_rate": 4.049128367670365e-06,
"loss": 2.4287,
"step": 819
},
{
"epoch": 0.617005267118134,
"grad_norm": 23.073978424072266,
"learning_rate": 4.041204437400951e-06,
"loss": 2.0996,
"step": 820
},
{
"epoch": 0.617757712565839,
"grad_norm": 23.573352813720703,
"learning_rate": 4.033280507131538e-06,
"loss": 2.46,
"step": 821
},
{
"epoch": 0.618510158013544,
"grad_norm": 31.260934829711914,
"learning_rate": 4.025356576862124e-06,
"loss": 2.915,
"step": 822
},
{
"epoch": 0.6192626034612491,
"grad_norm": 24.224809646606445,
"learning_rate": 4.01743264659271e-06,
"loss": 3.0488,
"step": 823
},
{
"epoch": 0.6200150489089541,
"grad_norm": 27.658130645751953,
"learning_rate": 4.0095087163232966e-06,
"loss": 2.1646,
"step": 824
},
{
"epoch": 0.6207674943566591,
"grad_norm": 30.932138442993164,
"learning_rate": 4.001584786053883e-06,
"loss": 2.7983,
"step": 825
},
{
"epoch": 0.6215199398043642,
"grad_norm": 25.3236141204834,
"learning_rate": 3.993660855784469e-06,
"loss": 2.0117,
"step": 826
},
{
"epoch": 0.6222723852520692,
"grad_norm": 24.713939666748047,
"learning_rate": 3.985736925515056e-06,
"loss": 1.8252,
"step": 827
},
{
"epoch": 0.6230248306997742,
"grad_norm": 22.71609878540039,
"learning_rate": 3.977812995245642e-06,
"loss": 2.4561,
"step": 828
},
{
"epoch": 0.6237772761474794,
"grad_norm": 19.888782501220703,
"learning_rate": 3.969889064976228e-06,
"loss": 2.2373,
"step": 829
},
{
"epoch": 0.6245297215951844,
"grad_norm": 27.303646087646484,
"learning_rate": 3.961965134706815e-06,
"loss": 2.0645,
"step": 830
},
{
"epoch": 0.6252821670428894,
"grad_norm": 18.7181396484375,
"learning_rate": 3.954041204437401e-06,
"loss": 2.4932,
"step": 831
},
{
"epoch": 0.6260346124905944,
"grad_norm": 15.501985549926758,
"learning_rate": 3.946117274167988e-06,
"loss": 2.0273,
"step": 832
},
{
"epoch": 0.6267870579382995,
"grad_norm": 28.724773406982422,
"learning_rate": 3.938193343898574e-06,
"loss": 2.4346,
"step": 833
},
{
"epoch": 0.6275395033860045,
"grad_norm": 22.180727005004883,
"learning_rate": 3.93026941362916e-06,
"loss": 2.6465,
"step": 834
},
{
"epoch": 0.6282919488337095,
"grad_norm": 18.778850555419922,
"learning_rate": 3.922345483359747e-06,
"loss": 2.0625,
"step": 835
},
{
"epoch": 0.6290443942814146,
"grad_norm": 21.990373611450195,
"learning_rate": 3.9144215530903326e-06,
"loss": 2.1973,
"step": 836
},
{
"epoch": 0.6297968397291196,
"grad_norm": 31.995012283325195,
"learning_rate": 3.90649762282092e-06,
"loss": 2.3857,
"step": 837
},
{
"epoch": 0.6305492851768246,
"grad_norm": 28.826034545898438,
"learning_rate": 3.898573692551506e-06,
"loss": 2.0898,
"step": 838
},
{
"epoch": 0.6313017306245298,
"grad_norm": 30.95318603515625,
"learning_rate": 3.890649762282092e-06,
"loss": 2.5283,
"step": 839
},
{
"epoch": 0.6320541760722348,
"grad_norm": 33.76778030395508,
"learning_rate": 3.882725832012679e-06,
"loss": 2.0483,
"step": 840
},
{
"epoch": 0.6328066215199398,
"grad_norm": 27.856613159179688,
"learning_rate": 3.874801901743264e-06,
"loss": 2.2412,
"step": 841
},
{
"epoch": 0.6335590669676449,
"grad_norm": 18.836238861083984,
"learning_rate": 3.866877971473851e-06,
"loss": 2.1514,
"step": 842
},
{
"epoch": 0.6343115124153499,
"grad_norm": 26.430967330932617,
"learning_rate": 3.858954041204438e-06,
"loss": 2.147,
"step": 843
},
{
"epoch": 0.6350639578630549,
"grad_norm": 33.526512145996094,
"learning_rate": 3.851030110935024e-06,
"loss": 1.9834,
"step": 844
},
{
"epoch": 0.63581640331076,
"grad_norm": 18.545738220214844,
"learning_rate": 3.843106180665611e-06,
"loss": 2.1514,
"step": 845
},
{
"epoch": 0.636568848758465,
"grad_norm": 30.445545196533203,
"learning_rate": 3.835182250396197e-06,
"loss": 2.6758,
"step": 846
},
{
"epoch": 0.63732129420617,
"grad_norm": 38.89741516113281,
"learning_rate": 3.827258320126783e-06,
"loss": 2.3945,
"step": 847
},
{
"epoch": 0.6380737396538751,
"grad_norm": 40.87432861328125,
"learning_rate": 3.8193343898573694e-06,
"loss": 2.9551,
"step": 848
},
{
"epoch": 0.6388261851015802,
"grad_norm": 19.067197799682617,
"learning_rate": 3.811410459587956e-06,
"loss": 2.2969,
"step": 849
},
{
"epoch": 0.6395786305492852,
"grad_norm": 27.969614028930664,
"learning_rate": 3.8034865293185427e-06,
"loss": 2.373,
"step": 850
},
{
"epoch": 0.6403310759969902,
"grad_norm": 23.686763763427734,
"learning_rate": 3.7955625990491284e-06,
"loss": 3.1475,
"step": 851
},
{
"epoch": 0.6410835214446953,
"grad_norm": 29.188365936279297,
"learning_rate": 3.787638668779715e-06,
"loss": 2.2734,
"step": 852
},
{
"epoch": 0.6418359668924003,
"grad_norm": 20.16975975036621,
"learning_rate": 3.7797147385103017e-06,
"loss": 2.4541,
"step": 853
},
{
"epoch": 0.6425884123401053,
"grad_norm": 21.829917907714844,
"learning_rate": 3.771790808240888e-06,
"loss": 2.2627,
"step": 854
},
{
"epoch": 0.6433408577878104,
"grad_norm": 27.086030960083008,
"learning_rate": 3.7638668779714745e-06,
"loss": 1.8145,
"step": 855
},
{
"epoch": 0.6440933032355154,
"grad_norm": 23.82771873474121,
"learning_rate": 3.7559429477020602e-06,
"loss": 1.876,
"step": 856
},
{
"epoch": 0.6448457486832204,
"grad_norm": 20.215105056762695,
"learning_rate": 3.748019017432647e-06,
"loss": 2.2539,
"step": 857
},
{
"epoch": 0.6455981941309256,
"grad_norm": 21.309492111206055,
"learning_rate": 3.740095087163233e-06,
"loss": 2.3628,
"step": 858
},
{
"epoch": 0.6463506395786306,
"grad_norm": 35.519500732421875,
"learning_rate": 3.7321711568938197e-06,
"loss": 3.0176,
"step": 859
},
{
"epoch": 0.6471030850263356,
"grad_norm": 35.95414352416992,
"learning_rate": 3.7242472266244063e-06,
"loss": 2.5986,
"step": 860
},
{
"epoch": 0.6478555304740407,
"grad_norm": 21.86758804321289,
"learning_rate": 3.716323296354992e-06,
"loss": 2.4932,
"step": 861
},
{
"epoch": 0.6486079759217457,
"grad_norm": 17.709407806396484,
"learning_rate": 3.7083993660855787e-06,
"loss": 2.165,
"step": 862
},
{
"epoch": 0.6493604213694507,
"grad_norm": 32.432437896728516,
"learning_rate": 3.700475435816165e-06,
"loss": 2.3838,
"step": 863
},
{
"epoch": 0.6501128668171557,
"grad_norm": 31.911968231201172,
"learning_rate": 3.6925515055467515e-06,
"loss": 2.2124,
"step": 864
},
{
"epoch": 0.6508653122648608,
"grad_norm": 19.1397762298584,
"learning_rate": 3.684627575277338e-06,
"loss": 2.5645,
"step": 865
},
{
"epoch": 0.6516177577125658,
"grad_norm": 17.174745559692383,
"learning_rate": 3.6767036450079243e-06,
"loss": 2.0762,
"step": 866
},
{
"epoch": 0.6523702031602708,
"grad_norm": 26.017173767089844,
"learning_rate": 3.6687797147385105e-06,
"loss": 2.2939,
"step": 867
},
{
"epoch": 0.653122648607976,
"grad_norm": 17.96502113342285,
"learning_rate": 3.6608557844690967e-06,
"loss": 2.7549,
"step": 868
},
{
"epoch": 0.653875094055681,
"grad_norm": 19.559343338012695,
"learning_rate": 3.6529318541996833e-06,
"loss": 2.3613,
"step": 869
},
{
"epoch": 0.654627539503386,
"grad_norm": 18.72821617126465,
"learning_rate": 3.64500792393027e-06,
"loss": 2.2822,
"step": 870
},
{
"epoch": 0.6553799849510911,
"grad_norm": 18.58492660522461,
"learning_rate": 3.637083993660856e-06,
"loss": 2.0332,
"step": 871
},
{
"epoch": 0.6561324303987961,
"grad_norm": 25.23973274230957,
"learning_rate": 3.6291600633914427e-06,
"loss": 2.7051,
"step": 872
},
{
"epoch": 0.6568848758465011,
"grad_norm": 26.061168670654297,
"learning_rate": 3.6212361331220285e-06,
"loss": 2.126,
"step": 873
},
{
"epoch": 0.6576373212942062,
"grad_norm": 30.92963409423828,
"learning_rate": 3.613312202852615e-06,
"loss": 2.2412,
"step": 874
},
{
"epoch": 0.6583897667419112,
"grad_norm": 16.77997589111328,
"learning_rate": 3.6053882725832017e-06,
"loss": 1.4937,
"step": 875
},
{
"epoch": 0.6591422121896162,
"grad_norm": 20.67428207397461,
"learning_rate": 3.597464342313788e-06,
"loss": 2.3301,
"step": 876
},
{
"epoch": 0.6598946576373212,
"grad_norm": 22.45784568786621,
"learning_rate": 3.5895404120443745e-06,
"loss": 2.0977,
"step": 877
},
{
"epoch": 0.6606471030850264,
"grad_norm": 30.148887634277344,
"learning_rate": 3.5816164817749603e-06,
"loss": 2.873,
"step": 878
},
{
"epoch": 0.6613995485327314,
"grad_norm": 21.913610458374023,
"learning_rate": 3.573692551505547e-06,
"loss": 2.583,
"step": 879
},
{
"epoch": 0.6621519939804364,
"grad_norm": 24.19639015197754,
"learning_rate": 3.5657686212361335e-06,
"loss": 2.502,
"step": 880
},
{
"epoch": 0.6629044394281415,
"grad_norm": 32.243167877197266,
"learning_rate": 3.5578446909667197e-06,
"loss": 2.2207,
"step": 881
},
{
"epoch": 0.6636568848758465,
"grad_norm": 25.025768280029297,
"learning_rate": 3.5499207606973063e-06,
"loss": 2.3164,
"step": 882
},
{
"epoch": 0.6644093303235515,
"grad_norm": 36.78255844116211,
"learning_rate": 3.5419968304278925e-06,
"loss": 2.5381,
"step": 883
},
{
"epoch": 0.6651617757712566,
"grad_norm": 25.785430908203125,
"learning_rate": 3.5340729001584787e-06,
"loss": 2.5068,
"step": 884
},
{
"epoch": 0.6659142212189616,
"grad_norm": 24.93991470336914,
"learning_rate": 3.5261489698890653e-06,
"loss": 2.9014,
"step": 885
},
{
"epoch": 0.6666666666666666,
"grad_norm": 33.389732360839844,
"learning_rate": 3.5182250396196515e-06,
"loss": 2.4287,
"step": 886
},
{
"epoch": 0.6674191121143717,
"grad_norm": 23.581132888793945,
"learning_rate": 3.510301109350238e-06,
"loss": 2.1855,
"step": 887
},
{
"epoch": 0.6681715575620768,
"grad_norm": 26.595279693603516,
"learning_rate": 3.5023771790808243e-06,
"loss": 1.9658,
"step": 888
},
{
"epoch": 0.6689240030097818,
"grad_norm": 19.963623046875,
"learning_rate": 3.494453248811411e-06,
"loss": 2.2393,
"step": 889
},
{
"epoch": 0.6696764484574869,
"grad_norm": 24.39027976989746,
"learning_rate": 3.4865293185419976e-06,
"loss": 2.4365,
"step": 890
},
{
"epoch": 0.6704288939051919,
"grad_norm": 19.47262191772461,
"learning_rate": 3.4786053882725833e-06,
"loss": 1.9961,
"step": 891
},
{
"epoch": 0.6711813393528969,
"grad_norm": 30.582433700561523,
"learning_rate": 3.47068145800317e-06,
"loss": 2.3809,
"step": 892
},
{
"epoch": 0.671933784800602,
"grad_norm": 17.60356903076172,
"learning_rate": 3.462757527733756e-06,
"loss": 2.0938,
"step": 893
},
{
"epoch": 0.672686230248307,
"grad_norm": 25.00141143798828,
"learning_rate": 3.4548335974643428e-06,
"loss": 2.1436,
"step": 894
},
{
"epoch": 0.673438675696012,
"grad_norm": 20.50116729736328,
"learning_rate": 3.4469096671949285e-06,
"loss": 2.2646,
"step": 895
},
{
"epoch": 0.674191121143717,
"grad_norm": 22.395421981811523,
"learning_rate": 3.438985736925515e-06,
"loss": 2.4668,
"step": 896
},
{
"epoch": 0.6749435665914221,
"grad_norm": 23.272846221923828,
"learning_rate": 3.4310618066561018e-06,
"loss": 2.5137,
"step": 897
},
{
"epoch": 0.6756960120391272,
"grad_norm": 18.45476722717285,
"learning_rate": 3.423137876386688e-06,
"loss": 2.1172,
"step": 898
},
{
"epoch": 0.6764484574868322,
"grad_norm": 20.40255355834961,
"learning_rate": 3.4152139461172746e-06,
"loss": 2.5439,
"step": 899
},
{
"epoch": 0.6772009029345373,
"grad_norm": 20.203140258789062,
"learning_rate": 3.4072900158478608e-06,
"loss": 1.998,
"step": 900
},
{
"epoch": 0.6779533483822423,
"grad_norm": 36.98493576049805,
"learning_rate": 3.399366085578447e-06,
"loss": 2.4912,
"step": 901
},
{
"epoch": 0.6787057938299473,
"grad_norm": 25.70787811279297,
"learning_rate": 3.3914421553090336e-06,
"loss": 2.9082,
"step": 902
},
{
"epoch": 0.6794582392776524,
"grad_norm": 21.873355865478516,
"learning_rate": 3.3835182250396198e-06,
"loss": 2.3164,
"step": 903
},
{
"epoch": 0.6802106847253574,
"grad_norm": 20.708681106567383,
"learning_rate": 3.3755942947702064e-06,
"loss": 2.626,
"step": 904
},
{
"epoch": 0.6809631301730624,
"grad_norm": 24.416845321655273,
"learning_rate": 3.3676703645007926e-06,
"loss": 2.1973,
"step": 905
},
{
"epoch": 0.6817155756207675,
"grad_norm": 18.85466957092285,
"learning_rate": 3.359746434231379e-06,
"loss": 1.7402,
"step": 906
},
{
"epoch": 0.6824680210684725,
"grad_norm": 33.928104400634766,
"learning_rate": 3.351822503961966e-06,
"loss": 3.1211,
"step": 907
},
{
"epoch": 0.6832204665161776,
"grad_norm": 19.205385208129883,
"learning_rate": 3.3438985736925516e-06,
"loss": 2.2646,
"step": 908
},
{
"epoch": 0.6839729119638827,
"grad_norm": 23.29824447631836,
"learning_rate": 3.335974643423138e-06,
"loss": 2.5674,
"step": 909
},
{
"epoch": 0.6847253574115877,
"grad_norm": 51.69862365722656,
"learning_rate": 3.3280507131537244e-06,
"loss": 2.8838,
"step": 910
},
{
"epoch": 0.6854778028592927,
"grad_norm": 31.652620315551758,
"learning_rate": 3.320126782884311e-06,
"loss": 1.8994,
"step": 911
},
{
"epoch": 0.6862302483069977,
"grad_norm": 34.18386459350586,
"learning_rate": 3.3122028526148976e-06,
"loss": 2.2725,
"step": 912
},
{
"epoch": 0.6869826937547028,
"grad_norm": 25.95589256286621,
"learning_rate": 3.3042789223454834e-06,
"loss": 1.8262,
"step": 913
},
{
"epoch": 0.6877351392024078,
"grad_norm": 25.984094619750977,
"learning_rate": 3.29635499207607e-06,
"loss": 2.5732,
"step": 914
},
{
"epoch": 0.6884875846501128,
"grad_norm": 20.5795841217041,
"learning_rate": 3.2884310618066562e-06,
"loss": 2.209,
"step": 915
},
{
"epoch": 0.6892400300978179,
"grad_norm": 24.327163696289062,
"learning_rate": 3.280507131537243e-06,
"loss": 2.834,
"step": 916
},
{
"epoch": 0.6899924755455229,
"grad_norm": 23.493921279907227,
"learning_rate": 3.2725832012678294e-06,
"loss": 2.3008,
"step": 917
},
{
"epoch": 0.690744920993228,
"grad_norm": 22.002779006958008,
"learning_rate": 3.2646592709984152e-06,
"loss": 2.5127,
"step": 918
},
{
"epoch": 0.6914973664409331,
"grad_norm": 20.9849853515625,
"learning_rate": 3.256735340729002e-06,
"loss": 2.3838,
"step": 919
},
{
"epoch": 0.6922498118886381,
"grad_norm": 23.12964630126953,
"learning_rate": 3.248811410459588e-06,
"loss": 2.4521,
"step": 920
},
{
"epoch": 0.6930022573363431,
"grad_norm": 18.361614227294922,
"learning_rate": 3.2408874801901746e-06,
"loss": 1.9404,
"step": 921
},
{
"epoch": 0.6937547027840482,
"grad_norm": 25.768903732299805,
"learning_rate": 3.2329635499207613e-06,
"loss": 2.582,
"step": 922
},
{
"epoch": 0.6945071482317532,
"grad_norm": 18.827003479003906,
"learning_rate": 3.2250396196513475e-06,
"loss": 2.0742,
"step": 923
},
{
"epoch": 0.6952595936794582,
"grad_norm": 22.65555763244629,
"learning_rate": 3.217115689381934e-06,
"loss": 2.5566,
"step": 924
},
{
"epoch": 0.6960120391271633,
"grad_norm": 18.19965171813965,
"learning_rate": 3.20919175911252e-06,
"loss": 2.0063,
"step": 925
},
{
"epoch": 0.6967644845748683,
"grad_norm": 31.940683364868164,
"learning_rate": 3.2012678288431065e-06,
"loss": 2.1514,
"step": 926
},
{
"epoch": 0.6975169300225733,
"grad_norm": 23.10796546936035,
"learning_rate": 3.1933438985736926e-06,
"loss": 2.3877,
"step": 927
},
{
"epoch": 0.6982693754702785,
"grad_norm": 23.25333023071289,
"learning_rate": 3.1854199683042793e-06,
"loss": 2.3535,
"step": 928
},
{
"epoch": 0.6990218209179835,
"grad_norm": 20.867780685424805,
"learning_rate": 3.177496038034866e-06,
"loss": 2.2783,
"step": 929
},
{
"epoch": 0.6997742663656885,
"grad_norm": 17.495346069335938,
"learning_rate": 3.1695721077654516e-06,
"loss": 1.9824,
"step": 930
},
{
"epoch": 0.7005267118133935,
"grad_norm": 19.127239227294922,
"learning_rate": 3.1616481774960383e-06,
"loss": 2.0283,
"step": 931
},
{
"epoch": 0.7012791572610986,
"grad_norm": 25.830289840698242,
"learning_rate": 3.1537242472266245e-06,
"loss": 3.0107,
"step": 932
},
{
"epoch": 0.7020316027088036,
"grad_norm": 21.201894760131836,
"learning_rate": 3.145800316957211e-06,
"loss": 2.0293,
"step": 933
},
{
"epoch": 0.7027840481565086,
"grad_norm": 29.03411102294922,
"learning_rate": 3.1378763866877977e-06,
"loss": 2.5771,
"step": 934
},
{
"epoch": 0.7035364936042137,
"grad_norm": 20.404911041259766,
"learning_rate": 3.1299524564183835e-06,
"loss": 2.0645,
"step": 935
},
{
"epoch": 0.7042889390519187,
"grad_norm": 29.42574691772461,
"learning_rate": 3.12202852614897e-06,
"loss": 2.1699,
"step": 936
},
{
"epoch": 0.7050413844996237,
"grad_norm": 20.876218795776367,
"learning_rate": 3.1141045958795563e-06,
"loss": 1.6353,
"step": 937
},
{
"epoch": 0.7057938299473289,
"grad_norm": 37.073631286621094,
"learning_rate": 3.106180665610143e-06,
"loss": 2.0352,
"step": 938
},
{
"epoch": 0.7065462753950339,
"grad_norm": 25.04046630859375,
"learning_rate": 3.0982567353407295e-06,
"loss": 2.1602,
"step": 939
},
{
"epoch": 0.7072987208427389,
"grad_norm": 27.084985733032227,
"learning_rate": 3.0903328050713157e-06,
"loss": 2.5342,
"step": 940
},
{
"epoch": 0.708051166290444,
"grad_norm": 27.979019165039062,
"learning_rate": 3.082408874801902e-06,
"loss": 2.0264,
"step": 941
},
{
"epoch": 0.708803611738149,
"grad_norm": 24.28731346130371,
"learning_rate": 3.074484944532488e-06,
"loss": 2.541,
"step": 942
},
{
"epoch": 0.709556057185854,
"grad_norm": 26.896106719970703,
"learning_rate": 3.0665610142630747e-06,
"loss": 1.5684,
"step": 943
},
{
"epoch": 0.710308502633559,
"grad_norm": 38.26861572265625,
"learning_rate": 3.0586370839936613e-06,
"loss": 2.4727,
"step": 944
},
{
"epoch": 0.7110609480812641,
"grad_norm": 34.82157516479492,
"learning_rate": 3.0507131537242475e-06,
"loss": 2.6074,
"step": 945
},
{
"epoch": 0.7118133935289691,
"grad_norm": 22.82600975036621,
"learning_rate": 3.042789223454834e-06,
"loss": 2.4297,
"step": 946
},
{
"epoch": 0.7125658389766741,
"grad_norm": 22.42252540588379,
"learning_rate": 3.03486529318542e-06,
"loss": 2.4795,
"step": 947
},
{
"epoch": 0.7133182844243793,
"grad_norm": 25.8705997467041,
"learning_rate": 3.0269413629160065e-06,
"loss": 3.0166,
"step": 948
},
{
"epoch": 0.7140707298720843,
"grad_norm": 23.13258171081543,
"learning_rate": 3.019017432646593e-06,
"loss": 2.0781,
"step": 949
},
{
"epoch": 0.7148231753197893,
"grad_norm": 19.896459579467773,
"learning_rate": 3.0110935023771793e-06,
"loss": 2.2402,
"step": 950
},
{
"epoch": 0.7155756207674944,
"grad_norm": 20.398405075073242,
"learning_rate": 3.003169572107766e-06,
"loss": 2.0166,
"step": 951
},
{
"epoch": 0.7163280662151994,
"grad_norm": 25.631702423095703,
"learning_rate": 2.9952456418383517e-06,
"loss": 2.0332,
"step": 952
},
{
"epoch": 0.7170805116629044,
"grad_norm": 19.276575088500977,
"learning_rate": 2.9873217115689383e-06,
"loss": 2.5947,
"step": 953
},
{
"epoch": 0.7178329571106095,
"grad_norm": 22.23748207092285,
"learning_rate": 2.979397781299525e-06,
"loss": 2.0342,
"step": 954
},
{
"epoch": 0.7185854025583145,
"grad_norm": 35.471561431884766,
"learning_rate": 2.971473851030111e-06,
"loss": 2.542,
"step": 955
},
{
"epoch": 0.7193378480060195,
"grad_norm": 20.142765045166016,
"learning_rate": 2.9635499207606977e-06,
"loss": 2.168,
"step": 956
},
{
"epoch": 0.7200902934537246,
"grad_norm": 25.604598999023438,
"learning_rate": 2.955625990491284e-06,
"loss": 2.084,
"step": 957
},
{
"epoch": 0.7208427389014297,
"grad_norm": 24.70061492919922,
"learning_rate": 2.94770206022187e-06,
"loss": 2.3701,
"step": 958
},
{
"epoch": 0.7215951843491347,
"grad_norm": 19.22873306274414,
"learning_rate": 2.9397781299524568e-06,
"loss": 2.5967,
"step": 959
},
{
"epoch": 0.7223476297968398,
"grad_norm": 19.727336883544922,
"learning_rate": 2.931854199683043e-06,
"loss": 2.0752,
"step": 960
},
{
"epoch": 0.7231000752445448,
"grad_norm": 25.627744674682617,
"learning_rate": 2.9239302694136296e-06,
"loss": 2.4873,
"step": 961
},
{
"epoch": 0.7238525206922498,
"grad_norm": 23.095905303955078,
"learning_rate": 2.9160063391442158e-06,
"loss": 2.3765,
"step": 962
},
{
"epoch": 0.7246049661399548,
"grad_norm": 21.401283264160156,
"learning_rate": 2.9080824088748024e-06,
"loss": 2.4014,
"step": 963
},
{
"epoch": 0.7253574115876599,
"grad_norm": 23.942296981811523,
"learning_rate": 2.900158478605388e-06,
"loss": 2.5908,
"step": 964
},
{
"epoch": 0.7261098570353649,
"grad_norm": 18.88973617553711,
"learning_rate": 2.8922345483359748e-06,
"loss": 2.042,
"step": 965
},
{
"epoch": 0.7268623024830699,
"grad_norm": 21.262531280517578,
"learning_rate": 2.8843106180665614e-06,
"loss": 2.4756,
"step": 966
},
{
"epoch": 0.7276147479307751,
"grad_norm": 18.916324615478516,
"learning_rate": 2.8763866877971476e-06,
"loss": 2.6748,
"step": 967
},
{
"epoch": 0.7283671933784801,
"grad_norm": 21.240524291992188,
"learning_rate": 2.868462757527734e-06,
"loss": 2.4922,
"step": 968
},
{
"epoch": 0.7291196388261851,
"grad_norm": 21.823871612548828,
"learning_rate": 2.86053882725832e-06,
"loss": 2.3828,
"step": 969
},
{
"epoch": 0.7298720842738902,
"grad_norm": 38.113922119140625,
"learning_rate": 2.8526148969889066e-06,
"loss": 2.752,
"step": 970
},
{
"epoch": 0.7306245297215952,
"grad_norm": 21.07139015197754,
"learning_rate": 2.844690966719493e-06,
"loss": 2.3223,
"step": 971
},
{
"epoch": 0.7313769751693002,
"grad_norm": 18.12544822692871,
"learning_rate": 2.8367670364500794e-06,
"loss": 2.3076,
"step": 972
},
{
"epoch": 0.7321294206170053,
"grad_norm": 25.153772354125977,
"learning_rate": 2.828843106180666e-06,
"loss": 2.5049,
"step": 973
},
{
"epoch": 0.7328818660647103,
"grad_norm": 24.048927307128906,
"learning_rate": 2.8209191759112518e-06,
"loss": 2.7891,
"step": 974
},
{
"epoch": 0.7336343115124153,
"grad_norm": 23.4043025970459,
"learning_rate": 2.8129952456418384e-06,
"loss": 2.7061,
"step": 975
},
{
"epoch": 0.7343867569601203,
"grad_norm": 24.055015563964844,
"learning_rate": 2.805071315372425e-06,
"loss": 2.3691,
"step": 976
},
{
"epoch": 0.7351392024078255,
"grad_norm": 30.86815643310547,
"learning_rate": 2.797147385103011e-06,
"loss": 2.4199,
"step": 977
},
{
"epoch": 0.7358916478555305,
"grad_norm": 19.904773712158203,
"learning_rate": 2.789223454833598e-06,
"loss": 2.4951,
"step": 978
},
{
"epoch": 0.7366440933032355,
"grad_norm": 18.34711265563965,
"learning_rate": 2.781299524564184e-06,
"loss": 2.5,
"step": 979
},
{
"epoch": 0.7373965387509406,
"grad_norm": 17.771268844604492,
"learning_rate": 2.7733755942947706e-06,
"loss": 1.8232,
"step": 980
},
{
"epoch": 0.7381489841986456,
"grad_norm": 20.759653091430664,
"learning_rate": 2.7654516640253572e-06,
"loss": 2.1533,
"step": 981
},
{
"epoch": 0.7389014296463506,
"grad_norm": 27.191268920898438,
"learning_rate": 2.757527733755943e-06,
"loss": 2.2979,
"step": 982
},
{
"epoch": 0.7396538750940557,
"grad_norm": 23.227584838867188,
"learning_rate": 2.7496038034865296e-06,
"loss": 2.0762,
"step": 983
},
{
"epoch": 0.7404063205417607,
"grad_norm": 21.38162612915039,
"learning_rate": 2.741679873217116e-06,
"loss": 2.709,
"step": 984
},
{
"epoch": 0.7411587659894657,
"grad_norm": 17.612733840942383,
"learning_rate": 2.7337559429477024e-06,
"loss": 2.3223,
"step": 985
},
{
"epoch": 0.7419112114371708,
"grad_norm": 26.95534896850586,
"learning_rate": 2.725832012678289e-06,
"loss": 2.3506,
"step": 986
},
{
"epoch": 0.7426636568848759,
"grad_norm": 46.48407745361328,
"learning_rate": 2.717908082408875e-06,
"loss": 2.0186,
"step": 987
},
{
"epoch": 0.7434161023325809,
"grad_norm": 24.3732967376709,
"learning_rate": 2.7099841521394614e-06,
"loss": 2.2715,
"step": 988
},
{
"epoch": 0.744168547780286,
"grad_norm": 18.800397872924805,
"learning_rate": 2.7020602218700476e-06,
"loss": 2.3564,
"step": 989
},
{
"epoch": 0.744920993227991,
"grad_norm": 28.8432674407959,
"learning_rate": 2.6941362916006342e-06,
"loss": 2.8242,
"step": 990
},
{
"epoch": 0.745673438675696,
"grad_norm": 23.0736026763916,
"learning_rate": 2.686212361331221e-06,
"loss": 2.1826,
"step": 991
},
{
"epoch": 0.746425884123401,
"grad_norm": 27.80742073059082,
"learning_rate": 2.6782884310618066e-06,
"loss": 2.2583,
"step": 992
},
{
"epoch": 0.7471783295711061,
"grad_norm": 21.215221405029297,
"learning_rate": 2.6703645007923932e-06,
"loss": 2.6289,
"step": 993
},
{
"epoch": 0.7479307750188111,
"grad_norm": 25.544788360595703,
"learning_rate": 2.6624405705229794e-06,
"loss": 2.4414,
"step": 994
},
{
"epoch": 0.7486832204665161,
"grad_norm": 27.689598083496094,
"learning_rate": 2.654516640253566e-06,
"loss": 2.5352,
"step": 995
},
{
"epoch": 0.7494356659142212,
"grad_norm": 21.94173240661621,
"learning_rate": 2.6465927099841527e-06,
"loss": 2.2261,
"step": 996
},
{
"epoch": 0.7501881113619263,
"grad_norm": 20.872621536254883,
"learning_rate": 2.638668779714739e-06,
"loss": 1.9814,
"step": 997
},
{
"epoch": 0.7509405568096313,
"grad_norm": 21.449371337890625,
"learning_rate": 2.630744849445325e-06,
"loss": 2.4668,
"step": 998
},
{
"epoch": 0.7516930022573364,
"grad_norm": 18.553009033203125,
"learning_rate": 2.6228209191759112e-06,
"loss": 2.1934,
"step": 999
},
{
"epoch": 0.7524454477050414,
"grad_norm": 31.450166702270508,
"learning_rate": 2.614896988906498e-06,
"loss": 2.2305,
"step": 1000
},
{
"epoch": 0.7531978931527464,
"grad_norm": 30.71107292175293,
"learning_rate": 2.606973058637084e-06,
"loss": 2.3838,
"step": 1001
},
{
"epoch": 0.7539503386004515,
"grad_norm": 19.482921600341797,
"learning_rate": 2.5990491283676707e-06,
"loss": 2.0806,
"step": 1002
},
{
"epoch": 0.7547027840481565,
"grad_norm": 27.4390926361084,
"learning_rate": 2.5911251980982573e-06,
"loss": 2.5801,
"step": 1003
},
{
"epoch": 0.7554552294958615,
"grad_norm": 39.73894119262695,
"learning_rate": 2.583201267828843e-06,
"loss": 2.5068,
"step": 1004
},
{
"epoch": 0.7562076749435666,
"grad_norm": 35.52018356323242,
"learning_rate": 2.5752773375594297e-06,
"loss": 2.6641,
"step": 1005
},
{
"epoch": 0.7569601203912716,
"grad_norm": 23.1878719329834,
"learning_rate": 2.567353407290016e-06,
"loss": 2.2603,
"step": 1006
},
{
"epoch": 0.7577125658389767,
"grad_norm": 21.26180648803711,
"learning_rate": 2.5594294770206025e-06,
"loss": 2.0269,
"step": 1007
},
{
"epoch": 0.7584650112866818,
"grad_norm": 18.922929763793945,
"learning_rate": 2.551505546751189e-06,
"loss": 1.9541,
"step": 1008
},
{
"epoch": 0.7592174567343868,
"grad_norm": 24.16057586669922,
"learning_rate": 2.543581616481775e-06,
"loss": 3.0117,
"step": 1009
},
{
"epoch": 0.7599699021820918,
"grad_norm": 19.66029930114746,
"learning_rate": 2.5356576862123615e-06,
"loss": 2.0205,
"step": 1010
},
{
"epoch": 0.7607223476297968,
"grad_norm": 24.95394515991211,
"learning_rate": 2.5277337559429477e-06,
"loss": 2.6768,
"step": 1011
},
{
"epoch": 0.7614747930775019,
"grad_norm": 30.851152420043945,
"learning_rate": 2.5198098256735343e-06,
"loss": 1.8813,
"step": 1012
},
{
"epoch": 0.7622272385252069,
"grad_norm": 23.487869262695312,
"learning_rate": 2.511885895404121e-06,
"loss": 1.9902,
"step": 1013
},
{
"epoch": 0.7629796839729119,
"grad_norm": 20.58772087097168,
"learning_rate": 2.503961965134707e-06,
"loss": 2.7119,
"step": 1014
},
{
"epoch": 0.763732129420617,
"grad_norm": 19.332704544067383,
"learning_rate": 2.4960380348652933e-06,
"loss": 2.4258,
"step": 1015
},
{
"epoch": 0.764484574868322,
"grad_norm": 31.803115844726562,
"learning_rate": 2.48811410459588e-06,
"loss": 2.707,
"step": 1016
},
{
"epoch": 0.7652370203160271,
"grad_norm": 28.797060012817383,
"learning_rate": 2.480190174326466e-06,
"loss": 1.7207,
"step": 1017
},
{
"epoch": 0.7659894657637322,
"grad_norm": 30.22066307067871,
"learning_rate": 2.4722662440570523e-06,
"loss": 2.2822,
"step": 1018
},
{
"epoch": 0.7667419112114372,
"grad_norm": 15.914982795715332,
"learning_rate": 2.464342313787639e-06,
"loss": 2.2627,
"step": 1019
},
{
"epoch": 0.7674943566591422,
"grad_norm": 24.08812713623047,
"learning_rate": 2.4564183835182255e-06,
"loss": 2.1035,
"step": 1020
},
{
"epoch": 0.7682468021068473,
"grad_norm": 35.42763137817383,
"learning_rate": 2.4484944532488117e-06,
"loss": 2.832,
"step": 1021
},
{
"epoch": 0.7689992475545523,
"grad_norm": 19.57898712158203,
"learning_rate": 2.440570522979398e-06,
"loss": 2.3027,
"step": 1022
},
{
"epoch": 0.7697516930022573,
"grad_norm": 24.63896942138672,
"learning_rate": 2.432646592709984e-06,
"loss": 2.1934,
"step": 1023
},
{
"epoch": 0.7705041384499624,
"grad_norm": 18.475221633911133,
"learning_rate": 2.4247226624405707e-06,
"loss": 1.8838,
"step": 1024
},
{
"epoch": 0.7712565838976674,
"grad_norm": 20.874549865722656,
"learning_rate": 2.416798732171157e-06,
"loss": 2.0713,
"step": 1025
},
{
"epoch": 0.7720090293453724,
"grad_norm": 25.372650146484375,
"learning_rate": 2.4088748019017435e-06,
"loss": 2.2949,
"step": 1026
},
{
"epoch": 0.7727614747930776,
"grad_norm": 19.395593643188477,
"learning_rate": 2.4009508716323297e-06,
"loss": 2.252,
"step": 1027
},
{
"epoch": 0.7735139202407826,
"grad_norm": 25.478004455566406,
"learning_rate": 2.3930269413629164e-06,
"loss": 2.3438,
"step": 1028
},
{
"epoch": 0.7742663656884876,
"grad_norm": 22.913898468017578,
"learning_rate": 2.3851030110935025e-06,
"loss": 2.7061,
"step": 1029
},
{
"epoch": 0.7750188111361926,
"grad_norm": 17.623754501342773,
"learning_rate": 2.3771790808240887e-06,
"loss": 1.833,
"step": 1030
},
{
"epoch": 0.7757712565838977,
"grad_norm": 36.00138854980469,
"learning_rate": 2.3692551505546754e-06,
"loss": 1.9961,
"step": 1031
},
{
"epoch": 0.7765237020316027,
"grad_norm": 22.17951202392578,
"learning_rate": 2.3613312202852615e-06,
"loss": 2.291,
"step": 1032
},
{
"epoch": 0.7772761474793077,
"grad_norm": 27.934125900268555,
"learning_rate": 2.353407290015848e-06,
"loss": 2.0273,
"step": 1033
},
{
"epoch": 0.7780285929270128,
"grad_norm": 25.72597885131836,
"learning_rate": 2.3454833597464344e-06,
"loss": 2.3418,
"step": 1034
},
{
"epoch": 0.7787810383747178,
"grad_norm": 20.993690490722656,
"learning_rate": 2.3375594294770205e-06,
"loss": 2.2773,
"step": 1035
},
{
"epoch": 0.7795334838224228,
"grad_norm": 19.845046997070312,
"learning_rate": 2.329635499207607e-06,
"loss": 2.4473,
"step": 1036
},
{
"epoch": 0.780285929270128,
"grad_norm": 19.89436149597168,
"learning_rate": 2.3217115689381938e-06,
"loss": 2.186,
"step": 1037
},
{
"epoch": 0.781038374717833,
"grad_norm": 21.185094833374023,
"learning_rate": 2.31378763866878e-06,
"loss": 1.8525,
"step": 1038
},
{
"epoch": 0.781790820165538,
"grad_norm": 19.601686477661133,
"learning_rate": 2.305863708399366e-06,
"loss": 2.0459,
"step": 1039
},
{
"epoch": 0.782543265613243,
"grad_norm": 28.16640853881836,
"learning_rate": 2.2979397781299524e-06,
"loss": 1.9453,
"step": 1040
},
{
"epoch": 0.7832957110609481,
"grad_norm": 20.31542205810547,
"learning_rate": 2.290015847860539e-06,
"loss": 2.0532,
"step": 1041
},
{
"epoch": 0.7840481565086531,
"grad_norm": 43.06874465942383,
"learning_rate": 2.2820919175911256e-06,
"loss": 2.6689,
"step": 1042
},
{
"epoch": 0.7848006019563581,
"grad_norm": 24.530195236206055,
"learning_rate": 2.2741679873217118e-06,
"loss": 2.4707,
"step": 1043
},
{
"epoch": 0.7855530474040632,
"grad_norm": 20.09838104248047,
"learning_rate": 2.266244057052298e-06,
"loss": 1.9854,
"step": 1044
},
{
"epoch": 0.7863054928517682,
"grad_norm": 26.703359603881836,
"learning_rate": 2.2583201267828846e-06,
"loss": 2.6191,
"step": 1045
},
{
"epoch": 0.7870579382994732,
"grad_norm": 20.001371383666992,
"learning_rate": 2.250396196513471e-06,
"loss": 2.1367,
"step": 1046
},
{
"epoch": 0.7878103837471784,
"grad_norm": 21.10999870300293,
"learning_rate": 2.2424722662440574e-06,
"loss": 2.0635,
"step": 1047
},
{
"epoch": 0.7885628291948834,
"grad_norm": 30.246315002441406,
"learning_rate": 2.2345483359746436e-06,
"loss": 2.8379,
"step": 1048
},
{
"epoch": 0.7893152746425884,
"grad_norm": 19.928213119506836,
"learning_rate": 2.22662440570523e-06,
"loss": 2.2905,
"step": 1049
},
{
"epoch": 0.7900677200902935,
"grad_norm": 34.11455535888672,
"learning_rate": 2.2187004754358164e-06,
"loss": 1.7715,
"step": 1050
},
{
"epoch": 0.7908201655379985,
"grad_norm": 21.632551193237305,
"learning_rate": 2.2107765451664026e-06,
"loss": 2.2568,
"step": 1051
},
{
"epoch": 0.7915726109857035,
"grad_norm": 37.97250747680664,
"learning_rate": 2.2028526148969892e-06,
"loss": 2.3799,
"step": 1052
},
{
"epoch": 0.7923250564334086,
"grad_norm": 27.17977523803711,
"learning_rate": 2.1949286846275754e-06,
"loss": 2.9277,
"step": 1053
},
{
"epoch": 0.7930775018811136,
"grad_norm": 20.97044563293457,
"learning_rate": 2.187004754358162e-06,
"loss": 2.209,
"step": 1054
},
{
"epoch": 0.7938299473288186,
"grad_norm": 25.320844650268555,
"learning_rate": 2.1790808240887482e-06,
"loss": 2.5098,
"step": 1055
},
{
"epoch": 0.7945823927765236,
"grad_norm": 21.632658004760742,
"learning_rate": 2.1711568938193344e-06,
"loss": 2.4844,
"step": 1056
},
{
"epoch": 0.7953348382242288,
"grad_norm": 17.359174728393555,
"learning_rate": 2.163232963549921e-06,
"loss": 2.0874,
"step": 1057
},
{
"epoch": 0.7960872836719338,
"grad_norm": 32.31191635131836,
"learning_rate": 2.1553090332805072e-06,
"loss": 2.207,
"step": 1058
},
{
"epoch": 0.7968397291196389,
"grad_norm": 26.761028289794922,
"learning_rate": 2.147385103011094e-06,
"loss": 2.7266,
"step": 1059
},
{
"epoch": 0.7975921745673439,
"grad_norm": 22.91716194152832,
"learning_rate": 2.13946117274168e-06,
"loss": 2.0654,
"step": 1060
},
{
"epoch": 0.7983446200150489,
"grad_norm": 29.451723098754883,
"learning_rate": 2.1315372424722662e-06,
"loss": 1.999,
"step": 1061
},
{
"epoch": 0.7990970654627539,
"grad_norm": 20.911706924438477,
"learning_rate": 2.123613312202853e-06,
"loss": 2.0498,
"step": 1062
},
{
"epoch": 0.799849510910459,
"grad_norm": 23.506391525268555,
"learning_rate": 2.1156893819334395e-06,
"loss": 2.2852,
"step": 1063
},
{
"epoch": 0.800601956358164,
"grad_norm": 16.55670738220215,
"learning_rate": 2.1077654516640257e-06,
"loss": 1.9072,
"step": 1064
},
{
"epoch": 0.801354401805869,
"grad_norm": 22.04123878479004,
"learning_rate": 2.099841521394612e-06,
"loss": 2.6279,
"step": 1065
},
{
"epoch": 0.8021068472535741,
"grad_norm": 23.356552124023438,
"learning_rate": 2.091917591125198e-06,
"loss": 2.0391,
"step": 1066
},
{
"epoch": 0.8028592927012792,
"grad_norm": 21.152170181274414,
"learning_rate": 2.0839936608557847e-06,
"loss": 2.3486,
"step": 1067
},
{
"epoch": 0.8036117381489842,
"grad_norm": 41.95413589477539,
"learning_rate": 2.0760697305863713e-06,
"loss": 2.5762,
"step": 1068
},
{
"epoch": 0.8043641835966893,
"grad_norm": 22.67193031311035,
"learning_rate": 2.0681458003169575e-06,
"loss": 2.9912,
"step": 1069
},
{
"epoch": 0.8051166290443943,
"grad_norm": 22.507898330688477,
"learning_rate": 2.0602218700475437e-06,
"loss": 2.1948,
"step": 1070
},
{
"epoch": 0.8058690744920993,
"grad_norm": 38.1120719909668,
"learning_rate": 2.0522979397781303e-06,
"loss": 2.4033,
"step": 1071
},
{
"epoch": 0.8066215199398044,
"grad_norm": 27.03844451904297,
"learning_rate": 2.0443740095087165e-06,
"loss": 2.5049,
"step": 1072
},
{
"epoch": 0.8073739653875094,
"grad_norm": 25.915897369384766,
"learning_rate": 2.036450079239303e-06,
"loss": 2.4033,
"step": 1073
},
{
"epoch": 0.8081264108352144,
"grad_norm": 20.6876277923584,
"learning_rate": 2.0285261489698893e-06,
"loss": 2.2363,
"step": 1074
},
{
"epoch": 0.8088788562829194,
"grad_norm": 23.289859771728516,
"learning_rate": 2.0206022187004755e-06,
"loss": 2.2559,
"step": 1075
},
{
"epoch": 0.8096313017306245,
"grad_norm": 19.997568130493164,
"learning_rate": 2.012678288431062e-06,
"loss": 2.0186,
"step": 1076
},
{
"epoch": 0.8103837471783296,
"grad_norm": 17.515745162963867,
"learning_rate": 2.0047543581616483e-06,
"loss": 2.3486,
"step": 1077
},
{
"epoch": 0.8111361926260346,
"grad_norm": 21.844892501831055,
"learning_rate": 1.9968304278922345e-06,
"loss": 2.249,
"step": 1078
},
{
"epoch": 0.8118886380737397,
"grad_norm": 30.721351623535156,
"learning_rate": 1.988906497622821e-06,
"loss": 2.7441,
"step": 1079
},
{
"epoch": 0.8126410835214447,
"grad_norm": 26.29536247253418,
"learning_rate": 1.9809825673534077e-06,
"loss": 2.1123,
"step": 1080
},
{
"epoch": 0.8133935289691497,
"grad_norm": 25.76918601989746,
"learning_rate": 1.973058637083994e-06,
"loss": 2.167,
"step": 1081
},
{
"epoch": 0.8141459744168548,
"grad_norm": 36.153045654296875,
"learning_rate": 1.96513470681458e-06,
"loss": 2.9219,
"step": 1082
},
{
"epoch": 0.8148984198645598,
"grad_norm": 19.623029708862305,
"learning_rate": 1.9572107765451663e-06,
"loss": 1.9341,
"step": 1083
},
{
"epoch": 0.8156508653122648,
"grad_norm": 28.89622688293457,
"learning_rate": 1.949286846275753e-06,
"loss": 2.748,
"step": 1084
},
{
"epoch": 0.8164033107599699,
"grad_norm": 17.98982048034668,
"learning_rate": 1.9413629160063395e-06,
"loss": 1.7686,
"step": 1085
},
{
"epoch": 0.8171557562076749,
"grad_norm": 25.77059555053711,
"learning_rate": 1.9334389857369257e-06,
"loss": 1.8994,
"step": 1086
},
{
"epoch": 0.81790820165538,
"grad_norm": 26.73072052001953,
"learning_rate": 1.925515055467512e-06,
"loss": 2.0908,
"step": 1087
},
{
"epoch": 0.8186606471030851,
"grad_norm": 31.21881866455078,
"learning_rate": 1.9175911251980985e-06,
"loss": 2.2412,
"step": 1088
},
{
"epoch": 0.8194130925507901,
"grad_norm": 20.786651611328125,
"learning_rate": 1.9096671949286847e-06,
"loss": 2.2842,
"step": 1089
},
{
"epoch": 0.8201655379984951,
"grad_norm": 24.010360717773438,
"learning_rate": 1.9017432646592713e-06,
"loss": 2.0205,
"step": 1090
},
{
"epoch": 0.8209179834462002,
"grad_norm": 20.723363876342773,
"learning_rate": 1.8938193343898575e-06,
"loss": 2.041,
"step": 1091
},
{
"epoch": 0.8216704288939052,
"grad_norm": 25.89142608642578,
"learning_rate": 1.885895404120444e-06,
"loss": 1.8486,
"step": 1092
},
{
"epoch": 0.8224228743416102,
"grad_norm": 20.13483428955078,
"learning_rate": 1.8779714738510301e-06,
"loss": 1.9575,
"step": 1093
},
{
"epoch": 0.8231753197893152,
"grad_norm": 29.47180938720703,
"learning_rate": 1.8700475435816165e-06,
"loss": 2.2529,
"step": 1094
},
{
"epoch": 0.8239277652370203,
"grad_norm": 21.102237701416016,
"learning_rate": 1.8621236133122031e-06,
"loss": 2.2578,
"step": 1095
},
{
"epoch": 0.8246802106847254,
"grad_norm": 23.39295768737793,
"learning_rate": 1.8541996830427893e-06,
"loss": 2.8486,
"step": 1096
},
{
"epoch": 0.8254326561324304,
"grad_norm": 22.92432403564453,
"learning_rate": 1.8462757527733757e-06,
"loss": 2.0596,
"step": 1097
},
{
"epoch": 0.8261851015801355,
"grad_norm": 26.11507797241211,
"learning_rate": 1.8383518225039621e-06,
"loss": 2.1094,
"step": 1098
},
{
"epoch": 0.8269375470278405,
"grad_norm": 22.83144760131836,
"learning_rate": 1.8304278922345483e-06,
"loss": 2.252,
"step": 1099
},
{
"epoch": 0.8276899924755455,
"grad_norm": 23.018451690673828,
"learning_rate": 1.822503961965135e-06,
"loss": 2.3848,
"step": 1100
},
{
"epoch": 0.8284424379232506,
"grad_norm": 22.875572204589844,
"learning_rate": 1.8145800316957214e-06,
"loss": 2.7886,
"step": 1101
},
{
"epoch": 0.8291948833709556,
"grad_norm": 23.032974243164062,
"learning_rate": 1.8066561014263076e-06,
"loss": 2.2539,
"step": 1102
},
{
"epoch": 0.8299473288186606,
"grad_norm": 18.94373893737793,
"learning_rate": 1.798732171156894e-06,
"loss": 1.9492,
"step": 1103
},
{
"epoch": 0.8306997742663657,
"grad_norm": 24.989089965820312,
"learning_rate": 1.7908082408874801e-06,
"loss": 2.1313,
"step": 1104
},
{
"epoch": 0.8314522197140707,
"grad_norm": 17.287385940551758,
"learning_rate": 1.7828843106180668e-06,
"loss": 1.7773,
"step": 1105
},
{
"epoch": 0.8322046651617758,
"grad_norm": 39.13855743408203,
"learning_rate": 1.7749603803486532e-06,
"loss": 2.3174,
"step": 1106
},
{
"epoch": 0.8329571106094809,
"grad_norm": 27.369783401489258,
"learning_rate": 1.7670364500792394e-06,
"loss": 2.6826,
"step": 1107
},
{
"epoch": 0.8337095560571859,
"grad_norm": 29.145048141479492,
"learning_rate": 1.7591125198098258e-06,
"loss": 2.6401,
"step": 1108
},
{
"epoch": 0.8344620015048909,
"grad_norm": 19.961219787597656,
"learning_rate": 1.7511885895404122e-06,
"loss": 1.9082,
"step": 1109
},
{
"epoch": 0.835214446952596,
"grad_norm": 35.9928092956543,
"learning_rate": 1.7432646592709988e-06,
"loss": 2.5713,
"step": 1110
},
{
"epoch": 0.835966892400301,
"grad_norm": 20.073556900024414,
"learning_rate": 1.735340729001585e-06,
"loss": 2.1055,
"step": 1111
},
{
"epoch": 0.836719337848006,
"grad_norm": 22.781147003173828,
"learning_rate": 1.7274167987321714e-06,
"loss": 2.4678,
"step": 1112
},
{
"epoch": 0.837471783295711,
"grad_norm": 18.387203216552734,
"learning_rate": 1.7194928684627576e-06,
"loss": 2.2944,
"step": 1113
},
{
"epoch": 0.8382242287434161,
"grad_norm": 26.837833404541016,
"learning_rate": 1.711568938193344e-06,
"loss": 2.8203,
"step": 1114
},
{
"epoch": 0.8389766741911211,
"grad_norm": 22.23270034790039,
"learning_rate": 1.7036450079239304e-06,
"loss": 1.7646,
"step": 1115
},
{
"epoch": 0.8397291196388262,
"grad_norm": 23.629858016967773,
"learning_rate": 1.6957210776545168e-06,
"loss": 1.7197,
"step": 1116
},
{
"epoch": 0.8404815650865313,
"grad_norm": 21.33991813659668,
"learning_rate": 1.6877971473851032e-06,
"loss": 2.0195,
"step": 1117
},
{
"epoch": 0.8412340105342363,
"grad_norm": 21.652618408203125,
"learning_rate": 1.6798732171156896e-06,
"loss": 1.9268,
"step": 1118
},
{
"epoch": 0.8419864559819413,
"grad_norm": 29.22657012939453,
"learning_rate": 1.6719492868462758e-06,
"loss": 2.5576,
"step": 1119
},
{
"epoch": 0.8427389014296464,
"grad_norm": 26.373394012451172,
"learning_rate": 1.6640253565768622e-06,
"loss": 2.292,
"step": 1120
},
{
"epoch": 0.8434913468773514,
"grad_norm": 21.04275131225586,
"learning_rate": 1.6561014263074488e-06,
"loss": 2.2852,
"step": 1121
},
{
"epoch": 0.8442437923250564,
"grad_norm": 24.09245491027832,
"learning_rate": 1.648177496038035e-06,
"loss": 2.3447,
"step": 1122
},
{
"epoch": 0.8449962377727614,
"grad_norm": 29.287586212158203,
"learning_rate": 1.6402535657686214e-06,
"loss": 2.23,
"step": 1123
},
{
"epoch": 0.8457486832204665,
"grad_norm": 19.69169807434082,
"learning_rate": 1.6323296354992076e-06,
"loss": 2.1064,
"step": 1124
},
{
"epoch": 0.8465011286681715,
"grad_norm": 19.223487854003906,
"learning_rate": 1.624405705229794e-06,
"loss": 2.291,
"step": 1125
},
{
"epoch": 0.8472535741158767,
"grad_norm": 21.353008270263672,
"learning_rate": 1.6164817749603806e-06,
"loss": 1.6641,
"step": 1126
},
{
"epoch": 0.8480060195635817,
"grad_norm": 28.44927978515625,
"learning_rate": 1.608557844690967e-06,
"loss": 2.8379,
"step": 1127
},
{
"epoch": 0.8487584650112867,
"grad_norm": 20.611202239990234,
"learning_rate": 1.6006339144215532e-06,
"loss": 2.1348,
"step": 1128
},
{
"epoch": 0.8495109104589917,
"grad_norm": 20.636144638061523,
"learning_rate": 1.5927099841521396e-06,
"loss": 1.8086,
"step": 1129
},
{
"epoch": 0.8502633559066968,
"grad_norm": 29.253482818603516,
"learning_rate": 1.5847860538827258e-06,
"loss": 2.395,
"step": 1130
},
{
"epoch": 0.8510158013544018,
"grad_norm": 27.642684936523438,
"learning_rate": 1.5768621236133122e-06,
"loss": 1.9863,
"step": 1131
},
{
"epoch": 0.8517682468021068,
"grad_norm": 38.00718688964844,
"learning_rate": 1.5689381933438988e-06,
"loss": 2.6221,
"step": 1132
},
{
"epoch": 0.8525206922498119,
"grad_norm": 31.242435455322266,
"learning_rate": 1.561014263074485e-06,
"loss": 2.2764,
"step": 1133
},
{
"epoch": 0.8532731376975169,
"grad_norm": 37.08576202392578,
"learning_rate": 1.5530903328050714e-06,
"loss": 2.2881,
"step": 1134
},
{
"epoch": 0.8540255831452219,
"grad_norm": 25.496156692504883,
"learning_rate": 1.5451664025356578e-06,
"loss": 2.1523,
"step": 1135
},
{
"epoch": 0.8547780285929271,
"grad_norm": 24.68577766418457,
"learning_rate": 1.537242472266244e-06,
"loss": 2.0137,
"step": 1136
},
{
"epoch": 0.8555304740406321,
"grad_norm": 19.2375431060791,
"learning_rate": 1.5293185419968307e-06,
"loss": 1.9766,
"step": 1137
},
{
"epoch": 0.8562829194883371,
"grad_norm": 23.542686462402344,
"learning_rate": 1.521394611727417e-06,
"loss": 2.2139,
"step": 1138
},
{
"epoch": 0.8570353649360422,
"grad_norm": 20.412443161010742,
"learning_rate": 1.5134706814580033e-06,
"loss": 2.1104,
"step": 1139
},
{
"epoch": 0.8577878103837472,
"grad_norm": 28.579959869384766,
"learning_rate": 1.5055467511885897e-06,
"loss": 2.4404,
"step": 1140
},
{
"epoch": 0.8585402558314522,
"grad_norm": 17.60219383239746,
"learning_rate": 1.4976228209191759e-06,
"loss": 2.1152,
"step": 1141
},
{
"epoch": 0.8592927012791572,
"grad_norm": 23.072465896606445,
"learning_rate": 1.4896988906497625e-06,
"loss": 2.292,
"step": 1142
},
{
"epoch": 0.8600451467268623,
"grad_norm": 18.52507972717285,
"learning_rate": 1.4817749603803489e-06,
"loss": 2.2676,
"step": 1143
},
{
"epoch": 0.8607975921745673,
"grad_norm": 20.507993698120117,
"learning_rate": 1.473851030110935e-06,
"loss": 1.5454,
"step": 1144
},
{
"epoch": 0.8615500376222723,
"grad_norm": 21.937292098999023,
"learning_rate": 1.4659270998415215e-06,
"loss": 2.5098,
"step": 1145
},
{
"epoch": 0.8623024830699775,
"grad_norm": 30.636445999145508,
"learning_rate": 1.4580031695721079e-06,
"loss": 2.6172,
"step": 1146
},
{
"epoch": 0.8630549285176825,
"grad_norm": 22.22297477722168,
"learning_rate": 1.450079239302694e-06,
"loss": 1.6914,
"step": 1147
},
{
"epoch": 0.8638073739653875,
"grad_norm": 22.2825984954834,
"learning_rate": 1.4421553090332807e-06,
"loss": 2.167,
"step": 1148
},
{
"epoch": 0.8645598194130926,
"grad_norm": 18.126550674438477,
"learning_rate": 1.434231378763867e-06,
"loss": 2.0254,
"step": 1149
},
{
"epoch": 0.8653122648607976,
"grad_norm": 19.636568069458008,
"learning_rate": 1.4263074484944533e-06,
"loss": 1.687,
"step": 1150
},
{
"epoch": 0.8660647103085026,
"grad_norm": 20.647727966308594,
"learning_rate": 1.4183835182250397e-06,
"loss": 2.0898,
"step": 1151
},
{
"epoch": 0.8668171557562077,
"grad_norm": 22.94881248474121,
"learning_rate": 1.4104595879556259e-06,
"loss": 2.3364,
"step": 1152
},
{
"epoch": 0.8675696012039127,
"grad_norm": 25.914772033691406,
"learning_rate": 1.4025356576862125e-06,
"loss": 1.8203,
"step": 1153
},
{
"epoch": 0.8683220466516177,
"grad_norm": 27.735950469970703,
"learning_rate": 1.394611727416799e-06,
"loss": 2.1641,
"step": 1154
},
{
"epoch": 0.8690744920993227,
"grad_norm": 23.504257202148438,
"learning_rate": 1.3866877971473853e-06,
"loss": 2.0322,
"step": 1155
},
{
"epoch": 0.8698269375470279,
"grad_norm": 19.194591522216797,
"learning_rate": 1.3787638668779715e-06,
"loss": 1.8711,
"step": 1156
},
{
"epoch": 0.8705793829947329,
"grad_norm": 22.447521209716797,
"learning_rate": 1.370839936608558e-06,
"loss": 2.062,
"step": 1157
},
{
"epoch": 0.871331828442438,
"grad_norm": 21.444046020507812,
"learning_rate": 1.3629160063391445e-06,
"loss": 2.0732,
"step": 1158
},
{
"epoch": 0.872084273890143,
"grad_norm": 32.98313522338867,
"learning_rate": 1.3549920760697307e-06,
"loss": 2.6113,
"step": 1159
},
{
"epoch": 0.872836719337848,
"grad_norm": 30.398639678955078,
"learning_rate": 1.3470681458003171e-06,
"loss": 2.2158,
"step": 1160
},
{
"epoch": 0.873589164785553,
"grad_norm": 24.345083236694336,
"learning_rate": 1.3391442155309033e-06,
"loss": 2.1172,
"step": 1161
},
{
"epoch": 0.8743416102332581,
"grad_norm": 29.294410705566406,
"learning_rate": 1.3312202852614897e-06,
"loss": 2.8584,
"step": 1162
},
{
"epoch": 0.8750940556809631,
"grad_norm": 27.93486785888672,
"learning_rate": 1.3232963549920763e-06,
"loss": 1.9658,
"step": 1163
},
{
"epoch": 0.8758465011286681,
"grad_norm": 24.938095092773438,
"learning_rate": 1.3153724247226625e-06,
"loss": 2.3027,
"step": 1164
},
{
"epoch": 0.8765989465763732,
"grad_norm": 22.570066452026367,
"learning_rate": 1.307448494453249e-06,
"loss": 2.3174,
"step": 1165
},
{
"epoch": 0.8773513920240783,
"grad_norm": 19.348644256591797,
"learning_rate": 1.2995245641838353e-06,
"loss": 1.8423,
"step": 1166
},
{
"epoch": 0.8781038374717833,
"grad_norm": 23.712268829345703,
"learning_rate": 1.2916006339144215e-06,
"loss": 2.5195,
"step": 1167
},
{
"epoch": 0.8788562829194884,
"grad_norm": 21.44355583190918,
"learning_rate": 1.283676703645008e-06,
"loss": 1.875,
"step": 1168
},
{
"epoch": 0.8796087283671934,
"grad_norm": 24.918310165405273,
"learning_rate": 1.2757527733755946e-06,
"loss": 2.2021,
"step": 1169
},
{
"epoch": 0.8803611738148984,
"grad_norm": 21.855985641479492,
"learning_rate": 1.2678288431061807e-06,
"loss": 2.022,
"step": 1170
},
{
"epoch": 0.8811136192626035,
"grad_norm": 21.790687561035156,
"learning_rate": 1.2599049128367671e-06,
"loss": 2.29,
"step": 1171
},
{
"epoch": 0.8818660647103085,
"grad_norm": 20.67224884033203,
"learning_rate": 1.2519809825673536e-06,
"loss": 1.9756,
"step": 1172
},
{
"epoch": 0.8826185101580135,
"grad_norm": 23.416410446166992,
"learning_rate": 1.24405705229794e-06,
"loss": 2.1387,
"step": 1173
},
{
"epoch": 0.8833709556057185,
"grad_norm": 29.197629928588867,
"learning_rate": 1.2361331220285262e-06,
"loss": 2.4033,
"step": 1174
},
{
"epoch": 0.8841234010534236,
"grad_norm": 22.050270080566406,
"learning_rate": 1.2282091917591128e-06,
"loss": 1.6289,
"step": 1175
},
{
"epoch": 0.8848758465011287,
"grad_norm": 16.857227325439453,
"learning_rate": 1.220285261489699e-06,
"loss": 1.8237,
"step": 1176
},
{
"epoch": 0.8856282919488337,
"grad_norm": 28.409175872802734,
"learning_rate": 1.2123613312202854e-06,
"loss": 2.082,
"step": 1177
},
{
"epoch": 0.8863807373965388,
"grad_norm": 22.208608627319336,
"learning_rate": 1.2044374009508718e-06,
"loss": 2.5762,
"step": 1178
},
{
"epoch": 0.8871331828442438,
"grad_norm": 15.535444259643555,
"learning_rate": 1.1965134706814582e-06,
"loss": 1.9907,
"step": 1179
},
{
"epoch": 0.8878856282919488,
"grad_norm": 27.926977157592773,
"learning_rate": 1.1885895404120444e-06,
"loss": 1.917,
"step": 1180
},
{
"epoch": 0.8886380737396539,
"grad_norm": 32.624481201171875,
"learning_rate": 1.1806656101426308e-06,
"loss": 1.686,
"step": 1181
},
{
"epoch": 0.8893905191873589,
"grad_norm": 19.761180877685547,
"learning_rate": 1.1727416798732172e-06,
"loss": 2.1934,
"step": 1182
},
{
"epoch": 0.8901429646350639,
"grad_norm": 28.41836929321289,
"learning_rate": 1.1648177496038036e-06,
"loss": 2.5391,
"step": 1183
},
{
"epoch": 0.890895410082769,
"grad_norm": 23.069141387939453,
"learning_rate": 1.15689381933439e-06,
"loss": 2.126,
"step": 1184
},
{
"epoch": 0.891647855530474,
"grad_norm": 31.388771057128906,
"learning_rate": 1.1489698890649762e-06,
"loss": 2.4609,
"step": 1185
},
{
"epoch": 0.8924003009781791,
"grad_norm": 18.07545280456543,
"learning_rate": 1.1410459587955628e-06,
"loss": 1.8765,
"step": 1186
},
{
"epoch": 0.8931527464258842,
"grad_norm": 21.415142059326172,
"learning_rate": 1.133122028526149e-06,
"loss": 1.7529,
"step": 1187
},
{
"epoch": 0.8939051918735892,
"grad_norm": 17.008604049682617,
"learning_rate": 1.1251980982567354e-06,
"loss": 1.4946,
"step": 1188
},
{
"epoch": 0.8946576373212942,
"grad_norm": 23.807188034057617,
"learning_rate": 1.1172741679873218e-06,
"loss": 2.0234,
"step": 1189
},
{
"epoch": 0.8954100827689992,
"grad_norm": 18.214134216308594,
"learning_rate": 1.1093502377179082e-06,
"loss": 1.8613,
"step": 1190
},
{
"epoch": 0.8961625282167043,
"grad_norm": 24.6344051361084,
"learning_rate": 1.1014263074484946e-06,
"loss": 2.5996,
"step": 1191
},
{
"epoch": 0.8969149736644093,
"grad_norm": 17.742876052856445,
"learning_rate": 1.093502377179081e-06,
"loss": 1.7769,
"step": 1192
},
{
"epoch": 0.8976674191121143,
"grad_norm": 23.508689880371094,
"learning_rate": 1.0855784469096672e-06,
"loss": 2.0527,
"step": 1193
},
{
"epoch": 0.8984198645598194,
"grad_norm": 27.96549415588379,
"learning_rate": 1.0776545166402536e-06,
"loss": 2.5054,
"step": 1194
},
{
"epoch": 0.8991723100075244,
"grad_norm": 23.357175827026367,
"learning_rate": 1.06973058637084e-06,
"loss": 1.9805,
"step": 1195
},
{
"epoch": 0.8999247554552295,
"grad_norm": 26.538541793823242,
"learning_rate": 1.0618066561014264e-06,
"loss": 2.2412,
"step": 1196
},
{
"epoch": 0.9006772009029346,
"grad_norm": 20.19118881225586,
"learning_rate": 1.0538827258320128e-06,
"loss": 1.9414,
"step": 1197
},
{
"epoch": 0.9014296463506396,
"grad_norm": 24.33431625366211,
"learning_rate": 1.045958795562599e-06,
"loss": 2.0566,
"step": 1198
},
{
"epoch": 0.9021820917983446,
"grad_norm": 22.80762481689453,
"learning_rate": 1.0380348652931856e-06,
"loss": 2.4951,
"step": 1199
},
{
"epoch": 0.9029345372460497,
"grad_norm": 25.68887710571289,
"learning_rate": 1.0301109350237718e-06,
"loss": 2.457,
"step": 1200
},
{
"epoch": 0.9036869826937547,
"grad_norm": 32.62685012817383,
"learning_rate": 1.0221870047543582e-06,
"loss": 2.1187,
"step": 1201
},
{
"epoch": 0.9044394281414597,
"grad_norm": 21.595977783203125,
"learning_rate": 1.0142630744849446e-06,
"loss": 2.019,
"step": 1202
},
{
"epoch": 0.9051918735891648,
"grad_norm": 22.366758346557617,
"learning_rate": 1.006339144215531e-06,
"loss": 2.5293,
"step": 1203
},
{
"epoch": 0.9059443190368698,
"grad_norm": 22.554767608642578,
"learning_rate": 9.984152139461172e-07,
"loss": 2.3076,
"step": 1204
},
{
"epoch": 0.9066967644845748,
"grad_norm": 23.065105438232422,
"learning_rate": 9.904912836767039e-07,
"loss": 2.1777,
"step": 1205
},
{
"epoch": 0.90744920993228,
"grad_norm": 20.888399124145508,
"learning_rate": 9.8256735340729e-07,
"loss": 2.0293,
"step": 1206
},
{
"epoch": 0.908201655379985,
"grad_norm": 23.42367935180664,
"learning_rate": 9.746434231378764e-07,
"loss": 1.9868,
"step": 1207
},
{
"epoch": 0.90895410082769,
"grad_norm": 28.955568313598633,
"learning_rate": 9.667194928684629e-07,
"loss": 2.3193,
"step": 1208
},
{
"epoch": 0.909706546275395,
"grad_norm": 24.234487533569336,
"learning_rate": 9.587955625990493e-07,
"loss": 2.29,
"step": 1209
},
{
"epoch": 0.9104589917231001,
"grad_norm": 28.754636764526367,
"learning_rate": 9.508716323296357e-07,
"loss": 1.8267,
"step": 1210
},
{
"epoch": 0.9112114371708051,
"grad_norm": 39.35755920410156,
"learning_rate": 9.42947702060222e-07,
"loss": 2.5771,
"step": 1211
},
{
"epoch": 0.9119638826185101,
"grad_norm": 32.07710647583008,
"learning_rate": 9.350237717908083e-07,
"loss": 2.2168,
"step": 1212
},
{
"epoch": 0.9127163280662152,
"grad_norm": 20.59889030456543,
"learning_rate": 9.270998415213947e-07,
"loss": 1.8052,
"step": 1213
},
{
"epoch": 0.9134687735139202,
"grad_norm": 28.78321647644043,
"learning_rate": 9.191759112519811e-07,
"loss": 2.0625,
"step": 1214
},
{
"epoch": 0.9142212189616253,
"grad_norm": 28.022022247314453,
"learning_rate": 9.112519809825675e-07,
"loss": 2.3125,
"step": 1215
},
{
"epoch": 0.9149736644093304,
"grad_norm": 25.90964698791504,
"learning_rate": 9.033280507131538e-07,
"loss": 2.3125,
"step": 1216
},
{
"epoch": 0.9157261098570354,
"grad_norm": 23.057811737060547,
"learning_rate": 8.954041204437401e-07,
"loss": 1.9546,
"step": 1217
},
{
"epoch": 0.9164785553047404,
"grad_norm": 29.27845001220703,
"learning_rate": 8.874801901743266e-07,
"loss": 2.0498,
"step": 1218
},
{
"epoch": 0.9172310007524455,
"grad_norm": 18.934534072875977,
"learning_rate": 8.795562599049129e-07,
"loss": 1.6738,
"step": 1219
},
{
"epoch": 0.9179834462001505,
"grad_norm": 24.765535354614258,
"learning_rate": 8.716323296354994e-07,
"loss": 1.8662,
"step": 1220
},
{
"epoch": 0.9187358916478555,
"grad_norm": 24.855249404907227,
"learning_rate": 8.637083993660857e-07,
"loss": 2.2725,
"step": 1221
},
{
"epoch": 0.9194883370955605,
"grad_norm": 25.016971588134766,
"learning_rate": 8.55784469096672e-07,
"loss": 2.1836,
"step": 1222
},
{
"epoch": 0.9202407825432656,
"grad_norm": 21.017953872680664,
"learning_rate": 8.478605388272584e-07,
"loss": 1.5679,
"step": 1223
},
{
"epoch": 0.9209932279909706,
"grad_norm": 22.584678649902344,
"learning_rate": 8.399366085578448e-07,
"loss": 2.208,
"step": 1224
},
{
"epoch": 0.9217456734386757,
"grad_norm": 28.47764778137207,
"learning_rate": 8.320126782884311e-07,
"loss": 2.3271,
"step": 1225
},
{
"epoch": 0.9224981188863808,
"grad_norm": 19.13991928100586,
"learning_rate": 8.240887480190175e-07,
"loss": 2.0029,
"step": 1226
},
{
"epoch": 0.9232505643340858,
"grad_norm": 22.010602951049805,
"learning_rate": 8.161648177496038e-07,
"loss": 2.271,
"step": 1227
},
{
"epoch": 0.9240030097817908,
"grad_norm": 20.220909118652344,
"learning_rate": 8.082408874801903e-07,
"loss": 1.8848,
"step": 1228
},
{
"epoch": 0.9247554552294959,
"grad_norm": 22.18108558654785,
"learning_rate": 8.003169572107766e-07,
"loss": 1.9492,
"step": 1229
},
{
"epoch": 0.9255079006772009,
"grad_norm": 23.985708236694336,
"learning_rate": 7.923930269413629e-07,
"loss": 2.5166,
"step": 1230
},
{
"epoch": 0.9262603461249059,
"grad_norm": 22.24486541748047,
"learning_rate": 7.844690966719494e-07,
"loss": 2.1641,
"step": 1231
},
{
"epoch": 0.927012791572611,
"grad_norm": 24.414819717407227,
"learning_rate": 7.765451664025357e-07,
"loss": 1.9678,
"step": 1232
},
{
"epoch": 0.927765237020316,
"grad_norm": 24.730697631835938,
"learning_rate": 7.68621236133122e-07,
"loss": 2.6123,
"step": 1233
},
{
"epoch": 0.928517682468021,
"grad_norm": 27.271892547607422,
"learning_rate": 7.606973058637085e-07,
"loss": 1.9854,
"step": 1234
},
{
"epoch": 0.9292701279157262,
"grad_norm": 29.186485290527344,
"learning_rate": 7.527733755942948e-07,
"loss": 2.0908,
"step": 1235
},
{
"epoch": 0.9300225733634312,
"grad_norm": 28.539831161499023,
"learning_rate": 7.448494453248812e-07,
"loss": 2.0962,
"step": 1236
},
{
"epoch": 0.9307750188111362,
"grad_norm": 37.84783172607422,
"learning_rate": 7.369255150554675e-07,
"loss": 1.8511,
"step": 1237
},
{
"epoch": 0.9315274642588413,
"grad_norm": 20.103530883789062,
"learning_rate": 7.290015847860539e-07,
"loss": 2.293,
"step": 1238
},
{
"epoch": 0.9322799097065463,
"grad_norm": 20.439922332763672,
"learning_rate": 7.210776545166403e-07,
"loss": 2.252,
"step": 1239
},
{
"epoch": 0.9330323551542513,
"grad_norm": 23.894981384277344,
"learning_rate": 7.131537242472266e-07,
"loss": 2.6875,
"step": 1240
},
{
"epoch": 0.9337848006019563,
"grad_norm": 19.7099666595459,
"learning_rate": 7.052297939778129e-07,
"loss": 2.0059,
"step": 1241
},
{
"epoch": 0.9345372460496614,
"grad_norm": 33.36943054199219,
"learning_rate": 6.973058637083995e-07,
"loss": 2.6074,
"step": 1242
},
{
"epoch": 0.9352896914973664,
"grad_norm": 35.51583480834961,
"learning_rate": 6.893819334389858e-07,
"loss": 2.46,
"step": 1243
},
{
"epoch": 0.9360421369450714,
"grad_norm": 24.268138885498047,
"learning_rate": 6.814580031695723e-07,
"loss": 2.5137,
"step": 1244
},
{
"epoch": 0.9367945823927766,
"grad_norm": 23.564882278442383,
"learning_rate": 6.735340729001586e-07,
"loss": 2.5801,
"step": 1245
},
{
"epoch": 0.9375470278404816,
"grad_norm": 25.725584030151367,
"learning_rate": 6.656101426307449e-07,
"loss": 2.377,
"step": 1246
},
{
"epoch": 0.9382994732881866,
"grad_norm": 22.05669593811035,
"learning_rate": 6.576862123613313e-07,
"loss": 2.0371,
"step": 1247
},
{
"epoch": 0.9390519187358917,
"grad_norm": 22.974348068237305,
"learning_rate": 6.497622820919177e-07,
"loss": 2.25,
"step": 1248
},
{
"epoch": 0.9398043641835967,
"grad_norm": 18.700010299682617,
"learning_rate": 6.41838351822504e-07,
"loss": 1.9766,
"step": 1249
},
{
"epoch": 0.9405568096313017,
"grad_norm": 30.736019134521484,
"learning_rate": 6.339144215530904e-07,
"loss": 2.3047,
"step": 1250
},
{
"epoch": 0.9413092550790068,
"grad_norm": 19.181961059570312,
"learning_rate": 6.259904912836768e-07,
"loss": 1.6123,
"step": 1251
},
{
"epoch": 0.9420617005267118,
"grad_norm": 24.28353500366211,
"learning_rate": 6.180665610142631e-07,
"loss": 3.0742,
"step": 1252
},
{
"epoch": 0.9428141459744168,
"grad_norm": 17.89773941040039,
"learning_rate": 6.101426307448495e-07,
"loss": 2.1025,
"step": 1253
},
{
"epoch": 0.9435665914221218,
"grad_norm": 20.147546768188477,
"learning_rate": 6.022187004754359e-07,
"loss": 1.6606,
"step": 1254
},
{
"epoch": 0.944319036869827,
"grad_norm": 22.10161781311035,
"learning_rate": 5.942947702060222e-07,
"loss": 2.126,
"step": 1255
},
{
"epoch": 0.945071482317532,
"grad_norm": 20.982725143432617,
"learning_rate": 5.863708399366086e-07,
"loss": 1.7944,
"step": 1256
},
{
"epoch": 0.945823927765237,
"grad_norm": 28.338546752929688,
"learning_rate": 5.78446909667195e-07,
"loss": 1.9727,
"step": 1257
},
{
"epoch": 0.9465763732129421,
"grad_norm": 24.861492156982422,
"learning_rate": 5.705229793977814e-07,
"loss": 2.1104,
"step": 1258
},
{
"epoch": 0.9473288186606471,
"grad_norm": 20.712827682495117,
"learning_rate": 5.625990491283677e-07,
"loss": 2.1768,
"step": 1259
},
{
"epoch": 0.9480812641083521,
"grad_norm": 17.77390480041504,
"learning_rate": 5.546751188589541e-07,
"loss": 1.8994,
"step": 1260
},
{
"epoch": 0.9488337095560572,
"grad_norm": 16.790987014770508,
"learning_rate": 5.467511885895405e-07,
"loss": 2.24,
"step": 1261
},
{
"epoch": 0.9495861550037622,
"grad_norm": 30.8397159576416,
"learning_rate": 5.388272583201268e-07,
"loss": 2.4854,
"step": 1262
},
{
"epoch": 0.9503386004514672,
"grad_norm": 21.42900276184082,
"learning_rate": 5.309033280507132e-07,
"loss": 2.0771,
"step": 1263
},
{
"epoch": 0.9510910458991723,
"grad_norm": 18.064916610717773,
"learning_rate": 5.229793977812995e-07,
"loss": 1.6284,
"step": 1264
},
{
"epoch": 0.9518434913468774,
"grad_norm": 25.005502700805664,
"learning_rate": 5.150554675118859e-07,
"loss": 2.2305,
"step": 1265
},
{
"epoch": 0.9525959367945824,
"grad_norm": 27.3756160736084,
"learning_rate": 5.071315372424723e-07,
"loss": 2.4912,
"step": 1266
},
{
"epoch": 0.9533483822422875,
"grad_norm": 19.913427352905273,
"learning_rate": 4.992076069730586e-07,
"loss": 2.0059,
"step": 1267
},
{
"epoch": 0.9541008276899925,
"grad_norm": 33.70522689819336,
"learning_rate": 4.91283676703645e-07,
"loss": 1.7095,
"step": 1268
},
{
"epoch": 0.9548532731376975,
"grad_norm": 20.042531967163086,
"learning_rate": 4.833597464342314e-07,
"loss": 2.4277,
"step": 1269
},
{
"epoch": 0.9556057185854026,
"grad_norm": 23.788270950317383,
"learning_rate": 4.7543581616481783e-07,
"loss": 2.3633,
"step": 1270
},
{
"epoch": 0.9563581640331076,
"grad_norm": 20.8429012298584,
"learning_rate": 4.6751188589540413e-07,
"loss": 2.3979,
"step": 1271
},
{
"epoch": 0.9571106094808126,
"grad_norm": 36.584800720214844,
"learning_rate": 4.5958795562599054e-07,
"loss": 2.1558,
"step": 1272
},
{
"epoch": 0.9578630549285176,
"grad_norm": 25.138290405273438,
"learning_rate": 4.516640253565769e-07,
"loss": 2.4678,
"step": 1273
},
{
"epoch": 0.9586155003762227,
"grad_norm": 32.93474197387695,
"learning_rate": 4.437400950871633e-07,
"loss": 2.0742,
"step": 1274
},
{
"epoch": 0.9593679458239278,
"grad_norm": 26.20588493347168,
"learning_rate": 4.358161648177497e-07,
"loss": 1.9326,
"step": 1275
},
{
"epoch": 0.9601203912716328,
"grad_norm": 25.365707397460938,
"learning_rate": 4.27892234548336e-07,
"loss": 2.0967,
"step": 1276
},
{
"epoch": 0.9608728367193379,
"grad_norm": 22.695459365844727,
"learning_rate": 4.199683042789224e-07,
"loss": 2.4688,
"step": 1277
},
{
"epoch": 0.9616252821670429,
"grad_norm": 19.69232177734375,
"learning_rate": 4.1204437400950875e-07,
"loss": 2.249,
"step": 1278
},
{
"epoch": 0.9623777276147479,
"grad_norm": 23.908605575561523,
"learning_rate": 4.0412044374009516e-07,
"loss": 2.3076,
"step": 1279
},
{
"epoch": 0.963130173062453,
"grad_norm": 22.21122169494629,
"learning_rate": 3.9619651347068146e-07,
"loss": 1.8652,
"step": 1280
},
{
"epoch": 0.963882618510158,
"grad_norm": 20.96678352355957,
"learning_rate": 3.8827258320126786e-07,
"loss": 2.583,
"step": 1281
},
{
"epoch": 0.964635063957863,
"grad_norm": 22.674808502197266,
"learning_rate": 3.8034865293185427e-07,
"loss": 2.2803,
"step": 1282
},
{
"epoch": 0.9653875094055681,
"grad_norm": 24.316547393798828,
"learning_rate": 3.724247226624406e-07,
"loss": 2.5156,
"step": 1283
},
{
"epoch": 0.9661399548532731,
"grad_norm": 21.583770751953125,
"learning_rate": 3.6450079239302697e-07,
"loss": 2.2202,
"step": 1284
},
{
"epoch": 0.9668924003009782,
"grad_norm": 22.84387969970703,
"learning_rate": 3.565768621236133e-07,
"loss": 2.4961,
"step": 1285
},
{
"epoch": 0.9676448457486833,
"grad_norm": 27.4648380279541,
"learning_rate": 3.486529318541997e-07,
"loss": 2.7754,
"step": 1286
},
{
"epoch": 0.9683972911963883,
"grad_norm": 23.863908767700195,
"learning_rate": 3.4072900158478613e-07,
"loss": 2.4111,
"step": 1287
},
{
"epoch": 0.9691497366440933,
"grad_norm": 27.128982543945312,
"learning_rate": 3.3280507131537243e-07,
"loss": 2.2827,
"step": 1288
},
{
"epoch": 0.9699021820917983,
"grad_norm": 25.93006134033203,
"learning_rate": 3.2488114104595883e-07,
"loss": 2.3442,
"step": 1289
},
{
"epoch": 0.9706546275395034,
"grad_norm": 26.65546417236328,
"learning_rate": 3.169572107765452e-07,
"loss": 2.1821,
"step": 1290
},
{
"epoch": 0.9714070729872084,
"grad_norm": 43.7320442199707,
"learning_rate": 3.0903328050713154e-07,
"loss": 2.998,
"step": 1291
},
{
"epoch": 0.9721595184349134,
"grad_norm": 20.625991821289062,
"learning_rate": 3.0110935023771794e-07,
"loss": 2.6406,
"step": 1292
},
{
"epoch": 0.9729119638826185,
"grad_norm": 19.000179290771484,
"learning_rate": 2.931854199683043e-07,
"loss": 2.1602,
"step": 1293
},
{
"epoch": 0.9736644093303235,
"grad_norm": 21.424129486083984,
"learning_rate": 2.852614896988907e-07,
"loss": 2.0127,
"step": 1294
},
{
"epoch": 0.9744168547780286,
"grad_norm": 29.558860778808594,
"learning_rate": 2.7733755942947705e-07,
"loss": 2.7988,
"step": 1295
},
{
"epoch": 0.9751693002257337,
"grad_norm": 26.818078994750977,
"learning_rate": 2.694136291600634e-07,
"loss": 1.9385,
"step": 1296
},
{
"epoch": 0.9759217456734387,
"grad_norm": 23.0302791595459,
"learning_rate": 2.6148969889064975e-07,
"loss": 2.0264,
"step": 1297
},
{
"epoch": 0.9766741911211437,
"grad_norm": 19.594850540161133,
"learning_rate": 2.5356576862123616e-07,
"loss": 1.9297,
"step": 1298
},
{
"epoch": 0.9774266365688488,
"grad_norm": 29.594823837280273,
"learning_rate": 2.456418383518225e-07,
"loss": 2.3984,
"step": 1299
},
{
"epoch": 0.9781790820165538,
"grad_norm": 18.358137130737305,
"learning_rate": 2.3771790808240892e-07,
"loss": 2.019,
"step": 1300
},
{
"epoch": 0.9789315274642588,
"grad_norm": 26.450931549072266,
"learning_rate": 2.2979397781299527e-07,
"loss": 2.0542,
"step": 1301
},
{
"epoch": 0.9796839729119639,
"grad_norm": 20.932947158813477,
"learning_rate": 2.2187004754358165e-07,
"loss": 2.2026,
"step": 1302
},
{
"epoch": 0.9804364183596689,
"grad_norm": 21.195083618164062,
"learning_rate": 2.13946117274168e-07,
"loss": 2.2148,
"step": 1303
},
{
"epoch": 0.9811888638073739,
"grad_norm": 25.654787063598633,
"learning_rate": 2.0602218700475438e-07,
"loss": 2.0049,
"step": 1304
},
{
"epoch": 0.981941309255079,
"grad_norm": 28.244712829589844,
"learning_rate": 1.9809825673534073e-07,
"loss": 1.793,
"step": 1305
},
{
"epoch": 0.9826937547027841,
"grad_norm": 30.86054801940918,
"learning_rate": 1.9017432646592713e-07,
"loss": 2.1162,
"step": 1306
},
{
"epoch": 0.9834462001504891,
"grad_norm": 19.956132888793945,
"learning_rate": 1.8225039619651348e-07,
"loss": 2.2725,
"step": 1307
},
{
"epoch": 0.9841986455981941,
"grad_norm": 22.997634887695312,
"learning_rate": 1.7432646592709986e-07,
"loss": 2.4033,
"step": 1308
},
{
"epoch": 0.9849510910458992,
"grad_norm": 32.66743087768555,
"learning_rate": 1.6640253565768621e-07,
"loss": 2.792,
"step": 1309
},
{
"epoch": 0.9857035364936042,
"grad_norm": 18.8122501373291,
"learning_rate": 1.584786053882726e-07,
"loss": 1.9287,
"step": 1310
},
{
"epoch": 0.9864559819413092,
"grad_norm": 18.69032096862793,
"learning_rate": 1.5055467511885897e-07,
"loss": 2.1504,
"step": 1311
},
{
"epoch": 0.9872084273890143,
"grad_norm": 23.050729751586914,
"learning_rate": 1.4263074484944535e-07,
"loss": 2.1992,
"step": 1312
},
{
"epoch": 0.9879608728367193,
"grad_norm": 24.075252532958984,
"learning_rate": 1.347068145800317e-07,
"loss": 2.7788,
"step": 1313
},
{
"epoch": 0.9887133182844243,
"grad_norm": 21.160234451293945,
"learning_rate": 1.2678288431061808e-07,
"loss": 2.4111,
"step": 1314
},
{
"epoch": 0.9894657637321295,
"grad_norm": 21.52849578857422,
"learning_rate": 1.1885895404120446e-07,
"loss": 1.6572,
"step": 1315
},
{
"epoch": 0.9902182091798345,
"grad_norm": 25.144681930541992,
"learning_rate": 1.1093502377179082e-07,
"loss": 2.0186,
"step": 1316
},
{
"epoch": 0.9909706546275395,
"grad_norm": 24.97117042541504,
"learning_rate": 1.0301109350237719e-07,
"loss": 2.4766,
"step": 1317
},
{
"epoch": 0.9917231000752446,
"grad_norm": 21.990854263305664,
"learning_rate": 9.508716323296357e-08,
"loss": 2.3086,
"step": 1318
},
{
"epoch": 0.9924755455229496,
"grad_norm": 20.860063552856445,
"learning_rate": 8.716323296354993e-08,
"loss": 1.8662,
"step": 1319
},
{
"epoch": 0.9932279909706546,
"grad_norm": 31.04228401184082,
"learning_rate": 7.92393026941363e-08,
"loss": 2.043,
"step": 1320
},
{
"epoch": 0.9939804364183596,
"grad_norm": 22.26874351501465,
"learning_rate": 7.131537242472267e-08,
"loss": 2.2002,
"step": 1321
},
{
"epoch": 0.9947328818660647,
"grad_norm": 25.829164505004883,
"learning_rate": 6.339144215530904e-08,
"loss": 2.2036,
"step": 1322
},
{
"epoch": 0.9954853273137697,
"grad_norm": 23.04068374633789,
"learning_rate": 5.546751188589541e-08,
"loss": 2.2529,
"step": 1323
},
{
"epoch": 0.9962377727614747,
"grad_norm": 21.899280548095703,
"learning_rate": 4.754358161648178e-08,
"loss": 2.6411,
"step": 1324
},
{
"epoch": 0.9969902182091799,
"grad_norm": 28.25452423095703,
"learning_rate": 3.961965134706815e-08,
"loss": 2.1074,
"step": 1325
},
{
"epoch": 0.9977426636568849,
"grad_norm": 22.75929069519043,
"learning_rate": 3.169572107765452e-08,
"loss": 2.2275,
"step": 1326
},
{
"epoch": 0.9984951091045899,
"grad_norm": 32.01602554321289,
"learning_rate": 2.377179080824089e-08,
"loss": 2.252,
"step": 1327
},
{
"epoch": 0.999247554552295,
"grad_norm": 20.957666397094727,
"learning_rate": 1.584786053882726e-08,
"loss": 2.002,
"step": 1328
},
{
"epoch": 1.0,
"grad_norm": 21.312353134155273,
"learning_rate": 7.92393026941363e-09,
"loss": 2.4668,
"step": 1329
},
{
"epoch": 1.0,
"step": 1329,
"total_flos": 2.392704719865774e+18,
"train_loss": 2.6215605380572797,
"train_runtime": 1486.6252,
"train_samples_per_second": 228.706,
"train_steps_per_second": 0.894
}
],
"logging_steps": 1,
"max_steps": 1329,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.392704719865774e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}