ISTJ / trainer_state.json
KZoe43's picture
feat: Upload ISTJ model
1ee7e0b verified
raw
history blame contribute delete
No virus
181 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 15.873015873015873,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.015873015873015872,
"grad_norm": 0.4978693127632141,
"learning_rate": 5e-05,
"loss": 0.9589,
"step": 1
},
{
"epoch": 0.031746031746031744,
"grad_norm": 0.4809919595718384,
"learning_rate": 0.0001,
"loss": 0.9265,
"step": 2
},
{
"epoch": 0.047619047619047616,
"grad_norm": 0.5111315250396729,
"learning_rate": 9.999975227016531e-05,
"loss": 0.9665,
"step": 3
},
{
"epoch": 0.06349206349206349,
"grad_norm": 0.670375406742096,
"learning_rate": 9.999900908311602e-05,
"loss": 0.9922,
"step": 4
},
{
"epoch": 0.07936507936507936,
"grad_norm": 0.6541376113891602,
"learning_rate": 9.999777044621652e-05,
"loss": 0.7833,
"step": 5
},
{
"epoch": 0.09523809523809523,
"grad_norm": 0.8511355519294739,
"learning_rate": 9.999603637174071e-05,
"loss": 0.8339,
"step": 6
},
{
"epoch": 0.1111111111111111,
"grad_norm": 0.933815598487854,
"learning_rate": 9.999380687687188e-05,
"loss": 0.7231,
"step": 7
},
{
"epoch": 0.12698412698412698,
"grad_norm": 1.2876204252243042,
"learning_rate": 9.999108198370249e-05,
"loss": 0.9078,
"step": 8
},
{
"epoch": 0.14285714285714285,
"grad_norm": 1.2767138481140137,
"learning_rate": 9.998786171923407e-05,
"loss": 0.8635,
"step": 9
},
{
"epoch": 0.15873015873015872,
"grad_norm": 1.2097059488296509,
"learning_rate": 9.998414611537681e-05,
"loss": 0.8626,
"step": 10
},
{
"epoch": 0.1746031746031746,
"grad_norm": 1.4272280931472778,
"learning_rate": 9.997993520894937e-05,
"loss": 0.9185,
"step": 11
},
{
"epoch": 0.19047619047619047,
"grad_norm": 0.8439553380012512,
"learning_rate": 9.997522904167844e-05,
"loss": 0.6789,
"step": 12
},
{
"epoch": 0.20634920634920634,
"grad_norm": 0.8309609293937683,
"learning_rate": 9.997002766019832e-05,
"loss": 0.8529,
"step": 13
},
{
"epoch": 0.2222222222222222,
"grad_norm": 0.9561108350753784,
"learning_rate": 9.996433111605052e-05,
"loss": 0.8475,
"step": 14
},
{
"epoch": 0.23809523809523808,
"grad_norm": 0.6515036821365356,
"learning_rate": 9.99581394656832e-05,
"loss": 0.8345,
"step": 15
},
{
"epoch": 0.25396825396825395,
"grad_norm": 0.45641326904296875,
"learning_rate": 9.995145277045061e-05,
"loss": 0.6654,
"step": 16
},
{
"epoch": 0.2698412698412698,
"grad_norm": 0.2796855568885803,
"learning_rate": 9.994427109661253e-05,
"loss": 0.6643,
"step": 17
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.2947935461997986,
"learning_rate": 9.993659451533353e-05,
"loss": 0.7327,
"step": 18
},
{
"epoch": 0.30158730158730157,
"grad_norm": 0.3294975161552429,
"learning_rate": 9.992842310268233e-05,
"loss": 0.7466,
"step": 19
},
{
"epoch": 0.31746031746031744,
"grad_norm": 0.25266706943511963,
"learning_rate": 9.991975693963107e-05,
"loss": 0.6628,
"step": 20
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.26995259523391724,
"learning_rate": 9.99105961120544e-05,
"loss": 0.7145,
"step": 21
},
{
"epoch": 0.3492063492063492,
"grad_norm": 0.29081106185913086,
"learning_rate": 9.990094071072877e-05,
"loss": 0.6947,
"step": 22
},
{
"epoch": 0.36507936507936506,
"grad_norm": 0.277067095041275,
"learning_rate": 9.989079083133139e-05,
"loss": 0.7225,
"step": 23
},
{
"epoch": 0.38095238095238093,
"grad_norm": 0.27529773116111755,
"learning_rate": 9.988014657443941e-05,
"loss": 0.7122,
"step": 24
},
{
"epoch": 0.3968253968253968,
"grad_norm": 0.30684614181518555,
"learning_rate": 9.986900804552878e-05,
"loss": 0.7015,
"step": 25
},
{
"epoch": 0.4126984126984127,
"grad_norm": 0.30238044261932373,
"learning_rate": 9.985737535497337e-05,
"loss": 0.5781,
"step": 26
},
{
"epoch": 0.42857142857142855,
"grad_norm": 0.30560147762298584,
"learning_rate": 9.984524861804376e-05,
"loss": 0.5947,
"step": 27
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.280203253030777,
"learning_rate": 9.983262795490613e-05,
"loss": 0.7072,
"step": 28
},
{
"epoch": 0.4603174603174603,
"grad_norm": 0.28849631547927856,
"learning_rate": 9.981951349062106e-05,
"loss": 0.7074,
"step": 29
},
{
"epoch": 0.47619047619047616,
"grad_norm": 0.2815149426460266,
"learning_rate": 9.980590535514233e-05,
"loss": 0.5274,
"step": 30
},
{
"epoch": 0.49206349206349204,
"grad_norm": 0.26764699816703796,
"learning_rate": 9.979180368331558e-05,
"loss": 0.6645,
"step": 31
},
{
"epoch": 0.5079365079365079,
"grad_norm": 0.29958057403564453,
"learning_rate": 9.9777208614877e-05,
"loss": 0.7361,
"step": 32
},
{
"epoch": 0.5238095238095238,
"grad_norm": 0.26811736822128296,
"learning_rate": 9.976212029445194e-05,
"loss": 0.6962,
"step": 33
},
{
"epoch": 0.5396825396825397,
"grad_norm": 0.2567647695541382,
"learning_rate": 9.97465388715535e-05,
"loss": 0.6077,
"step": 34
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.25592276453971863,
"learning_rate": 9.9730464500581e-05,
"loss": 0.6288,
"step": 35
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.24128927290439606,
"learning_rate": 9.971389734081848e-05,
"loss": 0.5665,
"step": 36
},
{
"epoch": 0.5873015873015873,
"grad_norm": 0.2471931427717209,
"learning_rate": 9.969683755643317e-05,
"loss": 0.7,
"step": 37
},
{
"epoch": 0.6031746031746031,
"grad_norm": 0.24910229444503784,
"learning_rate": 9.967928531647374e-05,
"loss": 0.5286,
"step": 38
},
{
"epoch": 0.6190476190476191,
"grad_norm": 0.29654461145401,
"learning_rate": 9.966124079486872e-05,
"loss": 0.6379,
"step": 39
},
{
"epoch": 0.6349206349206349,
"grad_norm": 0.23167571425437927,
"learning_rate": 9.96427041704248e-05,
"loss": 0.5028,
"step": 40
},
{
"epoch": 0.6507936507936508,
"grad_norm": 0.3802570402622223,
"learning_rate": 9.962367562682496e-05,
"loss": 0.7501,
"step": 41
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.2911546230316162,
"learning_rate": 9.960415535262671e-05,
"loss": 0.7529,
"step": 42
},
{
"epoch": 0.6825396825396826,
"grad_norm": 0.27725136280059814,
"learning_rate": 9.958414354126022e-05,
"loss": 0.6338,
"step": 43
},
{
"epoch": 0.6984126984126984,
"grad_norm": 0.29778677225112915,
"learning_rate": 9.956364039102642e-05,
"loss": 0.6084,
"step": 44
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.3038597106933594,
"learning_rate": 9.954264610509497e-05,
"loss": 0.7813,
"step": 45
},
{
"epoch": 0.7301587301587301,
"grad_norm": 0.24961970746517181,
"learning_rate": 9.952116089150232e-05,
"loss": 0.5784,
"step": 46
},
{
"epoch": 0.746031746031746,
"grad_norm": 0.41124090552330017,
"learning_rate": 9.94991849631496e-05,
"loss": 0.8362,
"step": 47
},
{
"epoch": 0.7619047619047619,
"grad_norm": 0.2612743079662323,
"learning_rate": 9.947671853780054e-05,
"loss": 0.5879,
"step": 48
},
{
"epoch": 0.7777777777777778,
"grad_norm": 0.3509594798088074,
"learning_rate": 9.94537618380793e-05,
"loss": 0.6429,
"step": 49
},
{
"epoch": 0.7936507936507936,
"grad_norm": 0.4222470223903656,
"learning_rate": 9.943031509146825e-05,
"loss": 0.8086,
"step": 50
},
{
"epoch": 0.8095238095238095,
"grad_norm": 0.34031662344932556,
"learning_rate": 9.940637853030572e-05,
"loss": 0.7058,
"step": 51
},
{
"epoch": 0.8253968253968254,
"grad_norm": 0.25386595726013184,
"learning_rate": 9.938195239178374e-05,
"loss": 0.5537,
"step": 52
},
{
"epoch": 0.8412698412698413,
"grad_norm": 0.27435001730918884,
"learning_rate": 9.935703691794565e-05,
"loss": 0.5793,
"step": 53
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.360727995634079,
"learning_rate": 9.933163235568367e-05,
"loss": 0.6103,
"step": 54
},
{
"epoch": 0.873015873015873,
"grad_norm": 0.29674389958381653,
"learning_rate": 9.930573895673657e-05,
"loss": 0.7375,
"step": 55
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.3319956958293915,
"learning_rate": 9.927935697768698e-05,
"loss": 0.5953,
"step": 56
},
{
"epoch": 0.9047619047619048,
"grad_norm": 0.3237013518810272,
"learning_rate": 9.925248667995907e-05,
"loss": 0.6891,
"step": 57
},
{
"epoch": 0.9206349206349206,
"grad_norm": 0.2946189343929291,
"learning_rate": 9.922512832981584e-05,
"loss": 0.5815,
"step": 58
},
{
"epoch": 0.9365079365079365,
"grad_norm": 0.31961193680763245,
"learning_rate": 9.919728219835643e-05,
"loss": 0.6767,
"step": 59
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.30548524856567383,
"learning_rate": 9.916894856151357e-05,
"loss": 0.6222,
"step": 60
},
{
"epoch": 0.9682539682539683,
"grad_norm": 0.2908201515674591,
"learning_rate": 9.914012770005072e-05,
"loss": 0.6102,
"step": 61
},
{
"epoch": 0.9841269841269841,
"grad_norm": 0.3024301826953888,
"learning_rate": 9.91108198995594e-05,
"loss": 0.6281,
"step": 62
},
{
"epoch": 1.0,
"grad_norm": 0.37488242983818054,
"learning_rate": 9.908102545045625e-05,
"loss": 0.5405,
"step": 63
},
{
"epoch": 1.0158730158730158,
"grad_norm": 0.3462425172328949,
"learning_rate": 9.905074464798024e-05,
"loss": 0.5831,
"step": 64
},
{
"epoch": 1.0317460317460316,
"grad_norm": 0.32379499077796936,
"learning_rate": 9.901997779218967e-05,
"loss": 0.6897,
"step": 65
},
{
"epoch": 1.0476190476190477,
"grad_norm": 0.3253431022167206,
"learning_rate": 9.898872518795932e-05,
"loss": 0.5935,
"step": 66
},
{
"epoch": 1.0634920634920635,
"grad_norm": 0.31801578402519226,
"learning_rate": 9.895698714497724e-05,
"loss": 0.5721,
"step": 67
},
{
"epoch": 1.0793650793650793,
"grad_norm": 0.29547229409217834,
"learning_rate": 9.892476397774186e-05,
"loss": 0.5041,
"step": 68
},
{
"epoch": 1.0952380952380953,
"grad_norm": 0.30208516120910645,
"learning_rate": 9.889205600555877e-05,
"loss": 0.5027,
"step": 69
},
{
"epoch": 1.1111111111111112,
"grad_norm": 0.37307029962539673,
"learning_rate": 9.885886355253758e-05,
"loss": 0.6963,
"step": 70
},
{
"epoch": 1.126984126984127,
"grad_norm": 0.31057053804397583,
"learning_rate": 9.882518694758875e-05,
"loss": 0.4872,
"step": 71
},
{
"epoch": 1.1428571428571428,
"grad_norm": 0.35556697845458984,
"learning_rate": 9.879102652442024e-05,
"loss": 0.6017,
"step": 72
},
{
"epoch": 1.1587301587301586,
"grad_norm": 0.37607231736183167,
"learning_rate": 9.875638262153431e-05,
"loss": 0.6837,
"step": 73
},
{
"epoch": 1.1746031746031746,
"grad_norm": 0.34590160846710205,
"learning_rate": 9.872125558222409e-05,
"loss": 0.5724,
"step": 74
},
{
"epoch": 1.1904761904761905,
"grad_norm": 0.3449731469154358,
"learning_rate": 9.868564575457023e-05,
"loss": 0.6157,
"step": 75
},
{
"epoch": 1.2063492063492063,
"grad_norm": 0.4771505892276764,
"learning_rate": 9.864955349143734e-05,
"loss": 0.5829,
"step": 76
},
{
"epoch": 1.2222222222222223,
"grad_norm": 0.374600887298584,
"learning_rate": 9.861297915047069e-05,
"loss": 0.6213,
"step": 77
},
{
"epoch": 1.2380952380952381,
"grad_norm": 0.40953242778778076,
"learning_rate": 9.857592309409247e-05,
"loss": 0.5805,
"step": 78
},
{
"epoch": 1.253968253968254,
"grad_norm": 0.3891858756542206,
"learning_rate": 9.853838568949831e-05,
"loss": 0.5201,
"step": 79
},
{
"epoch": 1.2698412698412698,
"grad_norm": 0.4599400758743286,
"learning_rate": 9.850036730865364e-05,
"loss": 0.6509,
"step": 80
},
{
"epoch": 1.2857142857142856,
"grad_norm": 0.47590476274490356,
"learning_rate": 9.846186832828989e-05,
"loss": 0.5522,
"step": 81
},
{
"epoch": 1.3015873015873016,
"grad_norm": 0.42077696323394775,
"learning_rate": 9.842288912990096e-05,
"loss": 0.6272,
"step": 82
},
{
"epoch": 1.3174603174603174,
"grad_norm": 0.4116186201572418,
"learning_rate": 9.838343009973925e-05,
"loss": 0.5974,
"step": 83
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.4247848689556122,
"learning_rate": 9.83434916288119e-05,
"loss": 0.4948,
"step": 84
},
{
"epoch": 1.3492063492063493,
"grad_norm": 0.3873782455921173,
"learning_rate": 9.830307411287695e-05,
"loss": 0.496,
"step": 85
},
{
"epoch": 1.3650793650793651,
"grad_norm": 0.4587806463241577,
"learning_rate": 9.82621779524394e-05,
"loss": 0.6617,
"step": 86
},
{
"epoch": 1.380952380952381,
"grad_norm": 0.4379841089248657,
"learning_rate": 9.822080355274719e-05,
"loss": 0.5294,
"step": 87
},
{
"epoch": 1.3968253968253967,
"grad_norm": 0.464910626411438,
"learning_rate": 9.817895132378725e-05,
"loss": 0.6855,
"step": 88
},
{
"epoch": 1.4126984126984126,
"grad_norm": 0.4157741963863373,
"learning_rate": 9.813662168028144e-05,
"loss": 0.5563,
"step": 89
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.4436641037464142,
"learning_rate": 9.809381504168234e-05,
"loss": 0.5291,
"step": 90
},
{
"epoch": 1.4444444444444444,
"grad_norm": 0.4180889427661896,
"learning_rate": 9.805053183216923e-05,
"loss": 0.5158,
"step": 91
},
{
"epoch": 1.4603174603174602,
"grad_norm": 0.4933961033821106,
"learning_rate": 9.800677248064382e-05,
"loss": 0.6885,
"step": 92
},
{
"epoch": 1.4761904761904763,
"grad_norm": 0.4813699722290039,
"learning_rate": 9.796253742072596e-05,
"loss": 0.6305,
"step": 93
},
{
"epoch": 1.492063492063492,
"grad_norm": 0.4272967278957367,
"learning_rate": 9.791782709074944e-05,
"loss": 0.5119,
"step": 94
},
{
"epoch": 1.507936507936508,
"grad_norm": 0.4510858356952667,
"learning_rate": 9.787264193375753e-05,
"loss": 0.5693,
"step": 95
},
{
"epoch": 1.5238095238095237,
"grad_norm": 0.5349589586257935,
"learning_rate": 9.782698239749873e-05,
"loss": 0.6708,
"step": 96
},
{
"epoch": 1.5396825396825395,
"grad_norm": 0.5285341739654541,
"learning_rate": 9.778084893442218e-05,
"loss": 0.6712,
"step": 97
},
{
"epoch": 1.5555555555555556,
"grad_norm": 0.4625120460987091,
"learning_rate": 9.77342420016733e-05,
"loss": 0.5257,
"step": 98
},
{
"epoch": 1.5714285714285714,
"grad_norm": 0.4635828733444214,
"learning_rate": 9.768716206108921e-05,
"loss": 0.482,
"step": 99
},
{
"epoch": 1.5873015873015874,
"grad_norm": 0.47050222754478455,
"learning_rate": 9.763960957919413e-05,
"loss": 0.4347,
"step": 100
},
{
"epoch": 1.6031746031746033,
"grad_norm": 0.42742452025413513,
"learning_rate": 9.759158502719481e-05,
"loss": 0.4208,
"step": 101
},
{
"epoch": 1.619047619047619,
"grad_norm": 0.48628243803977966,
"learning_rate": 9.754308888097583e-05,
"loss": 0.5814,
"step": 102
},
{
"epoch": 1.6349206349206349,
"grad_norm": 0.4874871075153351,
"learning_rate": 9.749412162109485e-05,
"loss": 0.5278,
"step": 103
},
{
"epoch": 1.6507936507936507,
"grad_norm": 0.5010098814964294,
"learning_rate": 9.744468373277797e-05,
"loss": 0.5341,
"step": 104
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.4798610508441925,
"learning_rate": 9.739477570591473e-05,
"loss": 0.5088,
"step": 105
},
{
"epoch": 1.6825396825396826,
"grad_norm": 0.5140134692192078,
"learning_rate": 9.734439803505345e-05,
"loss": 0.5922,
"step": 106
},
{
"epoch": 1.6984126984126984,
"grad_norm": 0.49391329288482666,
"learning_rate": 9.729355121939621e-05,
"loss": 0.5445,
"step": 107
},
{
"epoch": 1.7142857142857144,
"grad_norm": 0.5012408494949341,
"learning_rate": 9.724223576279395e-05,
"loss": 0.5175,
"step": 108
},
{
"epoch": 1.7301587301587302,
"grad_norm": 0.5038516521453857,
"learning_rate": 9.719045217374143e-05,
"loss": 0.4399,
"step": 109
},
{
"epoch": 1.746031746031746,
"grad_norm": 0.49503833055496216,
"learning_rate": 9.713820096537225e-05,
"loss": 0.483,
"step": 110
},
{
"epoch": 1.7619047619047619,
"grad_norm": 0.5000967979431152,
"learning_rate": 9.708548265545375e-05,
"loss": 0.6131,
"step": 111
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.504001796245575,
"learning_rate": 9.703229776638185e-05,
"loss": 0.5121,
"step": 112
},
{
"epoch": 1.7936507936507935,
"grad_norm": 0.5135077238082886,
"learning_rate": 9.697864682517592e-05,
"loss": 0.4606,
"step": 113
},
{
"epoch": 1.8095238095238095,
"grad_norm": 0.5064616799354553,
"learning_rate": 9.692453036347351e-05,
"loss": 0.4862,
"step": 114
},
{
"epoch": 1.8253968253968254,
"grad_norm": 0.5660854578018188,
"learning_rate": 9.686994891752508e-05,
"loss": 0.5925,
"step": 115
},
{
"epoch": 1.8412698412698414,
"grad_norm": 0.5516797304153442,
"learning_rate": 9.681490302818874e-05,
"loss": 0.5986,
"step": 116
},
{
"epoch": 1.8571428571428572,
"grad_norm": 0.5815762281417847,
"learning_rate": 9.675939324092486e-05,
"loss": 0.6187,
"step": 117
},
{
"epoch": 1.873015873015873,
"grad_norm": 0.5087511539459229,
"learning_rate": 9.670342010579065e-05,
"loss": 0.499,
"step": 118
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.45885273814201355,
"learning_rate": 9.664698417743475e-05,
"loss": 0.4405,
"step": 119
},
{
"epoch": 1.9047619047619047,
"grad_norm": 0.537526547908783,
"learning_rate": 9.659008601509168e-05,
"loss": 0.5208,
"step": 120
},
{
"epoch": 1.9206349206349205,
"grad_norm": 0.4978830814361572,
"learning_rate": 9.653272618257631e-05,
"loss": 0.5475,
"step": 121
},
{
"epoch": 1.9365079365079365,
"grad_norm": 0.5565654635429382,
"learning_rate": 9.647490524827834e-05,
"loss": 0.5459,
"step": 122
},
{
"epoch": 1.9523809523809523,
"grad_norm": 0.5845757126808167,
"learning_rate": 9.641662378515659e-05,
"loss": 0.6169,
"step": 123
},
{
"epoch": 1.9682539682539684,
"grad_norm": 0.5273924469947815,
"learning_rate": 9.635788237073334e-05,
"loss": 0.519,
"step": 124
},
{
"epoch": 1.9841269841269842,
"grad_norm": 0.5515849590301514,
"learning_rate": 9.629868158708861e-05,
"loss": 0.52,
"step": 125
},
{
"epoch": 2.0,
"grad_norm": 0.7463253736495972,
"learning_rate": 9.623902202085444e-05,
"loss": 0.5024,
"step": 126
},
{
"epoch": 2.015873015873016,
"grad_norm": 0.5206636190414429,
"learning_rate": 9.617890426320899e-05,
"loss": 0.4819,
"step": 127
},
{
"epoch": 2.0317460317460316,
"grad_norm": 0.4978935122489929,
"learning_rate": 9.611832890987076e-05,
"loss": 0.4031,
"step": 128
},
{
"epoch": 2.0476190476190474,
"grad_norm": 0.5565934181213379,
"learning_rate": 9.605729656109265e-05,
"loss": 0.5879,
"step": 129
},
{
"epoch": 2.0634920634920633,
"grad_norm": 0.5003566741943359,
"learning_rate": 9.599580782165598e-05,
"loss": 0.3628,
"step": 130
},
{
"epoch": 2.0793650793650795,
"grad_norm": 0.4868488609790802,
"learning_rate": 9.593386330086458e-05,
"loss": 0.3807,
"step": 131
},
{
"epoch": 2.0952380952380953,
"grad_norm": 0.5097118616104126,
"learning_rate": 9.587146361253868e-05,
"loss": 0.4166,
"step": 132
},
{
"epoch": 2.111111111111111,
"grad_norm": 0.5274227857589722,
"learning_rate": 9.580860937500884e-05,
"loss": 0.385,
"step": 133
},
{
"epoch": 2.126984126984127,
"grad_norm": 0.5781636238098145,
"learning_rate": 9.57453012111099e-05,
"loss": 0.3981,
"step": 134
},
{
"epoch": 2.142857142857143,
"grad_norm": 0.6308386921882629,
"learning_rate": 9.568153974817464e-05,
"loss": 0.4357,
"step": 135
},
{
"epoch": 2.1587301587301586,
"grad_norm": 0.6387614011764526,
"learning_rate": 9.561732561802778e-05,
"loss": 0.4168,
"step": 136
},
{
"epoch": 2.1746031746031744,
"grad_norm": 0.6377487182617188,
"learning_rate": 9.555265945697953e-05,
"loss": 0.3831,
"step": 137
},
{
"epoch": 2.1904761904761907,
"grad_norm": 0.7271438241004944,
"learning_rate": 9.548754190581939e-05,
"loss": 0.3844,
"step": 138
},
{
"epoch": 2.2063492063492065,
"grad_norm": 0.8928720951080322,
"learning_rate": 9.542197360980978e-05,
"loss": 0.5863,
"step": 139
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.8302777409553528,
"learning_rate": 9.53559552186796e-05,
"loss": 0.4477,
"step": 140
},
{
"epoch": 2.238095238095238,
"grad_norm": 0.7997470498085022,
"learning_rate": 9.528948738661784e-05,
"loss": 0.3644,
"step": 141
},
{
"epoch": 2.253968253968254,
"grad_norm": 0.8765047192573547,
"learning_rate": 9.522257077226717e-05,
"loss": 0.3806,
"step": 142
},
{
"epoch": 2.2698412698412698,
"grad_norm": 0.7953476309776306,
"learning_rate": 9.51552060387172e-05,
"loss": 0.3829,
"step": 143
},
{
"epoch": 2.2857142857142856,
"grad_norm": 0.8067965507507324,
"learning_rate": 9.508739385349812e-05,
"loss": 0.4414,
"step": 144
},
{
"epoch": 2.3015873015873014,
"grad_norm": 0.7154417037963867,
"learning_rate": 9.501913488857399e-05,
"loss": 0.3377,
"step": 145
},
{
"epoch": 2.317460317460317,
"grad_norm": 0.8233152627944946,
"learning_rate": 9.49504298203361e-05,
"loss": 0.4463,
"step": 146
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.8649589419364929,
"learning_rate": 9.488127932959625e-05,
"loss": 0.3966,
"step": 147
},
{
"epoch": 2.3492063492063493,
"grad_norm": 0.834513247013092,
"learning_rate": 9.481168410158003e-05,
"loss": 0.5009,
"step": 148
},
{
"epoch": 2.365079365079365,
"grad_norm": 0.7996335625648499,
"learning_rate": 9.474164482592002e-05,
"loss": 0.4546,
"step": 149
},
{
"epoch": 2.380952380952381,
"grad_norm": 0.9039611220359802,
"learning_rate": 9.467116219664894e-05,
"loss": 0.4492,
"step": 150
},
{
"epoch": 2.3968253968253967,
"grad_norm": 0.8271594643592834,
"learning_rate": 9.460023691219277e-05,
"loss": 0.3569,
"step": 151
},
{
"epoch": 2.4126984126984126,
"grad_norm": 0.9009270071983337,
"learning_rate": 9.45288696753639e-05,
"loss": 0.4727,
"step": 152
},
{
"epoch": 2.4285714285714284,
"grad_norm": 0.7487375736236572,
"learning_rate": 9.445706119335407e-05,
"loss": 0.3298,
"step": 153
},
{
"epoch": 2.4444444444444446,
"grad_norm": 0.8869822025299072,
"learning_rate": 9.438481217772744e-05,
"loss": 0.4476,
"step": 154
},
{
"epoch": 2.4603174603174605,
"grad_norm": 0.8800178170204163,
"learning_rate": 9.431212334441343e-05,
"loss": 0.4377,
"step": 155
},
{
"epoch": 2.4761904761904763,
"grad_norm": 0.8610995411872864,
"learning_rate": 9.423899541369978e-05,
"loss": 0.409,
"step": 156
},
{
"epoch": 2.492063492063492,
"grad_norm": 0.8344472050666809,
"learning_rate": 9.41654291102253e-05,
"loss": 0.427,
"step": 157
},
{
"epoch": 2.507936507936508,
"grad_norm": 0.9956201314926147,
"learning_rate": 9.409142516297269e-05,
"loss": 0.5661,
"step": 158
},
{
"epoch": 2.5238095238095237,
"grad_norm": 0.8969646692276001,
"learning_rate": 9.401698430526142e-05,
"loss": 0.4215,
"step": 159
},
{
"epoch": 2.5396825396825395,
"grad_norm": 0.918438732624054,
"learning_rate": 9.394210727474028e-05,
"loss": 0.4774,
"step": 160
},
{
"epoch": 2.5555555555555554,
"grad_norm": 0.8604788780212402,
"learning_rate": 9.386679481338033e-05,
"loss": 0.3978,
"step": 161
},
{
"epoch": 2.571428571428571,
"grad_norm": 0.7847458124160767,
"learning_rate": 9.379104766746722e-05,
"loss": 0.3602,
"step": 162
},
{
"epoch": 2.5873015873015874,
"grad_norm": 0.8306839466094971,
"learning_rate": 9.371486658759416e-05,
"loss": 0.466,
"step": 163
},
{
"epoch": 2.6031746031746033,
"grad_norm": 0.8458616137504578,
"learning_rate": 9.363825232865413e-05,
"loss": 0.4077,
"step": 164
},
{
"epoch": 2.619047619047619,
"grad_norm": 0.933336615562439,
"learning_rate": 9.356120564983266e-05,
"loss": 0.4652,
"step": 165
},
{
"epoch": 2.634920634920635,
"grad_norm": 0.9182778596878052,
"learning_rate": 9.348372731460023e-05,
"loss": 0.3775,
"step": 166
},
{
"epoch": 2.6507936507936507,
"grad_norm": 0.9331458806991577,
"learning_rate": 9.340581809070459e-05,
"loss": 0.4362,
"step": 167
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.8755380511283875,
"learning_rate": 9.332747875016332e-05,
"loss": 0.363,
"step": 168
},
{
"epoch": 2.682539682539683,
"grad_norm": 0.8975720405578613,
"learning_rate": 9.324871006925613e-05,
"loss": 0.4007,
"step": 169
},
{
"epoch": 2.6984126984126986,
"grad_norm": 1.1305972337722778,
"learning_rate": 9.316951282851707e-05,
"loss": 0.5013,
"step": 170
},
{
"epoch": 2.7142857142857144,
"grad_norm": 0.8970773220062256,
"learning_rate": 9.308988781272694e-05,
"loss": 0.4052,
"step": 171
},
{
"epoch": 2.7301587301587302,
"grad_norm": 1.0294140577316284,
"learning_rate": 9.300983581090541e-05,
"loss": 0.4707,
"step": 172
},
{
"epoch": 2.746031746031746,
"grad_norm": 0.9334731698036194,
"learning_rate": 9.292935761630326e-05,
"loss": 0.3639,
"step": 173
},
{
"epoch": 2.761904761904762,
"grad_norm": 0.9174486398696899,
"learning_rate": 9.284845402639446e-05,
"loss": 0.3959,
"step": 174
},
{
"epoch": 2.7777777777777777,
"grad_norm": 0.9317827224731445,
"learning_rate": 9.276712584286833e-05,
"loss": 0.3916,
"step": 175
},
{
"epoch": 2.7936507936507935,
"grad_norm": 0.9498136639595032,
"learning_rate": 9.26853738716216e-05,
"loss": 0.4551,
"step": 176
},
{
"epoch": 2.8095238095238093,
"grad_norm": 0.8333742022514343,
"learning_rate": 9.260319892275034e-05,
"loss": 0.3518,
"step": 177
},
{
"epoch": 2.825396825396825,
"grad_norm": 0.8575045466423035,
"learning_rate": 9.2520601810542e-05,
"loss": 0.3623,
"step": 178
},
{
"epoch": 2.8412698412698414,
"grad_norm": 1.100193977355957,
"learning_rate": 9.243758335346735e-05,
"loss": 0.5737,
"step": 179
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.9462725520133972,
"learning_rate": 9.235414437417234e-05,
"loss": 0.4491,
"step": 180
},
{
"epoch": 2.873015873015873,
"grad_norm": 0.8208152651786804,
"learning_rate": 9.227028569946996e-05,
"loss": 0.3799,
"step": 181
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.8733758330345154,
"learning_rate": 9.2186008160332e-05,
"loss": 0.4313,
"step": 182
},
{
"epoch": 2.9047619047619047,
"grad_norm": 0.8397769927978516,
"learning_rate": 9.210131259188095e-05,
"loss": 0.3718,
"step": 183
},
{
"epoch": 2.9206349206349205,
"grad_norm": 1.0263302326202393,
"learning_rate": 9.201619983338153e-05,
"loss": 0.5163,
"step": 184
},
{
"epoch": 2.9365079365079367,
"grad_norm": 0.7651734948158264,
"learning_rate": 9.193067072823251e-05,
"loss": 0.3483,
"step": 185
},
{
"epoch": 2.9523809523809526,
"grad_norm": 0.92905592918396,
"learning_rate": 9.18447261239584e-05,
"loss": 0.5041,
"step": 186
},
{
"epoch": 2.9682539682539684,
"grad_norm": 0.8523809909820557,
"learning_rate": 9.175836687220084e-05,
"loss": 0.381,
"step": 187
},
{
"epoch": 2.984126984126984,
"grad_norm": 0.8607370257377625,
"learning_rate": 9.167159382871039e-05,
"loss": 0.3953,
"step": 188
},
{
"epoch": 3.0,
"grad_norm": 1.321708083152771,
"learning_rate": 9.15844078533379e-05,
"loss": 0.4583,
"step": 189
},
{
"epoch": 3.015873015873016,
"grad_norm": 0.7019425630569458,
"learning_rate": 9.149680981002609e-05,
"loss": 0.2773,
"step": 190
},
{
"epoch": 3.0317460317460316,
"grad_norm": 0.6896389126777649,
"learning_rate": 9.140880056680088e-05,
"loss": 0.2746,
"step": 191
},
{
"epoch": 3.0476190476190474,
"grad_norm": 0.779511570930481,
"learning_rate": 9.13203809957629e-05,
"loss": 0.3052,
"step": 192
},
{
"epoch": 3.0634920634920633,
"grad_norm": 0.8268155455589294,
"learning_rate": 9.123155197307876e-05,
"loss": 0.3045,
"step": 193
},
{
"epoch": 3.0793650793650795,
"grad_norm": 0.7496017813682556,
"learning_rate": 9.114231437897244e-05,
"loss": 0.2231,
"step": 194
},
{
"epoch": 3.0952380952380953,
"grad_norm": 0.8415669798851013,
"learning_rate": 9.105266909771653e-05,
"loss": 0.2298,
"step": 195
},
{
"epoch": 3.111111111111111,
"grad_norm": 1.05263090133667,
"learning_rate": 9.096261701762342e-05,
"loss": 0.2488,
"step": 196
},
{
"epoch": 3.126984126984127,
"grad_norm": 1.238415241241455,
"learning_rate": 9.087215903103662e-05,
"loss": 0.2806,
"step": 197
},
{
"epoch": 3.142857142857143,
"grad_norm": 1.1588196754455566,
"learning_rate": 9.078129603432181e-05,
"loss": 0.245,
"step": 198
},
{
"epoch": 3.1587301587301586,
"grad_norm": 1.584652304649353,
"learning_rate": 9.069002892785797e-05,
"loss": 0.295,
"step": 199
},
{
"epoch": 3.1746031746031744,
"grad_norm": 1.3894325494766235,
"learning_rate": 9.059835861602853e-05,
"loss": 0.2349,
"step": 200
},
{
"epoch": 3.1904761904761907,
"grad_norm": 1.66408109664917,
"learning_rate": 9.050628600721234e-05,
"loss": 0.2627,
"step": 201
},
{
"epoch": 3.2063492063492065,
"grad_norm": 1.2087987661361694,
"learning_rate": 9.041381201377468e-05,
"loss": 0.2159,
"step": 202
},
{
"epoch": 3.2222222222222223,
"grad_norm": 1.369932770729065,
"learning_rate": 9.032093755205822e-05,
"loss": 0.2341,
"step": 203
},
{
"epoch": 3.238095238095238,
"grad_norm": 1.6366993188858032,
"learning_rate": 9.0227663542374e-05,
"loss": 0.2893,
"step": 204
},
{
"epoch": 3.253968253968254,
"grad_norm": 1.529963731765747,
"learning_rate": 9.013399090899217e-05,
"loss": 0.2395,
"step": 205
},
{
"epoch": 3.2698412698412698,
"grad_norm": 1.7285979986190796,
"learning_rate": 9.003992058013302e-05,
"loss": 0.3451,
"step": 206
},
{
"epoch": 3.2857142857142856,
"grad_norm": 1.3240851163864136,
"learning_rate": 8.99454534879576e-05,
"loss": 0.2469,
"step": 207
},
{
"epoch": 3.3015873015873014,
"grad_norm": 1.3964006900787354,
"learning_rate": 8.985059056855858e-05,
"loss": 0.2456,
"step": 208
},
{
"epoch": 3.317460317460317,
"grad_norm": 1.405621886253357,
"learning_rate": 8.975533276195102e-05,
"loss": 0.2347,
"step": 209
},
{
"epoch": 3.3333333333333335,
"grad_norm": 1.3338896036148071,
"learning_rate": 8.965968101206291e-05,
"loss": 0.2988,
"step": 210
},
{
"epoch": 3.3492063492063493,
"grad_norm": 1.329379677772522,
"learning_rate": 8.956363626672595e-05,
"loss": 0.2651,
"step": 211
},
{
"epoch": 3.365079365079365,
"grad_norm": 1.3324720859527588,
"learning_rate": 8.94671994776661e-05,
"loss": 0.2527,
"step": 212
},
{
"epoch": 3.380952380952381,
"grad_norm": 1.2702524662017822,
"learning_rate": 8.937037160049416e-05,
"loss": 0.2763,
"step": 213
},
{
"epoch": 3.3968253968253967,
"grad_norm": 1.270229458808899,
"learning_rate": 8.927315359469626e-05,
"loss": 0.236,
"step": 214
},
{
"epoch": 3.4126984126984126,
"grad_norm": 1.3164818286895752,
"learning_rate": 8.917554642362443e-05,
"loss": 0.2476,
"step": 215
},
{
"epoch": 3.4285714285714284,
"grad_norm": 1.2434004545211792,
"learning_rate": 8.907755105448704e-05,
"loss": 0.2387,
"step": 216
},
{
"epoch": 3.4444444444444446,
"grad_norm": 1.0932611227035522,
"learning_rate": 8.89791684583391e-05,
"loss": 0.2195,
"step": 217
},
{
"epoch": 3.4603174603174605,
"grad_norm": 1.334930181503296,
"learning_rate": 8.888039961007282e-05,
"loss": 0.2725,
"step": 218
},
{
"epoch": 3.4761904761904763,
"grad_norm": 1.1716219186782837,
"learning_rate": 8.87812454884078e-05,
"loss": 0.2515,
"step": 219
},
{
"epoch": 3.492063492063492,
"grad_norm": 1.1771153211593628,
"learning_rate": 8.868170707588142e-05,
"loss": 0.2286,
"step": 220
},
{
"epoch": 3.507936507936508,
"grad_norm": 1.2309902906417847,
"learning_rate": 8.858178535883905e-05,
"loss": 0.2365,
"step": 221
},
{
"epoch": 3.5238095238095237,
"grad_norm": 0.9976351261138916,
"learning_rate": 8.848148132742431e-05,
"loss": 0.22,
"step": 222
},
{
"epoch": 3.5396825396825395,
"grad_norm": 1.1791083812713623,
"learning_rate": 8.838079597556925e-05,
"loss": 0.2683,
"step": 223
},
{
"epoch": 3.5555555555555554,
"grad_norm": 1.1750749349594116,
"learning_rate": 8.827973030098448e-05,
"loss": 0.2396,
"step": 224
},
{
"epoch": 3.571428571428571,
"grad_norm": 1.054264783859253,
"learning_rate": 8.81782853051493e-05,
"loss": 0.2396,
"step": 225
},
{
"epoch": 3.5873015873015874,
"grad_norm": 1.1976933479309082,
"learning_rate": 8.807646199330187e-05,
"loss": 0.2393,
"step": 226
},
{
"epoch": 3.6031746031746033,
"grad_norm": 1.4662325382232666,
"learning_rate": 8.797426137442897e-05,
"loss": 0.3188,
"step": 227
},
{
"epoch": 3.619047619047619,
"grad_norm": 1.5771795511245728,
"learning_rate": 8.787168446125638e-05,
"loss": 0.3204,
"step": 228
},
{
"epoch": 3.634920634920635,
"grad_norm": 1.3994357585906982,
"learning_rate": 8.776873227023852e-05,
"loss": 0.3045,
"step": 229
},
{
"epoch": 3.6507936507936507,
"grad_norm": 1.9753646850585938,
"learning_rate": 8.766540582154859e-05,
"loss": 0.2306,
"step": 230
},
{
"epoch": 3.6666666666666665,
"grad_norm": 1.4474598169326782,
"learning_rate": 8.756170613906833e-05,
"loss": 0.2581,
"step": 231
},
{
"epoch": 3.682539682539683,
"grad_norm": 1.1273548603057861,
"learning_rate": 8.745763425037797e-05,
"loss": 0.2213,
"step": 232
},
{
"epoch": 3.6984126984126986,
"grad_norm": 1.0989768505096436,
"learning_rate": 8.735319118674596e-05,
"loss": 0.2063,
"step": 233
},
{
"epoch": 3.7142857142857144,
"grad_norm": 1.243393063545227,
"learning_rate": 8.724837798311882e-05,
"loss": 0.2539,
"step": 234
},
{
"epoch": 3.7301587301587302,
"grad_norm": 1.1233344078063965,
"learning_rate": 8.714319567811088e-05,
"loss": 0.2225,
"step": 235
},
{
"epoch": 3.746031746031746,
"grad_norm": 1.2728500366210938,
"learning_rate": 8.703764531399392e-05,
"loss": 0.246,
"step": 236
},
{
"epoch": 3.761904761904762,
"grad_norm": 1.2673249244689941,
"learning_rate": 8.69317279366869e-05,
"loss": 0.2881,
"step": 237
},
{
"epoch": 3.7777777777777777,
"grad_norm": 1.4421532154083252,
"learning_rate": 8.682544459574562e-05,
"loss": 0.3309,
"step": 238
},
{
"epoch": 3.7936507936507935,
"grad_norm": 1.217529296875,
"learning_rate": 8.671879634435224e-05,
"loss": 0.2815,
"step": 239
},
{
"epoch": 3.8095238095238093,
"grad_norm": 1.1456962823867798,
"learning_rate": 8.661178423930491e-05,
"loss": 0.2557,
"step": 240
},
{
"epoch": 3.825396825396825,
"grad_norm": 1.0717531442642212,
"learning_rate": 8.650440934100728e-05,
"loss": 0.2471,
"step": 241
},
{
"epoch": 3.8412698412698414,
"grad_norm": 1.217034935951233,
"learning_rate": 8.6396672713458e-05,
"loss": 0.2883,
"step": 242
},
{
"epoch": 3.857142857142857,
"grad_norm": 1.237244725227356,
"learning_rate": 8.628857542424009e-05,
"loss": 0.2953,
"step": 243
},
{
"epoch": 3.873015873015873,
"grad_norm": 1.2947179079055786,
"learning_rate": 8.618011854451056e-05,
"loss": 0.3134,
"step": 244
},
{
"epoch": 3.888888888888889,
"grad_norm": 1.2005493640899658,
"learning_rate": 8.607130314898956e-05,
"loss": 0.2655,
"step": 245
},
{
"epoch": 3.9047619047619047,
"grad_norm": 1.387406826019287,
"learning_rate": 8.596213031594991e-05,
"loss": 0.3133,
"step": 246
},
{
"epoch": 3.9206349206349205,
"grad_norm": 1.297012209892273,
"learning_rate": 8.585260112720631e-05,
"loss": 0.2747,
"step": 247
},
{
"epoch": 3.9365079365079367,
"grad_norm": 1.12217378616333,
"learning_rate": 8.57427166681047e-05,
"loss": 0.2444,
"step": 248
},
{
"epoch": 3.9523809523809526,
"grad_norm": 1.2482068538665771,
"learning_rate": 8.56324780275114e-05,
"loss": 0.2887,
"step": 249
},
{
"epoch": 3.9682539682539684,
"grad_norm": 1.2814184427261353,
"learning_rate": 8.552188629780244e-05,
"loss": 0.284,
"step": 250
},
{
"epoch": 3.984126984126984,
"grad_norm": 1.1486774682998657,
"learning_rate": 8.541094257485265e-05,
"loss": 0.2636,
"step": 251
},
{
"epoch": 4.0,
"grad_norm": 1.6360046863555908,
"learning_rate": 8.529964795802485e-05,
"loss": 0.2305,
"step": 252
},
{
"epoch": 4.015873015873016,
"grad_norm": 0.7815824151039124,
"learning_rate": 8.518800355015892e-05,
"loss": 0.1427,
"step": 253
},
{
"epoch": 4.031746031746032,
"grad_norm": 0.9590736031532288,
"learning_rate": 8.507601045756085e-05,
"loss": 0.1609,
"step": 254
},
{
"epoch": 4.0476190476190474,
"grad_norm": 0.9721108078956604,
"learning_rate": 8.49636697899919e-05,
"loss": 0.1429,
"step": 255
},
{
"epoch": 4.063492063492063,
"grad_norm": 1.0513888597488403,
"learning_rate": 8.485098266065744e-05,
"loss": 0.1344,
"step": 256
},
{
"epoch": 4.079365079365079,
"grad_norm": 1.1911511421203613,
"learning_rate": 8.473795018619604e-05,
"loss": 0.135,
"step": 257
},
{
"epoch": 4.095238095238095,
"grad_norm": 1.052157998085022,
"learning_rate": 8.462457348666835e-05,
"loss": 0.1146,
"step": 258
},
{
"epoch": 4.111111111111111,
"grad_norm": 1.4159713983535767,
"learning_rate": 8.4510853685546e-05,
"loss": 0.1359,
"step": 259
},
{
"epoch": 4.1269841269841265,
"grad_norm": 1.6234732866287231,
"learning_rate": 8.439679190970052e-05,
"loss": 0.1634,
"step": 260
},
{
"epoch": 4.142857142857143,
"grad_norm": 1.2149155139923096,
"learning_rate": 8.428238928939207e-05,
"loss": 0.1051,
"step": 261
},
{
"epoch": 4.158730158730159,
"grad_norm": 1.527443528175354,
"learning_rate": 8.416764695825834e-05,
"loss": 0.1519,
"step": 262
},
{
"epoch": 4.174603174603175,
"grad_norm": 1.3665393590927124,
"learning_rate": 8.405256605330331e-05,
"loss": 0.1366,
"step": 263
},
{
"epoch": 4.190476190476191,
"grad_norm": 1.2650479078292847,
"learning_rate": 8.39371477148859e-05,
"loss": 0.1314,
"step": 264
},
{
"epoch": 4.2063492063492065,
"grad_norm": 0.9967718124389648,
"learning_rate": 8.382139308670875e-05,
"loss": 0.1173,
"step": 265
},
{
"epoch": 4.222222222222222,
"grad_norm": 1.1094558238983154,
"learning_rate": 8.370530331580686e-05,
"loss": 0.1126,
"step": 266
},
{
"epoch": 4.238095238095238,
"grad_norm": 1.0152033567428589,
"learning_rate": 8.35888795525362e-05,
"loss": 0.089,
"step": 267
},
{
"epoch": 4.253968253968254,
"grad_norm": 1.2841627597808838,
"learning_rate": 8.347212295056239e-05,
"loss": 0.1292,
"step": 268
},
{
"epoch": 4.26984126984127,
"grad_norm": 1.416364073753357,
"learning_rate": 8.335503466684915e-05,
"loss": 0.1444,
"step": 269
},
{
"epoch": 4.285714285714286,
"grad_norm": 1.2542331218719482,
"learning_rate": 8.323761586164695e-05,
"loss": 0.1313,
"step": 270
},
{
"epoch": 4.301587301587301,
"grad_norm": 1.3430452346801758,
"learning_rate": 8.311986769848141e-05,
"loss": 0.1405,
"step": 271
},
{
"epoch": 4.317460317460317,
"grad_norm": 1.3169519901275635,
"learning_rate": 8.300179134414188e-05,
"loss": 0.1429,
"step": 272
},
{
"epoch": 4.333333333333333,
"grad_norm": 1.2539156675338745,
"learning_rate": 8.288338796866976e-05,
"loss": 0.1382,
"step": 273
},
{
"epoch": 4.349206349206349,
"grad_norm": 1.365218997001648,
"learning_rate": 8.276465874534702e-05,
"loss": 0.1236,
"step": 274
},
{
"epoch": 4.365079365079365,
"grad_norm": 1.4856258630752563,
"learning_rate": 8.264560485068446e-05,
"loss": 0.1516,
"step": 275
},
{
"epoch": 4.380952380952381,
"grad_norm": 1.139467477798462,
"learning_rate": 8.252622746441021e-05,
"loss": 0.1187,
"step": 276
},
{
"epoch": 4.396825396825397,
"grad_norm": 1.1698997020721436,
"learning_rate": 8.240652776945781e-05,
"loss": 0.133,
"step": 277
},
{
"epoch": 4.412698412698413,
"grad_norm": 1.284920334815979,
"learning_rate": 8.228650695195472e-05,
"loss": 0.1564,
"step": 278
},
{
"epoch": 4.428571428571429,
"grad_norm": 1.2975406646728516,
"learning_rate": 8.216616620121043e-05,
"loss": 0.1476,
"step": 279
},
{
"epoch": 4.444444444444445,
"grad_norm": 1.28453528881073,
"learning_rate": 8.204550670970469e-05,
"loss": 0.1444,
"step": 280
},
{
"epoch": 4.4603174603174605,
"grad_norm": 1.2703144550323486,
"learning_rate": 8.192452967307576e-05,
"loss": 0.1627,
"step": 281
},
{
"epoch": 4.476190476190476,
"grad_norm": 1.2940740585327148,
"learning_rate": 8.180323629010848e-05,
"loss": 0.1384,
"step": 282
},
{
"epoch": 4.492063492063492,
"grad_norm": 1.2578924894332886,
"learning_rate": 8.168162776272244e-05,
"loss": 0.1301,
"step": 283
},
{
"epoch": 4.507936507936508,
"grad_norm": 1.2214442491531372,
"learning_rate": 8.155970529596006e-05,
"loss": 0.139,
"step": 284
},
{
"epoch": 4.523809523809524,
"grad_norm": 1.436343789100647,
"learning_rate": 8.143747009797464e-05,
"loss": 0.1522,
"step": 285
},
{
"epoch": 4.5396825396825395,
"grad_norm": 1.179060459136963,
"learning_rate": 8.131492338001839e-05,
"loss": 0.1236,
"step": 286
},
{
"epoch": 4.555555555555555,
"grad_norm": 1.3683005571365356,
"learning_rate": 8.119206635643045e-05,
"loss": 0.1489,
"step": 287
},
{
"epoch": 4.571428571428571,
"grad_norm": 1.2832778692245483,
"learning_rate": 8.106890024462481e-05,
"loss": 0.1388,
"step": 288
},
{
"epoch": 4.587301587301587,
"grad_norm": 1.0831190347671509,
"learning_rate": 8.094542626507828e-05,
"loss": 0.1219,
"step": 289
},
{
"epoch": 4.603174603174603,
"grad_norm": 1.212108850479126,
"learning_rate": 8.082164564131845e-05,
"loss": 0.1331,
"step": 290
},
{
"epoch": 4.619047619047619,
"grad_norm": 1.157487154006958,
"learning_rate": 8.069755959991142e-05,
"loss": 0.1306,
"step": 291
},
{
"epoch": 4.634920634920634,
"grad_norm": 1.194389820098877,
"learning_rate": 8.057316937044977e-05,
"loss": 0.1361,
"step": 292
},
{
"epoch": 4.650793650793651,
"grad_norm": 1.2109564542770386,
"learning_rate": 8.044847618554034e-05,
"loss": 0.138,
"step": 293
},
{
"epoch": 4.666666666666667,
"grad_norm": 1.0707926750183105,
"learning_rate": 8.032348128079203e-05,
"loss": 0.1078,
"step": 294
},
{
"epoch": 4.682539682539683,
"grad_norm": 1.179071307182312,
"learning_rate": 8.019818589480352e-05,
"loss": 0.1397,
"step": 295
},
{
"epoch": 4.698412698412699,
"grad_norm": 1.3288228511810303,
"learning_rate": 8.0072591269151e-05,
"loss": 0.1613,
"step": 296
},
{
"epoch": 4.714285714285714,
"grad_norm": 1.24984872341156,
"learning_rate": 7.994669864837594e-05,
"loss": 0.1457,
"step": 297
},
{
"epoch": 4.73015873015873,
"grad_norm": 1.2009999752044678,
"learning_rate": 7.982050927997264e-05,
"loss": 0.1257,
"step": 298
},
{
"epoch": 4.746031746031746,
"grad_norm": 1.207233190536499,
"learning_rate": 7.969402441437594e-05,
"loss": 0.1567,
"step": 299
},
{
"epoch": 4.761904761904762,
"grad_norm": 1.1672086715698242,
"learning_rate": 7.956724530494887e-05,
"loss": 0.1274,
"step": 300
},
{
"epoch": 4.777777777777778,
"grad_norm": 1.506867527961731,
"learning_rate": 7.944017320797013e-05,
"loss": 0.139,
"step": 301
},
{
"epoch": 4.7936507936507935,
"grad_norm": 1.4278178215026855,
"learning_rate": 7.931280938262169e-05,
"loss": 0.1357,
"step": 302
},
{
"epoch": 4.809523809523809,
"grad_norm": 1.599716067314148,
"learning_rate": 7.918515509097634e-05,
"loss": 0.1704,
"step": 303
},
{
"epoch": 4.825396825396825,
"grad_norm": 1.3049015998840332,
"learning_rate": 7.905721159798513e-05,
"loss": 0.1379,
"step": 304
},
{
"epoch": 4.841269841269841,
"grad_norm": 1.3524868488311768,
"learning_rate": 7.89289801714649e-05,
"loss": 0.1545,
"step": 305
},
{
"epoch": 4.857142857142857,
"grad_norm": 1.2142527103424072,
"learning_rate": 7.880046208208563e-05,
"loss": 0.1453,
"step": 306
},
{
"epoch": 4.8730158730158735,
"grad_norm": 1.1084891557693481,
"learning_rate": 7.867165860335792e-05,
"loss": 0.1427,
"step": 307
},
{
"epoch": 4.888888888888889,
"grad_norm": 1.266802191734314,
"learning_rate": 7.854257101162037e-05,
"loss": 0.1396,
"step": 308
},
{
"epoch": 4.904761904761905,
"grad_norm": 1.1826775074005127,
"learning_rate": 7.841320058602688e-05,
"loss": 0.1514,
"step": 309
},
{
"epoch": 4.920634920634921,
"grad_norm": 1.4232659339904785,
"learning_rate": 7.828354860853399e-05,
"loss": 0.1472,
"step": 310
},
{
"epoch": 4.936507936507937,
"grad_norm": 1.1436830759048462,
"learning_rate": 7.815361636388827e-05,
"loss": 0.1249,
"step": 311
},
{
"epoch": 4.9523809523809526,
"grad_norm": 1.3001309633255005,
"learning_rate": 7.802340513961342e-05,
"loss": 0.1663,
"step": 312
},
{
"epoch": 4.968253968253968,
"grad_norm": 1.4213690757751465,
"learning_rate": 7.789291622599767e-05,
"loss": 0.1538,
"step": 313
},
{
"epoch": 4.984126984126984,
"grad_norm": 1.5043220520019531,
"learning_rate": 7.776215091608085e-05,
"loss": 0.151,
"step": 314
},
{
"epoch": 5.0,
"grad_norm": 1.6825261116027832,
"learning_rate": 7.763111050564178e-05,
"loss": 0.1485,
"step": 315
},
{
"epoch": 5.015873015873016,
"grad_norm": 0.8601322174072266,
"learning_rate": 7.749979629318516e-05,
"loss": 0.0703,
"step": 316
},
{
"epoch": 5.031746031746032,
"grad_norm": 0.7637147903442383,
"learning_rate": 7.736820957992895e-05,
"loss": 0.0633,
"step": 317
},
{
"epoch": 5.0476190476190474,
"grad_norm": 0.8896054625511169,
"learning_rate": 7.723635166979133e-05,
"loss": 0.0652,
"step": 318
},
{
"epoch": 5.063492063492063,
"grad_norm": 0.9216472506523132,
"learning_rate": 7.710422386937784e-05,
"loss": 0.0585,
"step": 319
},
{
"epoch": 5.079365079365079,
"grad_norm": 0.7166661024093628,
"learning_rate": 7.697182748796841e-05,
"loss": 0.0531,
"step": 320
},
{
"epoch": 5.095238095238095,
"grad_norm": 0.9962891936302185,
"learning_rate": 7.683916383750436e-05,
"loss": 0.072,
"step": 321
},
{
"epoch": 5.111111111111111,
"grad_norm": 0.7969011068344116,
"learning_rate": 7.670623423257548e-05,
"loss": 0.0554,
"step": 322
},
{
"epoch": 5.1269841269841265,
"grad_norm": 0.8427059650421143,
"learning_rate": 7.657303999040693e-05,
"loss": 0.0534,
"step": 323
},
{
"epoch": 5.142857142857143,
"grad_norm": 0.9813700914382935,
"learning_rate": 7.64395824308462e-05,
"loss": 0.0696,
"step": 324
},
{
"epoch": 5.158730158730159,
"grad_norm": 0.8625731468200684,
"learning_rate": 7.630586287635008e-05,
"loss": 0.0562,
"step": 325
},
{
"epoch": 5.174603174603175,
"grad_norm": 0.9820646047592163,
"learning_rate": 7.617188265197148e-05,
"loss": 0.063,
"step": 326
},
{
"epoch": 5.190476190476191,
"grad_norm": 1.0742745399475098,
"learning_rate": 7.603764308534636e-05,
"loss": 0.0689,
"step": 327
},
{
"epoch": 5.2063492063492065,
"grad_norm": 0.9532903432846069,
"learning_rate": 7.590314550668054e-05,
"loss": 0.0667,
"step": 328
},
{
"epoch": 5.222222222222222,
"grad_norm": 0.8958349227905273,
"learning_rate": 7.576839124873653e-05,
"loss": 0.0538,
"step": 329
},
{
"epoch": 5.238095238095238,
"grad_norm": 0.9804512858390808,
"learning_rate": 7.563338164682036e-05,
"loss": 0.0689,
"step": 330
},
{
"epoch": 5.253968253968254,
"grad_norm": 1.0487632751464844,
"learning_rate": 7.549811803876825e-05,
"loss": 0.0671,
"step": 331
},
{
"epoch": 5.26984126984127,
"grad_norm": 0.9195834994316101,
"learning_rate": 7.536260176493348e-05,
"loss": 0.0669,
"step": 332
},
{
"epoch": 5.285714285714286,
"grad_norm": 0.9964186549186707,
"learning_rate": 7.5226834168173e-05,
"loss": 0.0688,
"step": 333
},
{
"epoch": 5.301587301587301,
"grad_norm": 0.8904131054878235,
"learning_rate": 7.509081659383417e-05,
"loss": 0.0636,
"step": 334
},
{
"epoch": 5.317460317460317,
"grad_norm": 0.965900182723999,
"learning_rate": 7.495455038974146e-05,
"loss": 0.0769,
"step": 335
},
{
"epoch": 5.333333333333333,
"grad_norm": 0.9203529357910156,
"learning_rate": 7.481803690618303e-05,
"loss": 0.0554,
"step": 336
},
{
"epoch": 5.349206349206349,
"grad_norm": 1.2281473875045776,
"learning_rate": 7.46812774958974e-05,
"loss": 0.0735,
"step": 337
},
{
"epoch": 5.365079365079365,
"grad_norm": 1.0948208570480347,
"learning_rate": 7.454427351405999e-05,
"loss": 0.0705,
"step": 338
},
{
"epoch": 5.380952380952381,
"grad_norm": 1.0401225090026855,
"learning_rate": 7.440702631826977e-05,
"loss": 0.07,
"step": 339
},
{
"epoch": 5.396825396825397,
"grad_norm": 0.9042516350746155,
"learning_rate": 7.426953726853574e-05,
"loss": 0.0628,
"step": 340
},
{
"epoch": 5.412698412698413,
"grad_norm": 0.9594908356666565,
"learning_rate": 7.413180772726348e-05,
"loss": 0.0606,
"step": 341
},
{
"epoch": 5.428571428571429,
"grad_norm": 1.0593825578689575,
"learning_rate": 7.399383905924165e-05,
"loss": 0.0652,
"step": 342
},
{
"epoch": 5.444444444444445,
"grad_norm": 1.0469237565994263,
"learning_rate": 7.385563263162847e-05,
"loss": 0.0636,
"step": 343
},
{
"epoch": 5.4603174603174605,
"grad_norm": 0.9159653782844543,
"learning_rate": 7.371718981393815e-05,
"loss": 0.0566,
"step": 344
},
{
"epoch": 5.476190476190476,
"grad_norm": 0.9596768021583557,
"learning_rate": 7.357851197802735e-05,
"loss": 0.0659,
"step": 345
},
{
"epoch": 5.492063492063492,
"grad_norm": 0.8929640054702759,
"learning_rate": 7.343960049808156e-05,
"loss": 0.0586,
"step": 346
},
{
"epoch": 5.507936507936508,
"grad_norm": 0.859683632850647,
"learning_rate": 7.330045675060149e-05,
"loss": 0.0522,
"step": 347
},
{
"epoch": 5.523809523809524,
"grad_norm": 1.026452898979187,
"learning_rate": 7.316108211438945e-05,
"loss": 0.0679,
"step": 348
},
{
"epoch": 5.5396825396825395,
"grad_norm": 0.9891062378883362,
"learning_rate": 7.302147797053569e-05,
"loss": 0.072,
"step": 349
},
{
"epoch": 5.555555555555555,
"grad_norm": 0.9392737150192261,
"learning_rate": 7.288164570240463e-05,
"loss": 0.062,
"step": 350
},
{
"epoch": 5.571428571428571,
"grad_norm": 1.1346358060836792,
"learning_rate": 7.274158669562126e-05,
"loss": 0.0666,
"step": 351
},
{
"epoch": 5.587301587301587,
"grad_norm": 0.8670554757118225,
"learning_rate": 7.26013023380574e-05,
"loss": 0.0572,
"step": 352
},
{
"epoch": 5.603174603174603,
"grad_norm": 1.020330786705017,
"learning_rate": 7.246079401981784e-05,
"loss": 0.0617,
"step": 353
},
{
"epoch": 5.619047619047619,
"grad_norm": 1.0136491060256958,
"learning_rate": 7.232006313322667e-05,
"loss": 0.0853,
"step": 354
},
{
"epoch": 5.634920634920634,
"grad_norm": 1.010423183441162,
"learning_rate": 7.217911107281352e-05,
"loss": 0.0705,
"step": 355
},
{
"epoch": 5.650793650793651,
"grad_norm": 0.9768037796020508,
"learning_rate": 7.203793923529956e-05,
"loss": 0.0853,
"step": 356
},
{
"epoch": 5.666666666666667,
"grad_norm": 0.9990655183792114,
"learning_rate": 7.189654901958385e-05,
"loss": 0.0715,
"step": 357
},
{
"epoch": 5.682539682539683,
"grad_norm": 1.0247498750686646,
"learning_rate": 7.175494182672939e-05,
"loss": 0.0712,
"step": 358
},
{
"epoch": 5.698412698412699,
"grad_norm": 1.0099873542785645,
"learning_rate": 7.161311905994922e-05,
"loss": 0.0712,
"step": 359
},
{
"epoch": 5.714285714285714,
"grad_norm": 1.0355095863342285,
"learning_rate": 7.147108212459257e-05,
"loss": 0.0722,
"step": 360
},
{
"epoch": 5.73015873015873,
"grad_norm": 1.1409605741500854,
"learning_rate": 7.13288324281309e-05,
"loss": 0.0688,
"step": 361
},
{
"epoch": 5.746031746031746,
"grad_norm": 1.1082065105438232,
"learning_rate": 7.118637138014396e-05,
"loss": 0.0781,
"step": 362
},
{
"epoch": 5.761904761904762,
"grad_norm": 1.1074239015579224,
"learning_rate": 7.104370039230583e-05,
"loss": 0.0705,
"step": 363
},
{
"epoch": 5.777777777777778,
"grad_norm": 0.9265062212944031,
"learning_rate": 7.090082087837091e-05,
"loss": 0.0593,
"step": 364
},
{
"epoch": 5.7936507936507935,
"grad_norm": 0.911005437374115,
"learning_rate": 7.075773425415994e-05,
"loss": 0.0678,
"step": 365
},
{
"epoch": 5.809523809523809,
"grad_norm": 1.0349949598312378,
"learning_rate": 7.061444193754596e-05,
"loss": 0.078,
"step": 366
},
{
"epoch": 5.825396825396825,
"grad_norm": 1.0515737533569336,
"learning_rate": 7.047094534844023e-05,
"loss": 0.0666,
"step": 367
},
{
"epoch": 5.841269841269841,
"grad_norm": 1.179187297821045,
"learning_rate": 7.032724590877821e-05,
"loss": 0.0774,
"step": 368
},
{
"epoch": 5.857142857142857,
"grad_norm": 1.1190379858016968,
"learning_rate": 7.018334504250545e-05,
"loss": 0.0891,
"step": 369
},
{
"epoch": 5.8730158730158735,
"grad_norm": 0.9958922863006592,
"learning_rate": 7.003924417556343e-05,
"loss": 0.0711,
"step": 370
},
{
"epoch": 5.888888888888889,
"grad_norm": 1.053802728652954,
"learning_rate": 6.989494473587554e-05,
"loss": 0.0759,
"step": 371
},
{
"epoch": 5.904761904761905,
"grad_norm": 0.9447202682495117,
"learning_rate": 6.975044815333282e-05,
"loss": 0.0713,
"step": 372
},
{
"epoch": 5.920634920634921,
"grad_norm": 0.9191451668739319,
"learning_rate": 6.960575585977984e-05,
"loss": 0.0655,
"step": 373
},
{
"epoch": 5.936507936507937,
"grad_norm": 1.1037213802337646,
"learning_rate": 6.946086928900054e-05,
"loss": 0.0831,
"step": 374
},
{
"epoch": 5.9523809523809526,
"grad_norm": 0.9468006491661072,
"learning_rate": 6.931578987670396e-05,
"loss": 0.059,
"step": 375
},
{
"epoch": 5.968253968253968,
"grad_norm": 1.1110552549362183,
"learning_rate": 6.917051906051006e-05,
"loss": 0.0709,
"step": 376
},
{
"epoch": 5.984126984126984,
"grad_norm": 1.1933718919754028,
"learning_rate": 6.902505827993541e-05,
"loss": 0.1004,
"step": 377
},
{
"epoch": 6.0,
"grad_norm": 1.4565590620040894,
"learning_rate": 6.887940897637908e-05,
"loss": 0.0915,
"step": 378
},
{
"epoch": 6.015873015873016,
"grad_norm": 0.6238571405410767,
"learning_rate": 6.873357259310815e-05,
"loss": 0.0431,
"step": 379
},
{
"epoch": 6.031746031746032,
"grad_norm": 0.4840649366378784,
"learning_rate": 6.858755057524354e-05,
"loss": 0.0358,
"step": 380
},
{
"epoch": 6.0476190476190474,
"grad_norm": 0.48597481846809387,
"learning_rate": 6.844134436974567e-05,
"loss": 0.0222,
"step": 381
},
{
"epoch": 6.063492063492063,
"grad_norm": 0.6410611867904663,
"learning_rate": 6.829495542540013e-05,
"loss": 0.0404,
"step": 382
},
{
"epoch": 6.079365079365079,
"grad_norm": 0.5220045447349548,
"learning_rate": 6.814838519280324e-05,
"loss": 0.0303,
"step": 383
},
{
"epoch": 6.095238095238095,
"grad_norm": 0.6196178793907166,
"learning_rate": 6.80016351243478e-05,
"loss": 0.0391,
"step": 384
},
{
"epoch": 6.111111111111111,
"grad_norm": 0.64337158203125,
"learning_rate": 6.785470667420862e-05,
"loss": 0.0338,
"step": 385
},
{
"epoch": 6.1269841269841265,
"grad_norm": 0.8072399497032166,
"learning_rate": 6.77076012983281e-05,
"loss": 0.0413,
"step": 386
},
{
"epoch": 6.142857142857143,
"grad_norm": 0.6252787709236145,
"learning_rate": 6.75603204544019e-05,
"loss": 0.0332,
"step": 387
},
{
"epoch": 6.158730158730159,
"grad_norm": 0.7571528553962708,
"learning_rate": 6.741286560186437e-05,
"loss": 0.0375,
"step": 388
},
{
"epoch": 6.174603174603175,
"grad_norm": 0.5972614884376526,
"learning_rate": 6.726523820187413e-05,
"loss": 0.0333,
"step": 389
},
{
"epoch": 6.190476190476191,
"grad_norm": 0.6365858316421509,
"learning_rate": 6.711743971729967e-05,
"loss": 0.0264,
"step": 390
},
{
"epoch": 6.2063492063492065,
"grad_norm": 0.7397788763046265,
"learning_rate": 6.696947161270476e-05,
"loss": 0.0319,
"step": 391
},
{
"epoch": 6.222222222222222,
"grad_norm": 0.6979987025260925,
"learning_rate": 6.682133535433393e-05,
"loss": 0.0415,
"step": 392
},
{
"epoch": 6.238095238095238,
"grad_norm": 0.6048802733421326,
"learning_rate": 6.667303241009803e-05,
"loss": 0.031,
"step": 393
},
{
"epoch": 6.253968253968254,
"grad_norm": 0.7918148040771484,
"learning_rate": 6.652456424955963e-05,
"loss": 0.0342,
"step": 394
},
{
"epoch": 6.26984126984127,
"grad_norm": 0.5297304391860962,
"learning_rate": 6.637593234391843e-05,
"loss": 0.0283,
"step": 395
},
{
"epoch": 6.285714285714286,
"grad_norm": 0.6882847547531128,
"learning_rate": 6.622713816599673e-05,
"loss": 0.0327,
"step": 396
},
{
"epoch": 6.301587301587301,
"grad_norm": 0.5969606637954712,
"learning_rate": 6.60781831902248e-05,
"loss": 0.0344,
"step": 397
},
{
"epoch": 6.317460317460317,
"grad_norm": 0.5623995065689087,
"learning_rate": 6.592906889262632e-05,
"loss": 0.0292,
"step": 398
},
{
"epoch": 6.333333333333333,
"grad_norm": 0.7312327027320862,
"learning_rate": 6.577979675080369e-05,
"loss": 0.0358,
"step": 399
},
{
"epoch": 6.349206349206349,
"grad_norm": 0.5290599465370178,
"learning_rate": 6.563036824392344e-05,
"loss": 0.0265,
"step": 400
},
{
"epoch": 6.365079365079365,
"grad_norm": 0.604269802570343,
"learning_rate": 6.548078485270152e-05,
"loss": 0.0311,
"step": 401
},
{
"epoch": 6.380952380952381,
"grad_norm": 0.6508985161781311,
"learning_rate": 6.533104805938873e-05,
"loss": 0.0325,
"step": 402
},
{
"epoch": 6.396825396825397,
"grad_norm": 0.7835598587989807,
"learning_rate": 6.518115934775585e-05,
"loss": 0.0311,
"step": 403
},
{
"epoch": 6.412698412698413,
"grad_norm": 0.6879574656486511,
"learning_rate": 6.503112020307916e-05,
"loss": 0.039,
"step": 404
},
{
"epoch": 6.428571428571429,
"grad_norm": 0.8170531392097473,
"learning_rate": 6.488093211212555e-05,
"loss": 0.0476,
"step": 405
},
{
"epoch": 6.444444444444445,
"grad_norm": 0.635261058807373,
"learning_rate": 6.473059656313782e-05,
"loss": 0.0315,
"step": 406
},
{
"epoch": 6.4603174603174605,
"grad_norm": 0.6152068972587585,
"learning_rate": 6.458011504582005e-05,
"loss": 0.0303,
"step": 407
},
{
"epoch": 6.476190476190476,
"grad_norm": 0.6500536799430847,
"learning_rate": 6.442948905132266e-05,
"loss": 0.0227,
"step": 408
},
{
"epoch": 6.492063492063492,
"grad_norm": 0.792615532875061,
"learning_rate": 6.427872007222777e-05,
"loss": 0.0254,
"step": 409
},
{
"epoch": 6.507936507936508,
"grad_norm": 0.7331106066703796,
"learning_rate": 6.412780960253436e-05,
"loss": 0.0307,
"step": 410
},
{
"epoch": 6.523809523809524,
"grad_norm": 0.7086438536643982,
"learning_rate": 6.397675913764347e-05,
"loss": 0.0275,
"step": 411
},
{
"epoch": 6.5396825396825395,
"grad_norm": 0.8358487486839294,
"learning_rate": 6.382557017434332e-05,
"loss": 0.0466,
"step": 412
},
{
"epoch": 6.555555555555555,
"grad_norm": 0.6510606408119202,
"learning_rate": 6.367424421079463e-05,
"loss": 0.037,
"step": 413
},
{
"epoch": 6.571428571428571,
"grad_norm": 0.8983582854270935,
"learning_rate": 6.352278274651561e-05,
"loss": 0.0379,
"step": 414
},
{
"epoch": 6.587301587301587,
"grad_norm": 0.7613969445228577,
"learning_rate": 6.337118728236721e-05,
"loss": 0.0358,
"step": 415
},
{
"epoch": 6.603174603174603,
"grad_norm": 0.8371831774711609,
"learning_rate": 6.321945932053822e-05,
"loss": 0.046,
"step": 416
},
{
"epoch": 6.619047619047619,
"grad_norm": 0.7133164405822754,
"learning_rate": 6.306760036453035e-05,
"loss": 0.0276,
"step": 417
},
{
"epoch": 6.634920634920634,
"grad_norm": 0.6740472316741943,
"learning_rate": 6.291561191914333e-05,
"loss": 0.0383,
"step": 418
},
{
"epoch": 6.650793650793651,
"grad_norm": 0.6885079741477966,
"learning_rate": 6.276349549046007e-05,
"loss": 0.0368,
"step": 419
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.8201141953468323,
"learning_rate": 6.261125258583171e-05,
"loss": 0.0487,
"step": 420
},
{
"epoch": 6.682539682539683,
"grad_norm": 0.6679426431655884,
"learning_rate": 6.245888471386263e-05,
"loss": 0.0318,
"step": 421
},
{
"epoch": 6.698412698412699,
"grad_norm": 0.8221629858016968,
"learning_rate": 6.230639338439549e-05,
"loss": 0.0392,
"step": 422
},
{
"epoch": 6.714285714285714,
"grad_norm": 0.7618691921234131,
"learning_rate": 6.215378010849641e-05,
"loss": 0.0373,
"step": 423
},
{
"epoch": 6.73015873015873,
"grad_norm": 0.7761756181716919,
"learning_rate": 6.200104639843985e-05,
"loss": 0.0366,
"step": 424
},
{
"epoch": 6.746031746031746,
"grad_norm": 0.8383869528770447,
"learning_rate": 6.184819376769364e-05,
"loss": 0.0375,
"step": 425
},
{
"epoch": 6.761904761904762,
"grad_norm": 0.78884357213974,
"learning_rate": 6.169522373090412e-05,
"loss": 0.0487,
"step": 426
},
{
"epoch": 6.777777777777778,
"grad_norm": 0.7803629040718079,
"learning_rate": 6.154213780388092e-05,
"loss": 0.0373,
"step": 427
},
{
"epoch": 6.7936507936507935,
"grad_norm": 0.5684940218925476,
"learning_rate": 6.138893750358212e-05,
"loss": 0.0297,
"step": 428
},
{
"epoch": 6.809523809523809,
"grad_norm": 0.7369560599327087,
"learning_rate": 6.123562434809912e-05,
"loss": 0.0372,
"step": 429
},
{
"epoch": 6.825396825396825,
"grad_norm": 0.47202688455581665,
"learning_rate": 6.108219985664161e-05,
"loss": 0.0243,
"step": 430
},
{
"epoch": 6.841269841269841,
"grad_norm": 0.6708411574363708,
"learning_rate": 6.0928665549522554e-05,
"loss": 0.0348,
"step": 431
},
{
"epoch": 6.857142857142857,
"grad_norm": 0.8175257444381714,
"learning_rate": 6.0775022948143115e-05,
"loss": 0.05,
"step": 432
},
{
"epoch": 6.8730158730158735,
"grad_norm": 0.7456179261207581,
"learning_rate": 6.06212735749775e-05,
"loss": 0.0356,
"step": 433
},
{
"epoch": 6.888888888888889,
"grad_norm": 0.615135908126831,
"learning_rate": 6.046741895355802e-05,
"loss": 0.0292,
"step": 434
},
{
"epoch": 6.904761904761905,
"grad_norm": 0.6926703453063965,
"learning_rate": 6.031346060845986e-05,
"loss": 0.035,
"step": 435
},
{
"epoch": 6.920634920634921,
"grad_norm": 0.9521751403808594,
"learning_rate": 6.015940006528602e-05,
"loss": 0.0478,
"step": 436
},
{
"epoch": 6.936507936507937,
"grad_norm": 0.6635673642158508,
"learning_rate": 6.0005238850652234e-05,
"loss": 0.0405,
"step": 437
},
{
"epoch": 6.9523809523809526,
"grad_norm": 0.6299306154251099,
"learning_rate": 5.9850978492171794e-05,
"loss": 0.0328,
"step": 438
},
{
"epoch": 6.968253968253968,
"grad_norm": 0.7513844966888428,
"learning_rate": 5.96966205184404e-05,
"loss": 0.0335,
"step": 439
},
{
"epoch": 6.984126984126984,
"grad_norm": 0.9874755144119263,
"learning_rate": 5.954216645902109e-05,
"loss": 0.0416,
"step": 440
},
{
"epoch": 7.0,
"grad_norm": 0.8250815272331238,
"learning_rate": 5.9387617844429e-05,
"loss": 0.0368,
"step": 441
},
{
"epoch": 7.015873015873016,
"grad_norm": 0.4338611364364624,
"learning_rate": 5.923297620611623e-05,
"loss": 0.0189,
"step": 442
},
{
"epoch": 7.031746031746032,
"grad_norm": 0.5719791054725647,
"learning_rate": 5.907824307645669e-05,
"loss": 0.0169,
"step": 443
},
{
"epoch": 7.0476190476190474,
"grad_norm": 0.38255706429481506,
"learning_rate": 5.892341998873089e-05,
"loss": 0.0186,
"step": 444
},
{
"epoch": 7.063492063492063,
"grad_norm": 0.3592822253704071,
"learning_rate": 5.876850847711073e-05,
"loss": 0.0166,
"step": 445
},
{
"epoch": 7.079365079365079,
"grad_norm": 0.6182012557983398,
"learning_rate": 5.861351007664434e-05,
"loss": 0.0236,
"step": 446
},
{
"epoch": 7.095238095238095,
"grad_norm": 0.5176107883453369,
"learning_rate": 5.845842632324088e-05,
"loss": 0.0253,
"step": 447
},
{
"epoch": 7.111111111111111,
"grad_norm": 0.4049137830734253,
"learning_rate": 5.83032587536552e-05,
"loss": 0.0221,
"step": 448
},
{
"epoch": 7.1269841269841265,
"grad_norm": 0.4034527540206909,
"learning_rate": 5.814800890547278e-05,
"loss": 0.0182,
"step": 449
},
{
"epoch": 7.142857142857143,
"grad_norm": 0.4478590488433838,
"learning_rate": 5.799267831709442e-05,
"loss": 0.0208,
"step": 450
},
{
"epoch": 7.158730158730159,
"grad_norm": 0.4524051547050476,
"learning_rate": 5.78372685277209e-05,
"loss": 0.0147,
"step": 451
},
{
"epoch": 7.174603174603175,
"grad_norm": 0.4985044300556183,
"learning_rate": 5.7681781077337905e-05,
"loss": 0.0198,
"step": 452
},
{
"epoch": 7.190476190476191,
"grad_norm": 0.4616793692111969,
"learning_rate": 5.752621750670068e-05,
"loss": 0.0171,
"step": 453
},
{
"epoch": 7.2063492063492065,
"grad_norm": 0.4235040247440338,
"learning_rate": 5.737057935731868e-05,
"loss": 0.0159,
"step": 454
},
{
"epoch": 7.222222222222222,
"grad_norm": 0.42039763927459717,
"learning_rate": 5.721486817144044e-05,
"loss": 0.0168,
"step": 455
},
{
"epoch": 7.238095238095238,
"grad_norm": 0.40982750058174133,
"learning_rate": 5.705908549203823e-05,
"loss": 0.0153,
"step": 456
},
{
"epoch": 7.253968253968254,
"grad_norm": 0.44600027799606323,
"learning_rate": 5.690323286279274e-05,
"loss": 0.0167,
"step": 457
},
{
"epoch": 7.26984126984127,
"grad_norm": 0.5298761129379272,
"learning_rate": 5.674731182807781e-05,
"loss": 0.0158,
"step": 458
},
{
"epoch": 7.285714285714286,
"grad_norm": 0.3657887279987335,
"learning_rate": 5.659132393294514e-05,
"loss": 0.0188,
"step": 459
},
{
"epoch": 7.301587301587301,
"grad_norm": 0.4426786005496979,
"learning_rate": 5.643527072310891e-05,
"loss": 0.0197,
"step": 460
},
{
"epoch": 7.317460317460317,
"grad_norm": 0.5749462842941284,
"learning_rate": 5.627915374493061e-05,
"loss": 0.0181,
"step": 461
},
{
"epoch": 7.333333333333333,
"grad_norm": 0.5059666633605957,
"learning_rate": 5.612297454540352e-05,
"loss": 0.0206,
"step": 462
},
{
"epoch": 7.349206349206349,
"grad_norm": 0.5599040389060974,
"learning_rate": 5.596673467213756e-05,
"loss": 0.0148,
"step": 463
},
{
"epoch": 7.365079365079365,
"grad_norm": 0.5010665059089661,
"learning_rate": 5.581043567334383e-05,
"loss": 0.0186,
"step": 464
},
{
"epoch": 7.380952380952381,
"grad_norm": 0.49025240540504456,
"learning_rate": 5.5654079097819345e-05,
"loss": 0.0237,
"step": 465
},
{
"epoch": 7.396825396825397,
"grad_norm": 0.4369467794895172,
"learning_rate": 5.5497666494931654e-05,
"loss": 0.017,
"step": 466
},
{
"epoch": 7.412698412698413,
"grad_norm": 0.4754543602466583,
"learning_rate": 5.5341199414603493e-05,
"loss": 0.0202,
"step": 467
},
{
"epoch": 7.428571428571429,
"grad_norm": 0.4779890179634094,
"learning_rate": 5.518467940729739e-05,
"loss": 0.0221,
"step": 468
},
{
"epoch": 7.444444444444445,
"grad_norm": 0.5082346796989441,
"learning_rate": 5.502810802400039e-05,
"loss": 0.0191,
"step": 469
},
{
"epoch": 7.4603174603174605,
"grad_norm": 0.4045872688293457,
"learning_rate": 5.487148681620862e-05,
"loss": 0.0181,
"step": 470
},
{
"epoch": 7.476190476190476,
"grad_norm": 0.306020587682724,
"learning_rate": 5.4714817335911894e-05,
"loss": 0.011,
"step": 471
},
{
"epoch": 7.492063492063492,
"grad_norm": 0.4682234823703766,
"learning_rate": 5.455810113557839e-05,
"loss": 0.0126,
"step": 472
},
{
"epoch": 7.507936507936508,
"grad_norm": 0.46444806456565857,
"learning_rate": 5.440133976813926e-05,
"loss": 0.0205,
"step": 473
},
{
"epoch": 7.523809523809524,
"grad_norm": 1.0911283493041992,
"learning_rate": 5.4244534786973214e-05,
"loss": 0.0209,
"step": 474
},
{
"epoch": 7.5396825396825395,
"grad_norm": 0.4805389642715454,
"learning_rate": 5.40876877458911e-05,
"loss": 0.0186,
"step": 475
},
{
"epoch": 7.555555555555555,
"grad_norm": 0.5102893114089966,
"learning_rate": 5.3930800199120616e-05,
"loss": 0.02,
"step": 476
},
{
"epoch": 7.571428571428571,
"grad_norm": 0.44652751088142395,
"learning_rate": 5.377387370129079e-05,
"loss": 0.0176,
"step": 477
},
{
"epoch": 7.587301587301587,
"grad_norm": 0.5319653153419495,
"learning_rate": 5.361690980741663e-05,
"loss": 0.0276,
"step": 478
},
{
"epoch": 7.603174603174603,
"grad_norm": 0.42663267254829407,
"learning_rate": 5.345991007288371e-05,
"loss": 0.0165,
"step": 479
},
{
"epoch": 7.619047619047619,
"grad_norm": 0.5141676068305969,
"learning_rate": 5.330287605343279e-05,
"loss": 0.0206,
"step": 480
},
{
"epoch": 7.634920634920634,
"grad_norm": 0.37202200293540955,
"learning_rate": 5.314580930514431e-05,
"loss": 0.014,
"step": 481
},
{
"epoch": 7.650793650793651,
"grad_norm": 0.5131287574768066,
"learning_rate": 5.298871138442307e-05,
"loss": 0.018,
"step": 482
},
{
"epoch": 7.666666666666667,
"grad_norm": 0.5241144895553589,
"learning_rate": 5.283158384798275e-05,
"loss": 0.0174,
"step": 483
},
{
"epoch": 7.682539682539683,
"grad_norm": 0.4443790316581726,
"learning_rate": 5.267442825283048e-05,
"loss": 0.0194,
"step": 484
},
{
"epoch": 7.698412698412699,
"grad_norm": 0.46092358231544495,
"learning_rate": 5.2517246156251455e-05,
"loss": 0.0138,
"step": 485
},
{
"epoch": 7.714285714285714,
"grad_norm": 0.5907039046287537,
"learning_rate": 5.236003911579345e-05,
"loss": 0.028,
"step": 486
},
{
"epoch": 7.73015873015873,
"grad_norm": 0.5472407341003418,
"learning_rate": 5.220280868925145e-05,
"loss": 0.0201,
"step": 487
},
{
"epoch": 7.746031746031746,
"grad_norm": 0.522294282913208,
"learning_rate": 5.204555643465215e-05,
"loss": 0.021,
"step": 488
},
{
"epoch": 7.761904761904762,
"grad_norm": 0.5975657105445862,
"learning_rate": 5.1888283910238555e-05,
"loss": 0.0198,
"step": 489
},
{
"epoch": 7.777777777777778,
"grad_norm": 0.6385313868522644,
"learning_rate": 5.173099267445451e-05,
"loss": 0.0222,
"step": 490
},
{
"epoch": 7.7936507936507935,
"grad_norm": 0.5334087014198303,
"learning_rate": 5.157368428592933e-05,
"loss": 0.0183,
"step": 491
},
{
"epoch": 7.809523809523809,
"grad_norm": 0.6203488111495972,
"learning_rate": 5.1416360303462206e-05,
"loss": 0.0329,
"step": 492
},
{
"epoch": 7.825396825396825,
"grad_norm": 0.5505366325378418,
"learning_rate": 5.125902228600693e-05,
"loss": 0.0169,
"step": 493
},
{
"epoch": 7.841269841269841,
"grad_norm": 0.4648919999599457,
"learning_rate": 5.110167179265636e-05,
"loss": 0.0182,
"step": 494
},
{
"epoch": 7.857142857142857,
"grad_norm": 0.3623007833957672,
"learning_rate": 5.094431038262693e-05,
"loss": 0.0155,
"step": 495
},
{
"epoch": 7.8730158730158735,
"grad_norm": 0.4798755347728729,
"learning_rate": 5.078693961524329e-05,
"loss": 0.02,
"step": 496
},
{
"epoch": 7.888888888888889,
"grad_norm": 0.5778583288192749,
"learning_rate": 5.062956104992285e-05,
"loss": 0.0318,
"step": 497
},
{
"epoch": 7.904761904761905,
"grad_norm": 0.37309491634368896,
"learning_rate": 5.0472176246160184e-05,
"loss": 0.0116,
"step": 498
},
{
"epoch": 7.920634920634921,
"grad_norm": 0.6432266235351562,
"learning_rate": 5.031478676351179e-05,
"loss": 0.0188,
"step": 499
},
{
"epoch": 7.936507936507937,
"grad_norm": 0.43156516551971436,
"learning_rate": 5.01573941615805e-05,
"loss": 0.0179,
"step": 500
},
{
"epoch": 7.9523809523809526,
"grad_norm": 0.553710401058197,
"learning_rate": 5e-05,
"loss": 0.0192,
"step": 501
},
{
"epoch": 7.968253968253968,
"grad_norm": 0.39197760820388794,
"learning_rate": 4.984260583841953e-05,
"loss": 0.0177,
"step": 502
},
{
"epoch": 7.984126984126984,
"grad_norm": 0.5970882773399353,
"learning_rate": 4.9685213236488216e-05,
"loss": 0.025,
"step": 503
},
{
"epoch": 8.0,
"grad_norm": 0.44673952460289,
"learning_rate": 4.9527823753839834e-05,
"loss": 0.0121,
"step": 504
},
{
"epoch": 8.015873015873016,
"grad_norm": 0.3288459777832031,
"learning_rate": 4.937043895007717e-05,
"loss": 0.0167,
"step": 505
},
{
"epoch": 8.031746031746032,
"grad_norm": 0.410833477973938,
"learning_rate": 4.9213060384756716e-05,
"loss": 0.0147,
"step": 506
},
{
"epoch": 8.047619047619047,
"grad_norm": 0.34271591901779175,
"learning_rate": 4.9055689617373084e-05,
"loss": 0.0108,
"step": 507
},
{
"epoch": 8.063492063492063,
"grad_norm": 0.22280845046043396,
"learning_rate": 4.8898328207343666e-05,
"loss": 0.0076,
"step": 508
},
{
"epoch": 8.079365079365079,
"grad_norm": 0.404482364654541,
"learning_rate": 4.874097771399308e-05,
"loss": 0.0124,
"step": 509
},
{
"epoch": 8.095238095238095,
"grad_norm": 0.3690173327922821,
"learning_rate": 4.858363969653781e-05,
"loss": 0.0167,
"step": 510
},
{
"epoch": 8.11111111111111,
"grad_norm": 0.31355366110801697,
"learning_rate": 4.8426315714070684e-05,
"loss": 0.0143,
"step": 511
},
{
"epoch": 8.126984126984127,
"grad_norm": 0.24391916394233704,
"learning_rate": 4.8269007325545506e-05,
"loss": 0.0111,
"step": 512
},
{
"epoch": 8.142857142857142,
"grad_norm": 0.39755526185035706,
"learning_rate": 4.8111716089761456e-05,
"loss": 0.0145,
"step": 513
},
{
"epoch": 8.158730158730158,
"grad_norm": 0.27595722675323486,
"learning_rate": 4.7954443565347865e-05,
"loss": 0.01,
"step": 514
},
{
"epoch": 8.174603174603174,
"grad_norm": 0.304116815328598,
"learning_rate": 4.779719131074857e-05,
"loss": 0.0105,
"step": 515
},
{
"epoch": 8.19047619047619,
"grad_norm": 0.2722436487674713,
"learning_rate": 4.7639960884206576e-05,
"loss": 0.0089,
"step": 516
},
{
"epoch": 8.206349206349206,
"grad_norm": 0.2728959321975708,
"learning_rate": 4.7482753843748564e-05,
"loss": 0.0108,
"step": 517
},
{
"epoch": 8.222222222222221,
"grad_norm": 0.2411596029996872,
"learning_rate": 4.7325571747169545e-05,
"loss": 0.0085,
"step": 518
},
{
"epoch": 8.238095238095237,
"grad_norm": 0.23578131198883057,
"learning_rate": 4.716841615201726e-05,
"loss": 0.008,
"step": 519
},
{
"epoch": 8.253968253968253,
"grad_norm": 0.3611275255680084,
"learning_rate": 4.7011288615576934e-05,
"loss": 0.0141,
"step": 520
},
{
"epoch": 8.26984126984127,
"grad_norm": 0.3158744275569916,
"learning_rate": 4.6854190694855694e-05,
"loss": 0.0115,
"step": 521
},
{
"epoch": 8.285714285714286,
"grad_norm": 0.40253180265426636,
"learning_rate": 4.6697123946567227e-05,
"loss": 0.013,
"step": 522
},
{
"epoch": 8.301587301587302,
"grad_norm": 0.290996789932251,
"learning_rate": 4.65400899271163e-05,
"loss": 0.0103,
"step": 523
},
{
"epoch": 8.317460317460318,
"grad_norm": 0.37486013770103455,
"learning_rate": 4.63830901925834e-05,
"loss": 0.0155,
"step": 524
},
{
"epoch": 8.333333333333334,
"grad_norm": 0.42451635003089905,
"learning_rate": 4.6226126298709224e-05,
"loss": 0.0175,
"step": 525
},
{
"epoch": 8.34920634920635,
"grad_norm": 0.4372078776359558,
"learning_rate": 4.60691998008794e-05,
"loss": 0.0203,
"step": 526
},
{
"epoch": 8.365079365079366,
"grad_norm": 0.3044324517250061,
"learning_rate": 4.5912312254108905e-05,
"loss": 0.0139,
"step": 527
},
{
"epoch": 8.380952380952381,
"grad_norm": 0.39817896485328674,
"learning_rate": 4.575546521302681e-05,
"loss": 0.0135,
"step": 528
},
{
"epoch": 8.396825396825397,
"grad_norm": 0.3401551842689514,
"learning_rate": 4.5598660231860746e-05,
"loss": 0.0107,
"step": 529
},
{
"epoch": 8.412698412698413,
"grad_norm": 0.3589102625846863,
"learning_rate": 4.544189886442162e-05,
"loss": 0.0131,
"step": 530
},
{
"epoch": 8.428571428571429,
"grad_norm": 0.4164977967739105,
"learning_rate": 4.528518266408811e-05,
"loss": 0.015,
"step": 531
},
{
"epoch": 8.444444444444445,
"grad_norm": 0.5136562585830688,
"learning_rate": 4.5128513183791386e-05,
"loss": 0.016,
"step": 532
},
{
"epoch": 8.46031746031746,
"grad_norm": 0.36152708530426025,
"learning_rate": 4.49718919759996e-05,
"loss": 0.015,
"step": 533
},
{
"epoch": 8.476190476190476,
"grad_norm": 0.2721676230430603,
"learning_rate": 4.481532059270262e-05,
"loss": 0.0083,
"step": 534
},
{
"epoch": 8.492063492063492,
"grad_norm": 0.2820744216442108,
"learning_rate": 4.465880058539652e-05,
"loss": 0.01,
"step": 535
},
{
"epoch": 8.507936507936508,
"grad_norm": 0.3638380467891693,
"learning_rate": 4.450233350506836e-05,
"loss": 0.0101,
"step": 536
},
{
"epoch": 8.523809523809524,
"grad_norm": 0.3278939723968506,
"learning_rate": 4.4345920902180647e-05,
"loss": 0.0104,
"step": 537
},
{
"epoch": 8.53968253968254,
"grad_norm": 0.3926644027233124,
"learning_rate": 4.418956432665618e-05,
"loss": 0.0125,
"step": 538
},
{
"epoch": 8.555555555555555,
"grad_norm": 0.3797055780887604,
"learning_rate": 4.403326532786245e-05,
"loss": 0.0111,
"step": 539
},
{
"epoch": 8.571428571428571,
"grad_norm": 0.26904818415641785,
"learning_rate": 4.387702545459649e-05,
"loss": 0.009,
"step": 540
},
{
"epoch": 8.587301587301587,
"grad_norm": 0.32789549231529236,
"learning_rate": 4.3720846255069406e-05,
"loss": 0.0075,
"step": 541
},
{
"epoch": 8.603174603174603,
"grad_norm": 0.19732752442359924,
"learning_rate": 4.356472927689109e-05,
"loss": 0.008,
"step": 542
},
{
"epoch": 8.619047619047619,
"grad_norm": 0.23964589834213257,
"learning_rate": 4.3408676067054866e-05,
"loss": 0.0102,
"step": 543
},
{
"epoch": 8.634920634920634,
"grad_norm": 0.4041917026042938,
"learning_rate": 4.32526881719222e-05,
"loss": 0.0188,
"step": 544
},
{
"epoch": 8.65079365079365,
"grad_norm": 0.4420047998428345,
"learning_rate": 4.3096767137207256e-05,
"loss": 0.0138,
"step": 545
},
{
"epoch": 8.666666666666666,
"grad_norm": 0.43801549077033997,
"learning_rate": 4.2940914507961775e-05,
"loss": 0.012,
"step": 546
},
{
"epoch": 8.682539682539682,
"grad_norm": 0.24375741183757782,
"learning_rate": 4.278513182855956e-05,
"loss": 0.0078,
"step": 547
},
{
"epoch": 8.698412698412698,
"grad_norm": 0.48987898230552673,
"learning_rate": 4.262942064268134e-05,
"loss": 0.0184,
"step": 548
},
{
"epoch": 8.714285714285714,
"grad_norm": 0.38676026463508606,
"learning_rate": 4.247378249329933e-05,
"loss": 0.0122,
"step": 549
},
{
"epoch": 8.73015873015873,
"grad_norm": 0.20567281544208527,
"learning_rate": 4.23182189226621e-05,
"loss": 0.0076,
"step": 550
},
{
"epoch": 8.746031746031747,
"grad_norm": 0.28698331117630005,
"learning_rate": 4.21627314722791e-05,
"loss": 0.0084,
"step": 551
},
{
"epoch": 8.761904761904763,
"grad_norm": 0.3160061836242676,
"learning_rate": 4.20073216829056e-05,
"loss": 0.0111,
"step": 552
},
{
"epoch": 8.777777777777779,
"grad_norm": 0.2930062711238861,
"learning_rate": 4.185199109452721e-05,
"loss": 0.0107,
"step": 553
},
{
"epoch": 8.793650793650794,
"grad_norm": 0.3634200692176819,
"learning_rate": 4.169674124634481e-05,
"loss": 0.0101,
"step": 554
},
{
"epoch": 8.80952380952381,
"grad_norm": 0.37438124418258667,
"learning_rate": 4.1541573676759126e-05,
"loss": 0.014,
"step": 555
},
{
"epoch": 8.825396825396826,
"grad_norm": 0.3476526141166687,
"learning_rate": 4.138648992335566e-05,
"loss": 0.0129,
"step": 556
},
{
"epoch": 8.841269841269842,
"grad_norm": 0.18964612483978271,
"learning_rate": 4.12314915228893e-05,
"loss": 0.0062,
"step": 557
},
{
"epoch": 8.857142857142858,
"grad_norm": 0.35653162002563477,
"learning_rate": 4.107658001126913e-05,
"loss": 0.0131,
"step": 558
},
{
"epoch": 8.873015873015873,
"grad_norm": 0.38258370757102966,
"learning_rate": 4.092175692354333e-05,
"loss": 0.0119,
"step": 559
},
{
"epoch": 8.88888888888889,
"grad_norm": 0.2177157700061798,
"learning_rate": 4.0767023793883785e-05,
"loss": 0.0062,
"step": 560
},
{
"epoch": 8.904761904761905,
"grad_norm": 0.3157006502151489,
"learning_rate": 4.0612382155571026e-05,
"loss": 0.0116,
"step": 561
},
{
"epoch": 8.920634920634921,
"grad_norm": 0.5421932935714722,
"learning_rate": 4.045783354097893e-05,
"loss": 0.0251,
"step": 562
},
{
"epoch": 8.936507936507937,
"grad_norm": 0.4682704210281372,
"learning_rate": 4.0303379481559623e-05,
"loss": 0.0193,
"step": 563
},
{
"epoch": 8.952380952380953,
"grad_norm": 0.36263760924339294,
"learning_rate": 4.0149021507828224e-05,
"loss": 0.0155,
"step": 564
},
{
"epoch": 8.968253968253968,
"grad_norm": 0.3147249221801758,
"learning_rate": 3.9994761149347784e-05,
"loss": 0.0114,
"step": 565
},
{
"epoch": 8.984126984126984,
"grad_norm": 0.41839832067489624,
"learning_rate": 3.984059993471399e-05,
"loss": 0.0154,
"step": 566
},
{
"epoch": 9.0,
"grad_norm": 0.37561434507369995,
"learning_rate": 3.968653939154017e-05,
"loss": 0.0103,
"step": 567
},
{
"epoch": 9.015873015873016,
"grad_norm": 0.31883716583251953,
"learning_rate": 3.9532581046442e-05,
"loss": 0.0082,
"step": 568
},
{
"epoch": 9.031746031746032,
"grad_norm": 0.23053289949893951,
"learning_rate": 3.937872642502252e-05,
"loss": 0.0073,
"step": 569
},
{
"epoch": 9.047619047619047,
"grad_norm": 0.25523173809051514,
"learning_rate": 3.9224977051856904e-05,
"loss": 0.008,
"step": 570
},
{
"epoch": 9.063492063492063,
"grad_norm": 0.20138682425022125,
"learning_rate": 3.907133445047747e-05,
"loss": 0.007,
"step": 571
},
{
"epoch": 9.079365079365079,
"grad_norm": 0.2522388696670532,
"learning_rate": 3.8917800143358404e-05,
"loss": 0.0064,
"step": 572
},
{
"epoch": 9.095238095238095,
"grad_norm": 0.32254767417907715,
"learning_rate": 3.8764375651900906e-05,
"loss": 0.0121,
"step": 573
},
{
"epoch": 9.11111111111111,
"grad_norm": 0.2257680743932724,
"learning_rate": 3.861106249641789e-05,
"loss": 0.0069,
"step": 574
},
{
"epoch": 9.126984126984127,
"grad_norm": 0.20319634675979614,
"learning_rate": 3.84578621961191e-05,
"loss": 0.0083,
"step": 575
},
{
"epoch": 9.142857142857142,
"grad_norm": 0.21617092192173004,
"learning_rate": 3.830477626909589e-05,
"loss": 0.0081,
"step": 576
},
{
"epoch": 9.158730158730158,
"grad_norm": 0.3438735902309418,
"learning_rate": 3.8151806232306374e-05,
"loss": 0.0113,
"step": 577
},
{
"epoch": 9.174603174603174,
"grad_norm": 0.29311296343803406,
"learning_rate": 3.7998953601560175e-05,
"loss": 0.0097,
"step": 578
},
{
"epoch": 9.19047619047619,
"grad_norm": 0.16206145286560059,
"learning_rate": 3.784621989150361e-05,
"loss": 0.0059,
"step": 579
},
{
"epoch": 9.206349206349206,
"grad_norm": 0.22121606767177582,
"learning_rate": 3.769360661560453e-05,
"loss": 0.0084,
"step": 580
},
{
"epoch": 9.222222222222221,
"grad_norm": 0.25994566082954407,
"learning_rate": 3.75411152861374e-05,
"loss": 0.0104,
"step": 581
},
{
"epoch": 9.238095238095237,
"grad_norm": 0.18151433765888214,
"learning_rate": 3.73887474141683e-05,
"loss": 0.0056,
"step": 582
},
{
"epoch": 9.253968253968253,
"grad_norm": 0.18867704272270203,
"learning_rate": 3.723650450953994e-05,
"loss": 0.006,
"step": 583
},
{
"epoch": 9.26984126984127,
"grad_norm": 0.3016846776008606,
"learning_rate": 3.708438808085668e-05,
"loss": 0.0136,
"step": 584
},
{
"epoch": 9.285714285714286,
"grad_norm": 0.41189849376678467,
"learning_rate": 3.693239963546967e-05,
"loss": 0.0168,
"step": 585
},
{
"epoch": 9.301587301587302,
"grad_norm": 0.2735559940338135,
"learning_rate": 3.6780540679461784e-05,
"loss": 0.0097,
"step": 586
},
{
"epoch": 9.317460317460318,
"grad_norm": 0.23788434267044067,
"learning_rate": 3.662881271763279e-05,
"loss": 0.0068,
"step": 587
},
{
"epoch": 9.333333333333334,
"grad_norm": 0.14663733541965485,
"learning_rate": 3.64772172534844e-05,
"loss": 0.0046,
"step": 588
},
{
"epoch": 9.34920634920635,
"grad_norm": 0.3166827857494354,
"learning_rate": 3.63257557892054e-05,
"loss": 0.0082,
"step": 589
},
{
"epoch": 9.365079365079366,
"grad_norm": 0.24929101765155792,
"learning_rate": 3.6174429825656685e-05,
"loss": 0.0104,
"step": 590
},
{
"epoch": 9.380952380952381,
"grad_norm": 0.27766042947769165,
"learning_rate": 3.602324086235655e-05,
"loss": 0.0079,
"step": 591
},
{
"epoch": 9.396825396825397,
"grad_norm": 0.25808480381965637,
"learning_rate": 3.587219039746564e-05,
"loss": 0.0076,
"step": 592
},
{
"epoch": 9.412698412698413,
"grad_norm": 0.2501043677330017,
"learning_rate": 3.572127992777223e-05,
"loss": 0.0103,
"step": 593
},
{
"epoch": 9.428571428571429,
"grad_norm": 0.2836500108242035,
"learning_rate": 3.557051094867735e-05,
"loss": 0.0082,
"step": 594
},
{
"epoch": 9.444444444444445,
"grad_norm": 0.3479957580566406,
"learning_rate": 3.541988495417997e-05,
"loss": 0.0126,
"step": 595
},
{
"epoch": 9.46031746031746,
"grad_norm": 0.2896635830402374,
"learning_rate": 3.5269403436862175e-05,
"loss": 0.0072,
"step": 596
},
{
"epoch": 9.476190476190476,
"grad_norm": 0.2840765416622162,
"learning_rate": 3.511906788787447e-05,
"loss": 0.0101,
"step": 597
},
{
"epoch": 9.492063492063492,
"grad_norm": 0.3210354745388031,
"learning_rate": 3.496887979692084e-05,
"loss": 0.0085,
"step": 598
},
{
"epoch": 9.507936507936508,
"grad_norm": 0.27587252855300903,
"learning_rate": 3.481884065224415e-05,
"loss": 0.0087,
"step": 599
},
{
"epoch": 9.523809523809524,
"grad_norm": 0.3219284117221832,
"learning_rate": 3.466895194061128e-05,
"loss": 0.009,
"step": 600
},
{
"epoch": 9.53968253968254,
"grad_norm": 0.17630243301391602,
"learning_rate": 3.451921514729848e-05,
"loss": 0.0059,
"step": 601
},
{
"epoch": 9.555555555555555,
"grad_norm": 0.25327348709106445,
"learning_rate": 3.436963175607656e-05,
"loss": 0.0081,
"step": 602
},
{
"epoch": 9.571428571428571,
"grad_norm": 0.3768535554409027,
"learning_rate": 3.422020324919632e-05,
"loss": 0.0113,
"step": 603
},
{
"epoch": 9.587301587301587,
"grad_norm": 0.1651473492383957,
"learning_rate": 3.4070931107373675e-05,
"loss": 0.0049,
"step": 604
},
{
"epoch": 9.603174603174603,
"grad_norm": 0.23368506133556366,
"learning_rate": 3.39218168097752e-05,
"loss": 0.008,
"step": 605
},
{
"epoch": 9.619047619047619,
"grad_norm": 0.1572844684123993,
"learning_rate": 3.377286183400328e-05,
"loss": 0.0048,
"step": 606
},
{
"epoch": 9.634920634920634,
"grad_norm": 0.2425893396139145,
"learning_rate": 3.362406765608158e-05,
"loss": 0.0084,
"step": 607
},
{
"epoch": 9.65079365079365,
"grad_norm": 0.280091255903244,
"learning_rate": 3.3475435750440356e-05,
"loss": 0.0114,
"step": 608
},
{
"epoch": 9.666666666666666,
"grad_norm": 0.34356409311294556,
"learning_rate": 3.332696758990197e-05,
"loss": 0.0101,
"step": 609
},
{
"epoch": 9.682539682539682,
"grad_norm": 0.26575177907943726,
"learning_rate": 3.3178664645666066e-05,
"loss": 0.0076,
"step": 610
},
{
"epoch": 9.698412698412698,
"grad_norm": 0.38795173168182373,
"learning_rate": 3.303052838729525e-05,
"loss": 0.0141,
"step": 611
},
{
"epoch": 9.714285714285714,
"grad_norm": 0.17991788685321808,
"learning_rate": 3.2882560282700336e-05,
"loss": 0.0071,
"step": 612
},
{
"epoch": 9.73015873015873,
"grad_norm": 0.26826414465904236,
"learning_rate": 3.273476179812588e-05,
"loss": 0.0084,
"step": 613
},
{
"epoch": 9.746031746031747,
"grad_norm": 0.4353213906288147,
"learning_rate": 3.258713439813566e-05,
"loss": 0.0138,
"step": 614
},
{
"epoch": 9.761904761904763,
"grad_norm": 0.27039167284965515,
"learning_rate": 3.243967954559811e-05,
"loss": 0.0075,
"step": 615
},
{
"epoch": 9.777777777777779,
"grad_norm": 0.1729506552219391,
"learning_rate": 3.229239870167191e-05,
"loss": 0.0066,
"step": 616
},
{
"epoch": 9.793650793650794,
"grad_norm": 0.31375908851623535,
"learning_rate": 3.2145293325791395e-05,
"loss": 0.0091,
"step": 617
},
{
"epoch": 9.80952380952381,
"grad_norm": 0.2373589277267456,
"learning_rate": 3.199836487565222e-05,
"loss": 0.0077,
"step": 618
},
{
"epoch": 9.825396825396826,
"grad_norm": 0.3218036890029907,
"learning_rate": 3.1851614807196774e-05,
"loss": 0.0142,
"step": 619
},
{
"epoch": 9.841269841269842,
"grad_norm": 0.2621251940727234,
"learning_rate": 3.170504457459989e-05,
"loss": 0.0085,
"step": 620
},
{
"epoch": 9.857142857142858,
"grad_norm": 0.2235831618309021,
"learning_rate": 3.155865563025433e-05,
"loss": 0.0085,
"step": 621
},
{
"epoch": 9.873015873015873,
"grad_norm": 0.3102441728115082,
"learning_rate": 3.1412449424756474e-05,
"loss": 0.0091,
"step": 622
},
{
"epoch": 9.88888888888889,
"grad_norm": 0.3454819321632385,
"learning_rate": 3.1266427406891856e-05,
"loss": 0.0078,
"step": 623
},
{
"epoch": 9.904761904761905,
"grad_norm": 0.1699669510126114,
"learning_rate": 3.112059102362093e-05,
"loss": 0.005,
"step": 624
},
{
"epoch": 9.920634920634921,
"grad_norm": 0.21184861660003662,
"learning_rate": 3.0974941720064585e-05,
"loss": 0.0059,
"step": 625
},
{
"epoch": 9.936507936507937,
"grad_norm": 0.21373149752616882,
"learning_rate": 3.082948093948997e-05,
"loss": 0.0067,
"step": 626
},
{
"epoch": 9.952380952380953,
"grad_norm": 0.17170457541942596,
"learning_rate": 3.0684210123296055e-05,
"loss": 0.0061,
"step": 627
},
{
"epoch": 9.968253968253968,
"grad_norm": 0.33514630794525146,
"learning_rate": 3.053913071099947e-05,
"loss": 0.0136,
"step": 628
},
{
"epoch": 9.984126984126984,
"grad_norm": 0.34444811940193176,
"learning_rate": 3.0394244140220163e-05,
"loss": 0.0129,
"step": 629
},
{
"epoch": 10.0,
"grad_norm": 0.2810363173484802,
"learning_rate": 3.0249551846667207e-05,
"loss": 0.0072,
"step": 630
},
{
"epoch": 10.015873015873016,
"grad_norm": 0.16898448765277863,
"learning_rate": 3.010505526412447e-05,
"loss": 0.0057,
"step": 631
},
{
"epoch": 10.031746031746032,
"grad_norm": 0.27064862847328186,
"learning_rate": 2.996075582443658e-05,
"loss": 0.0081,
"step": 632
},
{
"epoch": 10.047619047619047,
"grad_norm": 0.11674167960882187,
"learning_rate": 2.981665495749457e-05,
"loss": 0.0044,
"step": 633
},
{
"epoch": 10.063492063492063,
"grad_norm": 0.18693989515304565,
"learning_rate": 2.9672754091221805e-05,
"loss": 0.0071,
"step": 634
},
{
"epoch": 10.079365079365079,
"grad_norm": 0.19624684751033783,
"learning_rate": 2.9529054651559772e-05,
"loss": 0.0065,
"step": 635
},
{
"epoch": 10.095238095238095,
"grad_norm": 0.13836269080638885,
"learning_rate": 2.938555806245406e-05,
"loss": 0.0045,
"step": 636
},
{
"epoch": 10.11111111111111,
"grad_norm": 0.2417069971561432,
"learning_rate": 2.9242265745840063e-05,
"loss": 0.0091,
"step": 637
},
{
"epoch": 10.126984126984127,
"grad_norm": 0.18066619336605072,
"learning_rate": 2.9099179121629117e-05,
"loss": 0.006,
"step": 638
},
{
"epoch": 10.142857142857142,
"grad_norm": 0.2307615429162979,
"learning_rate": 2.895629960769417e-05,
"loss": 0.0078,
"step": 639
},
{
"epoch": 10.158730158730158,
"grad_norm": 0.1858942061662674,
"learning_rate": 2.881362861985606e-05,
"loss": 0.007,
"step": 640
},
{
"epoch": 10.174603174603174,
"grad_norm": 0.20081129670143127,
"learning_rate": 2.867116757186911e-05,
"loss": 0.0073,
"step": 641
},
{
"epoch": 10.19047619047619,
"grad_norm": 0.2889654338359833,
"learning_rate": 2.8528917875407433e-05,
"loss": 0.0088,
"step": 642
},
{
"epoch": 10.206349206349206,
"grad_norm": 0.22024375200271606,
"learning_rate": 2.838688094005078e-05,
"loss": 0.0061,
"step": 643
},
{
"epoch": 10.222222222222221,
"grad_norm": 0.2205890566110611,
"learning_rate": 2.8245058173270622e-05,
"loss": 0.0072,
"step": 644
},
{
"epoch": 10.238095238095237,
"grad_norm": 0.21441209316253662,
"learning_rate": 2.8103450980416136e-05,
"loss": 0.0054,
"step": 645
},
{
"epoch": 10.253968253968253,
"grad_norm": 0.18930909037590027,
"learning_rate": 2.796206076470044e-05,
"loss": 0.0066,
"step": 646
},
{
"epoch": 10.26984126984127,
"grad_norm": 0.16868965327739716,
"learning_rate": 2.7820888927186483e-05,
"loss": 0.0048,
"step": 647
},
{
"epoch": 10.285714285714286,
"grad_norm": 0.3065090775489807,
"learning_rate": 2.7679936866773315e-05,
"loss": 0.0088,
"step": 648
},
{
"epoch": 10.301587301587302,
"grad_norm": 0.21105839312076569,
"learning_rate": 2.753920598018217e-05,
"loss": 0.0057,
"step": 649
},
{
"epoch": 10.317460317460318,
"grad_norm": 0.07848194986581802,
"learning_rate": 2.739869766194263e-05,
"loss": 0.0031,
"step": 650
},
{
"epoch": 10.333333333333334,
"grad_norm": 0.23540142178535461,
"learning_rate": 2.7258413304378734e-05,
"loss": 0.0078,
"step": 651
},
{
"epoch": 10.34920634920635,
"grad_norm": 0.2934277057647705,
"learning_rate": 2.7118354297595396e-05,
"loss": 0.0065,
"step": 652
},
{
"epoch": 10.365079365079366,
"grad_norm": 0.2042340338230133,
"learning_rate": 2.6978522029464325e-05,
"loss": 0.005,
"step": 653
},
{
"epoch": 10.380952380952381,
"grad_norm": 0.2258983999490738,
"learning_rate": 2.683891788561055e-05,
"loss": 0.0074,
"step": 654
},
{
"epoch": 10.396825396825397,
"grad_norm": 0.18975599110126495,
"learning_rate": 2.669954324939852e-05,
"loss": 0.0071,
"step": 655
},
{
"epoch": 10.412698412698413,
"grad_norm": 0.16135640442371368,
"learning_rate": 2.6560399501918465e-05,
"loss": 0.0058,
"step": 656
},
{
"epoch": 10.428571428571429,
"grad_norm": 0.30178365111351013,
"learning_rate": 2.6421488021972673e-05,
"loss": 0.0086,
"step": 657
},
{
"epoch": 10.444444444444445,
"grad_norm": 0.3351801037788391,
"learning_rate": 2.6282810186061862e-05,
"loss": 0.0132,
"step": 658
},
{
"epoch": 10.46031746031746,
"grad_norm": 0.25116395950317383,
"learning_rate": 2.6144367368371535e-05,
"loss": 0.0081,
"step": 659
},
{
"epoch": 10.476190476190476,
"grad_norm": 0.2531328797340393,
"learning_rate": 2.600616094075835e-05,
"loss": 0.0082,
"step": 660
},
{
"epoch": 10.492063492063492,
"grad_norm": 0.22533273696899414,
"learning_rate": 2.5868192272736514e-05,
"loss": 0.0065,
"step": 661
},
{
"epoch": 10.507936507936508,
"grad_norm": 0.18789933621883392,
"learning_rate": 2.5730462731464273e-05,
"loss": 0.0048,
"step": 662
},
{
"epoch": 10.523809523809524,
"grad_norm": 0.2593654990196228,
"learning_rate": 2.5592973681730236e-05,
"loss": 0.008,
"step": 663
},
{
"epoch": 10.53968253968254,
"grad_norm": 0.2563331425189972,
"learning_rate": 2.5455726485940012e-05,
"loss": 0.0099,
"step": 664
},
{
"epoch": 10.555555555555555,
"grad_norm": 0.2012241631746292,
"learning_rate": 2.5318722504102604e-05,
"loss": 0.0051,
"step": 665
},
{
"epoch": 10.571428571428571,
"grad_norm": 0.3327932059764862,
"learning_rate": 2.5181963093816962e-05,
"loss": 0.0077,
"step": 666
},
{
"epoch": 10.587301587301587,
"grad_norm": 0.2965086102485657,
"learning_rate": 2.504544961025853e-05,
"loss": 0.0089,
"step": 667
},
{
"epoch": 10.603174603174603,
"grad_norm": 0.2296365350484848,
"learning_rate": 2.4909183406165836e-05,
"loss": 0.0068,
"step": 668
},
{
"epoch": 10.619047619047619,
"grad_norm": 0.3457624018192291,
"learning_rate": 2.4773165831827018e-05,
"loss": 0.0083,
"step": 669
},
{
"epoch": 10.634920634920634,
"grad_norm": 0.20112329721450806,
"learning_rate": 2.4637398235066527e-05,
"loss": 0.0061,
"step": 670
},
{
"epoch": 10.65079365079365,
"grad_norm": 0.19829870760440826,
"learning_rate": 2.450188196123177e-05,
"loss": 0.0063,
"step": 671
},
{
"epoch": 10.666666666666666,
"grad_norm": 0.17704661190509796,
"learning_rate": 2.4366618353179644e-05,
"loss": 0.0045,
"step": 672
},
{
"epoch": 10.682539682539682,
"grad_norm": 0.27905184030532837,
"learning_rate": 2.423160875126348e-05,
"loss": 0.009,
"step": 673
},
{
"epoch": 10.698412698412698,
"grad_norm": 0.18189361691474915,
"learning_rate": 2.4096854493319477e-05,
"loss": 0.0069,
"step": 674
},
{
"epoch": 10.714285714285714,
"grad_norm": 0.2877546548843384,
"learning_rate": 2.3962356914653657e-05,
"loss": 0.0064,
"step": 675
},
{
"epoch": 10.73015873015873,
"grad_norm": 0.27436089515686035,
"learning_rate": 2.3828117348028528e-05,
"loss": 0.009,
"step": 676
},
{
"epoch": 10.746031746031747,
"grad_norm": 0.11570344120264053,
"learning_rate": 2.3694137123649946e-05,
"loss": 0.0038,
"step": 677
},
{
"epoch": 10.761904761904763,
"grad_norm": 0.29015523195266724,
"learning_rate": 2.3560417569153796e-05,
"loss": 0.0079,
"step": 678
},
{
"epoch": 10.777777777777779,
"grad_norm": 0.23264740407466888,
"learning_rate": 2.342696000959309e-05,
"loss": 0.0087,
"step": 679
},
{
"epoch": 10.793650793650794,
"grad_norm": 0.23853233456611633,
"learning_rate": 2.3293765767424537e-05,
"loss": 0.0068,
"step": 680
},
{
"epoch": 10.80952380952381,
"grad_norm": 0.11449386179447174,
"learning_rate": 2.3160836162495653e-05,
"loss": 0.0033,
"step": 681
},
{
"epoch": 10.825396825396826,
"grad_norm": 0.15624088048934937,
"learning_rate": 2.3028172512031604e-05,
"loss": 0.005,
"step": 682
},
{
"epoch": 10.841269841269842,
"grad_norm": 0.17482654750347137,
"learning_rate": 2.289577613062218e-05,
"loss": 0.0053,
"step": 683
},
{
"epoch": 10.857142857142858,
"grad_norm": 0.1657302975654602,
"learning_rate": 2.2763648330208688e-05,
"loss": 0.0044,
"step": 684
},
{
"epoch": 10.873015873015873,
"grad_norm": 0.3183576762676239,
"learning_rate": 2.2631790420071064e-05,
"loss": 0.0087,
"step": 685
},
{
"epoch": 10.88888888888889,
"grad_norm": 0.2113347351551056,
"learning_rate": 2.2500203706814856e-05,
"loss": 0.0057,
"step": 686
},
{
"epoch": 10.904761904761905,
"grad_norm": 0.20787814259529114,
"learning_rate": 2.2368889494358235e-05,
"loss": 0.0066,
"step": 687
},
{
"epoch": 10.920634920634921,
"grad_norm": 0.19461645185947418,
"learning_rate": 2.2237849083919142e-05,
"loss": 0.0056,
"step": 688
},
{
"epoch": 10.936507936507937,
"grad_norm": 0.3162117302417755,
"learning_rate": 2.2107083774002364e-05,
"loss": 0.0102,
"step": 689
},
{
"epoch": 10.952380952380953,
"grad_norm": 0.1498049944639206,
"learning_rate": 2.1976594860386597e-05,
"loss": 0.0054,
"step": 690
},
{
"epoch": 10.968253968253968,
"grad_norm": 0.25862017273902893,
"learning_rate": 2.1846383636111743e-05,
"loss": 0.0063,
"step": 691
},
{
"epoch": 10.984126984126984,
"grad_norm": 0.2787252962589264,
"learning_rate": 2.1716451391466008e-05,
"loss": 0.004,
"step": 692
},
{
"epoch": 11.0,
"grad_norm": 0.5165538787841797,
"learning_rate": 2.1586799413973135e-05,
"loss": 0.0117,
"step": 693
},
{
"epoch": 11.015873015873016,
"grad_norm": 0.16975046694278717,
"learning_rate": 2.1457428988379635e-05,
"loss": 0.0053,
"step": 694
},
{
"epoch": 11.031746031746032,
"grad_norm": 0.09435385465621948,
"learning_rate": 2.1328341396642093e-05,
"loss": 0.0032,
"step": 695
},
{
"epoch": 11.047619047619047,
"grad_norm": 0.0928262248635292,
"learning_rate": 2.1199537917914386e-05,
"loss": 0.0031,
"step": 696
},
{
"epoch": 11.063492063492063,
"grad_norm": 0.1879938691854477,
"learning_rate": 2.107101982853511e-05,
"loss": 0.0052,
"step": 697
},
{
"epoch": 11.079365079365079,
"grad_norm": 0.13509397208690643,
"learning_rate": 2.0942788402014867e-05,
"loss": 0.005,
"step": 698
},
{
"epoch": 11.095238095238095,
"grad_norm": 0.10293649882078171,
"learning_rate": 2.0814844909023663e-05,
"loss": 0.0038,
"step": 699
},
{
"epoch": 11.11111111111111,
"grad_norm": 0.26907050609588623,
"learning_rate": 2.068719061737831e-05,
"loss": 0.0086,
"step": 700
},
{
"epoch": 11.126984126984127,
"grad_norm": 0.1459931880235672,
"learning_rate": 2.0559826792029884e-05,
"loss": 0.0045,
"step": 701
},
{
"epoch": 11.142857142857142,
"grad_norm": 0.10803816467523575,
"learning_rate": 2.0432754695051136e-05,
"loss": 0.0034,
"step": 702
},
{
"epoch": 11.158730158730158,
"grad_norm": 0.07795245200395584,
"learning_rate": 2.0305975585624058e-05,
"loss": 0.0031,
"step": 703
},
{
"epoch": 11.174603174603174,
"grad_norm": 0.14636225998401642,
"learning_rate": 2.0179490720027372e-05,
"loss": 0.0055,
"step": 704
},
{
"epoch": 11.19047619047619,
"grad_norm": 0.0945882797241211,
"learning_rate": 2.005330135162408e-05,
"loss": 0.0036,
"step": 705
},
{
"epoch": 11.206349206349206,
"grad_norm": 0.16662253439426422,
"learning_rate": 1.992740873084899e-05,
"loss": 0.0042,
"step": 706
},
{
"epoch": 11.222222222222221,
"grad_norm": 0.2733784019947052,
"learning_rate": 1.9801814105196497e-05,
"loss": 0.0066,
"step": 707
},
{
"epoch": 11.238095238095237,
"grad_norm": 0.27156999707221985,
"learning_rate": 1.9676518719207977e-05,
"loss": 0.0069,
"step": 708
},
{
"epoch": 11.253968253968253,
"grad_norm": 0.23552264273166656,
"learning_rate": 1.9551523814459665e-05,
"loss": 0.0071,
"step": 709
},
{
"epoch": 11.26984126984127,
"grad_norm": 0.09834027290344238,
"learning_rate": 1.9426830629550242e-05,
"loss": 0.0035,
"step": 710
},
{
"epoch": 11.285714285714286,
"grad_norm": 0.1471029371023178,
"learning_rate": 1.9302440400088606e-05,
"loss": 0.0055,
"step": 711
},
{
"epoch": 11.301587301587302,
"grad_norm": 0.20986461639404297,
"learning_rate": 1.917835435868155e-05,
"loss": 0.0063,
"step": 712
},
{
"epoch": 11.317460317460318,
"grad_norm": 0.29454532265663147,
"learning_rate": 1.9054573734921714e-05,
"loss": 0.0098,
"step": 713
},
{
"epoch": 11.333333333333334,
"grad_norm": 0.1742410510778427,
"learning_rate": 1.8931099755375203e-05,
"loss": 0.0044,
"step": 714
},
{
"epoch": 11.34920634920635,
"grad_norm": 0.13173726201057434,
"learning_rate": 1.880793364356956e-05,
"loss": 0.0055,
"step": 715
},
{
"epoch": 11.365079365079366,
"grad_norm": 0.20177853107452393,
"learning_rate": 1.8685076619981608e-05,
"loss": 0.006,
"step": 716
},
{
"epoch": 11.380952380952381,
"grad_norm": 0.1103038340806961,
"learning_rate": 1.8562529902025372e-05,
"loss": 0.0037,
"step": 717
},
{
"epoch": 11.396825396825397,
"grad_norm": 0.22189675271511078,
"learning_rate": 1.844029470403993e-05,
"loss": 0.0066,
"step": 718
},
{
"epoch": 11.412698412698413,
"grad_norm": 0.21314705908298492,
"learning_rate": 1.8318372237277565e-05,
"loss": 0.0065,
"step": 719
},
{
"epoch": 11.428571428571429,
"grad_norm": 0.1456424593925476,
"learning_rate": 1.8196763709891524e-05,
"loss": 0.0049,
"step": 720
},
{
"epoch": 11.444444444444445,
"grad_norm": 0.1834188550710678,
"learning_rate": 1.8075470326924243e-05,
"loss": 0.0067,
"step": 721
},
{
"epoch": 11.46031746031746,
"grad_norm": 0.2855736017227173,
"learning_rate": 1.795449329029531e-05,
"loss": 0.009,
"step": 722
},
{
"epoch": 11.476190476190476,
"grad_norm": 0.15806177258491516,
"learning_rate": 1.7833833798789595e-05,
"loss": 0.0044,
"step": 723
},
{
"epoch": 11.492063492063492,
"grad_norm": 0.16890814900398254,
"learning_rate": 1.7713493048045294e-05,
"loss": 0.0056,
"step": 724
},
{
"epoch": 11.507936507936508,
"grad_norm": 0.24409544467926025,
"learning_rate": 1.7593472230542202e-05,
"loss": 0.0069,
"step": 725
},
{
"epoch": 11.523809523809524,
"grad_norm": 0.2861270010471344,
"learning_rate": 1.747377253558982e-05,
"loss": 0.0078,
"step": 726
},
{
"epoch": 11.53968253968254,
"grad_norm": 0.17466863989830017,
"learning_rate": 1.7354395149315534e-05,
"loss": 0.0044,
"step": 727
},
{
"epoch": 11.555555555555555,
"grad_norm": 0.2202078104019165,
"learning_rate": 1.7235341254653005e-05,
"loss": 0.0071,
"step": 728
},
{
"epoch": 11.571428571428571,
"grad_norm": 0.25968992710113525,
"learning_rate": 1.711661203133026e-05,
"loss": 0.0052,
"step": 729
},
{
"epoch": 11.587301587301587,
"grad_norm": 0.10932864248752594,
"learning_rate": 1.6998208655858137e-05,
"loss": 0.0033,
"step": 730
},
{
"epoch": 11.603174603174603,
"grad_norm": 0.1846671849489212,
"learning_rate": 1.6880132301518598e-05,
"loss": 0.0049,
"step": 731
},
{
"epoch": 11.619047619047619,
"grad_norm": 0.18320026993751526,
"learning_rate": 1.6762384138353078e-05,
"loss": 0.0048,
"step": 732
},
{
"epoch": 11.634920634920634,
"grad_norm": 0.18667708337306976,
"learning_rate": 1.6644965333150847e-05,
"loss": 0.0041,
"step": 733
},
{
"epoch": 11.65079365079365,
"grad_norm": 0.29703792929649353,
"learning_rate": 1.6527877049437622e-05,
"loss": 0.0098,
"step": 734
},
{
"epoch": 11.666666666666666,
"grad_norm": 0.1451849490404129,
"learning_rate": 1.6411120447463807e-05,
"loss": 0.0034,
"step": 735
},
{
"epoch": 11.682539682539682,
"grad_norm": 0.28783440589904785,
"learning_rate": 1.6294696684193154e-05,
"loss": 0.009,
"step": 736
},
{
"epoch": 11.698412698412698,
"grad_norm": 0.22581429779529572,
"learning_rate": 1.617860691329126e-05,
"loss": 0.0044,
"step": 737
},
{
"epoch": 11.714285714285714,
"grad_norm": 0.20482461154460907,
"learning_rate": 1.6062852285114123e-05,
"loss": 0.007,
"step": 738
},
{
"epoch": 11.73015873015873,
"grad_norm": 0.10219337791204453,
"learning_rate": 1.5947433946696693e-05,
"loss": 0.0031,
"step": 739
},
{
"epoch": 11.746031746031747,
"grad_norm": 0.2273254692554474,
"learning_rate": 1.583235304174167e-05,
"loss": 0.0069,
"step": 740
},
{
"epoch": 11.761904761904763,
"grad_norm": 0.3083495497703552,
"learning_rate": 1.5717610710607948e-05,
"loss": 0.0116,
"step": 741
},
{
"epoch": 11.777777777777779,
"grad_norm": 0.2324836254119873,
"learning_rate": 1.5603208090299498e-05,
"loss": 0.0065,
"step": 742
},
{
"epoch": 11.793650793650794,
"grad_norm": 0.14565986394882202,
"learning_rate": 1.5489146314454002e-05,
"loss": 0.0041,
"step": 743
},
{
"epoch": 11.80952380952381,
"grad_norm": 0.18284986913204193,
"learning_rate": 1.537542651333167e-05,
"loss": 0.0043,
"step": 744
},
{
"epoch": 11.825396825396826,
"grad_norm": 0.21167722344398499,
"learning_rate": 1.5262049813803958e-05,
"loss": 0.0066,
"step": 745
},
{
"epoch": 11.841269841269842,
"grad_norm": 0.16525444388389587,
"learning_rate": 1.5149017339342574e-05,
"loss": 0.0047,
"step": 746
},
{
"epoch": 11.857142857142858,
"grad_norm": 0.17935959994792938,
"learning_rate": 1.503633021000812e-05,
"loss": 0.0053,
"step": 747
},
{
"epoch": 11.873015873015873,
"grad_norm": 0.2582390010356903,
"learning_rate": 1.4923989542439159e-05,
"loss": 0.0052,
"step": 748
},
{
"epoch": 11.88888888888889,
"grad_norm": 0.06719334423542023,
"learning_rate": 1.4811996449841098e-05,
"loss": 0.0025,
"step": 749
},
{
"epoch": 11.904761904761905,
"grad_norm": 0.19448348879814148,
"learning_rate": 1.4700352041975163e-05,
"loss": 0.0059,
"step": 750
},
{
"epoch": 11.920634920634921,
"grad_norm": 0.30000415444374084,
"learning_rate": 1.458905742514734e-05,
"loss": 0.0089,
"step": 751
},
{
"epoch": 11.936507936507937,
"grad_norm": 0.19624555110931396,
"learning_rate": 1.447811370219757e-05,
"loss": 0.0067,
"step": 752
},
{
"epoch": 11.952380952380953,
"grad_norm": 0.16108612716197968,
"learning_rate": 1.4367521972488612e-05,
"loss": 0.0036,
"step": 753
},
{
"epoch": 11.968253968253968,
"grad_norm": 0.10793477296829224,
"learning_rate": 1.4257283331895315e-05,
"loss": 0.0032,
"step": 754
},
{
"epoch": 11.984126984126984,
"grad_norm": 0.19331948459148407,
"learning_rate": 1.4147398872793693e-05,
"loss": 0.0054,
"step": 755
},
{
"epoch": 12.0,
"grad_norm": 0.3868754208087921,
"learning_rate": 1.4037869684050115e-05,
"loss": 0.0066,
"step": 756
},
{
"epoch": 12.015873015873016,
"grad_norm": 0.1854810267686844,
"learning_rate": 1.3928696851010443e-05,
"loss": 0.0052,
"step": 757
},
{
"epoch": 12.031746031746032,
"grad_norm": 0.1465175747871399,
"learning_rate": 1.3819881455489458e-05,
"loss": 0.0064,
"step": 758
},
{
"epoch": 12.047619047619047,
"grad_norm": 0.09918566048145294,
"learning_rate": 1.3711424575759912e-05,
"loss": 0.0033,
"step": 759
},
{
"epoch": 12.063492063492063,
"grad_norm": 0.1635628491640091,
"learning_rate": 1.3603327286542023e-05,
"loss": 0.0044,
"step": 760
},
{
"epoch": 12.079365079365079,
"grad_norm": 0.1613842099905014,
"learning_rate": 1.3495590658992718e-05,
"loss": 0.0048,
"step": 761
},
{
"epoch": 12.095238095238095,
"grad_norm": 0.13634873926639557,
"learning_rate": 1.33882157606951e-05,
"loss": 0.0034,
"step": 762
},
{
"epoch": 12.11111111111111,
"grad_norm": 0.15302757918834686,
"learning_rate": 1.3281203655647756e-05,
"loss": 0.0047,
"step": 763
},
{
"epoch": 12.126984126984127,
"grad_norm": 0.10601391643285751,
"learning_rate": 1.317455540425439e-05,
"loss": 0.0031,
"step": 764
},
{
"epoch": 12.142857142857142,
"grad_norm": 0.16901229321956635,
"learning_rate": 1.3068272063313102e-05,
"loss": 0.004,
"step": 765
},
{
"epoch": 12.158730158730158,
"grad_norm": 0.11270225793123245,
"learning_rate": 1.2962354686006084e-05,
"loss": 0.0036,
"step": 766
},
{
"epoch": 12.174603174603174,
"grad_norm": 0.17881913483142853,
"learning_rate": 1.2856804321889115e-05,
"loss": 0.0061,
"step": 767
},
{
"epoch": 12.19047619047619,
"grad_norm": 0.27680760622024536,
"learning_rate": 1.2751622016881182e-05,
"loss": 0.0087,
"step": 768
},
{
"epoch": 12.206349206349206,
"grad_norm": 0.14763417840003967,
"learning_rate": 1.2646808813254042e-05,
"loss": 0.0039,
"step": 769
},
{
"epoch": 12.222222222222221,
"grad_norm": 0.21186058223247528,
"learning_rate": 1.2542365749622049e-05,
"loss": 0.0065,
"step": 770
},
{
"epoch": 12.238095238095237,
"grad_norm": 0.13028453290462494,
"learning_rate": 1.2438293860931682e-05,
"loss": 0.0037,
"step": 771
},
{
"epoch": 12.253968253968253,
"grad_norm": 0.1220482587814331,
"learning_rate": 1.2334594178451425e-05,
"loss": 0.0034,
"step": 772
},
{
"epoch": 12.26984126984127,
"grad_norm": 0.10451938956975937,
"learning_rate": 1.2231267729761487e-05,
"loss": 0.0034,
"step": 773
},
{
"epoch": 12.285714285714286,
"grad_norm": 0.06596413254737854,
"learning_rate": 1.2128315538743646e-05,
"loss": 0.0025,
"step": 774
},
{
"epoch": 12.301587301587302,
"grad_norm": 0.18053588271141052,
"learning_rate": 1.2025738625571026e-05,
"loss": 0.0043,
"step": 775
},
{
"epoch": 12.317460317460318,
"grad_norm": 0.2295704185962677,
"learning_rate": 1.1923538006698154e-05,
"loss": 0.0076,
"step": 776
},
{
"epoch": 12.333333333333334,
"grad_norm": 0.21795432269573212,
"learning_rate": 1.1821714694850689e-05,
"loss": 0.0062,
"step": 777
},
{
"epoch": 12.34920634920635,
"grad_norm": 0.110650934278965,
"learning_rate": 1.172026969901553e-05,
"loss": 0.0033,
"step": 778
},
{
"epoch": 12.365079365079366,
"grad_norm": 0.14939086139202118,
"learning_rate": 1.161920402443077e-05,
"loss": 0.0053,
"step": 779
},
{
"epoch": 12.380952380952381,
"grad_norm": 0.14100809395313263,
"learning_rate": 1.1518518672575701e-05,
"loss": 0.0047,
"step": 780
},
{
"epoch": 12.396825396825397,
"grad_norm": 0.1589258462190628,
"learning_rate": 1.1418214641160958e-05,
"loss": 0.0041,
"step": 781
},
{
"epoch": 12.412698412698413,
"grad_norm": 0.22199559211730957,
"learning_rate": 1.1318292924118584e-05,
"loss": 0.0048,
"step": 782
},
{
"epoch": 12.428571428571429,
"grad_norm": 0.1654834747314453,
"learning_rate": 1.1218754511592217e-05,
"loss": 0.0052,
"step": 783
},
{
"epoch": 12.444444444444445,
"grad_norm": 0.18298682570457458,
"learning_rate": 1.1119600389927182e-05,
"loss": 0.0053,
"step": 784
},
{
"epoch": 12.46031746031746,
"grad_norm": 0.13524076342582703,
"learning_rate": 1.1020831541660915e-05,
"loss": 0.0038,
"step": 785
},
{
"epoch": 12.476190476190476,
"grad_norm": 0.17973224818706512,
"learning_rate": 1.092244894551298e-05,
"loss": 0.0047,
"step": 786
},
{
"epoch": 12.492063492063492,
"grad_norm": 0.06217047572135925,
"learning_rate": 1.0824453576375576e-05,
"loss": 0.0026,
"step": 787
},
{
"epoch": 12.507936507936508,
"grad_norm": 0.17186515033245087,
"learning_rate": 1.0726846405303754e-05,
"loss": 0.0043,
"step": 788
},
{
"epoch": 12.523809523809524,
"grad_norm": 0.22013287246227264,
"learning_rate": 1.062962839950587e-05,
"loss": 0.0057,
"step": 789
},
{
"epoch": 12.53968253968254,
"grad_norm": 0.1783435344696045,
"learning_rate": 1.0532800522333897e-05,
"loss": 0.0057,
"step": 790
},
{
"epoch": 12.555555555555555,
"grad_norm": 0.21852487325668335,
"learning_rate": 1.0436363733274057e-05,
"loss": 0.0053,
"step": 791
},
{
"epoch": 12.571428571428571,
"grad_norm": 0.22835583984851837,
"learning_rate": 1.0340318987937097e-05,
"loss": 0.0056,
"step": 792
},
{
"epoch": 12.587301587301587,
"grad_norm": 0.14611005783081055,
"learning_rate": 1.0244667238048988e-05,
"loss": 0.004,
"step": 793
},
{
"epoch": 12.603174603174603,
"grad_norm": 0.13122573494911194,
"learning_rate": 1.014940943144142e-05,
"loss": 0.0034,
"step": 794
},
{
"epoch": 12.619047619047619,
"grad_norm": 0.1692192703485489,
"learning_rate": 1.0054546512042424e-05,
"loss": 0.0036,
"step": 795
},
{
"epoch": 12.634920634920634,
"grad_norm": 0.10081874579191208,
"learning_rate": 9.960079419866985e-06,
"loss": 0.0028,
"step": 796
},
{
"epoch": 12.65079365079365,
"grad_norm": 0.16554361581802368,
"learning_rate": 9.866009091007833e-06,
"loss": 0.004,
"step": 797
},
{
"epoch": 12.666666666666666,
"grad_norm": 0.11980407685041428,
"learning_rate": 9.772336457626014e-06,
"loss": 0.0033,
"step": 798
},
{
"epoch": 12.682539682539682,
"grad_norm": 0.19346101582050323,
"learning_rate": 9.679062447941778e-06,
"loss": 0.0054,
"step": 799
},
{
"epoch": 12.698412698412698,
"grad_norm": 0.21870972216129303,
"learning_rate": 9.586187986225325e-06,
"loss": 0.0056,
"step": 800
},
{
"epoch": 12.714285714285714,
"grad_norm": 0.18945957720279694,
"learning_rate": 9.493713992787672e-06,
"loss": 0.0056,
"step": 801
},
{
"epoch": 12.73015873015873,
"grad_norm": 0.25288915634155273,
"learning_rate": 9.401641383971477e-06,
"loss": 0.0067,
"step": 802
},
{
"epoch": 12.746031746031747,
"grad_norm": 0.15972785651683807,
"learning_rate": 9.309971072142038e-06,
"loss": 0.0041,
"step": 803
},
{
"epoch": 12.761904761904763,
"grad_norm": 0.2357502579689026,
"learning_rate": 9.218703965678204e-06,
"loss": 0.0059,
"step": 804
},
{
"epoch": 12.777777777777779,
"grad_norm": 0.23380345106124878,
"learning_rate": 9.127840968963381e-06,
"loss": 0.0072,
"step": 805
},
{
"epoch": 12.793650793650794,
"grad_norm": 0.13809677958488464,
"learning_rate": 9.03738298237658e-06,
"loss": 0.0046,
"step": 806
},
{
"epoch": 12.80952380952381,
"grad_norm": 0.26843348145484924,
"learning_rate": 8.94733090228349e-06,
"loss": 0.007,
"step": 807
},
{
"epoch": 12.825396825396826,
"grad_norm": 0.30479297041893005,
"learning_rate": 8.857685621027568e-06,
"loss": 0.0072,
"step": 808
},
{
"epoch": 12.841269841269842,
"grad_norm": 0.09838364273309708,
"learning_rate": 8.768448026921245e-06,
"loss": 0.0032,
"step": 809
},
{
"epoch": 12.857142857142858,
"grad_norm": 0.13536061346530914,
"learning_rate": 8.67961900423711e-06,
"loss": 0.0031,
"step": 810
},
{
"epoch": 12.873015873015873,
"grad_norm": 0.12725569307804108,
"learning_rate": 8.591199433199126e-06,
"loss": 0.0034,
"step": 811
},
{
"epoch": 12.88888888888889,
"grad_norm": 0.1910911351442337,
"learning_rate": 8.503190189973914e-06,
"loss": 0.0048,
"step": 812
},
{
"epoch": 12.904761904761905,
"grad_norm": 0.08065954595804214,
"learning_rate": 8.415592146662104e-06,
"loss": 0.0027,
"step": 813
},
{
"epoch": 12.920634920634921,
"grad_norm": 0.20949719846248627,
"learning_rate": 8.328406171289621e-06,
"loss": 0.0056,
"step": 814
},
{
"epoch": 12.936507936507937,
"grad_norm": 0.11893566697835922,
"learning_rate": 8.24163312779917e-06,
"loss": 0.0036,
"step": 815
},
{
"epoch": 12.952380952380953,
"grad_norm": 0.28728553652763367,
"learning_rate": 8.155273876041614e-06,
"loss": 0.0098,
"step": 816
},
{
"epoch": 12.968253968253968,
"grad_norm": 0.2053646296262741,
"learning_rate": 8.069329271767484e-06,
"loss": 0.0057,
"step": 817
},
{
"epoch": 12.984126984126984,
"grad_norm": 0.186600461602211,
"learning_rate": 7.983800166618482e-06,
"loss": 0.0044,
"step": 818
},
{
"epoch": 13.0,
"grad_norm": 0.18637099862098694,
"learning_rate": 7.898687408119065e-06,
"loss": 0.0034,
"step": 819
},
{
"epoch": 13.015873015873016,
"grad_norm": 0.23288948833942413,
"learning_rate": 7.813991839667995e-06,
"loss": 0.006,
"step": 820
},
{
"epoch": 13.031746031746032,
"grad_norm": 0.11603759229183197,
"learning_rate": 7.72971430053005e-06,
"loss": 0.0032,
"step": 821
},
{
"epoch": 13.047619047619047,
"grad_norm": 0.11359909176826477,
"learning_rate": 7.645855625827658e-06,
"loss": 0.0036,
"step": 822
},
{
"epoch": 13.063492063492063,
"grad_norm": 0.1750001609325409,
"learning_rate": 7.56241664653266e-06,
"loss": 0.0047,
"step": 823
},
{
"epoch": 13.079365079365079,
"grad_norm": 0.08407314866781235,
"learning_rate": 7.4793981894580034e-06,
"loss": 0.003,
"step": 824
},
{
"epoch": 13.095238095238095,
"grad_norm": 0.15450453758239746,
"learning_rate": 7.396801077249676e-06,
"loss": 0.004,
"step": 825
},
{
"epoch": 13.11111111111111,
"grad_norm": 0.1506980061531067,
"learning_rate": 7.3146261283784104e-06,
"loss": 0.004,
"step": 826
},
{
"epoch": 13.126984126984127,
"grad_norm": 0.0932818278670311,
"learning_rate": 7.2328741571316696e-06,
"loss": 0.0028,
"step": 827
},
{
"epoch": 13.142857142857142,
"grad_norm": 0.1964637041091919,
"learning_rate": 7.1515459736055505e-06,
"loss": 0.0047,
"step": 828
},
{
"epoch": 13.158730158730158,
"grad_norm": 0.11378604173660278,
"learning_rate": 7.070642383696763e-06,
"loss": 0.0036,
"step": 829
},
{
"epoch": 13.174603174603174,
"grad_norm": 0.07380079478025436,
"learning_rate": 6.990164189094589e-06,
"loss": 0.0024,
"step": 830
},
{
"epoch": 13.19047619047619,
"grad_norm": 0.11589548736810684,
"learning_rate": 6.910112187273066e-06,
"loss": 0.0036,
"step": 831
},
{
"epoch": 13.206349206349206,
"grad_norm": 0.2268502563238144,
"learning_rate": 6.830487171482935e-06,
"loss": 0.0065,
"step": 832
},
{
"epoch": 13.222222222222221,
"grad_norm": 0.1941031664609909,
"learning_rate": 6.751289930743882e-06,
"loss": 0.0043,
"step": 833
},
{
"epoch": 13.238095238095237,
"grad_norm": 0.14726468920707703,
"learning_rate": 6.6725212498366885e-06,
"loss": 0.0044,
"step": 834
},
{
"epoch": 13.253968253968253,
"grad_norm": 0.09331656992435455,
"learning_rate": 6.594181909295427e-06,
"loss": 0.003,
"step": 835
},
{
"epoch": 13.26984126984127,
"grad_norm": 0.1862584948539734,
"learning_rate": 6.516272685399788e-06,
"loss": 0.005,
"step": 836
},
{
"epoch": 13.285714285714286,
"grad_norm": 0.14406166970729828,
"learning_rate": 6.438794350167337e-06,
"loss": 0.0039,
"step": 837
},
{
"epoch": 13.301587301587302,
"grad_norm": 0.09058280289173126,
"learning_rate": 6.36174767134588e-06,
"loss": 0.0033,
"step": 838
},
{
"epoch": 13.317460317460318,
"grad_norm": 0.1405523121356964,
"learning_rate": 6.285133412405858e-06,
"loss": 0.0035,
"step": 839
},
{
"epoch": 13.333333333333334,
"grad_norm": 0.14805886149406433,
"learning_rate": 6.20895233253278e-06,
"loss": 0.0046,
"step": 840
},
{
"epoch": 13.34920634920635,
"grad_norm": 0.2134266048669815,
"learning_rate": 6.133205186619695e-06,
"loss": 0.0064,
"step": 841
},
{
"epoch": 13.365079365079366,
"grad_norm": 0.09715571254491806,
"learning_rate": 6.057892725259717e-06,
"loss": 0.003,
"step": 842
},
{
"epoch": 13.380952380952381,
"grad_norm": 0.16215340793132782,
"learning_rate": 5.983015694738597e-06,
"loss": 0.0052,
"step": 843
},
{
"epoch": 13.396825396825397,
"grad_norm": 0.12831249833106995,
"learning_rate": 5.908574837027309e-06,
"loss": 0.0035,
"step": 844
},
{
"epoch": 13.412698412698413,
"grad_norm": 0.23066161572933197,
"learning_rate": 5.83457088977471e-06,
"loss": 0.006,
"step": 845
},
{
"epoch": 13.428571428571429,
"grad_norm": 0.16153094172477722,
"learning_rate": 5.761004586300234e-06,
"loss": 0.0032,
"step": 846
},
{
"epoch": 13.444444444444445,
"grad_norm": 0.1263124942779541,
"learning_rate": 5.687876655586583e-06,
"loss": 0.0033,
"step": 847
},
{
"epoch": 13.46031746031746,
"grad_norm": 0.22407254576683044,
"learning_rate": 5.615187822272583e-06,
"loss": 0.0069,
"step": 848
},
{
"epoch": 13.476190476190476,
"grad_norm": 0.2908068001270294,
"learning_rate": 5.542938806645931e-06,
"loss": 0.0087,
"step": 849
},
{
"epoch": 13.492063492063492,
"grad_norm": 0.18590912222862244,
"learning_rate": 5.4711303246361144e-06,
"loss": 0.0048,
"step": 850
},
{
"epoch": 13.507936507936508,
"grad_norm": 0.17562605440616608,
"learning_rate": 5.399763087807236e-06,
"loss": 0.0044,
"step": 851
},
{
"epoch": 13.523809523809524,
"grad_norm": 0.07766014337539673,
"learning_rate": 5.328837803351083e-06,
"loss": 0.0025,
"step": 852
},
{
"epoch": 13.53968253968254,
"grad_norm": 0.16575992107391357,
"learning_rate": 5.258355174079993e-06,
"loss": 0.0045,
"step": 853
},
{
"epoch": 13.555555555555555,
"grad_norm": 0.1963498741388321,
"learning_rate": 5.188315898419971e-06,
"loss": 0.0061,
"step": 854
},
{
"epoch": 13.571428571428571,
"grad_norm": 0.2301764041185379,
"learning_rate": 5.118720670403748e-06,
"loss": 0.0051,
"step": 855
},
{
"epoch": 13.587301587301587,
"grad_norm": 0.16544826328754425,
"learning_rate": 5.04957017966391e-06,
"loss": 0.0033,
"step": 856
},
{
"epoch": 13.603174603174603,
"grad_norm": 0.17942006886005402,
"learning_rate": 4.980865111426003e-06,
"loss": 0.004,
"step": 857
},
{
"epoch": 13.619047619047619,
"grad_norm": 0.14243295788764954,
"learning_rate": 4.912606146501886e-06,
"loss": 0.0035,
"step": 858
},
{
"epoch": 13.634920634920634,
"grad_norm": 0.14227573573589325,
"learning_rate": 4.844793961282812e-06,
"loss": 0.0034,
"step": 859
},
{
"epoch": 13.65079365079365,
"grad_norm": 0.14716386795043945,
"learning_rate": 4.777429227732844e-06,
"loss": 0.0033,
"step": 860
},
{
"epoch": 13.666666666666666,
"grad_norm": 0.2278168946504593,
"learning_rate": 4.710512613382151e-06,
"loss": 0.006,
"step": 861
},
{
"epoch": 13.682539682539682,
"grad_norm": 0.2408359944820404,
"learning_rate": 4.644044781320422e-06,
"loss": 0.0072,
"step": 862
},
{
"epoch": 13.698412698412698,
"grad_norm": 0.23890067636966705,
"learning_rate": 4.578026390190232e-06,
"loss": 0.0051,
"step": 863
},
{
"epoch": 13.714285714285714,
"grad_norm": 0.2770053446292877,
"learning_rate": 4.5124580941806165e-06,
"loss": 0.0078,
"step": 864
},
{
"epoch": 13.73015873015873,
"grad_norm": 0.16485559940338135,
"learning_rate": 4.447340543020473e-06,
"loss": 0.0033,
"step": 865
},
{
"epoch": 13.746031746031747,
"grad_norm": 0.1674467772245407,
"learning_rate": 4.382674381972224e-06,
"loss": 0.0041,
"step": 866
},
{
"epoch": 13.761904761904763,
"grad_norm": 0.09436249732971191,
"learning_rate": 4.318460251825357e-06,
"loss": 0.0031,
"step": 867
},
{
"epoch": 13.777777777777779,
"grad_norm": 0.0673573687672615,
"learning_rate": 4.254698788890127e-06,
"loss": 0.0025,
"step": 868
},
{
"epoch": 13.793650793650794,
"grad_norm": 0.18255870044231415,
"learning_rate": 4.191390624991159e-06,
"loss": 0.0047,
"step": 869
},
{
"epoch": 13.80952380952381,
"grad_norm": 0.13948306441307068,
"learning_rate": 4.12853638746134e-06,
"loss": 0.0032,
"step": 870
},
{
"epoch": 13.825396825396826,
"grad_norm": 0.24183286726474762,
"learning_rate": 4.0661366991354365e-06,
"loss": 0.0063,
"step": 871
},
{
"epoch": 13.841269841269842,
"grad_norm": 0.10236512869596481,
"learning_rate": 4.004192178344029e-06,
"loss": 0.003,
"step": 872
},
{
"epoch": 13.857142857142858,
"grad_norm": 0.10468772053718567,
"learning_rate": 3.942703438907358e-06,
"loss": 0.003,
"step": 873
},
{
"epoch": 13.873015873015873,
"grad_norm": 0.1839323341846466,
"learning_rate": 3.881671090129247e-06,
"loss": 0.0047,
"step": 874
},
{
"epoch": 13.88888888888889,
"grad_norm": 0.245498925447464,
"learning_rate": 3.821095736791008e-06,
"loss": 0.0063,
"step": 875
},
{
"epoch": 13.904761904761905,
"grad_norm": 0.08903949707746506,
"learning_rate": 3.7609779791455744e-06,
"loss": 0.0027,
"step": 876
},
{
"epoch": 13.920634920634921,
"grad_norm": 0.10096840560436249,
"learning_rate": 3.7013184129113976e-06,
"loss": 0.0025,
"step": 877
},
{
"epoch": 13.936507936507937,
"grad_norm": 0.16196174919605255,
"learning_rate": 3.6421176292666783e-06,
"loss": 0.0049,
"step": 878
},
{
"epoch": 13.952380952380953,
"grad_norm": 0.2010921686887741,
"learning_rate": 3.58337621484342e-06,
"loss": 0.0047,
"step": 879
},
{
"epoch": 13.968253968253968,
"grad_norm": 0.20379731059074402,
"learning_rate": 3.525094751721669e-06,
"loss": 0.0049,
"step": 880
},
{
"epoch": 13.984126984126984,
"grad_norm": 0.13787353038787842,
"learning_rate": 3.4672738174236884e-06,
"loss": 0.0038,
"step": 881
},
{
"epoch": 14.0,
"grad_norm": 0.21144546568393707,
"learning_rate": 3.4099139849083307e-06,
"loss": 0.0058,
"step": 882
},
{
"epoch": 14.015873015873016,
"grad_norm": 0.16598111391067505,
"learning_rate": 3.353015822565253e-06,
"loss": 0.0046,
"step": 883
},
{
"epoch": 14.031746031746032,
"grad_norm": 0.11316211521625519,
"learning_rate": 3.296579894209345e-06,
"loss": 0.0033,
"step": 884
},
{
"epoch": 14.047619047619047,
"grad_norm": 0.1642863005399704,
"learning_rate": 3.2406067590751433e-06,
"loss": 0.0042,
"step": 885
},
{
"epoch": 14.063492063492063,
"grad_norm": 0.06231338158249855,
"learning_rate": 3.1850969718112745e-06,
"loss": 0.0023,
"step": 886
},
{
"epoch": 14.079365079365079,
"grad_norm": 0.07541368156671524,
"learning_rate": 3.1300510824749273e-06,
"loss": 0.0027,
"step": 887
},
{
"epoch": 14.095238095238095,
"grad_norm": 0.10008185356855392,
"learning_rate": 3.0754696365265068e-06,
"loss": 0.0028,
"step": 888
},
{
"epoch": 14.11111111111111,
"grad_norm": 0.10464094579219818,
"learning_rate": 3.0213531748240764e-06,
"loss": 0.0031,
"step": 889
},
{
"epoch": 14.126984126984127,
"grad_norm": 0.09949090331792831,
"learning_rate": 2.9677022336181413e-06,
"loss": 0.003,
"step": 890
},
{
"epoch": 14.142857142857142,
"grad_norm": 0.08555309474468231,
"learning_rate": 2.914517344546258e-06,
"loss": 0.0028,
"step": 891
},
{
"epoch": 14.158730158730158,
"grad_norm": 0.10682200640439987,
"learning_rate": 2.8617990346277657e-06,
"loss": 0.0032,
"step": 892
},
{
"epoch": 14.174603174603174,
"grad_norm": 0.09806779026985168,
"learning_rate": 2.8095478262585907e-06,
"loss": 0.0035,
"step": 893
},
{
"epoch": 14.19047619047619,
"grad_norm": 0.13682028651237488,
"learning_rate": 2.7577642372060673e-06,
"loss": 0.003,
"step": 894
},
{
"epoch": 14.206349206349206,
"grad_norm": 0.1651875525712967,
"learning_rate": 2.7064487806037985e-06,
"loss": 0.0043,
"step": 895
},
{
"epoch": 14.222222222222221,
"grad_norm": 0.14128713309764862,
"learning_rate": 2.6556019649465525e-06,
"loss": 0.0032,
"step": 896
},
{
"epoch": 14.238095238095237,
"grad_norm": 0.18472391366958618,
"learning_rate": 2.6052242940852787e-06,
"loss": 0.0055,
"step": 897
},
{
"epoch": 14.253968253968253,
"grad_norm": 0.12015866488218307,
"learning_rate": 2.5553162672220465e-06,
"loss": 0.0027,
"step": 898
},
{
"epoch": 14.26984126984127,
"grad_norm": 0.20532798767089844,
"learning_rate": 2.5058783789051467e-06,
"loss": 0.006,
"step": 899
},
{
"epoch": 14.285714285714286,
"grad_norm": 0.11659039556980133,
"learning_rate": 2.45691111902418e-06,
"loss": 0.0033,
"step": 900
},
{
"epoch": 14.301587301587302,
"grad_norm": 0.22555606067180634,
"learning_rate": 2.4084149728051952e-06,
"loss": 0.0057,
"step": 901
},
{
"epoch": 14.317460317460318,
"grad_norm": 0.10364361107349396,
"learning_rate": 2.360390420805869e-06,
"loss": 0.003,
"step": 902
},
{
"epoch": 14.333333333333334,
"grad_norm": 0.15920886397361755,
"learning_rate": 2.3128379389108e-06,
"loss": 0.0039,
"step": 903
},
{
"epoch": 14.34920634920635,
"grad_norm": 0.10381603240966797,
"learning_rate": 2.2657579983267064e-06,
"loss": 0.0028,
"step": 904
},
{
"epoch": 14.365079365079366,
"grad_norm": 0.17512689530849457,
"learning_rate": 2.219151065577829e-06,
"loss": 0.0046,
"step": 905
},
{
"epoch": 14.380952380952381,
"grad_norm": 0.22503690421581268,
"learning_rate": 2.1730176025012816e-06,
"loss": 0.0063,
"step": 906
},
{
"epoch": 14.396825396825397,
"grad_norm": 0.17018793523311615,
"learning_rate": 2.1273580662424796e-06,
"loss": 0.0048,
"step": 907
},
{
"epoch": 14.412698412698413,
"grad_norm": 0.22725212574005127,
"learning_rate": 2.082172909250568e-06,
"loss": 0.0069,
"step": 908
},
{
"epoch": 14.428571428571429,
"grad_norm": 0.12136708199977875,
"learning_rate": 2.0374625792740464e-06,
"loss": 0.003,
"step": 909
},
{
"epoch": 14.444444444444445,
"grad_norm": 0.06128573417663574,
"learning_rate": 1.993227519356189e-06,
"loss": 0.0022,
"step": 910
},
{
"epoch": 14.46031746031746,
"grad_norm": 0.0800539031624794,
"learning_rate": 1.9494681678307703e-06,
"loss": 0.0025,
"step": 911
},
{
"epoch": 14.476190476190476,
"grad_norm": 0.2250363528728485,
"learning_rate": 1.906184958317664e-06,
"loss": 0.0056,
"step": 912
},
{
"epoch": 14.492063492063492,
"grad_norm": 0.06574003398418427,
"learning_rate": 1.8633783197185783e-06,
"loss": 0.0025,
"step": 913
},
{
"epoch": 14.507936507936508,
"grad_norm": 0.1733701229095459,
"learning_rate": 1.8210486762127499e-06,
"loss": 0.0052,
"step": 914
},
{
"epoch": 14.523809523809524,
"grad_norm": 0.14052851498126984,
"learning_rate": 1.7791964472528232e-06,
"loss": 0.0035,
"step": 915
},
{
"epoch": 14.53968253968254,
"grad_norm": 0.20883136987686157,
"learning_rate": 1.737822047560611e-06,
"loss": 0.006,
"step": 916
},
{
"epoch": 14.555555555555555,
"grad_norm": 0.18126244843006134,
"learning_rate": 1.696925887123052e-06,
"loss": 0.0043,
"step": 917
},
{
"epoch": 14.571428571428571,
"grad_norm": 0.19093488156795502,
"learning_rate": 1.656508371188109e-06,
"loss": 0.0045,
"step": 918
},
{
"epoch": 14.587301587301587,
"grad_norm": 0.16476662456989288,
"learning_rate": 1.6165699002607671e-06,
"loss": 0.0037,
"step": 919
},
{
"epoch": 14.603174603174603,
"grad_norm": 0.12128468602895737,
"learning_rate": 1.5771108700990412e-06,
"loss": 0.0034,
"step": 920
},
{
"epoch": 14.619047619047619,
"grad_norm": 0.07109358161687851,
"learning_rate": 1.538131671710108e-06,
"loss": 0.0027,
"step": 921
},
{
"epoch": 14.634920634920634,
"grad_norm": 0.12868039309978485,
"learning_rate": 1.4996326913463754e-06,
"loss": 0.0042,
"step": 922
},
{
"epoch": 14.65079365079365,
"grad_norm": 0.10166194289922714,
"learning_rate": 1.461614310501691e-06,
"loss": 0.0027,
"step": 923
},
{
"epoch": 14.666666666666666,
"grad_norm": 0.1676546037197113,
"learning_rate": 1.4240769059075342e-06,
"loss": 0.0045,
"step": 924
},
{
"epoch": 14.682539682539682,
"grad_norm": 0.16010187566280365,
"learning_rate": 1.387020849529319e-06,
"loss": 0.0048,
"step": 925
},
{
"epoch": 14.698412698412698,
"grad_norm": 0.22581593692302704,
"learning_rate": 1.3504465085626638e-06,
"loss": 0.0076,
"step": 926
},
{
"epoch": 14.714285714285714,
"grad_norm": 0.1580013781785965,
"learning_rate": 1.3143542454297885e-06,
"loss": 0.004,
"step": 927
},
{
"epoch": 14.73015873015873,
"grad_norm": 0.2010050266981125,
"learning_rate": 1.2787444177759068e-06,
"loss": 0.0058,
"step": 928
},
{
"epoch": 14.746031746031747,
"grad_norm": 0.2182077020406723,
"learning_rate": 1.243617378465689e-06,
"loss": 0.0053,
"step": 929
},
{
"epoch": 14.761904761904763,
"grad_norm": 0.23424509167671204,
"learning_rate": 1.208973475579761e-06,
"loss": 0.0055,
"step": 930
},
{
"epoch": 14.777777777777779,
"grad_norm": 0.1593056619167328,
"learning_rate": 1.1748130524112666e-06,
"loss": 0.0038,
"step": 931
},
{
"epoch": 14.793650793650794,
"grad_norm": 0.1183331161737442,
"learning_rate": 1.1411364474624264e-06,
"loss": 0.0035,
"step": 932
},
{
"epoch": 14.80952380952381,
"grad_norm": 0.1267019659280777,
"learning_rate": 1.1079439944412406e-06,
"loss": 0.0037,
"step": 933
},
{
"epoch": 14.825396825396826,
"grad_norm": 0.1250416487455368,
"learning_rate": 1.075236022258147e-06,
"loss": 0.0033,
"step": 934
},
{
"epoch": 14.841269841269842,
"grad_norm": 0.19605623185634613,
"learning_rate": 1.0430128550227625e-06,
"loss": 0.0034,
"step": 935
},
{
"epoch": 14.857142857142858,
"grad_norm": 0.18377277255058289,
"learning_rate": 1.0112748120406856e-06,
"loss": 0.007,
"step": 936
},
{
"epoch": 14.873015873015873,
"grad_norm": 0.1912008672952652,
"learning_rate": 9.800222078103271e-07,
"loss": 0.0042,
"step": 937
},
{
"epoch": 14.88888888888889,
"grad_norm": 0.1927856057882309,
"learning_rate": 9.492553520197733e-07,
"loss": 0.0055,
"step": 938
},
{
"epoch": 14.904761904761905,
"grad_norm": 0.103274405002594,
"learning_rate": 9.189745495437608e-07,
"loss": 0.0034,
"step": 939
},
{
"epoch": 14.920634920634921,
"grad_norm": 0.1846938282251358,
"learning_rate": 8.891801004406119e-07,
"loss": 0.0047,
"step": 940
},
{
"epoch": 14.936507936507937,
"grad_norm": 0.12870021164417267,
"learning_rate": 8.59872299949288e-07,
"loss": 0.0028,
"step": 941
},
{
"epoch": 14.952380952380953,
"grad_norm": 0.09814100712537766,
"learning_rate": 8.31051438486441e-07,
"loss": 0.0027,
"step": 942
},
{
"epoch": 14.968253968253968,
"grad_norm": 0.12259647250175476,
"learning_rate": 8.027178016435765e-07,
"loss": 0.003,
"step": 943
},
{
"epoch": 14.984126984126984,
"grad_norm": 0.2572350800037384,
"learning_rate": 7.748716701841685e-07,
"loss": 0.006,
"step": 944
},
{
"epoch": 15.0,
"grad_norm": 0.26040682196617126,
"learning_rate": 7.475133200409212e-07,
"loss": 0.0048,
"step": 945
},
{
"epoch": 15.015873015873016,
"grad_norm": 0.1166323646903038,
"learning_rate": 7.206430223130278e-07,
"loss": 0.0028,
"step": 946
},
{
"epoch": 15.031746031746032,
"grad_norm": 0.11518598347902298,
"learning_rate": 6.9426104326345e-07,
"loss": 0.0031,
"step": 947
},
{
"epoch": 15.047619047619047,
"grad_norm": 0.18673783540725708,
"learning_rate": 6.683676443163311e-07,
"loss": 0.0048,
"step": 948
},
{
"epoch": 15.063492063492063,
"grad_norm": 0.1127839982509613,
"learning_rate": 6.429630820543598e-07,
"loss": 0.0031,
"step": 949
},
{
"epoch": 15.079365079365079,
"grad_norm": 0.18263711035251617,
"learning_rate": 6.180476082162656e-07,
"loss": 0.004,
"step": 950
},
{
"epoch": 15.095238095238095,
"grad_norm": 0.1486678421497345,
"learning_rate": 5.936214696942887e-07,
"loss": 0.0037,
"step": 951
},
{
"epoch": 15.11111111111111,
"grad_norm": 0.2178022712469101,
"learning_rate": 5.696849085317646e-07,
"loss": 0.0057,
"step": 952
},
{
"epoch": 15.126984126984127,
"grad_norm": 0.12073294818401337,
"learning_rate": 5.462381619207091e-07,
"loss": 0.0031,
"step": 953
},
{
"epoch": 15.142857142857142,
"grad_norm": 0.12311496585607529,
"learning_rate": 5.232814621994598e-07,
"loss": 0.0036,
"step": 954
},
{
"epoch": 15.158730158730158,
"grad_norm": 0.16713330149650574,
"learning_rate": 5.008150368503994e-07,
"loss": 0.0038,
"step": 955
},
{
"epoch": 15.174603174603174,
"grad_norm": 0.1170608177781105,
"learning_rate": 4.788391084976862e-07,
"loss": 0.0033,
"step": 956
},
{
"epoch": 15.19047619047619,
"grad_norm": 0.06233490630984306,
"learning_rate": 4.573538949050327e-07,
"loss": 0.0023,
"step": 957
},
{
"epoch": 15.206349206349206,
"grad_norm": 0.13149504363536835,
"learning_rate": 4.363596089735911e-07,
"loss": 0.0031,
"step": 958
},
{
"epoch": 15.222222222222221,
"grad_norm": 0.16984321177005768,
"learning_rate": 4.1585645873978284e-07,
"loss": 0.0046,
"step": 959
},
{
"epoch": 15.238095238095237,
"grad_norm": 0.14544299244880676,
"learning_rate": 3.958446473733002e-07,
"loss": 0.0033,
"step": 960
},
{
"epoch": 15.253968253968253,
"grad_norm": 0.17623476684093475,
"learning_rate": 3.7632437317505207e-07,
"loss": 0.0046,
"step": 961
},
{
"epoch": 15.26984126984127,
"grad_norm": 0.12246549874544144,
"learning_rate": 3.572958295752049e-07,
"loss": 0.0034,
"step": 962
},
{
"epoch": 15.285714285714286,
"grad_norm": 0.14989396929740906,
"learning_rate": 3.387592051312782e-07,
"loss": 0.0036,
"step": 963
},
{
"epoch": 15.301587301587302,
"grad_norm": 0.19900646805763245,
"learning_rate": 3.207146835262742e-07,
"loss": 0.0057,
"step": 964
},
{
"epoch": 15.317460317460318,
"grad_norm": 0.1741442084312439,
"learning_rate": 3.0316244356683454e-07,
"loss": 0.0047,
"step": 965
},
{
"epoch": 15.333333333333334,
"grad_norm": 0.15245862305164337,
"learning_rate": 2.8610265918151414e-07,
"loss": 0.0046,
"step": 966
},
{
"epoch": 15.34920634920635,
"grad_norm": 0.19708728790283203,
"learning_rate": 2.695354994190047e-07,
"loss": 0.0058,
"step": 967
},
{
"epoch": 15.365079365079366,
"grad_norm": 0.13684900104999542,
"learning_rate": 2.534611284465083e-07,
"loss": 0.0037,
"step": 968
},
{
"epoch": 15.380952380952381,
"grad_norm": 0.18838024139404297,
"learning_rate": 2.3787970554806084e-07,
"loss": 0.0043,
"step": 969
},
{
"epoch": 15.396825396825397,
"grad_norm": 0.18869999051094055,
"learning_rate": 2.2279138512300567e-07,
"loss": 0.0056,
"step": 970
},
{
"epoch": 15.412698412698413,
"grad_norm": 0.14952099323272705,
"learning_rate": 2.0819631668442253e-07,
"loss": 0.0038,
"step": 971
},
{
"epoch": 15.428571428571429,
"grad_norm": 0.20797456800937653,
"learning_rate": 1.940946448576675e-07,
"loss": 0.0056,
"step": 972
},
{
"epoch": 15.444444444444445,
"grad_norm": 0.17077018320560455,
"learning_rate": 1.8048650937893542e-07,
"loss": 0.0049,
"step": 973
},
{
"epoch": 15.46031746031746,
"grad_norm": 0.16229721903800964,
"learning_rate": 1.6737204509387206e-07,
"loss": 0.0038,
"step": 974
},
{
"epoch": 15.476190476190476,
"grad_norm": 0.06878882646560669,
"learning_rate": 1.5475138195623629e-07,
"loss": 0.0024,
"step": 975
},
{
"epoch": 15.492063492063492,
"grad_norm": 0.1002248004078865,
"learning_rate": 1.4262464502663443e-07,
"loss": 0.0028,
"step": 976
},
{
"epoch": 15.507936507936508,
"grad_norm": 0.1598724126815796,
"learning_rate": 1.309919544712268e-07,
"loss": 0.0051,
"step": 977
},
{
"epoch": 15.523809523809524,
"grad_norm": 0.20375491678714752,
"learning_rate": 1.1985342556060652e-07,
"loss": 0.0048,
"step": 978
},
{
"epoch": 15.53968253968254,
"grad_norm": 0.15220007300376892,
"learning_rate": 1.0920916866861142e-07,
"loss": 0.0037,
"step": 979
},
{
"epoch": 15.555555555555555,
"grad_norm": 0.13166747987270355,
"learning_rate": 9.905928927123609e-08,
"loss": 0.0041,
"step": 980
},
{
"epoch": 15.571428571428571,
"grad_norm": 0.16521938145160675,
"learning_rate": 8.940388794559939e-08,
"loss": 0.0043,
"step": 981
},
{
"epoch": 15.587301587301587,
"grad_norm": 0.22669538855552673,
"learning_rate": 8.02430603689397e-08,
"loss": 0.006,
"step": 982
},
{
"epoch": 15.603174603174603,
"grad_norm": 0.09708595275878906,
"learning_rate": 7.157689731767669e-08,
"loss": 0.0025,
"step": 983
},
{
"epoch": 15.619047619047619,
"grad_norm": 0.2131219506263733,
"learning_rate": 6.340548466648443e-08,
"loss": 0.0051,
"step": 984
},
{
"epoch": 15.634920634920634,
"grad_norm": 0.1999976485967636,
"learning_rate": 5.572890338748082e-08,
"loss": 0.0046,
"step": 985
},
{
"epoch": 15.65079365079365,
"grad_norm": 0.10222487151622772,
"learning_rate": 4.8547229549383844e-08,
"loss": 0.0037,
"step": 986
},
{
"epoch": 15.666666666666666,
"grad_norm": 0.25009259581565857,
"learning_rate": 4.186053431680104e-08,
"loss": 0.0068,
"step": 987
},
{
"epoch": 15.682539682539682,
"grad_norm": 0.06356369704008102,
"learning_rate": 3.566888394948009e-08,
"loss": 0.0022,
"step": 988
},
{
"epoch": 15.698412698412698,
"grad_norm": 0.13318653404712677,
"learning_rate": 2.997233980168157e-08,
"loss": 0.0038,
"step": 989
},
{
"epoch": 15.714285714285714,
"grad_norm": 0.05918239429593086,
"learning_rate": 2.4770958321568283e-08,
"loss": 0.0022,
"step": 990
},
{
"epoch": 15.73015873015873,
"grad_norm": 0.1082151010632515,
"learning_rate": 2.0064791050633526e-08,
"loss": 0.0031,
"step": 991
},
{
"epoch": 15.746031746031747,
"grad_norm": 0.22153517603874207,
"learning_rate": 1.5853884623195925e-08,
"loss": 0.0049,
"step": 992
},
{
"epoch": 15.761904761904763,
"grad_norm": 0.09333167225122452,
"learning_rate": 1.2138280765944254e-08,
"loss": 0.0028,
"step": 993
},
{
"epoch": 15.777777777777779,
"grad_norm": 0.14806319773197174,
"learning_rate": 8.918016297515541e-09,
"loss": 0.0031,
"step": 994
},
{
"epoch": 15.793650793650794,
"grad_norm": 0.15807633101940155,
"learning_rate": 6.193123128134248e-09,
"loss": 0.0041,
"step": 995
},
{
"epoch": 15.80952380952381,
"grad_norm": 0.1491064578294754,
"learning_rate": 3.963628259290308e-09,
"loss": 0.0039,
"step": 996
},
{
"epoch": 15.825396825396826,
"grad_norm": 0.1288636475801468,
"learning_rate": 2.229553783478222e-09,
"loss": 0.0035,
"step": 997
},
{
"epoch": 15.841269841269842,
"grad_norm": 0.17619061470031738,
"learning_rate": 9.90916883986115e-10,
"loss": 0.0056,
"step": 998
},
{
"epoch": 15.857142857142858,
"grad_norm": 0.1407734900712967,
"learning_rate": 2.477298346958978e-10,
"loss": 0.0038,
"step": 999
},
{
"epoch": 15.873015873015873,
"grad_norm": 0.172795370221138,
"learning_rate": 0.0,
"loss": 0.0048,
"step": 1000
},
{
"epoch": 15.873015873015873,
"step": 1000,
"total_flos": 1.610662192030679e+17,
"train_loss": 0.14257763476669788,
"train_runtime": 58489.782,
"train_samples_per_second": 0.274,
"train_steps_per_second": 0.017
}
],
"logging_steps": 1,
"max_steps": 1000,
"num_input_tokens_seen": 0,
"num_train_epochs": 16,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.610662192030679e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}