baseline-gemma-2-2b-it-sft / trainer_state.json
ZhangShenao's picture
Model save
2243e3c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9959072305593453,
"eval_steps": 500,
"global_step": 1098,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002728512960436562,
"grad_norm": 9.15278434753418,
"learning_rate": 1.8181818181818183e-07,
"loss": 1.071,
"step": 1
},
{
"epoch": 0.005457025920873124,
"grad_norm": 9.647517204284668,
"learning_rate": 3.6363636363636366e-07,
"loss": 1.0791,
"step": 2
},
{
"epoch": 0.008185538881309686,
"grad_norm": 9.722785949707031,
"learning_rate": 5.454545454545455e-07,
"loss": 1.0874,
"step": 3
},
{
"epoch": 0.010914051841746248,
"grad_norm": 9.583983421325684,
"learning_rate": 7.272727272727273e-07,
"loss": 1.0872,
"step": 4
},
{
"epoch": 0.013642564802182811,
"grad_norm": 9.145880699157715,
"learning_rate": 9.090909090909091e-07,
"loss": 1.0747,
"step": 5
},
{
"epoch": 0.01637107776261937,
"grad_norm": 9.105477333068848,
"learning_rate": 1.090909090909091e-06,
"loss": 1.0738,
"step": 6
},
{
"epoch": 0.019099590723055934,
"grad_norm": 8.226037979125977,
"learning_rate": 1.2727272727272728e-06,
"loss": 1.0432,
"step": 7
},
{
"epoch": 0.021828103683492497,
"grad_norm": 7.477120876312256,
"learning_rate": 1.4545454545454546e-06,
"loss": 1.0028,
"step": 8
},
{
"epoch": 0.02455661664392906,
"grad_norm": 6.091797351837158,
"learning_rate": 1.6363636363636365e-06,
"loss": 0.9729,
"step": 9
},
{
"epoch": 0.027285129604365622,
"grad_norm": 5.667421817779541,
"learning_rate": 1.8181818181818183e-06,
"loss": 0.9413,
"step": 10
},
{
"epoch": 0.030013642564802184,
"grad_norm": 4.3738813400268555,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.8404,
"step": 11
},
{
"epoch": 0.03274215552523874,
"grad_norm": 4.342959880828857,
"learning_rate": 2.181818181818182e-06,
"loss": 0.8228,
"step": 12
},
{
"epoch": 0.03547066848567531,
"grad_norm": 3.8612661361694336,
"learning_rate": 2.363636363636364e-06,
"loss": 0.8026,
"step": 13
},
{
"epoch": 0.03819918144611187,
"grad_norm": 3.513092517852783,
"learning_rate": 2.5454545454545456e-06,
"loss": 0.7898,
"step": 14
},
{
"epoch": 0.040927694406548434,
"grad_norm": 4.365728378295898,
"learning_rate": 2.7272727272727272e-06,
"loss": 0.7217,
"step": 15
},
{
"epoch": 0.04365620736698499,
"grad_norm": 3.0272576808929443,
"learning_rate": 2.9090909090909093e-06,
"loss": 0.7098,
"step": 16
},
{
"epoch": 0.04638472032742155,
"grad_norm": 1.917324423789978,
"learning_rate": 3.090909090909091e-06,
"loss": 0.6913,
"step": 17
},
{
"epoch": 0.04911323328785812,
"grad_norm": 1.4591542482376099,
"learning_rate": 3.272727272727273e-06,
"loss": 0.6585,
"step": 18
},
{
"epoch": 0.05184174624829468,
"grad_norm": 1.2868497371673584,
"learning_rate": 3.454545454545455e-06,
"loss": 0.6543,
"step": 19
},
{
"epoch": 0.054570259208731244,
"grad_norm": 1.2492839097976685,
"learning_rate": 3.6363636363636366e-06,
"loss": 0.6487,
"step": 20
},
{
"epoch": 0.0572987721691678,
"grad_norm": 1.1911948919296265,
"learning_rate": 3.818181818181819e-06,
"loss": 0.6244,
"step": 21
},
{
"epoch": 0.06002728512960437,
"grad_norm": 0.9519822597503662,
"learning_rate": 4.000000000000001e-06,
"loss": 0.6212,
"step": 22
},
{
"epoch": 0.06275579809004093,
"grad_norm": 0.92195725440979,
"learning_rate": 4.181818181818182e-06,
"loss": 0.6121,
"step": 23
},
{
"epoch": 0.06548431105047749,
"grad_norm": 0.9371785521507263,
"learning_rate": 4.363636363636364e-06,
"loss": 0.595,
"step": 24
},
{
"epoch": 0.06821282401091405,
"grad_norm": 0.8275936245918274,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.5948,
"step": 25
},
{
"epoch": 0.07094133697135062,
"grad_norm": 0.7959325313568115,
"learning_rate": 4.727272727272728e-06,
"loss": 0.5912,
"step": 26
},
{
"epoch": 0.07366984993178717,
"grad_norm": 0.7594197392463684,
"learning_rate": 4.90909090909091e-06,
"loss": 0.5825,
"step": 27
},
{
"epoch": 0.07639836289222374,
"grad_norm": 0.7820079326629639,
"learning_rate": 5.090909090909091e-06,
"loss": 0.5589,
"step": 28
},
{
"epoch": 0.0791268758526603,
"grad_norm": 0.7125181555747986,
"learning_rate": 5.272727272727273e-06,
"loss": 0.5774,
"step": 29
},
{
"epoch": 0.08185538881309687,
"grad_norm": 0.6750136613845825,
"learning_rate": 5.4545454545454545e-06,
"loss": 0.5613,
"step": 30
},
{
"epoch": 0.08458390177353342,
"grad_norm": 0.6903117895126343,
"learning_rate": 5.636363636363636e-06,
"loss": 0.5659,
"step": 31
},
{
"epoch": 0.08731241473396999,
"grad_norm": 0.7005020380020142,
"learning_rate": 5.8181818181818185e-06,
"loss": 0.565,
"step": 32
},
{
"epoch": 0.09004092769440655,
"grad_norm": 0.6101182103157043,
"learning_rate": 6e-06,
"loss": 0.5515,
"step": 33
},
{
"epoch": 0.0927694406548431,
"grad_norm": 0.6128501892089844,
"learning_rate": 6.181818181818182e-06,
"loss": 0.5444,
"step": 34
},
{
"epoch": 0.09549795361527967,
"grad_norm": 0.6540465950965881,
"learning_rate": 6.363636363636364e-06,
"loss": 0.5529,
"step": 35
},
{
"epoch": 0.09822646657571624,
"grad_norm": 0.5642831325531006,
"learning_rate": 6.545454545454546e-06,
"loss": 0.5462,
"step": 36
},
{
"epoch": 0.1009549795361528,
"grad_norm": 0.5906216502189636,
"learning_rate": 6.7272727272727275e-06,
"loss": 0.5429,
"step": 37
},
{
"epoch": 0.10368349249658936,
"grad_norm": 0.5924307107925415,
"learning_rate": 6.90909090909091e-06,
"loss": 0.5317,
"step": 38
},
{
"epoch": 0.10641200545702592,
"grad_norm": 0.5813631415367126,
"learning_rate": 7.0909090909090916e-06,
"loss": 0.5353,
"step": 39
},
{
"epoch": 0.10914051841746249,
"grad_norm": 0.5987147092819214,
"learning_rate": 7.272727272727273e-06,
"loss": 0.5262,
"step": 40
},
{
"epoch": 0.11186903137789904,
"grad_norm": 0.5837368369102478,
"learning_rate": 7.454545454545456e-06,
"loss": 0.52,
"step": 41
},
{
"epoch": 0.1145975443383356,
"grad_norm": 0.5774114727973938,
"learning_rate": 7.636363636363638e-06,
"loss": 0.5186,
"step": 42
},
{
"epoch": 0.11732605729877217,
"grad_norm": 0.6058359146118164,
"learning_rate": 7.81818181818182e-06,
"loss": 0.5162,
"step": 43
},
{
"epoch": 0.12005457025920874,
"grad_norm": 0.5552087426185608,
"learning_rate": 8.000000000000001e-06,
"loss": 0.5134,
"step": 44
},
{
"epoch": 0.12278308321964529,
"grad_norm": 0.5530264973640442,
"learning_rate": 8.181818181818183e-06,
"loss": 0.5115,
"step": 45
},
{
"epoch": 0.12551159618008187,
"grad_norm": 0.5766640901565552,
"learning_rate": 8.363636363636365e-06,
"loss": 0.5107,
"step": 46
},
{
"epoch": 0.12824010914051842,
"grad_norm": 0.5763387680053711,
"learning_rate": 8.545454545454546e-06,
"loss": 0.5169,
"step": 47
},
{
"epoch": 0.13096862210095497,
"grad_norm": 0.5950232744216919,
"learning_rate": 8.727272727272728e-06,
"loss": 0.496,
"step": 48
},
{
"epoch": 0.13369713506139155,
"grad_norm": 0.550932765007019,
"learning_rate": 8.90909090909091e-06,
"loss": 0.4977,
"step": 49
},
{
"epoch": 0.1364256480218281,
"grad_norm": 0.5710775256156921,
"learning_rate": 9.090909090909091e-06,
"loss": 0.4979,
"step": 50
},
{
"epoch": 0.13915416098226466,
"grad_norm": 0.5536239743232727,
"learning_rate": 9.272727272727273e-06,
"loss": 0.4901,
"step": 51
},
{
"epoch": 0.14188267394270124,
"grad_norm": 0.5787481665611267,
"learning_rate": 9.454545454545456e-06,
"loss": 0.4985,
"step": 52
},
{
"epoch": 0.1446111869031378,
"grad_norm": 0.5732221007347107,
"learning_rate": 9.636363636363638e-06,
"loss": 0.4995,
"step": 53
},
{
"epoch": 0.14733969986357434,
"grad_norm": 0.5549193024635315,
"learning_rate": 9.81818181818182e-06,
"loss": 0.4786,
"step": 54
},
{
"epoch": 0.15006821282401092,
"grad_norm": 0.5745016932487488,
"learning_rate": 1e-05,
"loss": 0.4814,
"step": 55
},
{
"epoch": 0.15279672578444747,
"grad_norm": 0.5580504536628723,
"learning_rate": 1.0181818181818182e-05,
"loss": 0.4786,
"step": 56
},
{
"epoch": 0.15552523874488403,
"grad_norm": 0.5935932397842407,
"learning_rate": 1.0363636363636364e-05,
"loss": 0.492,
"step": 57
},
{
"epoch": 0.1582537517053206,
"grad_norm": 0.5717213153839111,
"learning_rate": 1.0545454545454546e-05,
"loss": 0.47,
"step": 58
},
{
"epoch": 0.16098226466575716,
"grad_norm": 0.5752708315849304,
"learning_rate": 1.0727272727272729e-05,
"loss": 0.4728,
"step": 59
},
{
"epoch": 0.16371077762619374,
"grad_norm": 0.6000607013702393,
"learning_rate": 1.0909090909090909e-05,
"loss": 0.4667,
"step": 60
},
{
"epoch": 0.1664392905866303,
"grad_norm": 0.5734297633171082,
"learning_rate": 1.1090909090909092e-05,
"loss": 0.4666,
"step": 61
},
{
"epoch": 0.16916780354706684,
"grad_norm": 0.6120443940162659,
"learning_rate": 1.1272727272727272e-05,
"loss": 0.4629,
"step": 62
},
{
"epoch": 0.17189631650750342,
"grad_norm": 0.5620084404945374,
"learning_rate": 1.1454545454545455e-05,
"loss": 0.4587,
"step": 63
},
{
"epoch": 0.17462482946793997,
"grad_norm": 0.6068463921546936,
"learning_rate": 1.1636363636363637e-05,
"loss": 0.4671,
"step": 64
},
{
"epoch": 0.17735334242837653,
"grad_norm": 0.5794389843940735,
"learning_rate": 1.181818181818182e-05,
"loss": 0.4559,
"step": 65
},
{
"epoch": 0.1800818553888131,
"grad_norm": 0.6076776385307312,
"learning_rate": 1.2e-05,
"loss": 0.4536,
"step": 66
},
{
"epoch": 0.18281036834924966,
"grad_norm": 0.5867766737937927,
"learning_rate": 1.2181818181818184e-05,
"loss": 0.4535,
"step": 67
},
{
"epoch": 0.1855388813096862,
"grad_norm": 0.639927089214325,
"learning_rate": 1.2363636363636364e-05,
"loss": 0.446,
"step": 68
},
{
"epoch": 0.1882673942701228,
"grad_norm": 0.6484801173210144,
"learning_rate": 1.2545454545454547e-05,
"loss": 0.4394,
"step": 69
},
{
"epoch": 0.19099590723055934,
"grad_norm": 0.6096192002296448,
"learning_rate": 1.2727272727272728e-05,
"loss": 0.4378,
"step": 70
},
{
"epoch": 0.1937244201909959,
"grad_norm": 0.6389114260673523,
"learning_rate": 1.2909090909090912e-05,
"loss": 0.4445,
"step": 71
},
{
"epoch": 0.19645293315143247,
"grad_norm": 0.6160559058189392,
"learning_rate": 1.3090909090909092e-05,
"loss": 0.4359,
"step": 72
},
{
"epoch": 0.19918144611186903,
"grad_norm": 0.6258884072303772,
"learning_rate": 1.3272727272727275e-05,
"loss": 0.4244,
"step": 73
},
{
"epoch": 0.2019099590723056,
"grad_norm": 0.6996473073959351,
"learning_rate": 1.3454545454545455e-05,
"loss": 0.437,
"step": 74
},
{
"epoch": 0.20463847203274216,
"grad_norm": 0.6465001702308655,
"learning_rate": 1.3636363636363637e-05,
"loss": 0.4224,
"step": 75
},
{
"epoch": 0.2073669849931787,
"grad_norm": 0.6399327516555786,
"learning_rate": 1.381818181818182e-05,
"loss": 0.4258,
"step": 76
},
{
"epoch": 0.2100954979536153,
"grad_norm": 0.7422960996627808,
"learning_rate": 1.4e-05,
"loss": 0.4199,
"step": 77
},
{
"epoch": 0.21282401091405184,
"grad_norm": 0.6545052528381348,
"learning_rate": 1.4181818181818183e-05,
"loss": 0.4243,
"step": 78
},
{
"epoch": 0.2155525238744884,
"grad_norm": 0.6757943630218506,
"learning_rate": 1.4363636363636365e-05,
"loss": 0.4093,
"step": 79
},
{
"epoch": 0.21828103683492497,
"grad_norm": 1.1193770170211792,
"learning_rate": 1.4545454545454546e-05,
"loss": 0.4144,
"step": 80
},
{
"epoch": 0.22100954979536153,
"grad_norm": 0.8388747572898865,
"learning_rate": 1.4727272727272728e-05,
"loss": 0.4079,
"step": 81
},
{
"epoch": 0.22373806275579808,
"grad_norm": 0.7611749172210693,
"learning_rate": 1.4909090909090911e-05,
"loss": 0.4066,
"step": 82
},
{
"epoch": 0.22646657571623466,
"grad_norm": 0.8053273558616638,
"learning_rate": 1.5090909090909091e-05,
"loss": 0.4045,
"step": 83
},
{
"epoch": 0.2291950886766712,
"grad_norm": 0.8546133637428284,
"learning_rate": 1.5272727272727276e-05,
"loss": 0.3915,
"step": 84
},
{
"epoch": 0.23192360163710776,
"grad_norm": 0.7772160172462463,
"learning_rate": 1.5454545454545454e-05,
"loss": 0.3951,
"step": 85
},
{
"epoch": 0.23465211459754434,
"grad_norm": 0.7378780245780945,
"learning_rate": 1.563636363636364e-05,
"loss": 0.3938,
"step": 86
},
{
"epoch": 0.2373806275579809,
"grad_norm": 0.8644944429397583,
"learning_rate": 1.5818181818181818e-05,
"loss": 0.3924,
"step": 87
},
{
"epoch": 0.24010914051841747,
"grad_norm": 0.7195233702659607,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.3832,
"step": 88
},
{
"epoch": 0.24283765347885403,
"grad_norm": 0.714919924736023,
"learning_rate": 1.6181818181818184e-05,
"loss": 0.3865,
"step": 89
},
{
"epoch": 0.24556616643929058,
"grad_norm": 0.7049028873443604,
"learning_rate": 1.6363636363636366e-05,
"loss": 0.3889,
"step": 90
},
{
"epoch": 0.24829467939972716,
"grad_norm": 0.6938926577568054,
"learning_rate": 1.6545454545454548e-05,
"loss": 0.3854,
"step": 91
},
{
"epoch": 0.25102319236016374,
"grad_norm": 0.7241790294647217,
"learning_rate": 1.672727272727273e-05,
"loss": 0.3745,
"step": 92
},
{
"epoch": 0.25375170532060026,
"grad_norm": 0.7766016721725464,
"learning_rate": 1.690909090909091e-05,
"loss": 0.3766,
"step": 93
},
{
"epoch": 0.25648021828103684,
"grad_norm": 0.6900867223739624,
"learning_rate": 1.7090909090909092e-05,
"loss": 0.3661,
"step": 94
},
{
"epoch": 0.2592087312414734,
"grad_norm": 0.7429078221321106,
"learning_rate": 1.7272727272727274e-05,
"loss": 0.369,
"step": 95
},
{
"epoch": 0.26193724420190995,
"grad_norm": 0.7221667766571045,
"learning_rate": 1.7454545454545456e-05,
"loss": 0.36,
"step": 96
},
{
"epoch": 0.2646657571623465,
"grad_norm": 0.8097471594810486,
"learning_rate": 1.7636363636363637e-05,
"loss": 0.3594,
"step": 97
},
{
"epoch": 0.2673942701227831,
"grad_norm": 0.7111004590988159,
"learning_rate": 1.781818181818182e-05,
"loss": 0.3649,
"step": 98
},
{
"epoch": 0.27012278308321963,
"grad_norm": 0.8246558904647827,
"learning_rate": 1.8e-05,
"loss": 0.3726,
"step": 99
},
{
"epoch": 0.2728512960436562,
"grad_norm": 0.7303751111030579,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.356,
"step": 100
},
{
"epoch": 0.2755798090040928,
"grad_norm": 0.7322264909744263,
"learning_rate": 1.8363636363636367e-05,
"loss": 0.3542,
"step": 101
},
{
"epoch": 0.2783083219645293,
"grad_norm": 0.7856989502906799,
"learning_rate": 1.8545454545454545e-05,
"loss": 0.3479,
"step": 102
},
{
"epoch": 0.2810368349249659,
"grad_norm": 0.6907728910446167,
"learning_rate": 1.872727272727273e-05,
"loss": 0.3428,
"step": 103
},
{
"epoch": 0.2837653478854025,
"grad_norm": 0.8893505930900574,
"learning_rate": 1.8909090909090912e-05,
"loss": 0.3453,
"step": 104
},
{
"epoch": 0.286493860845839,
"grad_norm": 0.8685447573661804,
"learning_rate": 1.9090909090909094e-05,
"loss": 0.3458,
"step": 105
},
{
"epoch": 0.2892223738062756,
"grad_norm": 0.7228476405143738,
"learning_rate": 1.9272727272727275e-05,
"loss": 0.3329,
"step": 106
},
{
"epoch": 0.29195088676671216,
"grad_norm": 0.8950058221817017,
"learning_rate": 1.9454545454545457e-05,
"loss": 0.3318,
"step": 107
},
{
"epoch": 0.2946793997271487,
"grad_norm": 0.7596233487129211,
"learning_rate": 1.963636363636364e-05,
"loss": 0.3435,
"step": 108
},
{
"epoch": 0.29740791268758526,
"grad_norm": 0.745883047580719,
"learning_rate": 1.981818181818182e-05,
"loss": 0.3379,
"step": 109
},
{
"epoch": 0.30013642564802184,
"grad_norm": 0.729240894317627,
"learning_rate": 2e-05,
"loss": 0.3338,
"step": 110
},
{
"epoch": 0.30286493860845837,
"grad_norm": 0.7607076168060303,
"learning_rate": 1.9999949446003432e-05,
"loss": 0.3302,
"step": 111
},
{
"epoch": 0.30559345156889495,
"grad_norm": 0.7514939308166504,
"learning_rate": 1.9999797784524866e-05,
"loss": 0.3305,
"step": 112
},
{
"epoch": 0.3083219645293315,
"grad_norm": 0.6919752955436707,
"learning_rate": 1.9999545017097726e-05,
"loss": 0.3256,
"step": 113
},
{
"epoch": 0.31105047748976805,
"grad_norm": 0.745072603225708,
"learning_rate": 1.999919114627769e-05,
"loss": 0.3234,
"step": 114
},
{
"epoch": 0.31377899045020463,
"grad_norm": 0.7782521843910217,
"learning_rate": 1.9998736175642674e-05,
"loss": 0.3341,
"step": 115
},
{
"epoch": 0.3165075034106412,
"grad_norm": 0.70183265209198,
"learning_rate": 1.9998180109792793e-05,
"loss": 0.3079,
"step": 116
},
{
"epoch": 0.31923601637107774,
"grad_norm": 0.8084478378295898,
"learning_rate": 1.999752295435032e-05,
"loss": 0.3245,
"step": 117
},
{
"epoch": 0.3219645293315143,
"grad_norm": 0.70412278175354,
"learning_rate": 1.999676471595962e-05,
"loss": 0.318,
"step": 118
},
{
"epoch": 0.3246930422919509,
"grad_norm": 0.74163818359375,
"learning_rate": 1.9995905402287094e-05,
"loss": 0.3139,
"step": 119
},
{
"epoch": 0.3274215552523875,
"grad_norm": 0.6615740656852722,
"learning_rate": 1.9994945022021085e-05,
"loss": 0.3058,
"step": 120
},
{
"epoch": 0.330150068212824,
"grad_norm": 0.8651596903800964,
"learning_rate": 1.9993883584871807e-05,
"loss": 0.3171,
"step": 121
},
{
"epoch": 0.3328785811732606,
"grad_norm": 0.712332010269165,
"learning_rate": 1.9992721101571238e-05,
"loss": 0.2981,
"step": 122
},
{
"epoch": 0.33560709413369716,
"grad_norm": 0.7112699747085571,
"learning_rate": 1.999145758387301e-05,
"loss": 0.3184,
"step": 123
},
{
"epoch": 0.3383356070941337,
"grad_norm": 0.7106016874313354,
"learning_rate": 1.9990093044552304e-05,
"loss": 0.2973,
"step": 124
},
{
"epoch": 0.34106412005457026,
"grad_norm": 0.7371537685394287,
"learning_rate": 1.9988627497405696e-05,
"loss": 0.3082,
"step": 125
},
{
"epoch": 0.34379263301500684,
"grad_norm": 0.6868679523468018,
"learning_rate": 1.9987060957251047e-05,
"loss": 0.3042,
"step": 126
},
{
"epoch": 0.34652114597544337,
"grad_norm": 0.7139438986778259,
"learning_rate": 1.9985393439927325e-05,
"loss": 0.3072,
"step": 127
},
{
"epoch": 0.34924965893587995,
"grad_norm": 0.7296750545501709,
"learning_rate": 1.998362496229446e-05,
"loss": 0.3042,
"step": 128
},
{
"epoch": 0.3519781718963165,
"grad_norm": 0.6611348390579224,
"learning_rate": 1.9981755542233175e-05,
"loss": 0.2931,
"step": 129
},
{
"epoch": 0.35470668485675305,
"grad_norm": 0.6664003133773804,
"learning_rate": 1.997978519864481e-05,
"loss": 0.3008,
"step": 130
},
{
"epoch": 0.35743519781718963,
"grad_norm": 0.6638462543487549,
"learning_rate": 1.9977713951451102e-05,
"loss": 0.3034,
"step": 131
},
{
"epoch": 0.3601637107776262,
"grad_norm": 0.7581326961517334,
"learning_rate": 1.9975541821594028e-05,
"loss": 0.3027,
"step": 132
},
{
"epoch": 0.36289222373806274,
"grad_norm": 0.6742005348205566,
"learning_rate": 1.9973268831035547e-05,
"loss": 0.2966,
"step": 133
},
{
"epoch": 0.3656207366984993,
"grad_norm": 0.7190444469451904,
"learning_rate": 1.9970895002757413e-05,
"loss": 0.2928,
"step": 134
},
{
"epoch": 0.3683492496589359,
"grad_norm": 0.6588881015777588,
"learning_rate": 1.996842036076093e-05,
"loss": 0.2966,
"step": 135
},
{
"epoch": 0.3710777626193724,
"grad_norm": 0.7533734440803528,
"learning_rate": 1.99658449300667e-05,
"loss": 0.2939,
"step": 136
},
{
"epoch": 0.373806275579809,
"grad_norm": 0.6481987833976746,
"learning_rate": 1.9963168736714395e-05,
"loss": 0.2903,
"step": 137
},
{
"epoch": 0.3765347885402456,
"grad_norm": 0.6547331809997559,
"learning_rate": 1.9960391807762462e-05,
"loss": 0.2849,
"step": 138
},
{
"epoch": 0.3792633015006821,
"grad_norm": 0.6273127794265747,
"learning_rate": 1.9957514171287875e-05,
"loss": 0.2858,
"step": 139
},
{
"epoch": 0.3819918144611187,
"grad_norm": 0.6538695096969604,
"learning_rate": 1.995453585638584e-05,
"loss": 0.2884,
"step": 140
},
{
"epoch": 0.38472032742155526,
"grad_norm": 0.6586217284202576,
"learning_rate": 1.9951456893169497e-05,
"loss": 0.2807,
"step": 141
},
{
"epoch": 0.3874488403819918,
"grad_norm": 0.6591027975082397,
"learning_rate": 1.994827731276963e-05,
"loss": 0.2919,
"step": 142
},
{
"epoch": 0.39017735334242837,
"grad_norm": 0.616389274597168,
"learning_rate": 1.994499714733434e-05,
"loss": 0.2843,
"step": 143
},
{
"epoch": 0.39290586630286495,
"grad_norm": 0.6623064875602722,
"learning_rate": 1.9941616430028713e-05,
"loss": 0.2841,
"step": 144
},
{
"epoch": 0.3956343792633015,
"grad_norm": 0.617335319519043,
"learning_rate": 1.993813519503451e-05,
"loss": 0.2805,
"step": 145
},
{
"epoch": 0.39836289222373805,
"grad_norm": 0.6545597314834595,
"learning_rate": 1.9934553477549795e-05,
"loss": 0.2816,
"step": 146
},
{
"epoch": 0.40109140518417463,
"grad_norm": 0.6183223724365234,
"learning_rate": 1.99308713137886e-05,
"loss": 0.2766,
"step": 147
},
{
"epoch": 0.4038199181446112,
"grad_norm": 0.6186001896858215,
"learning_rate": 1.992708874098054e-05,
"loss": 0.272,
"step": 148
},
{
"epoch": 0.40654843110504774,
"grad_norm": 0.6923176646232605,
"learning_rate": 1.992320579737045e-05,
"loss": 0.2771,
"step": 149
},
{
"epoch": 0.4092769440654843,
"grad_norm": 0.659702718257904,
"learning_rate": 1.9919222522217998e-05,
"loss": 0.2795,
"step": 150
},
{
"epoch": 0.4120054570259209,
"grad_norm": 0.6168344616889954,
"learning_rate": 1.9915138955797272e-05,
"loss": 0.2759,
"step": 151
},
{
"epoch": 0.4147339699863574,
"grad_norm": 0.6312188506126404,
"learning_rate": 1.9910955139396395e-05,
"loss": 0.274,
"step": 152
},
{
"epoch": 0.417462482946794,
"grad_norm": 0.6516560316085815,
"learning_rate": 1.99066711153171e-05,
"loss": 0.2716,
"step": 153
},
{
"epoch": 0.4201909959072306,
"grad_norm": 0.6017420291900635,
"learning_rate": 1.990228692687429e-05,
"loss": 0.2659,
"step": 154
},
{
"epoch": 0.4229195088676671,
"grad_norm": 0.6232393980026245,
"learning_rate": 1.9897802618395614e-05,
"loss": 0.2747,
"step": 155
},
{
"epoch": 0.4256480218281037,
"grad_norm": 0.611860454082489,
"learning_rate": 1.9893218235221016e-05,
"loss": 0.2704,
"step": 156
},
{
"epoch": 0.42837653478854026,
"grad_norm": 0.6973581314086914,
"learning_rate": 1.988853382370228e-05,
"loss": 0.2801,
"step": 157
},
{
"epoch": 0.4311050477489768,
"grad_norm": 0.5803492069244385,
"learning_rate": 1.988374943120254e-05,
"loss": 0.2726,
"step": 158
},
{
"epoch": 0.43383356070941337,
"grad_norm": 0.6552534699440002,
"learning_rate": 1.9878865106095838e-05,
"loss": 0.2585,
"step": 159
},
{
"epoch": 0.43656207366984995,
"grad_norm": 0.5801110863685608,
"learning_rate": 1.9873880897766597e-05,
"loss": 0.2628,
"step": 160
},
{
"epoch": 0.4392905866302865,
"grad_norm": 0.7232557535171509,
"learning_rate": 1.9868796856609154e-05,
"loss": 0.2661,
"step": 161
},
{
"epoch": 0.44201909959072305,
"grad_norm": 0.6580977439880371,
"learning_rate": 1.9863613034027224e-05,
"loss": 0.2679,
"step": 162
},
{
"epoch": 0.44474761255115963,
"grad_norm": 0.5943677425384521,
"learning_rate": 1.9858329482433404e-05,
"loss": 0.2585,
"step": 163
},
{
"epoch": 0.44747612551159616,
"grad_norm": 0.6069351434707642,
"learning_rate": 1.985294625524861e-05,
"loss": 0.2627,
"step": 164
},
{
"epoch": 0.45020463847203274,
"grad_norm": 0.5810995101928711,
"learning_rate": 1.984746340690159e-05,
"loss": 0.2622,
"step": 165
},
{
"epoch": 0.4529331514324693,
"grad_norm": 0.6588981747627258,
"learning_rate": 1.9841880992828306e-05,
"loss": 0.26,
"step": 166
},
{
"epoch": 0.45566166439290584,
"grad_norm": 0.5870964527130127,
"learning_rate": 1.983619906947144e-05,
"loss": 0.2649,
"step": 167
},
{
"epoch": 0.4583901773533424,
"grad_norm": 0.6157024502754211,
"learning_rate": 1.9830417694279766e-05,
"loss": 0.2576,
"step": 168
},
{
"epoch": 0.461118690313779,
"grad_norm": 0.5841497778892517,
"learning_rate": 1.9824536925707622e-05,
"loss": 0.2506,
"step": 169
},
{
"epoch": 0.4638472032742155,
"grad_norm": 0.616535484790802,
"learning_rate": 1.981855682321427e-05,
"loss": 0.2556,
"step": 170
},
{
"epoch": 0.4665757162346521,
"grad_norm": 0.6029694080352783,
"learning_rate": 1.9812477447263324e-05,
"loss": 0.2567,
"step": 171
},
{
"epoch": 0.4693042291950887,
"grad_norm": 0.5664442777633667,
"learning_rate": 1.9806298859322143e-05,
"loss": 0.2549,
"step": 172
},
{
"epoch": 0.47203274215552526,
"grad_norm": 0.5878991484642029,
"learning_rate": 1.980002112186118e-05,
"loss": 0.2608,
"step": 173
},
{
"epoch": 0.4747612551159618,
"grad_norm": 0.5764362812042236,
"learning_rate": 1.979364429835339e-05,
"loss": 0.2514,
"step": 174
},
{
"epoch": 0.47748976807639837,
"grad_norm": 0.6009331345558167,
"learning_rate": 1.9787168453273546e-05,
"loss": 0.2488,
"step": 175
},
{
"epoch": 0.48021828103683495,
"grad_norm": 0.6152218580245972,
"learning_rate": 1.978059365209762e-05,
"loss": 0.2608,
"step": 176
},
{
"epoch": 0.4829467939972715,
"grad_norm": 0.5641770958900452,
"learning_rate": 1.9773919961302113e-05,
"loss": 0.2612,
"step": 177
},
{
"epoch": 0.48567530695770805,
"grad_norm": 0.5944207906723022,
"learning_rate": 1.9767147448363366e-05,
"loss": 0.2635,
"step": 178
},
{
"epoch": 0.48840381991814463,
"grad_norm": 0.5424714684486389,
"learning_rate": 1.9760276181756905e-05,
"loss": 0.2566,
"step": 179
},
{
"epoch": 0.49113233287858116,
"grad_norm": 0.6175258755683899,
"learning_rate": 1.975330623095672e-05,
"loss": 0.2543,
"step": 180
},
{
"epoch": 0.49386084583901774,
"grad_norm": 0.5744900107383728,
"learning_rate": 1.9746237666434588e-05,
"loss": 0.2471,
"step": 181
},
{
"epoch": 0.4965893587994543,
"grad_norm": 0.5477325320243835,
"learning_rate": 1.9739070559659347e-05,
"loss": 0.255,
"step": 182
},
{
"epoch": 0.49931787175989084,
"grad_norm": 0.599983811378479,
"learning_rate": 1.973180498309618e-05,
"loss": 0.252,
"step": 183
},
{
"epoch": 0.5020463847203275,
"grad_norm": 0.5861368179321289,
"learning_rate": 1.9724441010205865e-05,
"loss": 0.2527,
"step": 184
},
{
"epoch": 0.504774897680764,
"grad_norm": 0.5599756836891174,
"learning_rate": 1.9716978715444056e-05,
"loss": 0.249,
"step": 185
},
{
"epoch": 0.5075034106412005,
"grad_norm": 0.5516128540039062,
"learning_rate": 1.9709418174260523e-05,
"loss": 0.2433,
"step": 186
},
{
"epoch": 0.5102319236016372,
"grad_norm": 0.5438660383224487,
"learning_rate": 1.9701759463098377e-05,
"loss": 0.2479,
"step": 187
},
{
"epoch": 0.5129604365620737,
"grad_norm": 0.5530990362167358,
"learning_rate": 1.9694002659393306e-05,
"loss": 0.2503,
"step": 188
},
{
"epoch": 0.5156889495225102,
"grad_norm": 0.5980175733566284,
"learning_rate": 1.9686147841572803e-05,
"loss": 0.2452,
"step": 189
},
{
"epoch": 0.5184174624829468,
"grad_norm": 0.5309689044952393,
"learning_rate": 1.9678195089055347e-05,
"loss": 0.2381,
"step": 190
},
{
"epoch": 0.5211459754433834,
"grad_norm": 0.5269597172737122,
"learning_rate": 1.967014448224963e-05,
"loss": 0.24,
"step": 191
},
{
"epoch": 0.5238744884038199,
"grad_norm": 0.5427187085151672,
"learning_rate": 1.9661996102553716e-05,
"loss": 0.2453,
"step": 192
},
{
"epoch": 0.5266030013642565,
"grad_norm": 0.5384966731071472,
"learning_rate": 1.965375003235424e-05,
"loss": 0.2487,
"step": 193
},
{
"epoch": 0.529331514324693,
"grad_norm": 0.5402637124061584,
"learning_rate": 1.9645406355025565e-05,
"loss": 0.2412,
"step": 194
},
{
"epoch": 0.5320600272851296,
"grad_norm": 0.556141197681427,
"learning_rate": 1.9636965154928932e-05,
"loss": 0.2453,
"step": 195
},
{
"epoch": 0.5347885402455662,
"grad_norm": 0.5466804504394531,
"learning_rate": 1.9628426517411625e-05,
"loss": 0.2418,
"step": 196
},
{
"epoch": 0.5375170532060027,
"grad_norm": 0.5225583910942078,
"learning_rate": 1.9619790528806092e-05,
"loss": 0.242,
"step": 197
},
{
"epoch": 0.5402455661664393,
"grad_norm": 0.5193293690681458,
"learning_rate": 1.9611057276429085e-05,
"loss": 0.2352,
"step": 198
},
{
"epoch": 0.5429740791268759,
"grad_norm": 0.5324943661689758,
"learning_rate": 1.9602226848580762e-05,
"loss": 0.2473,
"step": 199
},
{
"epoch": 0.5457025920873124,
"grad_norm": 0.5119441747665405,
"learning_rate": 1.959329933454381e-05,
"loss": 0.2391,
"step": 200
},
{
"epoch": 0.548431105047749,
"grad_norm": 0.5389058589935303,
"learning_rate": 1.958427482458253e-05,
"loss": 0.239,
"step": 201
},
{
"epoch": 0.5511596180081856,
"grad_norm": 0.5182425379753113,
"learning_rate": 1.957515340994193e-05,
"loss": 0.2392,
"step": 202
},
{
"epoch": 0.5538881309686221,
"grad_norm": 0.5547599196434021,
"learning_rate": 1.95659351828468e-05,
"loss": 0.2439,
"step": 203
},
{
"epoch": 0.5566166439290586,
"grad_norm": 0.5299258232116699,
"learning_rate": 1.9556620236500794e-05,
"loss": 0.2423,
"step": 204
},
{
"epoch": 0.5593451568894953,
"grad_norm": 0.5086020231246948,
"learning_rate": 1.954720866508546e-05,
"loss": 0.234,
"step": 205
},
{
"epoch": 0.5620736698499318,
"grad_norm": 0.5384035706520081,
"learning_rate": 1.9537700563759303e-05,
"loss": 0.2405,
"step": 206
},
{
"epoch": 0.5648021828103683,
"grad_norm": 0.5312212705612183,
"learning_rate": 1.9528096028656835e-05,
"loss": 0.2419,
"step": 207
},
{
"epoch": 0.567530695770805,
"grad_norm": 0.5274946093559265,
"learning_rate": 1.9518395156887574e-05,
"loss": 0.2318,
"step": 208
},
{
"epoch": 0.5702592087312415,
"grad_norm": 0.5244715213775635,
"learning_rate": 1.9508598046535095e-05,
"loss": 0.2363,
"step": 209
},
{
"epoch": 0.572987721691678,
"grad_norm": 0.529675304889679,
"learning_rate": 1.949870479665602e-05,
"loss": 0.242,
"step": 210
},
{
"epoch": 0.5757162346521146,
"grad_norm": 0.510682225227356,
"learning_rate": 1.9488715507279e-05,
"loss": 0.2368,
"step": 211
},
{
"epoch": 0.5784447476125512,
"grad_norm": 0.5228487253189087,
"learning_rate": 1.9478630279403737e-05,
"loss": 0.2319,
"step": 212
},
{
"epoch": 0.5811732605729877,
"grad_norm": 0.5426226854324341,
"learning_rate": 1.9468449214999956e-05,
"loss": 0.239,
"step": 213
},
{
"epoch": 0.5839017735334243,
"grad_norm": 0.5186337828636169,
"learning_rate": 1.9458172417006347e-05,
"loss": 0.2386,
"step": 214
},
{
"epoch": 0.5866302864938608,
"grad_norm": 0.5370061993598938,
"learning_rate": 1.9447799989329557e-05,
"loss": 0.2387,
"step": 215
},
{
"epoch": 0.5893587994542974,
"grad_norm": 0.5024000406265259,
"learning_rate": 1.943733203684312e-05,
"loss": 0.2317,
"step": 216
},
{
"epoch": 0.592087312414734,
"grad_norm": 0.5344046354293823,
"learning_rate": 1.9426768665386397e-05,
"loss": 0.2308,
"step": 217
},
{
"epoch": 0.5948158253751705,
"grad_norm": 0.4995371699333191,
"learning_rate": 1.9416109981763526e-05,
"loss": 0.2342,
"step": 218
},
{
"epoch": 0.597544338335607,
"grad_norm": 0.5385648608207703,
"learning_rate": 1.9405356093742314e-05,
"loss": 0.2313,
"step": 219
},
{
"epoch": 0.6002728512960437,
"grad_norm": 0.5008872747421265,
"learning_rate": 1.939450711005316e-05,
"loss": 0.2365,
"step": 220
},
{
"epoch": 0.6030013642564802,
"grad_norm": 0.504681408405304,
"learning_rate": 1.9383563140387966e-05,
"loss": 0.2333,
"step": 221
},
{
"epoch": 0.6057298772169167,
"grad_norm": 0.49399399757385254,
"learning_rate": 1.9372524295399014e-05,
"loss": 0.2352,
"step": 222
},
{
"epoch": 0.6084583901773534,
"grad_norm": 0.5233116149902344,
"learning_rate": 1.9361390686697847e-05,
"loss": 0.2351,
"step": 223
},
{
"epoch": 0.6111869031377899,
"grad_norm": 0.5101408958435059,
"learning_rate": 1.9350162426854152e-05,
"loss": 0.2335,
"step": 224
},
{
"epoch": 0.6139154160982264,
"grad_norm": 0.5167925357818604,
"learning_rate": 1.9338839629394606e-05,
"loss": 0.233,
"step": 225
},
{
"epoch": 0.616643929058663,
"grad_norm": 0.4952821731567383,
"learning_rate": 1.9327422408801744e-05,
"loss": 0.2356,
"step": 226
},
{
"epoch": 0.6193724420190996,
"grad_norm": 0.48435306549072266,
"learning_rate": 1.9315910880512792e-05,
"loss": 0.2293,
"step": 227
},
{
"epoch": 0.6221009549795361,
"grad_norm": 0.5238944292068481,
"learning_rate": 1.93043051609185e-05,
"loss": 0.2294,
"step": 228
},
{
"epoch": 0.6248294679399727,
"grad_norm": 0.48778635263442993,
"learning_rate": 1.929260536736198e-05,
"loss": 0.2357,
"step": 229
},
{
"epoch": 0.6275579809004093,
"grad_norm": 0.5128819942474365,
"learning_rate": 1.9280811618137486e-05,
"loss": 0.2283,
"step": 230
},
{
"epoch": 0.6302864938608458,
"grad_norm": 0.49600908160209656,
"learning_rate": 1.926892403248925e-05,
"loss": 0.2225,
"step": 231
},
{
"epoch": 0.6330150068212824,
"grad_norm": 0.49010199308395386,
"learning_rate": 1.9256942730610268e-05,
"loss": 0.2301,
"step": 232
},
{
"epoch": 0.635743519781719,
"grad_norm": 0.5124602913856506,
"learning_rate": 1.9244867833641078e-05,
"loss": 0.2334,
"step": 233
},
{
"epoch": 0.6384720327421555,
"grad_norm": 0.4958963394165039,
"learning_rate": 1.9232699463668543e-05,
"loss": 0.2314,
"step": 234
},
{
"epoch": 0.6412005457025921,
"grad_norm": 0.4773724675178528,
"learning_rate": 1.9220437743724605e-05,
"loss": 0.2318,
"step": 235
},
{
"epoch": 0.6439290586630286,
"grad_norm": 0.4998438060283661,
"learning_rate": 1.9208082797785057e-05,
"loss": 0.22,
"step": 236
},
{
"epoch": 0.6466575716234653,
"grad_norm": 0.48424261808395386,
"learning_rate": 1.9195634750768276e-05,
"loss": 0.2156,
"step": 237
},
{
"epoch": 0.6493860845839018,
"grad_norm": 0.5186326503753662,
"learning_rate": 1.9183093728533966e-05,
"loss": 0.2338,
"step": 238
},
{
"epoch": 0.6521145975443383,
"grad_norm": 0.49726244807243347,
"learning_rate": 1.9170459857881888e-05,
"loss": 0.2256,
"step": 239
},
{
"epoch": 0.654843110504775,
"grad_norm": 0.4714222550392151,
"learning_rate": 1.9157733266550577e-05,
"loss": 0.2259,
"step": 240
},
{
"epoch": 0.6575716234652115,
"grad_norm": 0.5003750324249268,
"learning_rate": 1.9144914083216036e-05,
"loss": 0.2253,
"step": 241
},
{
"epoch": 0.660300136425648,
"grad_norm": 0.4727269411087036,
"learning_rate": 1.913200243749046e-05,
"loss": 0.2258,
"step": 242
},
{
"epoch": 0.6630286493860846,
"grad_norm": 0.5212213397026062,
"learning_rate": 1.91189984599209e-05,
"loss": 0.2322,
"step": 243
},
{
"epoch": 0.6657571623465212,
"grad_norm": 0.5002415776252747,
"learning_rate": 1.910590228198798e-05,
"loss": 0.2277,
"step": 244
},
{
"epoch": 0.6684856753069577,
"grad_norm": 0.4715561270713806,
"learning_rate": 1.9092714036104508e-05,
"loss": 0.2317,
"step": 245
},
{
"epoch": 0.6712141882673943,
"grad_norm": 0.47772514820098877,
"learning_rate": 1.9079433855614203e-05,
"loss": 0.2247,
"step": 246
},
{
"epoch": 0.6739427012278308,
"grad_norm": 0.47400856018066406,
"learning_rate": 1.9066061874790302e-05,
"loss": 0.2254,
"step": 247
},
{
"epoch": 0.6766712141882674,
"grad_norm": 0.4679079055786133,
"learning_rate": 1.9052598228834217e-05,
"loss": 0.2167,
"step": 248
},
{
"epoch": 0.679399727148704,
"grad_norm": 0.48590168356895447,
"learning_rate": 1.9039043053874175e-05,
"loss": 0.2216,
"step": 249
},
{
"epoch": 0.6821282401091405,
"grad_norm": 0.4846552610397339,
"learning_rate": 1.9025396486963827e-05,
"loss": 0.2247,
"step": 250
},
{
"epoch": 0.684856753069577,
"grad_norm": 0.4776105582714081,
"learning_rate": 1.9011658666080873e-05,
"loss": 0.2278,
"step": 251
},
{
"epoch": 0.6875852660300137,
"grad_norm": 0.4800094664096832,
"learning_rate": 1.8997829730125662e-05,
"loss": 0.2276,
"step": 252
},
{
"epoch": 0.6903137789904502,
"grad_norm": 0.47760075330734253,
"learning_rate": 1.898390981891979e-05,
"loss": 0.2189,
"step": 253
},
{
"epoch": 0.6930422919508867,
"grad_norm": 0.4844151735305786,
"learning_rate": 1.8969899073204687e-05,
"loss": 0.2236,
"step": 254
},
{
"epoch": 0.6957708049113234,
"grad_norm": 0.475306898355484,
"learning_rate": 1.895579763464019e-05,
"loss": 0.224,
"step": 255
},
{
"epoch": 0.6984993178717599,
"grad_norm": 0.4959429204463959,
"learning_rate": 1.8941605645803115e-05,
"loss": 0.2237,
"step": 256
},
{
"epoch": 0.7012278308321964,
"grad_norm": 0.4867900609970093,
"learning_rate": 1.8927323250185815e-05,
"loss": 0.2319,
"step": 257
},
{
"epoch": 0.703956343792633,
"grad_norm": 0.4814487099647522,
"learning_rate": 1.891295059219472e-05,
"loss": 0.2186,
"step": 258
},
{
"epoch": 0.7066848567530696,
"grad_norm": 0.4731517732143402,
"learning_rate": 1.88984878171489e-05,
"loss": 0.2205,
"step": 259
},
{
"epoch": 0.7094133697135061,
"grad_norm": 0.45691797137260437,
"learning_rate": 1.888393507127856e-05,
"loss": 0.2107,
"step": 260
},
{
"epoch": 0.7121418826739427,
"grad_norm": 0.5063825249671936,
"learning_rate": 1.8869292501723602e-05,
"loss": 0.2269,
"step": 261
},
{
"epoch": 0.7148703956343793,
"grad_norm": 0.4973117411136627,
"learning_rate": 1.8854560256532098e-05,
"loss": 0.2248,
"step": 262
},
{
"epoch": 0.7175989085948158,
"grad_norm": 0.49024084210395813,
"learning_rate": 1.8839738484658835e-05,
"loss": 0.2268,
"step": 263
},
{
"epoch": 0.7203274215552524,
"grad_norm": 0.4630759060382843,
"learning_rate": 1.8824827335963767e-05,
"loss": 0.2109,
"step": 264
},
{
"epoch": 0.723055934515689,
"grad_norm": 0.477979451417923,
"learning_rate": 1.8809826961210527e-05,
"loss": 0.2216,
"step": 265
},
{
"epoch": 0.7257844474761255,
"grad_norm": 0.4574492871761322,
"learning_rate": 1.879473751206489e-05,
"loss": 0.2182,
"step": 266
},
{
"epoch": 0.7285129604365621,
"grad_norm": 0.5049831867218018,
"learning_rate": 1.8779559141093256e-05,
"loss": 0.2238,
"step": 267
},
{
"epoch": 0.7312414733969986,
"grad_norm": 0.46383431553840637,
"learning_rate": 1.876429200176108e-05,
"loss": 0.2202,
"step": 268
},
{
"epoch": 0.7339699863574352,
"grad_norm": 0.48974180221557617,
"learning_rate": 1.8748936248431353e-05,
"loss": 0.2184,
"step": 269
},
{
"epoch": 0.7366984993178718,
"grad_norm": 0.4690982699394226,
"learning_rate": 1.8733492036363007e-05,
"loss": 0.2219,
"step": 270
},
{
"epoch": 0.7394270122783083,
"grad_norm": 0.48219865560531616,
"learning_rate": 1.871795952170937e-05,
"loss": 0.2209,
"step": 271
},
{
"epoch": 0.7421555252387448,
"grad_norm": 0.4637184739112854,
"learning_rate": 1.8702338861516587e-05,
"loss": 0.2131,
"step": 272
},
{
"epoch": 0.7448840381991815,
"grad_norm": 0.45264101028442383,
"learning_rate": 1.8686630213722015e-05,
"loss": 0.2167,
"step": 273
},
{
"epoch": 0.747612551159618,
"grad_norm": 0.4602806270122528,
"learning_rate": 1.867083373715264e-05,
"loss": 0.2194,
"step": 274
},
{
"epoch": 0.7503410641200545,
"grad_norm": 0.47461724281311035,
"learning_rate": 1.8654949591523467e-05,
"loss": 0.2195,
"step": 275
},
{
"epoch": 0.7530695770804912,
"grad_norm": 0.4658590257167816,
"learning_rate": 1.86389779374359e-05,
"loss": 0.2285,
"step": 276
},
{
"epoch": 0.7557980900409277,
"grad_norm": 0.4593490660190582,
"learning_rate": 1.8622918936376133e-05,
"loss": 0.2113,
"step": 277
},
{
"epoch": 0.7585266030013642,
"grad_norm": 0.4694429636001587,
"learning_rate": 1.8606772750713503e-05,
"loss": 0.2222,
"step": 278
},
{
"epoch": 0.7612551159618008,
"grad_norm": 0.4493769407272339,
"learning_rate": 1.8590539543698852e-05,
"loss": 0.2151,
"step": 279
},
{
"epoch": 0.7639836289222374,
"grad_norm": 0.4646337628364563,
"learning_rate": 1.857421947946288e-05,
"loss": 0.2208,
"step": 280
},
{
"epoch": 0.7667121418826739,
"grad_norm": 0.4551270008087158,
"learning_rate": 1.8557812723014476e-05,
"loss": 0.2113,
"step": 281
},
{
"epoch": 0.7694406548431105,
"grad_norm": 0.46589139103889465,
"learning_rate": 1.8541319440239066e-05,
"loss": 0.2207,
"step": 282
},
{
"epoch": 0.772169167803547,
"grad_norm": 0.4452350437641144,
"learning_rate": 1.8524739797896924e-05,
"loss": 0.2169,
"step": 283
},
{
"epoch": 0.7748976807639836,
"grad_norm": 0.49259626865386963,
"learning_rate": 1.8508073963621482e-05,
"loss": 0.2192,
"step": 284
},
{
"epoch": 0.7776261937244202,
"grad_norm": 0.450286328792572,
"learning_rate": 1.8491322105917645e-05,
"loss": 0.2187,
"step": 285
},
{
"epoch": 0.7803547066848567,
"grad_norm": 0.4535258710384369,
"learning_rate": 1.847448439416009e-05,
"loss": 0.218,
"step": 286
},
{
"epoch": 0.7830832196452933,
"grad_norm": 0.44347622990608215,
"learning_rate": 1.845756099859154e-05,
"loss": 0.2154,
"step": 287
},
{
"epoch": 0.7858117326057299,
"grad_norm": 0.45118656754493713,
"learning_rate": 1.8440552090321047e-05,
"loss": 0.21,
"step": 288
},
{
"epoch": 0.7885402455661664,
"grad_norm": 0.4592891335487366,
"learning_rate": 1.842345784132227e-05,
"loss": 0.2165,
"step": 289
},
{
"epoch": 0.791268758526603,
"grad_norm": 0.499129056930542,
"learning_rate": 1.8406278424431737e-05,
"loss": 0.2165,
"step": 290
},
{
"epoch": 0.7939972714870396,
"grad_norm": 0.4674071669578552,
"learning_rate": 1.838901401334708e-05,
"loss": 0.2183,
"step": 291
},
{
"epoch": 0.7967257844474761,
"grad_norm": 0.4629931151866913,
"learning_rate": 1.8371664782625287e-05,
"loss": 0.2164,
"step": 292
},
{
"epoch": 0.7994542974079127,
"grad_norm": 0.45491263270378113,
"learning_rate": 1.835423090768096e-05,
"loss": 0.2201,
"step": 293
},
{
"epoch": 0.8021828103683493,
"grad_norm": 0.4651404619216919,
"learning_rate": 1.8336712564784506e-05,
"loss": 0.2182,
"step": 294
},
{
"epoch": 0.8049113233287858,
"grad_norm": 0.4813602566719055,
"learning_rate": 1.8319109931060367e-05,
"loss": 0.2211,
"step": 295
},
{
"epoch": 0.8076398362892224,
"grad_norm": 0.4485262334346771,
"learning_rate": 1.8301423184485253e-05,
"loss": 0.2095,
"step": 296
},
{
"epoch": 0.810368349249659,
"grad_norm": 0.4614250361919403,
"learning_rate": 1.82836525038863e-05,
"loss": 0.2111,
"step": 297
},
{
"epoch": 0.8130968622100955,
"grad_norm": 0.47213491797447205,
"learning_rate": 1.8265798068939295e-05,
"loss": 0.216,
"step": 298
},
{
"epoch": 0.8158253751705321,
"grad_norm": 0.44635578989982605,
"learning_rate": 1.824786006016685e-05,
"loss": 0.208,
"step": 299
},
{
"epoch": 0.8185538881309686,
"grad_norm": 0.4536116123199463,
"learning_rate": 1.8229838658936566e-05,
"loss": 0.2105,
"step": 300
},
{
"epoch": 0.8212824010914052,
"grad_norm": 0.4396938681602478,
"learning_rate": 1.821173404745922e-05,
"loss": 0.2116,
"step": 301
},
{
"epoch": 0.8240109140518418,
"grad_norm": 0.4701569080352783,
"learning_rate": 1.81935464087869e-05,
"loss": 0.2143,
"step": 302
},
{
"epoch": 0.8267394270122783,
"grad_norm": 0.46065476536750793,
"learning_rate": 1.8175275926811173e-05,
"loss": 0.2163,
"step": 303
},
{
"epoch": 0.8294679399727148,
"grad_norm": 0.444499135017395,
"learning_rate": 1.815692278626122e-05,
"loss": 0.2109,
"step": 304
},
{
"epoch": 0.8321964529331515,
"grad_norm": 0.4454192519187927,
"learning_rate": 1.813848717270195e-05,
"loss": 0.2154,
"step": 305
},
{
"epoch": 0.834924965893588,
"grad_norm": 0.44666001200675964,
"learning_rate": 1.8119969272532164e-05,
"loss": 0.2158,
"step": 306
},
{
"epoch": 0.8376534788540245,
"grad_norm": 0.43889233469963074,
"learning_rate": 1.8101369272982633e-05,
"loss": 0.2121,
"step": 307
},
{
"epoch": 0.8403819918144612,
"grad_norm": 0.47276782989501953,
"learning_rate": 1.808268736211421e-05,
"loss": 0.215,
"step": 308
},
{
"epoch": 0.8431105047748977,
"grad_norm": 0.4451788365840912,
"learning_rate": 1.806392372881596e-05,
"loss": 0.217,
"step": 309
},
{
"epoch": 0.8458390177353342,
"grad_norm": 0.43395736813545227,
"learning_rate": 1.8045078562803203e-05,
"loss": 0.2137,
"step": 310
},
{
"epoch": 0.8485675306957708,
"grad_norm": 0.4520686864852905,
"learning_rate": 1.8026152054615633e-05,
"loss": 0.2133,
"step": 311
},
{
"epoch": 0.8512960436562074,
"grad_norm": 0.42937785387039185,
"learning_rate": 1.800714439561538e-05,
"loss": 0.2133,
"step": 312
},
{
"epoch": 0.8540245566166439,
"grad_norm": 0.4670831859111786,
"learning_rate": 1.7988055777985066e-05,
"loss": 0.2113,
"step": 313
},
{
"epoch": 0.8567530695770805,
"grad_norm": 0.4341495931148529,
"learning_rate": 1.7968886394725876e-05,
"loss": 0.2072,
"step": 314
},
{
"epoch": 0.859481582537517,
"grad_norm": 0.44559651613235474,
"learning_rate": 1.7949636439655592e-05,
"loss": 0.2173,
"step": 315
},
{
"epoch": 0.8622100954979536,
"grad_norm": 0.4362984299659729,
"learning_rate": 1.793030610740665e-05,
"loss": 0.2092,
"step": 316
},
{
"epoch": 0.8649386084583902,
"grad_norm": 0.4457165598869324,
"learning_rate": 1.7910895593424166e-05,
"loss": 0.2043,
"step": 317
},
{
"epoch": 0.8676671214188267,
"grad_norm": 0.4453994333744049,
"learning_rate": 1.789140509396394e-05,
"loss": 0.2149,
"step": 318
},
{
"epoch": 0.8703956343792633,
"grad_norm": 0.42760151624679565,
"learning_rate": 1.7871834806090502e-05,
"loss": 0.2081,
"step": 319
},
{
"epoch": 0.8731241473396999,
"grad_norm": 0.4355124533176422,
"learning_rate": 1.7852184927675113e-05,
"loss": 0.2087,
"step": 320
},
{
"epoch": 0.8758526603001364,
"grad_norm": 0.4316492974758148,
"learning_rate": 1.7832455657393745e-05,
"loss": 0.2062,
"step": 321
},
{
"epoch": 0.878581173260573,
"grad_norm": 0.43141260743141174,
"learning_rate": 1.7812647194725093e-05,
"loss": 0.2117,
"step": 322
},
{
"epoch": 0.8813096862210096,
"grad_norm": 0.44423919916152954,
"learning_rate": 1.7792759739948546e-05,
"loss": 0.2086,
"step": 323
},
{
"epoch": 0.8840381991814461,
"grad_norm": 0.4427170753479004,
"learning_rate": 1.777279349414217e-05,
"loss": 0.2065,
"step": 324
},
{
"epoch": 0.8867667121418826,
"grad_norm": 0.4190024733543396,
"learning_rate": 1.7752748659180662e-05,
"loss": 0.2104,
"step": 325
},
{
"epoch": 0.8894952251023193,
"grad_norm": 0.42245593667030334,
"learning_rate": 1.7732625437733338e-05,
"loss": 0.211,
"step": 326
},
{
"epoch": 0.8922237380627558,
"grad_norm": 0.4226483702659607,
"learning_rate": 1.771242403326204e-05,
"loss": 0.2093,
"step": 327
},
{
"epoch": 0.8949522510231923,
"grad_norm": 0.4086417853832245,
"learning_rate": 1.7692144650019125e-05,
"loss": 0.2046,
"step": 328
},
{
"epoch": 0.897680763983629,
"grad_norm": 0.432595431804657,
"learning_rate": 1.767178749304536e-05,
"loss": 0.2073,
"step": 329
},
{
"epoch": 0.9004092769440655,
"grad_norm": 0.44269153475761414,
"learning_rate": 1.765135276816787e-05,
"loss": 0.2129,
"step": 330
},
{
"epoch": 0.903137789904502,
"grad_norm": 0.42717134952545166,
"learning_rate": 1.7630840681998068e-05,
"loss": 0.2071,
"step": 331
},
{
"epoch": 0.9058663028649386,
"grad_norm": 0.4242989420890808,
"learning_rate": 1.7610251441929532e-05,
"loss": 0.2078,
"step": 332
},
{
"epoch": 0.9085948158253752,
"grad_norm": 0.4230920970439911,
"learning_rate": 1.758958525613594e-05,
"loss": 0.2064,
"step": 333
},
{
"epoch": 0.9113233287858117,
"grad_norm": 0.45626723766326904,
"learning_rate": 1.7568842333568952e-05,
"loss": 0.2147,
"step": 334
},
{
"epoch": 0.9140518417462483,
"grad_norm": 0.4308648109436035,
"learning_rate": 1.754802288395609e-05,
"loss": 0.2084,
"step": 335
},
{
"epoch": 0.9167803547066848,
"grad_norm": 0.4306204617023468,
"learning_rate": 1.7527127117798635e-05,
"loss": 0.2122,
"step": 336
},
{
"epoch": 0.9195088676671214,
"grad_norm": 0.4448958933353424,
"learning_rate": 1.750615524636948e-05,
"loss": 0.2056,
"step": 337
},
{
"epoch": 0.922237380627558,
"grad_norm": 0.4293544590473175,
"learning_rate": 1.7485107481711014e-05,
"loss": 0.2073,
"step": 338
},
{
"epoch": 0.9249658935879945,
"grad_norm": 0.45121076703071594,
"learning_rate": 1.7463984036632956e-05,
"loss": 0.2105,
"step": 339
},
{
"epoch": 0.927694406548431,
"grad_norm": 0.4212591350078583,
"learning_rate": 1.7442785124710227e-05,
"loss": 0.2065,
"step": 340
},
{
"epoch": 0.9304229195088677,
"grad_norm": 0.417019784450531,
"learning_rate": 1.742151096028076e-05,
"loss": 0.2117,
"step": 341
},
{
"epoch": 0.9331514324693042,
"grad_norm": 0.44269001483917236,
"learning_rate": 1.7400161758443377e-05,
"loss": 0.2098,
"step": 342
},
{
"epoch": 0.9358799454297408,
"grad_norm": 0.45144540071487427,
"learning_rate": 1.7378737735055562e-05,
"loss": 0.2031,
"step": 343
},
{
"epoch": 0.9386084583901774,
"grad_norm": 0.4351907968521118,
"learning_rate": 1.735723910673132e-05,
"loss": 0.2104,
"step": 344
},
{
"epoch": 0.9413369713506139,
"grad_norm": 0.42139601707458496,
"learning_rate": 1.7335666090838965e-05,
"loss": 0.2109,
"step": 345
},
{
"epoch": 0.9440654843110505,
"grad_norm": 0.42321038246154785,
"learning_rate": 1.7314018905498932e-05,
"loss": 0.207,
"step": 346
},
{
"epoch": 0.946793997271487,
"grad_norm": 0.409960001707077,
"learning_rate": 1.729229776958157e-05,
"loss": 0.2022,
"step": 347
},
{
"epoch": 0.9495225102319236,
"grad_norm": 0.42684659361839294,
"learning_rate": 1.7270502902704925e-05,
"loss": 0.2122,
"step": 348
},
{
"epoch": 0.9522510231923602,
"grad_norm": 0.4144516587257385,
"learning_rate": 1.7248634525232523e-05,
"loss": 0.2083,
"step": 349
},
{
"epoch": 0.9549795361527967,
"grad_norm": 0.431145578622818,
"learning_rate": 1.7226692858271133e-05,
"loss": 0.2113,
"step": 350
},
{
"epoch": 0.9577080491132333,
"grad_norm": 0.41966184973716736,
"learning_rate": 1.7204678123668556e-05,
"loss": 0.2064,
"step": 351
},
{
"epoch": 0.9604365620736699,
"grad_norm": 0.41143694519996643,
"learning_rate": 1.718259054401135e-05,
"loss": 0.2042,
"step": 352
},
{
"epoch": 0.9631650750341064,
"grad_norm": 0.40677252411842346,
"learning_rate": 1.71604303426226e-05,
"loss": 0.2079,
"step": 353
},
{
"epoch": 0.965893587994543,
"grad_norm": 0.41135165095329285,
"learning_rate": 1.7138197743559656e-05,
"loss": 0.2062,
"step": 354
},
{
"epoch": 0.9686221009549796,
"grad_norm": 0.42349115014076233,
"learning_rate": 1.7115892971611864e-05,
"loss": 0.2112,
"step": 355
},
{
"epoch": 0.9713506139154161,
"grad_norm": 0.4229361414909363,
"learning_rate": 1.7093516252298296e-05,
"loss": 0.2064,
"step": 356
},
{
"epoch": 0.9740791268758526,
"grad_norm": 0.4161188304424286,
"learning_rate": 1.7071067811865477e-05,
"loss": 0.2053,
"step": 357
},
{
"epoch": 0.9768076398362893,
"grad_norm": 0.4185996651649475,
"learning_rate": 1.7048547877285078e-05,
"loss": 0.2021,
"step": 358
},
{
"epoch": 0.9795361527967258,
"grad_norm": 0.4161181151866913,
"learning_rate": 1.7025956676251636e-05,
"loss": 0.2104,
"step": 359
},
{
"epoch": 0.9822646657571623,
"grad_norm": 0.41911646723747253,
"learning_rate": 1.7003294437180254e-05,
"loss": 0.2043,
"step": 360
},
{
"epoch": 0.984993178717599,
"grad_norm": 0.4234631061553955,
"learning_rate": 1.6980561389204285e-05,
"loss": 0.203,
"step": 361
},
{
"epoch": 0.9877216916780355,
"grad_norm": 0.43413975834846497,
"learning_rate": 1.695775776217301e-05,
"loss": 0.2091,
"step": 362
},
{
"epoch": 0.990450204638472,
"grad_norm": 0.4408881962299347,
"learning_rate": 1.6934883786649333e-05,
"loss": 0.205,
"step": 363
},
{
"epoch": 0.9931787175989086,
"grad_norm": 0.4064873456954956,
"learning_rate": 1.6911939693907422e-05,
"loss": 0.2018,
"step": 364
},
{
"epoch": 0.9959072305593452,
"grad_norm": 0.42045751214027405,
"learning_rate": 1.6888925715930396e-05,
"loss": 0.206,
"step": 365
},
{
"epoch": 0.9986357435197817,
"grad_norm": 0.42600393295288086,
"learning_rate": 1.686584208540797e-05,
"loss": 0.2106,
"step": 366
},
{
"epoch": 1.0013642564802183,
"grad_norm": 0.4242832064628601,
"learning_rate": 1.68426890357341e-05,
"loss": 0.1921,
"step": 367
},
{
"epoch": 1.004092769440655,
"grad_norm": 0.4603778123855591,
"learning_rate": 1.6819466801004622e-05,
"loss": 0.1769,
"step": 368
},
{
"epoch": 1.0068212824010914,
"grad_norm": 0.4325129985809326,
"learning_rate": 1.6796175616014894e-05,
"loss": 0.1739,
"step": 369
},
{
"epoch": 1.009549795361528,
"grad_norm": 0.4734160602092743,
"learning_rate": 1.6772815716257414e-05,
"loss": 0.172,
"step": 370
},
{
"epoch": 1.0122783083219646,
"grad_norm": 0.4947826862335205,
"learning_rate": 1.6749387337919434e-05,
"loss": 0.1713,
"step": 371
},
{
"epoch": 1.015006821282401,
"grad_norm": 0.5005810260772705,
"learning_rate": 1.672589071788059e-05,
"loss": 0.1732,
"step": 372
},
{
"epoch": 1.0177353342428377,
"grad_norm": 0.4360048472881317,
"learning_rate": 1.6702326093710493e-05,
"loss": 0.174,
"step": 373
},
{
"epoch": 1.0204638472032743,
"grad_norm": 0.44616055488586426,
"learning_rate": 1.6678693703666327e-05,
"loss": 0.1817,
"step": 374
},
{
"epoch": 1.0231923601637107,
"grad_norm": 0.42599302530288696,
"learning_rate": 1.6654993786690445e-05,
"loss": 0.1714,
"step": 375
},
{
"epoch": 1.0259208731241474,
"grad_norm": 0.42496803402900696,
"learning_rate": 1.6631226582407954e-05,
"loss": 0.1752,
"step": 376
},
{
"epoch": 1.028649386084584,
"grad_norm": 0.4142780005931854,
"learning_rate": 1.6607392331124282e-05,
"loss": 0.1689,
"step": 377
},
{
"epoch": 1.0313778990450204,
"grad_norm": 0.4593532681465149,
"learning_rate": 1.6583491273822763e-05,
"loss": 0.1788,
"step": 378
},
{
"epoch": 1.034106412005457,
"grad_norm": 0.44729867577552795,
"learning_rate": 1.6559523652162192e-05,
"loss": 0.1741,
"step": 379
},
{
"epoch": 1.0368349249658937,
"grad_norm": 0.46795180439949036,
"learning_rate": 1.653548970847438e-05,
"loss": 0.176,
"step": 380
},
{
"epoch": 1.03956343792633,
"grad_norm": 0.40044230222702026,
"learning_rate": 1.651138968576171e-05,
"loss": 0.1716,
"step": 381
},
{
"epoch": 1.0422919508867667,
"grad_norm": 0.42729461193084717,
"learning_rate": 1.6487223827694673e-05,
"loss": 0.1728,
"step": 382
},
{
"epoch": 1.0450204638472034,
"grad_norm": 0.41781216859817505,
"learning_rate": 1.646299237860941e-05,
"loss": 0.1789,
"step": 383
},
{
"epoch": 1.0477489768076398,
"grad_norm": 0.4189138114452362,
"learning_rate": 1.643869558350524e-05,
"loss": 0.1772,
"step": 384
},
{
"epoch": 1.0504774897680764,
"grad_norm": 0.4230622947216034,
"learning_rate": 1.6414333688042186e-05,
"loss": 0.1752,
"step": 385
},
{
"epoch": 1.053206002728513,
"grad_norm": 0.41642749309539795,
"learning_rate": 1.638990693853848e-05,
"loss": 0.1764,
"step": 386
},
{
"epoch": 1.0559345156889495,
"grad_norm": 0.4181545376777649,
"learning_rate": 1.6365415581968086e-05,
"loss": 0.1725,
"step": 387
},
{
"epoch": 1.058663028649386,
"grad_norm": 0.437956839799881,
"learning_rate": 1.6340859865958193e-05,
"loss": 0.1768,
"step": 388
},
{
"epoch": 1.0613915416098227,
"grad_norm": 0.4101565480232239,
"learning_rate": 1.631624003878672e-05,
"loss": 0.1706,
"step": 389
},
{
"epoch": 1.0641200545702592,
"grad_norm": 0.41338515281677246,
"learning_rate": 1.6291556349379794e-05,
"loss": 0.1729,
"step": 390
},
{
"epoch": 1.0668485675306958,
"grad_norm": 0.4137190580368042,
"learning_rate": 1.6266809047309253e-05,
"loss": 0.1797,
"step": 391
},
{
"epoch": 1.0695770804911324,
"grad_norm": 0.4079779088497162,
"learning_rate": 1.6241998382790095e-05,
"loss": 0.1741,
"step": 392
},
{
"epoch": 1.0723055934515688,
"grad_norm": 0.4103091061115265,
"learning_rate": 1.6217124606677973e-05,
"loss": 0.17,
"step": 393
},
{
"epoch": 1.0750341064120055,
"grad_norm": 0.3932867646217346,
"learning_rate": 1.6192187970466646e-05,
"loss": 0.1727,
"step": 394
},
{
"epoch": 1.077762619372442,
"grad_norm": 0.4191529154777527,
"learning_rate": 1.6167188726285433e-05,
"loss": 0.1747,
"step": 395
},
{
"epoch": 1.0804911323328785,
"grad_norm": 0.4195106327533722,
"learning_rate": 1.6142127126896682e-05,
"loss": 0.1716,
"step": 396
},
{
"epoch": 1.0832196452933152,
"grad_norm": 0.403487890958786,
"learning_rate": 1.611700342569319e-05,
"loss": 0.1757,
"step": 397
},
{
"epoch": 1.0859481582537518,
"grad_norm": 0.41124963760375977,
"learning_rate": 1.6091817876695655e-05,
"loss": 0.1729,
"step": 398
},
{
"epoch": 1.0886766712141882,
"grad_norm": 0.4118644595146179,
"learning_rate": 1.606657073455012e-05,
"loss": 0.169,
"step": 399
},
{
"epoch": 1.0914051841746248,
"grad_norm": 0.4055117666721344,
"learning_rate": 1.6041262254525362e-05,
"loss": 0.1741,
"step": 400
},
{
"epoch": 1.0941336971350615,
"grad_norm": 0.39282718300819397,
"learning_rate": 1.601589269251035e-05,
"loss": 0.174,
"step": 401
},
{
"epoch": 1.096862210095498,
"grad_norm": 0.41776013374328613,
"learning_rate": 1.599046230501163e-05,
"loss": 0.1801,
"step": 402
},
{
"epoch": 1.0995907230559345,
"grad_norm": 0.40125736594200134,
"learning_rate": 1.5964971349150746e-05,
"loss": 0.173,
"step": 403
},
{
"epoch": 1.1023192360163712,
"grad_norm": 0.40280765295028687,
"learning_rate": 1.593942008266164e-05,
"loss": 0.1727,
"step": 404
},
{
"epoch": 1.1050477489768076,
"grad_norm": 0.4229116141796112,
"learning_rate": 1.591380876388804e-05,
"loss": 0.1787,
"step": 405
},
{
"epoch": 1.1077762619372442,
"grad_norm": 0.393923819065094,
"learning_rate": 1.5888137651780847e-05,
"loss": 0.1707,
"step": 406
},
{
"epoch": 1.1105047748976808,
"grad_norm": 0.40177997946739197,
"learning_rate": 1.5862407005895524e-05,
"loss": 0.1696,
"step": 407
},
{
"epoch": 1.1132332878581173,
"grad_norm": 0.385484516620636,
"learning_rate": 1.583661708638947e-05,
"loss": 0.1698,
"step": 408
},
{
"epoch": 1.115961800818554,
"grad_norm": 0.41319334506988525,
"learning_rate": 1.5810768154019386e-05,
"loss": 0.1708,
"step": 409
},
{
"epoch": 1.1186903137789905,
"grad_norm": 0.4051019251346588,
"learning_rate": 1.5784860470138633e-05,
"loss": 0.1725,
"step": 410
},
{
"epoch": 1.121418826739427,
"grad_norm": 0.4432063102722168,
"learning_rate": 1.5758894296694614e-05,
"loss": 0.1802,
"step": 411
},
{
"epoch": 1.1241473396998636,
"grad_norm": 0.4144032597541809,
"learning_rate": 1.573286989622609e-05,
"loss": 0.1768,
"step": 412
},
{
"epoch": 1.1268758526603002,
"grad_norm": 0.40015333890914917,
"learning_rate": 1.5706787531860557e-05,
"loss": 0.1737,
"step": 413
},
{
"epoch": 1.1296043656207366,
"grad_norm": 0.40774694085121155,
"learning_rate": 1.568064746731156e-05,
"loss": 0.1764,
"step": 414
},
{
"epoch": 1.1323328785811733,
"grad_norm": 0.41893133521080017,
"learning_rate": 1.565444996687605e-05,
"loss": 0.1751,
"step": 415
},
{
"epoch": 1.13506139154161,
"grad_norm": 0.41963663697242737,
"learning_rate": 1.5628195295431696e-05,
"loss": 0.1752,
"step": 416
},
{
"epoch": 1.1377899045020463,
"grad_norm": 0.41315096616744995,
"learning_rate": 1.5601883718434207e-05,
"loss": 0.1736,
"step": 417
},
{
"epoch": 1.140518417462483,
"grad_norm": 0.40985772013664246,
"learning_rate": 1.557551550191467e-05,
"loss": 0.1725,
"step": 418
},
{
"epoch": 1.1432469304229196,
"grad_norm": 0.39127951860427856,
"learning_rate": 1.554909091247682e-05,
"loss": 0.1713,
"step": 419
},
{
"epoch": 1.145975443383356,
"grad_norm": 0.42498472332954407,
"learning_rate": 1.5522610217294377e-05,
"loss": 0.1713,
"step": 420
},
{
"epoch": 1.1487039563437926,
"grad_norm": 0.39464524388313293,
"learning_rate": 1.549607368410834e-05,
"loss": 0.1697,
"step": 421
},
{
"epoch": 1.1514324693042293,
"grad_norm": 0.3958282172679901,
"learning_rate": 1.5469481581224274e-05,
"loss": 0.1744,
"step": 422
},
{
"epoch": 1.1541609822646657,
"grad_norm": 0.40584421157836914,
"learning_rate": 1.544283417750958e-05,
"loss": 0.1738,
"step": 423
},
{
"epoch": 1.1568894952251023,
"grad_norm": 0.4232189953327179,
"learning_rate": 1.5416131742390827e-05,
"loss": 0.1755,
"step": 424
},
{
"epoch": 1.159618008185539,
"grad_norm": 0.46699848771095276,
"learning_rate": 1.5389374545850973e-05,
"loss": 0.1742,
"step": 425
},
{
"epoch": 1.1623465211459754,
"grad_norm": 0.41914746165275574,
"learning_rate": 1.5362562858426655e-05,
"loss": 0.1741,
"step": 426
},
{
"epoch": 1.165075034106412,
"grad_norm": 0.3851917088031769,
"learning_rate": 1.533569695120547e-05,
"loss": 0.1722,
"step": 427
},
{
"epoch": 1.1678035470668486,
"grad_norm": 0.4126559793949127,
"learning_rate": 1.530877709582321e-05,
"loss": 0.1731,
"step": 428
},
{
"epoch": 1.170532060027285,
"grad_norm": 0.3970852196216583,
"learning_rate": 1.5281803564461135e-05,
"loss": 0.1674,
"step": 429
},
{
"epoch": 1.1732605729877217,
"grad_norm": 0.409260094165802,
"learning_rate": 1.5254776629843204e-05,
"loss": 0.177,
"step": 430
},
{
"epoch": 1.1759890859481583,
"grad_norm": 0.4050799608230591,
"learning_rate": 1.522769656523333e-05,
"loss": 0.173,
"step": 431
},
{
"epoch": 1.1787175989085947,
"grad_norm": 0.39923250675201416,
"learning_rate": 1.5200563644432614e-05,
"loss": 0.1685,
"step": 432
},
{
"epoch": 1.1814461118690314,
"grad_norm": 0.39861583709716797,
"learning_rate": 1.5173378141776569e-05,
"loss": 0.1755,
"step": 433
},
{
"epoch": 1.184174624829468,
"grad_norm": 0.39778876304626465,
"learning_rate": 1.5146140332132359e-05,
"loss": 0.1734,
"step": 434
},
{
"epoch": 1.1869031377899044,
"grad_norm": 0.383896142244339,
"learning_rate": 1.5118850490896012e-05,
"loss": 0.1709,
"step": 435
},
{
"epoch": 1.189631650750341,
"grad_norm": 0.3906187117099762,
"learning_rate": 1.5091508893989633e-05,
"loss": 0.1713,
"step": 436
},
{
"epoch": 1.1923601637107777,
"grad_norm": 0.3931368589401245,
"learning_rate": 1.5064115817858622e-05,
"loss": 0.1773,
"step": 437
},
{
"epoch": 1.195088676671214,
"grad_norm": 0.4080300033092499,
"learning_rate": 1.5036671539468879e-05,
"loss": 0.1748,
"step": 438
},
{
"epoch": 1.1978171896316507,
"grad_norm": 0.4110572040081024,
"learning_rate": 1.5009176336303987e-05,
"loss": 0.1758,
"step": 439
},
{
"epoch": 1.2005457025920874,
"grad_norm": 0.38402611017227173,
"learning_rate": 1.4981630486362435e-05,
"loss": 0.1706,
"step": 440
},
{
"epoch": 1.2032742155525238,
"grad_norm": 0.4152999222278595,
"learning_rate": 1.4954034268154777e-05,
"loss": 0.1748,
"step": 441
},
{
"epoch": 1.2060027285129604,
"grad_norm": 0.3882010877132416,
"learning_rate": 1.4926387960700843e-05,
"loss": 0.1732,
"step": 442
},
{
"epoch": 1.208731241473397,
"grad_norm": 0.40996554493904114,
"learning_rate": 1.4898691843526897e-05,
"loss": 0.1773,
"step": 443
},
{
"epoch": 1.2114597544338335,
"grad_norm": 0.41083115339279175,
"learning_rate": 1.4870946196662822e-05,
"loss": 0.1727,
"step": 444
},
{
"epoch": 1.21418826739427,
"grad_norm": 0.39368975162506104,
"learning_rate": 1.4843151300639282e-05,
"loss": 0.1726,
"step": 445
},
{
"epoch": 1.2169167803547067,
"grad_norm": 0.4009232223033905,
"learning_rate": 1.4815307436484898e-05,
"loss": 0.1724,
"step": 446
},
{
"epoch": 1.2196452933151432,
"grad_norm": 0.3860257565975189,
"learning_rate": 1.4787414885723386e-05,
"loss": 0.1712,
"step": 447
},
{
"epoch": 1.2223738062755798,
"grad_norm": 0.37950554490089417,
"learning_rate": 1.4759473930370738e-05,
"loss": 0.1723,
"step": 448
},
{
"epoch": 1.2251023192360164,
"grad_norm": 0.3731255829334259,
"learning_rate": 1.4731484852932338e-05,
"loss": 0.1682,
"step": 449
},
{
"epoch": 1.2278308321964528,
"grad_norm": 0.39131075143814087,
"learning_rate": 1.4703447936400135e-05,
"loss": 0.169,
"step": 450
},
{
"epoch": 1.2305593451568895,
"grad_norm": 0.4061223566532135,
"learning_rate": 1.4675363464249763e-05,
"loss": 0.1769,
"step": 451
},
{
"epoch": 1.233287858117326,
"grad_norm": 0.3892337679862976,
"learning_rate": 1.4647231720437687e-05,
"loss": 0.173,
"step": 452
},
{
"epoch": 1.2360163710777625,
"grad_norm": 0.396017462015152,
"learning_rate": 1.461905298939832e-05,
"loss": 0.1737,
"step": 453
},
{
"epoch": 1.2387448840381992,
"grad_norm": 0.3907962739467621,
"learning_rate": 1.4590827556041158e-05,
"loss": 0.1699,
"step": 454
},
{
"epoch": 1.2414733969986358,
"grad_norm": 0.3987230658531189,
"learning_rate": 1.4562555705747894e-05,
"loss": 0.1755,
"step": 455
},
{
"epoch": 1.2442019099590724,
"grad_norm": 0.38845115900039673,
"learning_rate": 1.4534237724369534e-05,
"loss": 0.1743,
"step": 456
},
{
"epoch": 1.2469304229195088,
"grad_norm": 0.3965071737766266,
"learning_rate": 1.4505873898223498e-05,
"loss": 0.1729,
"step": 457
},
{
"epoch": 1.2496589358799455,
"grad_norm": 0.38698264956474304,
"learning_rate": 1.4477464514090745e-05,
"loss": 0.1711,
"step": 458
},
{
"epoch": 1.252387448840382,
"grad_norm": 0.39602330327033997,
"learning_rate": 1.4449009859212857e-05,
"loss": 0.1773,
"step": 459
},
{
"epoch": 1.2551159618008185,
"grad_norm": 0.3965539336204529,
"learning_rate": 1.4420510221289137e-05,
"loss": 0.1731,
"step": 460
},
{
"epoch": 1.2578444747612552,
"grad_norm": 0.3928743600845337,
"learning_rate": 1.4391965888473705e-05,
"loss": 0.1688,
"step": 461
},
{
"epoch": 1.2605729877216918,
"grad_norm": 0.3851865828037262,
"learning_rate": 1.4363377149372584e-05,
"loss": 0.1726,
"step": 462
},
{
"epoch": 1.2633015006821282,
"grad_norm": 0.3963593542575836,
"learning_rate": 1.4334744293040773e-05,
"loss": 0.1715,
"step": 463
},
{
"epoch": 1.2660300136425648,
"grad_norm": 0.37571755051612854,
"learning_rate": 1.430606760897934e-05,
"loss": 0.1723,
"step": 464
},
{
"epoch": 1.2687585266030013,
"grad_norm": 0.3996609151363373,
"learning_rate": 1.4277347387132482e-05,
"loss": 0.1706,
"step": 465
},
{
"epoch": 1.271487039563438,
"grad_norm": 0.403372585773468,
"learning_rate": 1.4248583917884595e-05,
"loss": 0.1714,
"step": 466
},
{
"epoch": 1.2742155525238745,
"grad_norm": 0.4006504714488983,
"learning_rate": 1.4219777492057349e-05,
"loss": 0.1712,
"step": 467
},
{
"epoch": 1.2769440654843112,
"grad_norm": 0.3809853494167328,
"learning_rate": 1.4190928400906731e-05,
"loss": 0.1652,
"step": 468
},
{
"epoch": 1.2796725784447476,
"grad_norm": 0.39101433753967285,
"learning_rate": 1.4162036936120115e-05,
"loss": 0.1725,
"step": 469
},
{
"epoch": 1.2824010914051842,
"grad_norm": 0.4025862514972687,
"learning_rate": 1.4133103389813302e-05,
"loss": 0.1736,
"step": 470
},
{
"epoch": 1.2851296043656206,
"grad_norm": 0.3864237666130066,
"learning_rate": 1.410412805452757e-05,
"loss": 0.1708,
"step": 471
},
{
"epoch": 1.2878581173260573,
"grad_norm": 0.37752318382263184,
"learning_rate": 1.4075111223226721e-05,
"loss": 0.1742,
"step": 472
},
{
"epoch": 1.290586630286494,
"grad_norm": 0.38657379150390625,
"learning_rate": 1.4046053189294114e-05,
"loss": 0.175,
"step": 473
},
{
"epoch": 1.2933151432469305,
"grad_norm": 0.3829413652420044,
"learning_rate": 1.4016954246529697e-05,
"loss": 0.1702,
"step": 474
},
{
"epoch": 1.296043656207367,
"grad_norm": 0.3982481360435486,
"learning_rate": 1.3987814689147041e-05,
"loss": 0.1708,
"step": 475
},
{
"epoch": 1.2987721691678036,
"grad_norm": 0.3672139048576355,
"learning_rate": 1.3958634811770361e-05,
"loss": 0.1673,
"step": 476
},
{
"epoch": 1.30150068212824,
"grad_norm": 0.39407840371131897,
"learning_rate": 1.3929414909431544e-05,
"loss": 0.174,
"step": 477
},
{
"epoch": 1.3042291950886766,
"grad_norm": 0.4047017991542816,
"learning_rate": 1.3900155277567157e-05,
"loss": 0.1721,
"step": 478
},
{
"epoch": 1.3069577080491133,
"grad_norm": 0.3893299102783203,
"learning_rate": 1.3870856212015468e-05,
"loss": 0.1757,
"step": 479
},
{
"epoch": 1.30968622100955,
"grad_norm": 0.38834965229034424,
"learning_rate": 1.3841518009013446e-05,
"loss": 0.1762,
"step": 480
},
{
"epoch": 1.3124147339699863,
"grad_norm": 0.39646992087364197,
"learning_rate": 1.3812140965193775e-05,
"loss": 0.1771,
"step": 481
},
{
"epoch": 1.315143246930423,
"grad_norm": 0.37843289971351624,
"learning_rate": 1.378272537758185e-05,
"loss": 0.1746,
"step": 482
},
{
"epoch": 1.3178717598908594,
"grad_norm": 0.378549724817276,
"learning_rate": 1.3753271543592772e-05,
"loss": 0.1711,
"step": 483
},
{
"epoch": 1.320600272851296,
"grad_norm": 0.3854270279407501,
"learning_rate": 1.3723779761028349e-05,
"loss": 0.1747,
"step": 484
},
{
"epoch": 1.3233287858117326,
"grad_norm": 0.38288038969039917,
"learning_rate": 1.3694250328074072e-05,
"loss": 0.1783,
"step": 485
},
{
"epoch": 1.3260572987721693,
"grad_norm": 0.3755180537700653,
"learning_rate": 1.3664683543296114e-05,
"loss": 0.1706,
"step": 486
},
{
"epoch": 1.3287858117326057,
"grad_norm": 0.38838502764701843,
"learning_rate": 1.3635079705638298e-05,
"loss": 0.1734,
"step": 487
},
{
"epoch": 1.3315143246930423,
"grad_norm": 0.3818054497241974,
"learning_rate": 1.3605439114419095e-05,
"loss": 0.1694,
"step": 488
},
{
"epoch": 1.3342428376534787,
"grad_norm": 0.3814438283443451,
"learning_rate": 1.3575762069328567e-05,
"loss": 0.1744,
"step": 489
},
{
"epoch": 1.3369713506139154,
"grad_norm": 0.39339813590049744,
"learning_rate": 1.3546048870425356e-05,
"loss": 0.1722,
"step": 490
},
{
"epoch": 1.339699863574352,
"grad_norm": 0.380854070186615,
"learning_rate": 1.3516299818133664e-05,
"loss": 0.1695,
"step": 491
},
{
"epoch": 1.3424283765347886,
"grad_norm": 0.3817788064479828,
"learning_rate": 1.3486515213240188e-05,
"loss": 0.1712,
"step": 492
},
{
"epoch": 1.345156889495225,
"grad_norm": 0.38205572962760925,
"learning_rate": 1.3456695356891079e-05,
"loss": 0.1709,
"step": 493
},
{
"epoch": 1.3478854024556617,
"grad_norm": 0.3941291570663452,
"learning_rate": 1.3426840550588933e-05,
"loss": 0.1731,
"step": 494
},
{
"epoch": 1.350613915416098,
"grad_norm": 0.3804328143596649,
"learning_rate": 1.33969510961897e-05,
"loss": 0.1723,
"step": 495
},
{
"epoch": 1.3533424283765347,
"grad_norm": 0.3907669186592102,
"learning_rate": 1.3367027295899652e-05,
"loss": 0.1737,
"step": 496
},
{
"epoch": 1.3560709413369714,
"grad_norm": 0.3885650038719177,
"learning_rate": 1.3337069452272332e-05,
"loss": 0.1726,
"step": 497
},
{
"epoch": 1.358799454297408,
"grad_norm": 0.38135266304016113,
"learning_rate": 1.3307077868205487e-05,
"loss": 0.1757,
"step": 498
},
{
"epoch": 1.3615279672578444,
"grad_norm": 0.36993685364723206,
"learning_rate": 1.3277052846937997e-05,
"loss": 0.1718,
"step": 499
},
{
"epoch": 1.364256480218281,
"grad_norm": 0.37970247864723206,
"learning_rate": 1.3246994692046837e-05,
"loss": 0.1746,
"step": 500
},
{
"epoch": 1.3669849931787175,
"grad_norm": 0.3805428445339203,
"learning_rate": 1.321690370744397e-05,
"loss": 0.1782,
"step": 501
},
{
"epoch": 1.369713506139154,
"grad_norm": 0.3677096962928772,
"learning_rate": 1.3186780197373306e-05,
"loss": 0.1691,
"step": 502
},
{
"epoch": 1.3724420190995907,
"grad_norm": 0.38988423347473145,
"learning_rate": 1.3156624466407607e-05,
"loss": 0.1708,
"step": 503
},
{
"epoch": 1.3751705320600274,
"grad_norm": 0.38532352447509766,
"learning_rate": 1.3126436819445423e-05,
"loss": 0.174,
"step": 504
},
{
"epoch": 1.3778990450204638,
"grad_norm": 0.3830656409263611,
"learning_rate": 1.309621756170799e-05,
"loss": 0.1737,
"step": 505
},
{
"epoch": 1.3806275579809004,
"grad_norm": 0.3703681230545044,
"learning_rate": 1.3065966998736155e-05,
"loss": 0.1687,
"step": 506
},
{
"epoch": 1.3833560709413368,
"grad_norm": 0.3779120445251465,
"learning_rate": 1.3035685436387297e-05,
"loss": 0.1687,
"step": 507
},
{
"epoch": 1.3860845839017735,
"grad_norm": 0.3700374364852905,
"learning_rate": 1.300537318083221e-05,
"loss": 0.173,
"step": 508
},
{
"epoch": 1.38881309686221,
"grad_norm": 0.38560089468955994,
"learning_rate": 1.297503053855203e-05,
"loss": 0.1706,
"step": 509
},
{
"epoch": 1.3915416098226467,
"grad_norm": 0.37079334259033203,
"learning_rate": 1.2944657816335124e-05,
"loss": 0.1715,
"step": 510
},
{
"epoch": 1.3942701227830832,
"grad_norm": 0.37695810198783875,
"learning_rate": 1.2914255321273987e-05,
"loss": 0.1746,
"step": 511
},
{
"epoch": 1.3969986357435198,
"grad_norm": 0.37229034304618835,
"learning_rate": 1.2883823360762149e-05,
"loss": 0.1692,
"step": 512
},
{
"epoch": 1.3997271487039564,
"grad_norm": 0.3717256486415863,
"learning_rate": 1.2853362242491054e-05,
"loss": 0.1722,
"step": 513
},
{
"epoch": 1.4024556616643928,
"grad_norm": 0.3814941346645355,
"learning_rate": 1.2822872274446958e-05,
"loss": 0.1692,
"step": 514
},
{
"epoch": 1.4051841746248295,
"grad_norm": 0.40019774436950684,
"learning_rate": 1.2792353764907803e-05,
"loss": 0.1736,
"step": 515
},
{
"epoch": 1.407912687585266,
"grad_norm": 0.3744332790374756,
"learning_rate": 1.276180702244012e-05,
"loss": 0.1698,
"step": 516
},
{
"epoch": 1.4106412005457025,
"grad_norm": 0.3784146010875702,
"learning_rate": 1.273123235589589e-05,
"loss": 0.1701,
"step": 517
},
{
"epoch": 1.4133697135061392,
"grad_norm": 0.371044784784317,
"learning_rate": 1.2700630074409427e-05,
"loss": 0.1661,
"step": 518
},
{
"epoch": 1.4160982264665758,
"grad_norm": 0.37950316071510315,
"learning_rate": 1.2670000487394268e-05,
"loss": 0.167,
"step": 519
},
{
"epoch": 1.4188267394270122,
"grad_norm": 0.3809194564819336,
"learning_rate": 1.2639343904540008e-05,
"loss": 0.172,
"step": 520
},
{
"epoch": 1.4215552523874488,
"grad_norm": 0.3829333186149597,
"learning_rate": 1.260866063580921e-05,
"loss": 0.1715,
"step": 521
},
{
"epoch": 1.4242837653478855,
"grad_norm": 0.4065547585487366,
"learning_rate": 1.2577950991434249e-05,
"loss": 0.1681,
"step": 522
},
{
"epoch": 1.427012278308322,
"grad_norm": 0.39221319556236267,
"learning_rate": 1.254721528191417e-05,
"loss": 0.1721,
"step": 523
},
{
"epoch": 1.4297407912687585,
"grad_norm": 0.3714672327041626,
"learning_rate": 1.2516453818011567e-05,
"loss": 0.1698,
"step": 524
},
{
"epoch": 1.4324693042291952,
"grad_norm": 0.38517430424690247,
"learning_rate": 1.2485666910749427e-05,
"loss": 0.1718,
"step": 525
},
{
"epoch": 1.4351978171896316,
"grad_norm": 0.3771488070487976,
"learning_rate": 1.2454854871407993e-05,
"loss": 0.1705,
"step": 526
},
{
"epoch": 1.4379263301500682,
"grad_norm": 0.39493903517723083,
"learning_rate": 1.242401801152161e-05,
"loss": 0.1725,
"step": 527
},
{
"epoch": 1.4406548431105048,
"grad_norm": 0.36978915333747864,
"learning_rate": 1.2393156642875579e-05,
"loss": 0.1746,
"step": 528
},
{
"epoch": 1.4433833560709413,
"grad_norm": 0.37171533703804016,
"learning_rate": 1.2362271077503007e-05,
"loss": 0.1681,
"step": 529
},
{
"epoch": 1.446111869031378,
"grad_norm": 0.36533740162849426,
"learning_rate": 1.2331361627681645e-05,
"loss": 0.1663,
"step": 530
},
{
"epoch": 1.4488403819918145,
"grad_norm": 0.3751925826072693,
"learning_rate": 1.2300428605930736e-05,
"loss": 0.1731,
"step": 531
},
{
"epoch": 1.451568894952251,
"grad_norm": 0.3715348541736603,
"learning_rate": 1.2269472325007858e-05,
"loss": 0.1699,
"step": 532
},
{
"epoch": 1.4542974079126876,
"grad_norm": 0.3716941773891449,
"learning_rate": 1.2238493097905754e-05,
"loss": 0.1698,
"step": 533
},
{
"epoch": 1.4570259208731242,
"grad_norm": 0.3676636517047882,
"learning_rate": 1.2207491237849174e-05,
"loss": 0.1678,
"step": 534
},
{
"epoch": 1.4597544338335606,
"grad_norm": 0.3839435875415802,
"learning_rate": 1.2176467058291699e-05,
"loss": 0.1699,
"step": 535
},
{
"epoch": 1.4624829467939973,
"grad_norm": 0.3931127190589905,
"learning_rate": 1.2145420872912586e-05,
"loss": 0.1773,
"step": 536
},
{
"epoch": 1.465211459754434,
"grad_norm": 0.3639325499534607,
"learning_rate": 1.2114352995613582e-05,
"loss": 0.1671,
"step": 537
},
{
"epoch": 1.4679399727148703,
"grad_norm": 0.3724636137485504,
"learning_rate": 1.2083263740515764e-05,
"loss": 0.168,
"step": 538
},
{
"epoch": 1.470668485675307,
"grad_norm": 0.3660443425178528,
"learning_rate": 1.2052153421956343e-05,
"loss": 0.1661,
"step": 539
},
{
"epoch": 1.4733969986357436,
"grad_norm": 0.36283448338508606,
"learning_rate": 1.2021022354485514e-05,
"loss": 0.1656,
"step": 540
},
{
"epoch": 1.4761255115961802,
"grad_norm": 0.3686031103134155,
"learning_rate": 1.1989870852863254e-05,
"loss": 0.1693,
"step": 541
},
{
"epoch": 1.4788540245566166,
"grad_norm": 0.3700789511203766,
"learning_rate": 1.1958699232056135e-05,
"loss": 0.1681,
"step": 542
},
{
"epoch": 1.4815825375170533,
"grad_norm": 0.37259745597839355,
"learning_rate": 1.1927507807234169e-05,
"loss": 0.1706,
"step": 543
},
{
"epoch": 1.4843110504774897,
"grad_norm": 0.3932602107524872,
"learning_rate": 1.1896296893767588e-05,
"loss": 0.1702,
"step": 544
},
{
"epoch": 1.4870395634379263,
"grad_norm": 0.3860296308994293,
"learning_rate": 1.186506680722367e-05,
"loss": 0.1713,
"step": 545
},
{
"epoch": 1.489768076398363,
"grad_norm": 0.37699058651924133,
"learning_rate": 1.1833817863363563e-05,
"loss": 0.1683,
"step": 546
},
{
"epoch": 1.4924965893587996,
"grad_norm": 0.37668129801750183,
"learning_rate": 1.180255037813906e-05,
"loss": 0.1698,
"step": 547
},
{
"epoch": 1.495225102319236,
"grad_norm": 0.3677523732185364,
"learning_rate": 1.1771264667689428e-05,
"loss": 0.1678,
"step": 548
},
{
"epoch": 1.4979536152796726,
"grad_norm": 0.3719595670700073,
"learning_rate": 1.1739961048338213e-05,
"loss": 0.1636,
"step": 549
},
{
"epoch": 1.500682128240109,
"grad_norm": 0.3700611889362335,
"learning_rate": 1.1708639836590024e-05,
"loss": 0.1673,
"step": 550
},
{
"epoch": 1.5034106412005457,
"grad_norm": 0.37999942898750305,
"learning_rate": 1.1677301349127349e-05,
"loss": 0.1706,
"step": 551
},
{
"epoch": 1.5061391541609823,
"grad_norm": 0.37487849593162537,
"learning_rate": 1.164594590280734e-05,
"loss": 0.1715,
"step": 552
},
{
"epoch": 1.508867667121419,
"grad_norm": 0.37367403507232666,
"learning_rate": 1.161457381465863e-05,
"loss": 0.1698,
"step": 553
},
{
"epoch": 1.5115961800818554,
"grad_norm": 0.39386484026908875,
"learning_rate": 1.15831854018781e-05,
"loss": 0.1699,
"step": 554
},
{
"epoch": 1.514324693042292,
"grad_norm": 0.37812918424606323,
"learning_rate": 1.1551780981827699e-05,
"loss": 0.1684,
"step": 555
},
{
"epoch": 1.5170532060027284,
"grad_norm": 0.3666611611843109,
"learning_rate": 1.1520360872031208e-05,
"loss": 0.1679,
"step": 556
},
{
"epoch": 1.519781718963165,
"grad_norm": 0.36553552746772766,
"learning_rate": 1.148892539017106e-05,
"loss": 0.1645,
"step": 557
},
{
"epoch": 1.5225102319236017,
"grad_norm": 0.3726014792919159,
"learning_rate": 1.1457474854085095e-05,
"loss": 0.1746,
"step": 558
},
{
"epoch": 1.5252387448840383,
"grad_norm": 0.3792051672935486,
"learning_rate": 1.1426009581763377e-05,
"loss": 0.1682,
"step": 559
},
{
"epoch": 1.5279672578444747,
"grad_norm": 0.3550755977630615,
"learning_rate": 1.139452989134496e-05,
"loss": 0.1668,
"step": 560
},
{
"epoch": 1.5306957708049114,
"grad_norm": 0.3755435049533844,
"learning_rate": 1.1363036101114671e-05,
"loss": 0.165,
"step": 561
},
{
"epoch": 1.5334242837653478,
"grad_norm": 0.3794499337673187,
"learning_rate": 1.1331528529499909e-05,
"loss": 0.1677,
"step": 562
},
{
"epoch": 1.5361527967257844,
"grad_norm": 0.3994825780391693,
"learning_rate": 1.1300007495067403e-05,
"loss": 0.173,
"step": 563
},
{
"epoch": 1.538881309686221,
"grad_norm": 0.3852636516094208,
"learning_rate": 1.1268473316520007e-05,
"loss": 0.171,
"step": 564
},
{
"epoch": 1.5416098226466577,
"grad_norm": 0.3853999674320221,
"learning_rate": 1.123692631269348e-05,
"loss": 0.1719,
"step": 565
},
{
"epoch": 1.544338335607094,
"grad_norm": 0.3695131540298462,
"learning_rate": 1.1205366802553231e-05,
"loss": 0.171,
"step": 566
},
{
"epoch": 1.5470668485675307,
"grad_norm": 0.3661561906337738,
"learning_rate": 1.1173795105191146e-05,
"loss": 0.166,
"step": 567
},
{
"epoch": 1.5497953615279672,
"grad_norm": 0.38201916217803955,
"learning_rate": 1.1142211539822318e-05,
"loss": 0.1673,
"step": 568
},
{
"epoch": 1.5525238744884038,
"grad_norm": 0.37060391902923584,
"learning_rate": 1.1110616425781833e-05,
"loss": 0.1675,
"step": 569
},
{
"epoch": 1.5552523874488404,
"grad_norm": 0.3723720908164978,
"learning_rate": 1.1079010082521557e-05,
"loss": 0.1688,
"step": 570
},
{
"epoch": 1.557980900409277,
"grad_norm": 0.3776531219482422,
"learning_rate": 1.1047392829606876e-05,
"loss": 0.1675,
"step": 571
},
{
"epoch": 1.5607094133697135,
"grad_norm": 0.38429945707321167,
"learning_rate": 1.101576498671349e-05,
"loss": 0.1671,
"step": 572
},
{
"epoch": 1.56343792633015,
"grad_norm": 0.3864508271217346,
"learning_rate": 1.098412687362418e-05,
"loss": 0.1721,
"step": 573
},
{
"epoch": 1.5661664392905865,
"grad_norm": 0.3664016127586365,
"learning_rate": 1.095247881022555e-05,
"loss": 0.1684,
"step": 574
},
{
"epoch": 1.5688949522510232,
"grad_norm": 0.3717040717601776,
"learning_rate": 1.0920821116504816e-05,
"loss": 0.1722,
"step": 575
},
{
"epoch": 1.5716234652114598,
"grad_norm": 0.3679288625717163,
"learning_rate": 1.0889154112546569e-05,
"loss": 0.1696,
"step": 576
},
{
"epoch": 1.5743519781718964,
"grad_norm": 0.3595617413520813,
"learning_rate": 1.0857478118529534e-05,
"loss": 0.1618,
"step": 577
},
{
"epoch": 1.5770804911323328,
"grad_norm": 0.36247459053993225,
"learning_rate": 1.0825793454723325e-05,
"loss": 0.1702,
"step": 578
},
{
"epoch": 1.5798090040927695,
"grad_norm": 0.36833876371383667,
"learning_rate": 1.079410044148522e-05,
"loss": 0.1684,
"step": 579
},
{
"epoch": 1.5825375170532059,
"grad_norm": 0.3675031065940857,
"learning_rate": 1.0762399399256917e-05,
"loss": 0.1635,
"step": 580
},
{
"epoch": 1.5852660300136425,
"grad_norm": 0.3601101040840149,
"learning_rate": 1.0730690648561293e-05,
"loss": 0.1659,
"step": 581
},
{
"epoch": 1.5879945429740792,
"grad_norm": 0.372632771730423,
"learning_rate": 1.0698974509999159e-05,
"loss": 0.1686,
"step": 582
},
{
"epoch": 1.5907230559345158,
"grad_norm": 0.37733328342437744,
"learning_rate": 1.0667251304246028e-05,
"loss": 0.1711,
"step": 583
},
{
"epoch": 1.5934515688949522,
"grad_norm": 0.36464056372642517,
"learning_rate": 1.0635521352048873e-05,
"loss": 0.1681,
"step": 584
},
{
"epoch": 1.5961800818553888,
"grad_norm": 0.37589231133461,
"learning_rate": 1.0603784974222862e-05,
"loss": 0.1688,
"step": 585
},
{
"epoch": 1.5989085948158253,
"grad_norm": 0.3540600836277008,
"learning_rate": 1.057204249164815e-05,
"loss": 0.1685,
"step": 586
},
{
"epoch": 1.601637107776262,
"grad_norm": 0.36046862602233887,
"learning_rate": 1.0540294225266608e-05,
"loss": 0.1642,
"step": 587
},
{
"epoch": 1.6043656207366985,
"grad_norm": 0.36235296726226807,
"learning_rate": 1.0508540496078582e-05,
"loss": 0.1686,
"step": 588
},
{
"epoch": 1.6070941336971352,
"grad_norm": 0.35960090160369873,
"learning_rate": 1.0476781625139655e-05,
"loss": 0.169,
"step": 589
},
{
"epoch": 1.6098226466575716,
"grad_norm": 0.37027955055236816,
"learning_rate": 1.0445017933557404e-05,
"loss": 0.1695,
"step": 590
},
{
"epoch": 1.6125511596180082,
"grad_norm": 0.37015828490257263,
"learning_rate": 1.0413249742488132e-05,
"loss": 0.1641,
"step": 591
},
{
"epoch": 1.6152796725784446,
"grad_norm": 0.36656126379966736,
"learning_rate": 1.0381477373133652e-05,
"loss": 0.1667,
"step": 592
},
{
"epoch": 1.6180081855388813,
"grad_norm": 0.364135205745697,
"learning_rate": 1.0349701146738007e-05,
"loss": 0.1652,
"step": 593
},
{
"epoch": 1.620736698499318,
"grad_norm": 0.36405521631240845,
"learning_rate": 1.0317921384584245e-05,
"loss": 0.1692,
"step": 594
},
{
"epoch": 1.6234652114597545,
"grad_norm": 0.36980974674224854,
"learning_rate": 1.0286138407991171e-05,
"loss": 0.1689,
"step": 595
},
{
"epoch": 1.626193724420191,
"grad_norm": 0.37447232007980347,
"learning_rate": 1.0254352538310075e-05,
"loss": 0.1729,
"step": 596
},
{
"epoch": 1.6289222373806276,
"grad_norm": 0.3681443929672241,
"learning_rate": 1.0222564096921505e-05,
"loss": 0.1714,
"step": 597
},
{
"epoch": 1.631650750341064,
"grad_norm": 0.36758869886398315,
"learning_rate": 1.0190773405232024e-05,
"loss": 0.1649,
"step": 598
},
{
"epoch": 1.6343792633015006,
"grad_norm": 0.36638176441192627,
"learning_rate": 1.0158980784670927e-05,
"loss": 0.1677,
"step": 599
},
{
"epoch": 1.6371077762619373,
"grad_norm": 0.35130685567855835,
"learning_rate": 1.012718655668702e-05,
"loss": 0.1643,
"step": 600
},
{
"epoch": 1.639836289222374,
"grad_norm": 0.3654751777648926,
"learning_rate": 1.0095391042745362e-05,
"loss": 0.1643,
"step": 601
},
{
"epoch": 1.6425648021828103,
"grad_norm": 0.3721977472305298,
"learning_rate": 1.0063594564324014e-05,
"loss": 0.1655,
"step": 602
},
{
"epoch": 1.645293315143247,
"grad_norm": 0.3742852509021759,
"learning_rate": 1.0031797442910788e-05,
"loss": 0.1694,
"step": 603
},
{
"epoch": 1.6480218281036834,
"grad_norm": 0.37592101097106934,
"learning_rate": 1e-05,
"loss": 0.1667,
"step": 604
},
{
"epoch": 1.65075034106412,
"grad_norm": 0.36879512667655945,
"learning_rate": 9.968202557089213e-06,
"loss": 0.1687,
"step": 605
},
{
"epoch": 1.6534788540245566,
"grad_norm": 0.36528080701828003,
"learning_rate": 9.936405435675991e-06,
"loss": 0.1679,
"step": 606
},
{
"epoch": 1.6562073669849933,
"grad_norm": 0.37423449754714966,
"learning_rate": 9.904608957254643e-06,
"loss": 0.1719,
"step": 607
},
{
"epoch": 1.65893587994543,
"grad_norm": 0.3570186495780945,
"learning_rate": 9.872813443312984e-06,
"loss": 0.1664,
"step": 608
},
{
"epoch": 1.6616643929058663,
"grad_norm": 0.357686847448349,
"learning_rate": 9.84101921532908e-06,
"loss": 0.1641,
"step": 609
},
{
"epoch": 1.6643929058663027,
"grad_norm": 0.35586750507354736,
"learning_rate": 9.809226594767979e-06,
"loss": 0.1618,
"step": 610
},
{
"epoch": 1.6671214188267394,
"grad_norm": 0.35907599329948425,
"learning_rate": 9.777435903078493e-06,
"loss": 0.1652,
"step": 611
},
{
"epoch": 1.669849931787176,
"grad_norm": 0.3660561144351959,
"learning_rate": 9.745647461689932e-06,
"loss": 0.1667,
"step": 612
},
{
"epoch": 1.6725784447476126,
"grad_norm": 0.36144155263900757,
"learning_rate": 9.713861592008834e-06,
"loss": 0.1639,
"step": 613
},
{
"epoch": 1.6753069577080493,
"grad_norm": 0.37603816390037537,
"learning_rate": 9.682078615415755e-06,
"loss": 0.1659,
"step": 614
},
{
"epoch": 1.6780354706684857,
"grad_norm": 0.3800089359283447,
"learning_rate": 9.650298853261998e-06,
"loss": 0.1673,
"step": 615
},
{
"epoch": 1.680763983628922,
"grad_norm": 0.3737815320491791,
"learning_rate": 9.618522626866351e-06,
"loss": 0.1654,
"step": 616
},
{
"epoch": 1.6834924965893587,
"grad_norm": 0.38408902287483215,
"learning_rate": 9.586750257511868e-06,
"loss": 0.1707,
"step": 617
},
{
"epoch": 1.6862210095497954,
"grad_norm": 0.36487987637519836,
"learning_rate": 9.554982066442601e-06,
"loss": 0.1689,
"step": 618
},
{
"epoch": 1.688949522510232,
"grad_norm": 0.35596176981925964,
"learning_rate": 9.523218374860348e-06,
"loss": 0.1664,
"step": 619
},
{
"epoch": 1.6916780354706686,
"grad_norm": 0.3624376654624939,
"learning_rate": 9.49145950392142e-06,
"loss": 0.1668,
"step": 620
},
{
"epoch": 1.694406548431105,
"grad_norm": 0.368150532245636,
"learning_rate": 9.459705774733397e-06,
"loss": 0.1707,
"step": 621
},
{
"epoch": 1.6971350613915415,
"grad_norm": 0.3632226586341858,
"learning_rate": 9.427957508351852e-06,
"loss": 0.1696,
"step": 622
},
{
"epoch": 1.699863574351978,
"grad_norm": 0.359633207321167,
"learning_rate": 9.39621502577714e-06,
"loss": 0.1668,
"step": 623
},
{
"epoch": 1.7025920873124147,
"grad_norm": 0.36725008487701416,
"learning_rate": 9.364478647951132e-06,
"loss": 0.1652,
"step": 624
},
{
"epoch": 1.7053206002728514,
"grad_norm": 0.3794236481189728,
"learning_rate": 9.332748695753973e-06,
"loss": 0.1701,
"step": 625
},
{
"epoch": 1.708049113233288,
"grad_norm": 0.3663654327392578,
"learning_rate": 9.301025490000843e-06,
"loss": 0.169,
"step": 626
},
{
"epoch": 1.7107776261937244,
"grad_norm": 0.3565012812614441,
"learning_rate": 9.26930935143871e-06,
"loss": 0.1691,
"step": 627
},
{
"epoch": 1.7135061391541608,
"grad_norm": 0.3741627633571625,
"learning_rate": 9.237600600743086e-06,
"loss": 0.1688,
"step": 628
},
{
"epoch": 1.7162346521145975,
"grad_norm": 0.36277323961257935,
"learning_rate": 9.20589955851478e-06,
"loss": 0.1675,
"step": 629
},
{
"epoch": 1.718963165075034,
"grad_norm": 0.35570815205574036,
"learning_rate": 9.174206545276678e-06,
"loss": 0.1673,
"step": 630
},
{
"epoch": 1.7216916780354707,
"grad_norm": 0.3580315411090851,
"learning_rate": 9.14252188147047e-06,
"loss": 0.1615,
"step": 631
},
{
"epoch": 1.7244201909959074,
"grad_norm": 0.3664811849594116,
"learning_rate": 9.11084588745343e-06,
"loss": 0.1677,
"step": 632
},
{
"epoch": 1.7271487039563438,
"grad_norm": 0.37821367383003235,
"learning_rate": 9.07917888349519e-06,
"loss": 0.1671,
"step": 633
},
{
"epoch": 1.7298772169167802,
"grad_norm": 0.35077428817749023,
"learning_rate": 9.047521189774456e-06,
"loss": 0.1653,
"step": 634
},
{
"epoch": 1.7326057298772168,
"grad_norm": 0.3558577597141266,
"learning_rate": 9.015873126375822e-06,
"loss": 0.1638,
"step": 635
},
{
"epoch": 1.7353342428376535,
"grad_norm": 0.34969979524612427,
"learning_rate": 8.984235013286512e-06,
"loss": 0.1628,
"step": 636
},
{
"epoch": 1.73806275579809,
"grad_norm": 0.37003517150878906,
"learning_rate": 8.952607170393126e-06,
"loss": 0.1691,
"step": 637
},
{
"epoch": 1.7407912687585267,
"grad_norm": 0.36189723014831543,
"learning_rate": 8.920989917478446e-06,
"loss": 0.1624,
"step": 638
},
{
"epoch": 1.7435197817189632,
"grad_norm": 0.3659520745277405,
"learning_rate": 8.88938357421817e-06,
"loss": 0.1669,
"step": 639
},
{
"epoch": 1.7462482946793996,
"grad_norm": 0.37601613998413086,
"learning_rate": 8.857788460177685e-06,
"loss": 0.1691,
"step": 640
},
{
"epoch": 1.7489768076398362,
"grad_norm": 0.36393433809280396,
"learning_rate": 8.826204894808856e-06,
"loss": 0.1643,
"step": 641
},
{
"epoch": 1.7517053206002728,
"grad_norm": 0.3598528802394867,
"learning_rate": 8.79463319744677e-06,
"loss": 0.1686,
"step": 642
},
{
"epoch": 1.7544338335607095,
"grad_norm": 0.3672749102115631,
"learning_rate": 8.763073687306523e-06,
"loss": 0.1661,
"step": 643
},
{
"epoch": 1.7571623465211461,
"grad_norm": 0.3553427457809448,
"learning_rate": 8.731526683479991e-06,
"loss": 0.168,
"step": 644
},
{
"epoch": 1.7598908594815825,
"grad_norm": 0.36217623949050903,
"learning_rate": 8.699992504932599e-06,
"loss": 0.1667,
"step": 645
},
{
"epoch": 1.762619372442019,
"grad_norm": 0.3513911962509155,
"learning_rate": 8.668471470500094e-06,
"loss": 0.1646,
"step": 646
},
{
"epoch": 1.7653478854024556,
"grad_norm": 0.36681661009788513,
"learning_rate": 8.63696389888533e-06,
"loss": 0.1702,
"step": 647
},
{
"epoch": 1.7680763983628922,
"grad_norm": 0.35328978300094604,
"learning_rate": 8.605470108655046e-06,
"loss": 0.1675,
"step": 648
},
{
"epoch": 1.7708049113233288,
"grad_norm": 0.3467901349067688,
"learning_rate": 8.573990418236626e-06,
"loss": 0.1684,
"step": 649
},
{
"epoch": 1.7735334242837655,
"grad_norm": 0.355685293674469,
"learning_rate": 8.542525145914907e-06,
"loss": 0.1637,
"step": 650
},
{
"epoch": 1.776261937244202,
"grad_norm": 0.393255352973938,
"learning_rate": 8.511074609828944e-06,
"loss": 0.1665,
"step": 651
},
{
"epoch": 1.7789904502046383,
"grad_norm": 0.34803488850593567,
"learning_rate": 8.479639127968793e-06,
"loss": 0.1643,
"step": 652
},
{
"epoch": 1.781718963165075,
"grad_norm": 0.37034231424331665,
"learning_rate": 8.448219018172303e-06,
"loss": 0.1684,
"step": 653
},
{
"epoch": 1.7844474761255116,
"grad_norm": 0.3687577247619629,
"learning_rate": 8.416814598121901e-06,
"loss": 0.1657,
"step": 654
},
{
"epoch": 1.7871759890859482,
"grad_norm": 0.3742063045501709,
"learning_rate": 8.385426185341374e-06,
"loss": 0.161,
"step": 655
},
{
"epoch": 1.7899045020463848,
"grad_norm": 0.36773720383644104,
"learning_rate": 8.35405409719266e-06,
"loss": 0.1648,
"step": 656
},
{
"epoch": 1.7926330150068213,
"grad_norm": 0.3640764653682709,
"learning_rate": 8.322698650872656e-06,
"loss": 0.1662,
"step": 657
},
{
"epoch": 1.795361527967258,
"grad_norm": 0.3551373779773712,
"learning_rate": 8.291360163409978e-06,
"loss": 0.1612,
"step": 658
},
{
"epoch": 1.7980900409276943,
"grad_norm": 0.3683207631111145,
"learning_rate": 8.260038951661787e-06,
"loss": 0.1691,
"step": 659
},
{
"epoch": 1.800818553888131,
"grad_norm": 0.37494397163391113,
"learning_rate": 8.228735332310575e-06,
"loss": 0.1682,
"step": 660
},
{
"epoch": 1.8035470668485676,
"grad_norm": 0.35092827677726746,
"learning_rate": 8.197449621860944e-06,
"loss": 0.1669,
"step": 661
},
{
"epoch": 1.8062755798090042,
"grad_norm": 0.342723548412323,
"learning_rate": 8.16618213663644e-06,
"loss": 0.1617,
"step": 662
},
{
"epoch": 1.8090040927694406,
"grad_norm": 0.3787606656551361,
"learning_rate": 8.134933192776333e-06,
"loss": 0.1714,
"step": 663
},
{
"epoch": 1.8117326057298773,
"grad_norm": 0.36189302802085876,
"learning_rate": 8.103703106232416e-06,
"loss": 0.1658,
"step": 664
},
{
"epoch": 1.8144611186903137,
"grad_norm": 0.3539157807826996,
"learning_rate": 8.072492192765833e-06,
"loss": 0.1609,
"step": 665
},
{
"epoch": 1.8171896316507503,
"grad_norm": 0.35902145504951477,
"learning_rate": 8.041300767943867e-06,
"loss": 0.1636,
"step": 666
},
{
"epoch": 1.819918144611187,
"grad_norm": 0.3730703890323639,
"learning_rate": 8.010129147136749e-06,
"loss": 0.1665,
"step": 667
},
{
"epoch": 1.8226466575716236,
"grad_norm": 0.3461478650569916,
"learning_rate": 7.978977645514488e-06,
"loss": 0.1615,
"step": 668
},
{
"epoch": 1.82537517053206,
"grad_norm": 0.36380550265312195,
"learning_rate": 7.947846578043658e-06,
"loss": 0.1667,
"step": 669
},
{
"epoch": 1.8281036834924966,
"grad_norm": 0.3588198125362396,
"learning_rate": 7.916736259484239e-06,
"loss": 0.1634,
"step": 670
},
{
"epoch": 1.830832196452933,
"grad_norm": 0.3543216586112976,
"learning_rate": 7.885647004386421e-06,
"loss": 0.1637,
"step": 671
},
{
"epoch": 1.8335607094133697,
"grad_norm": 0.3527906835079193,
"learning_rate": 7.854579127087418e-06,
"loss": 0.1636,
"step": 672
},
{
"epoch": 1.8362892223738063,
"grad_norm": 0.3463229537010193,
"learning_rate": 7.823532941708305e-06,
"loss": 0.1617,
"step": 673
},
{
"epoch": 1.839017735334243,
"grad_norm": 0.36647194623947144,
"learning_rate": 7.792508762150833e-06,
"loss": 0.1672,
"step": 674
},
{
"epoch": 1.8417462482946794,
"grad_norm": 0.35416069626808167,
"learning_rate": 7.761506902094248e-06,
"loss": 0.1633,
"step": 675
},
{
"epoch": 1.844474761255116,
"grad_norm": 0.3427625000476837,
"learning_rate": 7.730527674992143e-06,
"loss": 0.1662,
"step": 676
},
{
"epoch": 1.8472032742155524,
"grad_norm": 0.3429381251335144,
"learning_rate": 7.699571394069269e-06,
"loss": 0.1594,
"step": 677
},
{
"epoch": 1.849931787175989,
"grad_norm": 0.3431464433670044,
"learning_rate": 7.668638372318359e-06,
"loss": 0.1596,
"step": 678
},
{
"epoch": 1.8526603001364257,
"grad_norm": 0.36854469776153564,
"learning_rate": 7.637728922496996e-06,
"loss": 0.1686,
"step": 679
},
{
"epoch": 1.8553888130968623,
"grad_norm": 0.3657079339027405,
"learning_rate": 7.606843357124426e-06,
"loss": 0.1684,
"step": 680
},
{
"epoch": 1.8581173260572987,
"grad_norm": 0.3696242868900299,
"learning_rate": 7.575981988478393e-06,
"loss": 0.1693,
"step": 681
},
{
"epoch": 1.8608458390177354,
"grad_norm": 0.36820054054260254,
"learning_rate": 7.545145128592009e-06,
"loss": 0.169,
"step": 682
},
{
"epoch": 1.8635743519781718,
"grad_norm": 0.35837873816490173,
"learning_rate": 7.514333089250577e-06,
"loss": 0.1659,
"step": 683
},
{
"epoch": 1.8663028649386084,
"grad_norm": 0.35923945903778076,
"learning_rate": 7.483546181988437e-06,
"loss": 0.1683,
"step": 684
},
{
"epoch": 1.869031377899045,
"grad_norm": 0.35063204169273376,
"learning_rate": 7.452784718085834e-06,
"loss": 0.1611,
"step": 685
},
{
"epoch": 1.8717598908594817,
"grad_norm": 0.3463192582130432,
"learning_rate": 7.422049008565757e-06,
"loss": 0.1648,
"step": 686
},
{
"epoch": 1.874488403819918,
"grad_norm": 0.34152331948280334,
"learning_rate": 7.391339364190794e-06,
"loss": 0.1602,
"step": 687
},
{
"epoch": 1.8772169167803547,
"grad_norm": 0.34842997789382935,
"learning_rate": 7.360656095459995e-06,
"loss": 0.1644,
"step": 688
},
{
"epoch": 1.8799454297407912,
"grad_norm": 0.3455371856689453,
"learning_rate": 7.329999512605738e-06,
"loss": 0.1631,
"step": 689
},
{
"epoch": 1.8826739427012278,
"grad_norm": 0.34828466176986694,
"learning_rate": 7.299369925590575e-06,
"loss": 0.1618,
"step": 690
},
{
"epoch": 1.8854024556616644,
"grad_norm": 0.36350730061531067,
"learning_rate": 7.268767644104114e-06,
"loss": 0.1653,
"step": 691
},
{
"epoch": 1.888130968622101,
"grad_norm": 0.35511481761932373,
"learning_rate": 7.2381929775598835e-06,
"loss": 0.1598,
"step": 692
},
{
"epoch": 1.8908594815825375,
"grad_norm": 0.34677204489707947,
"learning_rate": 7.207646235092201e-06,
"loss": 0.1603,
"step": 693
},
{
"epoch": 1.893587994542974,
"grad_norm": 0.3588501811027527,
"learning_rate": 7.1771277255530456e-06,
"loss": 0.1638,
"step": 694
},
{
"epoch": 1.8963165075034105,
"grad_norm": 0.3464423716068268,
"learning_rate": 7.14663775750895e-06,
"loss": 0.1635,
"step": 695
},
{
"epoch": 1.8990450204638472,
"grad_norm": 0.3492993116378784,
"learning_rate": 7.116176639237853e-06,
"loss": 0.1629,
"step": 696
},
{
"epoch": 1.9017735334242838,
"grad_norm": 0.35050979256629944,
"learning_rate": 7.085744678726013e-06,
"loss": 0.1661,
"step": 697
},
{
"epoch": 1.9045020463847204,
"grad_norm": 0.3559916317462921,
"learning_rate": 7.05534218366488e-06,
"loss": 0.165,
"step": 698
},
{
"epoch": 1.9072305593451568,
"grad_norm": 0.34668344259262085,
"learning_rate": 7.024969461447973e-06,
"loss": 0.1624,
"step": 699
},
{
"epoch": 1.9099590723055935,
"grad_norm": 0.3400524854660034,
"learning_rate": 6.994626819167789e-06,
"loss": 0.1631,
"step": 700
},
{
"epoch": 1.9126875852660299,
"grad_norm": 0.3597893714904785,
"learning_rate": 6.964314563612709e-06,
"loss": 0.166,
"step": 701
},
{
"epoch": 1.9154160982264665,
"grad_norm": 0.35054320096969604,
"learning_rate": 6.934033001263847e-06,
"loss": 0.1667,
"step": 702
},
{
"epoch": 1.9181446111869032,
"grad_norm": 0.3557494878768921,
"learning_rate": 6.9037824382920145e-06,
"loss": 0.1644,
"step": 703
},
{
"epoch": 1.9208731241473398,
"grad_norm": 0.34000909328460693,
"learning_rate": 6.873563180554583e-06,
"loss": 0.1603,
"step": 704
},
{
"epoch": 1.9236016371077762,
"grad_norm": 0.35631030797958374,
"learning_rate": 6.843375533592395e-06,
"loss": 0.1661,
"step": 705
},
{
"epoch": 1.9263301500682128,
"grad_norm": 0.3655133843421936,
"learning_rate": 6.813219802626698e-06,
"loss": 0.1639,
"step": 706
},
{
"epoch": 1.9290586630286493,
"grad_norm": 0.3506717085838318,
"learning_rate": 6.783096292556035e-06,
"loss": 0.1674,
"step": 707
},
{
"epoch": 1.931787175989086,
"grad_norm": 0.34663382172584534,
"learning_rate": 6.7530053079531664e-06,
"loss": 0.1652,
"step": 708
},
{
"epoch": 1.9345156889495225,
"grad_norm": 0.34880849719047546,
"learning_rate": 6.722947153062003e-06,
"loss": 0.1647,
"step": 709
},
{
"epoch": 1.9372442019099592,
"grad_norm": 0.3882060945034027,
"learning_rate": 6.692922131794517e-06,
"loss": 0.1693,
"step": 710
},
{
"epoch": 1.9399727148703958,
"grad_norm": 0.35033705830574036,
"learning_rate": 6.662930547727668e-06,
"loss": 0.1648,
"step": 711
},
{
"epoch": 1.9427012278308322,
"grad_norm": 0.35732313990592957,
"learning_rate": 6.632972704100349e-06,
"loss": 0.1621,
"step": 712
},
{
"epoch": 1.9454297407912686,
"grad_norm": 0.33822888135910034,
"learning_rate": 6.603048903810305e-06,
"loss": 0.1598,
"step": 713
},
{
"epoch": 1.9481582537517053,
"grad_norm": 0.3523855209350586,
"learning_rate": 6.573159449411071e-06,
"loss": 0.164,
"step": 714
},
{
"epoch": 1.950886766712142,
"grad_norm": 0.38246482610702515,
"learning_rate": 6.5433046431089205e-06,
"loss": 0.1668,
"step": 715
},
{
"epoch": 1.9536152796725785,
"grad_norm": 0.3540845811367035,
"learning_rate": 6.513484786759818e-06,
"loss": 0.1646,
"step": 716
},
{
"epoch": 1.9563437926330152,
"grad_norm": 0.34382006525993347,
"learning_rate": 6.483700181866337e-06,
"loss": 0.1621,
"step": 717
},
{
"epoch": 1.9590723055934516,
"grad_norm": 0.35310429334640503,
"learning_rate": 6.453951129574644e-06,
"loss": 0.1678,
"step": 718
},
{
"epoch": 1.961800818553888,
"grad_norm": 0.3334411680698395,
"learning_rate": 6.42423793067144e-06,
"loss": 0.1572,
"step": 719
},
{
"epoch": 1.9645293315143246,
"grad_norm": 0.3550149202346802,
"learning_rate": 6.39456088558091e-06,
"loss": 0.1668,
"step": 720
},
{
"epoch": 1.9672578444747613,
"grad_norm": 0.35232090950012207,
"learning_rate": 6.364920294361701e-06,
"loss": 0.1621,
"step": 721
},
{
"epoch": 1.969986357435198,
"grad_norm": 0.3518090844154358,
"learning_rate": 6.335316456703891e-06,
"loss": 0.1589,
"step": 722
},
{
"epoch": 1.9727148703956345,
"grad_norm": 0.34866175055503845,
"learning_rate": 6.3057496719259314e-06,
"loss": 0.1664,
"step": 723
},
{
"epoch": 1.975443383356071,
"grad_norm": 0.3425263464450836,
"learning_rate": 6.276220238971653e-06,
"loss": 0.1606,
"step": 724
},
{
"epoch": 1.9781718963165074,
"grad_norm": 0.35735857486724854,
"learning_rate": 6.2467284564072294e-06,
"loss": 0.1632,
"step": 725
},
{
"epoch": 1.980900409276944,
"grad_norm": 0.342393159866333,
"learning_rate": 6.2172746224181524e-06,
"loss": 0.1604,
"step": 726
},
{
"epoch": 1.9836289222373806,
"grad_norm": 0.34849631786346436,
"learning_rate": 6.187859034806225e-06,
"loss": 0.1619,
"step": 727
},
{
"epoch": 1.9863574351978173,
"grad_norm": 0.34562066197395325,
"learning_rate": 6.158481990986558e-06,
"loss": 0.1637,
"step": 728
},
{
"epoch": 1.989085948158254,
"grad_norm": 0.3466956913471222,
"learning_rate": 6.1291437879845335e-06,
"loss": 0.1628,
"step": 729
},
{
"epoch": 1.9918144611186903,
"grad_norm": 0.3530554175376892,
"learning_rate": 6.099844722432844e-06,
"loss": 0.1608,
"step": 730
},
{
"epoch": 1.9945429740791267,
"grad_norm": 0.33960825204849243,
"learning_rate": 6.07058509056846e-06,
"loss": 0.1602,
"step": 731
},
{
"epoch": 1.9972714870395634,
"grad_norm": 0.3492552936077118,
"learning_rate": 6.041365188229641e-06,
"loss": 0.1669,
"step": 732
},
{
"epoch": 2.0,
"grad_norm": 0.3499084413051605,
"learning_rate": 6.012185310852962e-06,
"loss": 0.1638,
"step": 733
},
{
"epoch": 2.0027285129604366,
"grad_norm": 0.3995380997657776,
"learning_rate": 5.983045753470308e-06,
"loss": 0.134,
"step": 734
},
{
"epoch": 2.0054570259208733,
"grad_norm": 0.4106523096561432,
"learning_rate": 5.9539468107058885e-06,
"loss": 0.1382,
"step": 735
},
{
"epoch": 2.00818553888131,
"grad_norm": 0.3607160151004791,
"learning_rate": 5.924888776773281e-06,
"loss": 0.137,
"step": 736
},
{
"epoch": 2.010914051841746,
"grad_norm": 0.3377709686756134,
"learning_rate": 5.895871945472434e-06,
"loss": 0.1303,
"step": 737
},
{
"epoch": 2.0136425648021827,
"grad_norm": 0.33374232053756714,
"learning_rate": 5.866896610186701e-06,
"loss": 0.1324,
"step": 738
},
{
"epoch": 2.0163710777626194,
"grad_norm": 0.3930217921733856,
"learning_rate": 5.8379630638798845e-06,
"loss": 0.1324,
"step": 739
},
{
"epoch": 2.019099590723056,
"grad_norm": 0.44687938690185547,
"learning_rate": 5.809071599093272e-06,
"loss": 0.1345,
"step": 740
},
{
"epoch": 2.0218281036834926,
"grad_norm": 0.4330204129219055,
"learning_rate": 5.780222507942654e-06,
"loss": 0.1288,
"step": 741
},
{
"epoch": 2.0245566166439293,
"grad_norm": 0.41968345642089844,
"learning_rate": 5.7514160821154085e-06,
"loss": 0.1313,
"step": 742
},
{
"epoch": 2.0272851296043655,
"grad_norm": 0.3952399790287018,
"learning_rate": 5.7226526128675234e-06,
"loss": 0.1326,
"step": 743
},
{
"epoch": 2.030013642564802,
"grad_norm": 0.37134867906570435,
"learning_rate": 5.693932391020664e-06,
"loss": 0.1299,
"step": 744
},
{
"epoch": 2.0327421555252387,
"grad_norm": 0.39517930150032043,
"learning_rate": 5.665255706959231e-06,
"loss": 0.1331,
"step": 745
},
{
"epoch": 2.0354706684856754,
"grad_norm": 0.3553452491760254,
"learning_rate": 5.63662285062742e-06,
"loss": 0.1293,
"step": 746
},
{
"epoch": 2.038199181446112,
"grad_norm": 0.3541392683982849,
"learning_rate": 5.608034111526298e-06,
"loss": 0.1329,
"step": 747
},
{
"epoch": 2.0409276944065486,
"grad_norm": 0.3674290180206299,
"learning_rate": 5.579489778710867e-06,
"loss": 0.1323,
"step": 748
},
{
"epoch": 2.043656207366985,
"grad_norm": 0.3576567471027374,
"learning_rate": 5.550990140787147e-06,
"loss": 0.1314,
"step": 749
},
{
"epoch": 2.0463847203274215,
"grad_norm": 0.34979933500289917,
"learning_rate": 5.522535485909258e-06,
"loss": 0.1288,
"step": 750
},
{
"epoch": 2.049113233287858,
"grad_norm": 0.3478044867515564,
"learning_rate": 5.494126101776505e-06,
"loss": 0.1308,
"step": 751
},
{
"epoch": 2.0518417462482947,
"grad_norm": 0.35146066546440125,
"learning_rate": 5.465762275630471e-06,
"loss": 0.1301,
"step": 752
},
{
"epoch": 2.0545702592087314,
"grad_norm": 0.36018314957618713,
"learning_rate": 5.437444294252108e-06,
"loss": 0.1306,
"step": 753
},
{
"epoch": 2.057298772169168,
"grad_norm": 0.37735897302627563,
"learning_rate": 5.409172443958844e-06,
"loss": 0.1353,
"step": 754
},
{
"epoch": 2.060027285129604,
"grad_norm": 0.36527469754219055,
"learning_rate": 5.380947010601681e-06,
"loss": 0.1338,
"step": 755
},
{
"epoch": 2.062755798090041,
"grad_norm": 0.3785851001739502,
"learning_rate": 5.352768279562315e-06,
"loss": 0.1299,
"step": 756
},
{
"epoch": 2.0654843110504775,
"grad_norm": 0.3725038468837738,
"learning_rate": 5.324636535750238e-06,
"loss": 0.1312,
"step": 757
},
{
"epoch": 2.068212824010914,
"grad_norm": 0.3610471487045288,
"learning_rate": 5.2965520635998676e-06,
"loss": 0.1299,
"step": 758
},
{
"epoch": 2.0709413369713507,
"grad_norm": 0.3752604126930237,
"learning_rate": 5.268515147067666e-06,
"loss": 0.1322,
"step": 759
},
{
"epoch": 2.0736698499317874,
"grad_norm": 0.3641640245914459,
"learning_rate": 5.240526069629265e-06,
"loss": 0.1314,
"step": 760
},
{
"epoch": 2.0763983628922236,
"grad_norm": 0.3561566472053528,
"learning_rate": 5.212585114276614e-06,
"loss": 0.1301,
"step": 761
},
{
"epoch": 2.07912687585266,
"grad_norm": 0.3664422333240509,
"learning_rate": 5.184692563515104e-06,
"loss": 0.1315,
"step": 762
},
{
"epoch": 2.081855388813097,
"grad_norm": 0.3476559519767761,
"learning_rate": 5.156848699360719e-06,
"loss": 0.1293,
"step": 763
},
{
"epoch": 2.0845839017735335,
"grad_norm": 0.35268449783325195,
"learning_rate": 5.129053803337181e-06,
"loss": 0.1313,
"step": 764
},
{
"epoch": 2.08731241473397,
"grad_norm": 0.36053764820098877,
"learning_rate": 5.101308156473104e-06,
"loss": 0.1304,
"step": 765
},
{
"epoch": 2.0900409276944067,
"grad_norm": 0.3567509949207306,
"learning_rate": 5.073612039299157e-06,
"loss": 0.1309,
"step": 766
},
{
"epoch": 2.092769440654843,
"grad_norm": 0.35490044951438904,
"learning_rate": 5.045965731845223e-06,
"loss": 0.132,
"step": 767
},
{
"epoch": 2.0954979536152796,
"grad_norm": 0.3639454245567322,
"learning_rate": 5.018369513637567e-06,
"loss": 0.1311,
"step": 768
},
{
"epoch": 2.098226466575716,
"grad_norm": 0.3668364882469177,
"learning_rate": 4.990823663696013e-06,
"loss": 0.1301,
"step": 769
},
{
"epoch": 2.100954979536153,
"grad_norm": 0.3602748215198517,
"learning_rate": 4.963328460531127e-06,
"loss": 0.1316,
"step": 770
},
{
"epoch": 2.1036834924965895,
"grad_norm": 0.3594209551811218,
"learning_rate": 4.9358841821413775e-06,
"loss": 0.1292,
"step": 771
},
{
"epoch": 2.106412005457026,
"grad_norm": 0.36325401067733765,
"learning_rate": 4.908491106010368e-06,
"loss": 0.1333,
"step": 772
},
{
"epoch": 2.1091405184174623,
"grad_norm": 0.36963802576065063,
"learning_rate": 4.881149509103993e-06,
"loss": 0.1297,
"step": 773
},
{
"epoch": 2.111869031377899,
"grad_norm": 0.36008724570274353,
"learning_rate": 4.853859667867641e-06,
"loss": 0.1299,
"step": 774
},
{
"epoch": 2.1145975443383356,
"grad_norm": 0.3590433895587921,
"learning_rate": 4.826621858223431e-06,
"loss": 0.1315,
"step": 775
},
{
"epoch": 2.117326057298772,
"grad_norm": 0.3663657009601593,
"learning_rate": 4.799436355567391e-06,
"loss": 0.1314,
"step": 776
},
{
"epoch": 2.120054570259209,
"grad_norm": 0.3599216938018799,
"learning_rate": 4.772303434766669e-06,
"loss": 0.1307,
"step": 777
},
{
"epoch": 2.1227830832196455,
"grad_norm": 0.36558717489242554,
"learning_rate": 4.745223370156797e-06,
"loss": 0.1323,
"step": 778
},
{
"epoch": 2.1255115961800817,
"grad_norm": 0.3608526289463043,
"learning_rate": 4.7181964355388695e-06,
"loss": 0.1319,
"step": 779
},
{
"epoch": 2.1282401091405183,
"grad_norm": 0.35885030031204224,
"learning_rate": 4.691222904176791e-06,
"loss": 0.1323,
"step": 780
},
{
"epoch": 2.130968622100955,
"grad_norm": 0.3633890151977539,
"learning_rate": 4.664303048794533e-06,
"loss": 0.1323,
"step": 781
},
{
"epoch": 2.1336971350613916,
"grad_norm": 0.36755603551864624,
"learning_rate": 4.63743714157335e-06,
"loss": 0.1342,
"step": 782
},
{
"epoch": 2.136425648021828,
"grad_norm": 0.3608950674533844,
"learning_rate": 4.610625454149033e-06,
"loss": 0.1319,
"step": 783
},
{
"epoch": 2.139154160982265,
"grad_norm": 0.35920849442481995,
"learning_rate": 4.583868257609171e-06,
"loss": 0.1312,
"step": 784
},
{
"epoch": 2.141882673942701,
"grad_norm": 0.36166754364967346,
"learning_rate": 4.55716582249042e-06,
"loss": 0.1323,
"step": 785
},
{
"epoch": 2.1446111869031377,
"grad_norm": 0.37216901779174805,
"learning_rate": 4.530518418775734e-06,
"loss": 0.1304,
"step": 786
},
{
"epoch": 2.1473396998635743,
"grad_norm": 0.3564962148666382,
"learning_rate": 4.50392631589166e-06,
"loss": 0.1324,
"step": 787
},
{
"epoch": 2.150068212824011,
"grad_norm": 0.35313740372657776,
"learning_rate": 4.477389782705628e-06,
"loss": 0.1297,
"step": 788
},
{
"epoch": 2.1527967257844476,
"grad_norm": 0.35927248001098633,
"learning_rate": 4.4509090875231865e-06,
"loss": 0.1331,
"step": 789
},
{
"epoch": 2.155525238744884,
"grad_norm": 0.3569164276123047,
"learning_rate": 4.424484498085335e-06,
"loss": 0.1328,
"step": 790
},
{
"epoch": 2.1582537517053204,
"grad_norm": 0.36442428827285767,
"learning_rate": 4.398116281565794e-06,
"loss": 0.1313,
"step": 791
},
{
"epoch": 2.160982264665757,
"grad_norm": 0.35785555839538574,
"learning_rate": 4.371804704568309e-06,
"loss": 0.1296,
"step": 792
},
{
"epoch": 2.1637107776261937,
"grad_norm": 0.3561733663082123,
"learning_rate": 4.345550033123954e-06,
"loss": 0.1306,
"step": 793
},
{
"epoch": 2.1664392905866303,
"grad_norm": 0.36286094784736633,
"learning_rate": 4.319352532688444e-06,
"loss": 0.1324,
"step": 794
},
{
"epoch": 2.169167803547067,
"grad_norm": 0.35654789209365845,
"learning_rate": 4.293212468139447e-06,
"loss": 0.1292,
"step": 795
},
{
"epoch": 2.1718963165075036,
"grad_norm": 0.3664185404777527,
"learning_rate": 4.267130103773911e-06,
"loss": 0.1304,
"step": 796
},
{
"epoch": 2.17462482946794,
"grad_norm": 0.3567107021808624,
"learning_rate": 4.241105703305388e-06,
"loss": 0.1297,
"step": 797
},
{
"epoch": 2.1773533424283764,
"grad_norm": 0.3579164743423462,
"learning_rate": 4.2151395298613675e-06,
"loss": 0.1305,
"step": 798
},
{
"epoch": 2.180081855388813,
"grad_norm": 0.35019171237945557,
"learning_rate": 4.189231845980618e-06,
"loss": 0.131,
"step": 799
},
{
"epoch": 2.1828103683492497,
"grad_norm": 0.3644610047340393,
"learning_rate": 4.163382913610533e-06,
"loss": 0.1314,
"step": 800
},
{
"epoch": 2.1855388813096863,
"grad_norm": 0.3550662398338318,
"learning_rate": 4.137592994104479e-06,
"loss": 0.1297,
"step": 801
},
{
"epoch": 2.188267394270123,
"grad_norm": 0.37268707156181335,
"learning_rate": 4.111862348219158e-06,
"loss": 0.1311,
"step": 802
},
{
"epoch": 2.190995907230559,
"grad_norm": 0.35408931970596313,
"learning_rate": 4.086191236111964e-06,
"loss": 0.13,
"step": 803
},
{
"epoch": 2.193724420190996,
"grad_norm": 0.36723992228507996,
"learning_rate": 4.060579917338362e-06,
"loss": 0.1345,
"step": 804
},
{
"epoch": 2.1964529331514324,
"grad_norm": 0.3593861758708954,
"learning_rate": 4.035028650849255e-06,
"loss": 0.1322,
"step": 805
},
{
"epoch": 2.199181446111869,
"grad_norm": 0.3648458421230316,
"learning_rate": 4.009537694988372e-06,
"loss": 0.1341,
"step": 806
},
{
"epoch": 2.2019099590723057,
"grad_norm": 0.361258327960968,
"learning_rate": 3.984107307489652e-06,
"loss": 0.1305,
"step": 807
},
{
"epoch": 2.2046384720327423,
"grad_norm": 0.35904020071029663,
"learning_rate": 3.958737745474638e-06,
"loss": 0.1279,
"step": 808
},
{
"epoch": 2.2073669849931785,
"grad_norm": 0.3607647120952606,
"learning_rate": 3.933429265449882e-06,
"loss": 0.1299,
"step": 809
},
{
"epoch": 2.210095497953615,
"grad_norm": 0.3666648268699646,
"learning_rate": 3.908182123304344e-06,
"loss": 0.1309,
"step": 810
},
{
"epoch": 2.212824010914052,
"grad_norm": 0.3656477928161621,
"learning_rate": 3.882996574306818e-06,
"loss": 0.1334,
"step": 811
},
{
"epoch": 2.2155525238744884,
"grad_norm": 0.35920169949531555,
"learning_rate": 3.857872873103322e-06,
"loss": 0.1328,
"step": 812
},
{
"epoch": 2.218281036834925,
"grad_norm": 0.3563489317893982,
"learning_rate": 3.832811273714569e-06,
"loss": 0.1327,
"step": 813
},
{
"epoch": 2.2210095497953617,
"grad_norm": 0.363343209028244,
"learning_rate": 3.807812029533362e-06,
"loss": 0.13,
"step": 814
},
{
"epoch": 2.223738062755798,
"grad_norm": 0.36663034558296204,
"learning_rate": 3.78287539332203e-06,
"loss": 0.1319,
"step": 815
},
{
"epoch": 2.2264665757162345,
"grad_norm": 0.35951149463653564,
"learning_rate": 3.7580016172099067e-06,
"loss": 0.1306,
"step": 816
},
{
"epoch": 2.229195088676671,
"grad_norm": 0.35489046573638916,
"learning_rate": 3.7331909526907527e-06,
"loss": 0.1293,
"step": 817
},
{
"epoch": 2.231923601637108,
"grad_norm": 0.3771286904811859,
"learning_rate": 3.708443650620206e-06,
"loss": 0.1338,
"step": 818
},
{
"epoch": 2.2346521145975444,
"grad_norm": 0.35692450404167175,
"learning_rate": 3.6837599612132826e-06,
"loss": 0.1314,
"step": 819
},
{
"epoch": 2.237380627557981,
"grad_norm": 0.3585570454597473,
"learning_rate": 3.659140134041812e-06,
"loss": 0.1319,
"step": 820
},
{
"epoch": 2.2401091405184177,
"grad_norm": 0.3692927658557892,
"learning_rate": 3.6345844180319157e-06,
"loss": 0.1355,
"step": 821
},
{
"epoch": 2.242837653478854,
"grad_norm": 0.3518622815608978,
"learning_rate": 3.6100930614615204e-06,
"loss": 0.1298,
"step": 822
},
{
"epoch": 2.2455661664392905,
"grad_norm": 0.34735947847366333,
"learning_rate": 3.5856663119578174e-06,
"loss": 0.1315,
"step": 823
},
{
"epoch": 2.248294679399727,
"grad_norm": 0.339832603931427,
"learning_rate": 3.5613044164947617e-06,
"loss": 0.1267,
"step": 824
},
{
"epoch": 2.251023192360164,
"grad_norm": 0.3616288900375366,
"learning_rate": 3.5370076213905904e-06,
"loss": 0.1294,
"step": 825
},
{
"epoch": 2.2537517053206004,
"grad_norm": 0.35375145077705383,
"learning_rate": 3.5127761723053313e-06,
"loss": 0.1299,
"step": 826
},
{
"epoch": 2.2564802182810366,
"grad_norm": 0.35924383997917175,
"learning_rate": 3.4886103142382944e-06,
"loss": 0.1306,
"step": 827
},
{
"epoch": 2.2592087312414733,
"grad_norm": 0.3569202423095703,
"learning_rate": 3.46451029152562e-06,
"loss": 0.1303,
"step": 828
},
{
"epoch": 2.26193724420191,
"grad_norm": 0.36816731095314026,
"learning_rate": 3.440476347837811e-06,
"loss": 0.1317,
"step": 829
},
{
"epoch": 2.2646657571623465,
"grad_norm": 0.3677736520767212,
"learning_rate": 3.41650872617724e-06,
"loss": 0.1323,
"step": 830
},
{
"epoch": 2.267394270122783,
"grad_norm": 0.35778266191482544,
"learning_rate": 3.392607668875718e-06,
"loss": 0.1316,
"step": 831
},
{
"epoch": 2.27012278308322,
"grad_norm": 0.35744157433509827,
"learning_rate": 3.3687734175920505e-06,
"loss": 0.1296,
"step": 832
},
{
"epoch": 2.2728512960436564,
"grad_norm": 0.37661606073379517,
"learning_rate": 3.3450062133095572e-06,
"loss": 0.1326,
"step": 833
},
{
"epoch": 2.2755798090040926,
"grad_norm": 0.36716973781585693,
"learning_rate": 3.321306296333673e-06,
"loss": 0.1325,
"step": 834
},
{
"epoch": 2.2783083219645293,
"grad_norm": 0.3580392301082611,
"learning_rate": 3.29767390628951e-06,
"loss": 0.1317,
"step": 835
},
{
"epoch": 2.281036834924966,
"grad_norm": 0.36350300908088684,
"learning_rate": 3.274109282119413e-06,
"loss": 0.1311,
"step": 836
},
{
"epoch": 2.2837653478854025,
"grad_norm": 0.36548492312431335,
"learning_rate": 3.2506126620805666e-06,
"loss": 0.1325,
"step": 837
},
{
"epoch": 2.286493860845839,
"grad_norm": 0.3625844717025757,
"learning_rate": 3.2271842837425917e-06,
"loss": 0.1297,
"step": 838
},
{
"epoch": 2.2892223738062754,
"grad_norm": 0.3575722277164459,
"learning_rate": 3.203824383985108e-06,
"loss": 0.1315,
"step": 839
},
{
"epoch": 2.291950886766712,
"grad_norm": 0.3623971939086914,
"learning_rate": 3.180533198995379e-06,
"loss": 0.132,
"step": 840
},
{
"epoch": 2.2946793997271486,
"grad_norm": 0.3681948184967041,
"learning_rate": 3.157310964265903e-06,
"loss": 0.1308,
"step": 841
},
{
"epoch": 2.2974079126875853,
"grad_norm": 0.36443206667900085,
"learning_rate": 3.134157914592032e-06,
"loss": 0.1328,
"step": 842
},
{
"epoch": 2.300136425648022,
"grad_norm": 0.3680581748485565,
"learning_rate": 3.1110742840696063e-06,
"loss": 0.1314,
"step": 843
},
{
"epoch": 2.3028649386084585,
"grad_norm": 0.3506981134414673,
"learning_rate": 3.088060306092582e-06,
"loss": 0.1268,
"step": 844
},
{
"epoch": 2.305593451568895,
"grad_norm": 0.3596420884132385,
"learning_rate": 3.0651162133506707e-06,
"loss": 0.1317,
"step": 845
},
{
"epoch": 2.3083219645293314,
"grad_norm": 0.359647274017334,
"learning_rate": 3.042242237826991e-06,
"loss": 0.1278,
"step": 846
},
{
"epoch": 2.311050477489768,
"grad_norm": 0.36838117241859436,
"learning_rate": 3.0194386107957175e-06,
"loss": 0.1337,
"step": 847
},
{
"epoch": 2.3137789904502046,
"grad_norm": 0.3791147768497467,
"learning_rate": 2.996705562819747e-06,
"loss": 0.1325,
"step": 848
},
{
"epoch": 2.3165075034106413,
"grad_norm": 0.3608884811401367,
"learning_rate": 2.9740433237483667e-06,
"loss": 0.1299,
"step": 849
},
{
"epoch": 2.319236016371078,
"grad_norm": 0.35787829756736755,
"learning_rate": 2.951452122714926e-06,
"loss": 0.131,
"step": 850
},
{
"epoch": 2.321964529331514,
"grad_norm": 0.3690457046031952,
"learning_rate": 2.9289321881345257e-06,
"loss": 0.1327,
"step": 851
},
{
"epoch": 2.3246930422919507,
"grad_norm": 0.3544095456600189,
"learning_rate": 2.906483747701705e-06,
"loss": 0.1268,
"step": 852
},
{
"epoch": 2.3274215552523874,
"grad_norm": 0.3589238226413727,
"learning_rate": 2.88410702838814e-06,
"loss": 0.1298,
"step": 853
},
{
"epoch": 2.330150068212824,
"grad_norm": 0.34518349170684814,
"learning_rate": 2.861802256440348e-06,
"loss": 0.1286,
"step": 854
},
{
"epoch": 2.3328785811732606,
"grad_norm": 0.35874345898628235,
"learning_rate": 2.8395696573774034e-06,
"loss": 0.1296,
"step": 855
},
{
"epoch": 2.3356070941336973,
"grad_norm": 0.35607776045799255,
"learning_rate": 2.8174094559886535e-06,
"loss": 0.1282,
"step": 856
},
{
"epoch": 2.338335607094134,
"grad_norm": 0.36033570766448975,
"learning_rate": 2.795321876331446e-06,
"loss": 0.1327,
"step": 857
},
{
"epoch": 2.34106412005457,
"grad_norm": 0.3491268455982208,
"learning_rate": 2.773307141728867e-06,
"loss": 0.1279,
"step": 858
},
{
"epoch": 2.3437926330150067,
"grad_norm": 0.3684820234775543,
"learning_rate": 2.751365474767479e-06,
"loss": 0.1331,
"step": 859
},
{
"epoch": 2.3465211459754434,
"grad_norm": 0.3787662088871002,
"learning_rate": 2.729497097295075e-06,
"loss": 0.1365,
"step": 860
},
{
"epoch": 2.34924965893588,
"grad_norm": 0.3510342538356781,
"learning_rate": 2.70770223041843e-06,
"loss": 0.1286,
"step": 861
},
{
"epoch": 2.3519781718963166,
"grad_norm": 0.36617404222488403,
"learning_rate": 2.6859810945010687e-06,
"loss": 0.1328,
"step": 862
},
{
"epoch": 2.354706684856753,
"grad_norm": 0.3623158633708954,
"learning_rate": 2.6643339091610376e-06,
"loss": 0.1311,
"step": 863
},
{
"epoch": 2.3574351978171895,
"grad_norm": 0.3640574812889099,
"learning_rate": 2.642760893268684e-06,
"loss": 0.1289,
"step": 864
},
{
"epoch": 2.360163710777626,
"grad_norm": 0.3659525513648987,
"learning_rate": 2.621262264944444e-06,
"loss": 0.132,
"step": 865
},
{
"epoch": 2.3628922237380627,
"grad_norm": 0.3544803857803345,
"learning_rate": 2.5998382415566258e-06,
"loss": 0.1305,
"step": 866
},
{
"epoch": 2.3656207366984994,
"grad_norm": 0.35486936569213867,
"learning_rate": 2.5784890397192395e-06,
"loss": 0.1312,
"step": 867
},
{
"epoch": 2.368349249658936,
"grad_norm": 0.35350891947746277,
"learning_rate": 2.55721487528978e-06,
"loss": 0.1319,
"step": 868
},
{
"epoch": 2.3710777626193726,
"grad_norm": 0.35865113139152527,
"learning_rate": 2.5360159633670456e-06,
"loss": 0.1305,
"step": 869
},
{
"epoch": 2.373806275579809,
"grad_norm": 0.3628556430339813,
"learning_rate": 2.514892518288988e-06,
"loss": 0.1299,
"step": 870
},
{
"epoch": 2.3765347885402455,
"grad_norm": 0.34977447986602783,
"learning_rate": 2.4938447536305243e-06,
"loss": 0.1311,
"step": 871
},
{
"epoch": 2.379263301500682,
"grad_norm": 0.3562906086444855,
"learning_rate": 2.4728728822013683e-06,
"loss": 0.1299,
"step": 872
},
{
"epoch": 2.3819918144611187,
"grad_norm": 0.3648015558719635,
"learning_rate": 2.451977116043911e-06,
"loss": 0.1311,
"step": 873
},
{
"epoch": 2.3847203274215554,
"grad_norm": 0.3597666025161743,
"learning_rate": 2.431157666431052e-06,
"loss": 0.1307,
"step": 874
},
{
"epoch": 2.3874488403819916,
"grad_norm": 0.35988011956214905,
"learning_rate": 2.410414743864059e-06,
"loss": 0.1317,
"step": 875
},
{
"epoch": 2.390177353342428,
"grad_norm": 0.37350034713745117,
"learning_rate": 2.3897485580704684e-06,
"loss": 0.1307,
"step": 876
},
{
"epoch": 2.392905866302865,
"grad_norm": 0.36840546131134033,
"learning_rate": 2.369159318001937e-06,
"loss": 0.1318,
"step": 877
},
{
"epoch": 2.3956343792633015,
"grad_norm": 0.36420130729675293,
"learning_rate": 2.348647231832131e-06,
"loss": 0.1304,
"step": 878
},
{
"epoch": 2.398362892223738,
"grad_norm": 0.3545297086238861,
"learning_rate": 2.3282125069546437e-06,
"loss": 0.1269,
"step": 879
},
{
"epoch": 2.4010914051841747,
"grad_norm": 0.36249154806137085,
"learning_rate": 2.30785534998088e-06,
"loss": 0.133,
"step": 880
},
{
"epoch": 2.4038199181446114,
"grad_norm": 0.3670268952846527,
"learning_rate": 2.2875759667379616e-06,
"loss": 0.1292,
"step": 881
},
{
"epoch": 2.4065484311050476,
"grad_norm": 0.36384454369544983,
"learning_rate": 2.267374562266662e-06,
"loss": 0.1285,
"step": 882
},
{
"epoch": 2.409276944065484,
"grad_norm": 0.35740095376968384,
"learning_rate": 2.2472513408193385e-06,
"loss": 0.1305,
"step": 883
},
{
"epoch": 2.412005457025921,
"grad_norm": 0.3598315119743347,
"learning_rate": 2.227206505857834e-06,
"loss": 0.1319,
"step": 884
},
{
"epoch": 2.4147339699863575,
"grad_norm": 0.36150842905044556,
"learning_rate": 2.207240260051453e-06,
"loss": 0.1325,
"step": 885
},
{
"epoch": 2.417462482946794,
"grad_norm": 0.3574983477592468,
"learning_rate": 2.1873528052749094e-06,
"loss": 0.131,
"step": 886
},
{
"epoch": 2.4201909959072307,
"grad_norm": 0.35541966557502747,
"learning_rate": 2.167544342606256e-06,
"loss": 0.1276,
"step": 887
},
{
"epoch": 2.422919508867667,
"grad_norm": 0.3616240322589874,
"learning_rate": 2.147815072324886e-06,
"loss": 0.1328,
"step": 888
},
{
"epoch": 2.4256480218281036,
"grad_norm": 0.35617539286613464,
"learning_rate": 2.1281651939094996e-06,
"loss": 0.1289,
"step": 889
},
{
"epoch": 2.42837653478854,
"grad_norm": 0.3464270830154419,
"learning_rate": 2.1085949060360654e-06,
"loss": 0.1298,
"step": 890
},
{
"epoch": 2.431105047748977,
"grad_norm": 0.35398560762405396,
"learning_rate": 2.089104406575837e-06,
"loss": 0.1334,
"step": 891
},
{
"epoch": 2.4338335607094135,
"grad_norm": 0.36852145195007324,
"learning_rate": 2.0696938925933505e-06,
"loss": 0.1304,
"step": 892
},
{
"epoch": 2.43656207366985,
"grad_norm": 0.3582163155078888,
"learning_rate": 2.0503635603444094e-06,
"loss": 0.1307,
"step": 893
},
{
"epoch": 2.4392905866302863,
"grad_norm": 0.3476228713989258,
"learning_rate": 2.0311136052741274e-06,
"loss": 0.1265,
"step": 894
},
{
"epoch": 2.442019099590723,
"grad_norm": 0.3634602129459381,
"learning_rate": 2.0119442220149356e-06,
"loss": 0.1327,
"step": 895
},
{
"epoch": 2.4447476125511596,
"grad_norm": 0.363454133272171,
"learning_rate": 1.9928556043846215e-06,
"loss": 0.134,
"step": 896
},
{
"epoch": 2.447476125511596,
"grad_norm": 0.3611748516559601,
"learning_rate": 1.9738479453843685e-06,
"loss": 0.1305,
"step": 897
},
{
"epoch": 2.450204638472033,
"grad_norm": 0.34824034571647644,
"learning_rate": 1.9549214371968008e-06,
"loss": 0.1285,
"step": 898
},
{
"epoch": 2.4529331514324695,
"grad_norm": 0.3596573770046234,
"learning_rate": 1.936076271184044e-06,
"loss": 0.1281,
"step": 899
},
{
"epoch": 2.4556616643929057,
"grad_norm": 0.35721272230148315,
"learning_rate": 1.917312637885791e-06,
"loss": 0.1305,
"step": 900
},
{
"epoch": 2.4583901773533423,
"grad_norm": 0.36631426215171814,
"learning_rate": 1.898630727017371e-06,
"loss": 0.134,
"step": 901
},
{
"epoch": 2.461118690313779,
"grad_norm": 0.36680275201797485,
"learning_rate": 1.8800307274678364e-06,
"loss": 0.1325,
"step": 902
},
{
"epoch": 2.4638472032742156,
"grad_norm": 0.3748106062412262,
"learning_rate": 1.861512827298051e-06,
"loss": 0.1309,
"step": 903
},
{
"epoch": 2.466575716234652,
"grad_norm": 0.37186112999916077,
"learning_rate": 1.8430772137387853e-06,
"loss": 0.1333,
"step": 904
},
{
"epoch": 2.469304229195089,
"grad_norm": 0.36231905221939087,
"learning_rate": 1.8247240731888293e-06,
"loss": 0.13,
"step": 905
},
{
"epoch": 2.472032742155525,
"grad_norm": 0.35124829411506653,
"learning_rate": 1.8064535912131032e-06,
"loss": 0.1274,
"step": 906
},
{
"epoch": 2.4747612551159617,
"grad_norm": 0.35125380754470825,
"learning_rate": 1.7882659525407842e-06,
"loss": 0.1278,
"step": 907
},
{
"epoch": 2.4774897680763983,
"grad_norm": 0.3610575497150421,
"learning_rate": 1.7701613410634367e-06,
"loss": 0.1288,
"step": 908
},
{
"epoch": 2.480218281036835,
"grad_norm": 0.35818085074424744,
"learning_rate": 1.752139939833154e-06,
"loss": 0.1305,
"step": 909
},
{
"epoch": 2.4829467939972716,
"grad_norm": 0.36310091614723206,
"learning_rate": 1.7342019310607062e-06,
"loss": 0.1288,
"step": 910
},
{
"epoch": 2.485675306957708,
"grad_norm": 0.36673158407211304,
"learning_rate": 1.7163474961137029e-06,
"loss": 0.1307,
"step": 911
},
{
"epoch": 2.488403819918145,
"grad_norm": 0.3627110719680786,
"learning_rate": 1.6985768155147498e-06,
"loss": 0.1296,
"step": 912
},
{
"epoch": 2.491132332878581,
"grad_norm": 0.3594669997692108,
"learning_rate": 1.6808900689396334e-06,
"loss": 0.1281,
"step": 913
},
{
"epoch": 2.4938608458390177,
"grad_norm": 0.35554230213165283,
"learning_rate": 1.6632874352154982e-06,
"loss": 0.1273,
"step": 914
},
{
"epoch": 2.4965893587994543,
"grad_norm": 0.3572724163532257,
"learning_rate": 1.645769092319045e-06,
"loss": 0.1299,
"step": 915
},
{
"epoch": 2.499317871759891,
"grad_norm": 0.3588331639766693,
"learning_rate": 1.6283352173747148e-06,
"loss": 0.1308,
"step": 916
},
{
"epoch": 2.5020463847203276,
"grad_norm": 0.36220690608024597,
"learning_rate": 1.6109859866529253e-06,
"loss": 0.1297,
"step": 917
},
{
"epoch": 2.504774897680764,
"grad_norm": 0.35867708921432495,
"learning_rate": 1.5937215755682667e-06,
"loss": 0.1298,
"step": 918
},
{
"epoch": 2.5075034106412004,
"grad_norm": 0.3584694266319275,
"learning_rate": 1.5765421586777285e-06,
"loss": 0.1293,
"step": 919
},
{
"epoch": 2.510231923601637,
"grad_norm": 0.3651615381240845,
"learning_rate": 1.559447909678954e-06,
"loss": 0.1301,
"step": 920
},
{
"epoch": 2.5129604365620737,
"grad_norm": 0.3454124331474304,
"learning_rate": 1.5424390014084644e-06,
"loss": 0.1261,
"step": 921
},
{
"epoch": 2.5156889495225103,
"grad_norm": 0.36464953422546387,
"learning_rate": 1.5255156058399124e-06,
"loss": 0.1309,
"step": 922
},
{
"epoch": 2.518417462482947,
"grad_norm": 0.3527681827545166,
"learning_rate": 1.5086778940823544e-06,
"loss": 0.1271,
"step": 923
},
{
"epoch": 2.5211459754433836,
"grad_norm": 0.3712303042411804,
"learning_rate": 1.4919260363785215e-06,
"loss": 0.1311,
"step": 924
},
{
"epoch": 2.52387448840382,
"grad_norm": 0.3462398052215576,
"learning_rate": 1.4752602021030794e-06,
"loss": 0.1274,
"step": 925
},
{
"epoch": 2.5266030013642564,
"grad_norm": 0.36644744873046875,
"learning_rate": 1.4586805597609333e-06,
"loss": 0.1292,
"step": 926
},
{
"epoch": 2.529331514324693,
"grad_norm": 0.3596111536026001,
"learning_rate": 1.4421872769855262e-06,
"loss": 0.1311,
"step": 927
},
{
"epoch": 2.5320600272851297,
"grad_norm": 0.3601299822330475,
"learning_rate": 1.4257805205371233e-06,
"loss": 0.1306,
"step": 928
},
{
"epoch": 2.5347885402455663,
"grad_norm": 0.3638714551925659,
"learning_rate": 1.409460456301147e-06,
"loss": 0.1295,
"step": 929
},
{
"epoch": 2.5375170532060025,
"grad_norm": 0.3720818758010864,
"learning_rate": 1.3932272492864984e-06,
"loss": 0.1281,
"step": 930
},
{
"epoch": 2.540245566166439,
"grad_norm": 0.3580470085144043,
"learning_rate": 1.3770810636238685e-06,
"loss": 0.1282,
"step": 931
},
{
"epoch": 2.542974079126876,
"grad_norm": 0.3591982126235962,
"learning_rate": 1.3610220625641002e-06,
"loss": 0.1292,
"step": 932
},
{
"epoch": 2.5457025920873124,
"grad_norm": 0.3552996516227722,
"learning_rate": 1.3450504084765381e-06,
"loss": 0.1294,
"step": 933
},
{
"epoch": 2.548431105047749,
"grad_norm": 0.3630991578102112,
"learning_rate": 1.3291662628473634e-06,
"loss": 0.1296,
"step": 934
},
{
"epoch": 2.5511596180081857,
"grad_norm": 0.34676593542099,
"learning_rate": 1.313369786277987e-06,
"loss": 0.1281,
"step": 935
},
{
"epoch": 2.5538881309686223,
"grad_norm": 0.35284459590911865,
"learning_rate": 1.2976611384834148e-06,
"loss": 0.1285,
"step": 936
},
{
"epoch": 2.5566166439290585,
"grad_norm": 0.3661840856075287,
"learning_rate": 1.2820404782906315e-06,
"loss": 0.1304,
"step": 937
},
{
"epoch": 2.559345156889495,
"grad_norm": 0.34972235560417175,
"learning_rate": 1.266507963636997e-06,
"loss": 0.1268,
"step": 938
},
{
"epoch": 2.562073669849932,
"grad_norm": 0.34828442335128784,
"learning_rate": 1.2510637515686497e-06,
"loss": 0.1252,
"step": 939
},
{
"epoch": 2.5648021828103684,
"grad_norm": 0.3637445569038391,
"learning_rate": 1.2357079982389197e-06,
"loss": 0.1308,
"step": 940
},
{
"epoch": 2.567530695770805,
"grad_norm": 0.3512468934059143,
"learning_rate": 1.2204408589067462e-06,
"loss": 0.1287,
"step": 941
},
{
"epoch": 2.5702592087312413,
"grad_norm": 0.36545827984809875,
"learning_rate": 1.2052624879351105e-06,
"loss": 0.1306,
"step": 942
},
{
"epoch": 2.572987721691678,
"grad_norm": 0.3595888018608093,
"learning_rate": 1.190173038789476e-06,
"loss": 0.1291,
"step": 943
},
{
"epoch": 2.5757162346521145,
"grad_norm": 0.34953707456588745,
"learning_rate": 1.175172664036235e-06,
"loss": 0.1271,
"step": 944
},
{
"epoch": 2.578444747612551,
"grad_norm": 0.35142770409584045,
"learning_rate": 1.1602615153411666e-06,
"loss": 0.1296,
"step": 945
},
{
"epoch": 2.581173260572988,
"grad_norm": 0.3567049205303192,
"learning_rate": 1.1454397434679022e-06,
"loss": 0.1268,
"step": 946
},
{
"epoch": 2.5839017735334244,
"grad_norm": 0.3721398413181305,
"learning_rate": 1.1307074982764022e-06,
"loss": 0.1334,
"step": 947
},
{
"epoch": 2.586630286493861,
"grad_norm": 0.36237263679504395,
"learning_rate": 1.116064928721442e-06,
"loss": 0.1299,
"step": 948
},
{
"epoch": 2.5893587994542973,
"grad_norm": 0.356036514043808,
"learning_rate": 1.1015121828511033e-06,
"loss": 0.13,
"step": 949
},
{
"epoch": 2.592087312414734,
"grad_norm": 0.36223796010017395,
"learning_rate": 1.0870494078052796e-06,
"loss": 0.1302,
"step": 950
},
{
"epoch": 2.5948158253751705,
"grad_norm": 0.3553941547870636,
"learning_rate": 1.0726767498141877e-06,
"loss": 0.1283,
"step": 951
},
{
"epoch": 2.597544338335607,
"grad_norm": 0.3495614230632782,
"learning_rate": 1.0583943541968856e-06,
"loss": 0.1285,
"step": 952
},
{
"epoch": 2.600272851296044,
"grad_norm": 0.3539895713329315,
"learning_rate": 1.044202365359811e-06,
"loss": 0.1288,
"step": 953
},
{
"epoch": 2.60300136425648,
"grad_norm": 0.35513320565223694,
"learning_rate": 1.0301009267953145e-06,
"loss": 0.1281,
"step": 954
},
{
"epoch": 2.6057298772169166,
"grad_norm": 0.36777645349502563,
"learning_rate": 1.0160901810802114e-06,
"loss": 0.131,
"step": 955
},
{
"epoch": 2.6084583901773533,
"grad_norm": 0.3543359637260437,
"learning_rate": 1.0021702698743408e-06,
"loss": 0.126,
"step": 956
},
{
"epoch": 2.61118690313779,
"grad_norm": 0.35314980149269104,
"learning_rate": 9.883413339191295e-07,
"loss": 0.1289,
"step": 957
},
{
"epoch": 2.6139154160982265,
"grad_norm": 0.36196306347846985,
"learning_rate": 9.746035130361741e-07,
"loss": 0.13,
"step": 958
},
{
"epoch": 2.616643929058663,
"grad_norm": 0.3647453486919403,
"learning_rate": 9.609569461258262e-07,
"loss": 0.1306,
"step": 959
},
{
"epoch": 2.6193724420191,
"grad_norm": 0.35779696702957153,
"learning_rate": 9.474017711657835e-07,
"loss": 0.1284,
"step": 960
},
{
"epoch": 2.622100954979536,
"grad_norm": 0.36195996403694153,
"learning_rate": 9.339381252097001e-07,
"loss": 0.1313,
"step": 961
},
{
"epoch": 2.6248294679399726,
"grad_norm": 0.3700987696647644,
"learning_rate": 9.205661443857994e-07,
"loss": 0.1299,
"step": 962
},
{
"epoch": 2.6275579809004093,
"grad_norm": 0.35640883445739746,
"learning_rate": 9.072859638954956e-07,
"loss": 0.1281,
"step": 963
},
{
"epoch": 2.630286493860846,
"grad_norm": 0.35907936096191406,
"learning_rate": 8.940977180120247e-07,
"loss": 0.1272,
"step": 964
},
{
"epoch": 2.6330150068212825,
"grad_norm": 0.3632573187351227,
"learning_rate": 8.810015400790994e-07,
"loss": 0.1308,
"step": 965
},
{
"epoch": 2.6357435197817187,
"grad_norm": 0.3489466607570648,
"learning_rate": 8.67997562509546e-07,
"loss": 0.1283,
"step": 966
},
{
"epoch": 2.6384720327421554,
"grad_norm": 0.35598084330558777,
"learning_rate": 8.550859167839665e-07,
"loss": 0.1292,
"step": 967
},
{
"epoch": 2.641200545702592,
"grad_norm": 0.3620053827762604,
"learning_rate": 8.42266733449425e-07,
"loss": 0.1278,
"step": 968
},
{
"epoch": 2.6439290586630286,
"grad_norm": 0.36157307028770447,
"learning_rate": 8.295401421181126e-07,
"loss": 0.13,
"step": 969
},
{
"epoch": 2.6466575716234653,
"grad_norm": 0.3444899916648865,
"learning_rate": 8.169062714660347e-07,
"loss": 0.127,
"step": 970
},
{
"epoch": 2.649386084583902,
"grad_norm": 0.35807591676712036,
"learning_rate": 8.043652492317256e-07,
"loss": 0.1297,
"step": 971
},
{
"epoch": 2.6521145975443385,
"grad_norm": 0.3608627915382385,
"learning_rate": 7.919172022149458e-07,
"loss": 0.1281,
"step": 972
},
{
"epoch": 2.6548431105047747,
"grad_norm": 0.35188964009284973,
"learning_rate": 7.795622562753957e-07,
"loss": 0.1272,
"step": 973
},
{
"epoch": 2.6575716234652114,
"grad_norm": 0.3593100309371948,
"learning_rate": 7.673005363314578e-07,
"loss": 0.1302,
"step": 974
},
{
"epoch": 2.660300136425648,
"grad_norm": 0.35982316732406616,
"learning_rate": 7.551321663589229e-07,
"loss": 0.1304,
"step": 975
},
{
"epoch": 2.6630286493860846,
"grad_norm": 0.3450589179992676,
"learning_rate": 7.430572693897342e-07,
"loss": 0.128,
"step": 976
},
{
"epoch": 2.6657571623465213,
"grad_norm": 0.34911608695983887,
"learning_rate": 7.310759675107515e-07,
"loss": 0.1282,
"step": 977
},
{
"epoch": 2.6684856753069575,
"grad_norm": 0.36377087235450745,
"learning_rate": 7.19188381862519e-07,
"loss": 0.1338,
"step": 978
},
{
"epoch": 2.6712141882673945,
"grad_norm": 0.34635117650032043,
"learning_rate": 7.073946326380243e-07,
"loss": 0.1276,
"step": 979
},
{
"epoch": 2.6739427012278307,
"grad_norm": 0.3502279818058014,
"learning_rate": 6.956948390814977e-07,
"loss": 0.1291,
"step": 980
},
{
"epoch": 2.6766712141882674,
"grad_norm": 0.36715587973594666,
"learning_rate": 6.840891194872112e-07,
"loss": 0.1344,
"step": 981
},
{
"epoch": 2.679399727148704,
"grad_norm": 0.35529640316963196,
"learning_rate": 6.725775911982602e-07,
"loss": 0.1302,
"step": 982
},
{
"epoch": 2.6821282401091406,
"grad_norm": 0.35221484303474426,
"learning_rate": 6.61160370605397e-07,
"loss": 0.1265,
"step": 983
},
{
"epoch": 2.6848567530695773,
"grad_norm": 0.34295523166656494,
"learning_rate": 6.498375731458529e-07,
"loss": 0.1275,
"step": 984
},
{
"epoch": 2.6875852660300135,
"grad_norm": 0.36550015211105347,
"learning_rate": 6.386093133021554e-07,
"loss": 0.1277,
"step": 985
},
{
"epoch": 2.69031377899045,
"grad_norm": 0.3569300174713135,
"learning_rate": 6.274757046009871e-07,
"loss": 0.1281,
"step": 986
},
{
"epoch": 2.6930422919508867,
"grad_norm": 0.35022974014282227,
"learning_rate": 6.164368596120351e-07,
"loss": 0.1263,
"step": 987
},
{
"epoch": 2.6957708049113234,
"grad_norm": 0.35484209656715393,
"learning_rate": 6.054928899468427e-07,
"loss": 0.1278,
"step": 988
},
{
"epoch": 2.69849931787176,
"grad_norm": 0.3532489836215973,
"learning_rate": 5.946439062576903e-07,
"loss": 0.1284,
"step": 989
},
{
"epoch": 2.701227830832196,
"grad_norm": 0.3485071659088135,
"learning_rate": 5.83890018236476e-07,
"loss": 0.1272,
"step": 990
},
{
"epoch": 2.7039563437926333,
"grad_norm": 0.3542632758617401,
"learning_rate": 5.732313346136032e-07,
"loss": 0.1265,
"step": 991
},
{
"epoch": 2.7066848567530695,
"grad_norm": 0.3596004843711853,
"learning_rate": 5.626679631568832e-07,
"loss": 0.1282,
"step": 992
},
{
"epoch": 2.709413369713506,
"grad_norm": 0.36018607020378113,
"learning_rate": 5.52200010670444e-07,
"loss": 0.1277,
"step": 993
},
{
"epoch": 2.7121418826739427,
"grad_norm": 0.35304713249206543,
"learning_rate": 5.418275829936537e-07,
"loss": 0.1307,
"step": 994
},
{
"epoch": 2.7148703956343794,
"grad_norm": 0.3582627475261688,
"learning_rate": 5.315507850000456e-07,
"loss": 0.1284,
"step": 995
},
{
"epoch": 2.717598908594816,
"grad_norm": 0.35284796357154846,
"learning_rate": 5.213697205962631e-07,
"loss": 0.1273,
"step": 996
},
{
"epoch": 2.720327421555252,
"grad_norm": 0.35549217462539673,
"learning_rate": 5.112844927210048e-07,
"loss": 0.1283,
"step": 997
},
{
"epoch": 2.723055934515689,
"grad_norm": 0.35494813323020935,
"learning_rate": 5.012952033439844e-07,
"loss": 0.1243,
"step": 998
},
{
"epoch": 2.7257844474761255,
"grad_norm": 0.3586711585521698,
"learning_rate": 4.914019534649039e-07,
"loss": 0.1304,
"step": 999
},
{
"epoch": 2.728512960436562,
"grad_norm": 0.3513246774673462,
"learning_rate": 4.816048431124265e-07,
"loss": 0.1262,
"step": 1000
},
{
"epoch": 2.7312414733969987,
"grad_norm": 0.36048534512519836,
"learning_rate": 4.7190397134316946e-07,
"loss": 0.1298,
"step": 1001
},
{
"epoch": 2.733969986357435,
"grad_norm": 0.3561542332172394,
"learning_rate": 4.6229943624069963e-07,
"loss": 0.1304,
"step": 1002
},
{
"epoch": 2.736698499317872,
"grad_norm": 0.35859954357147217,
"learning_rate": 4.5279133491454406e-07,
"loss": 0.129,
"step": 1003
},
{
"epoch": 2.739427012278308,
"grad_norm": 0.35953471064567566,
"learning_rate": 4.4337976349920763e-07,
"loss": 0.1294,
"step": 1004
},
{
"epoch": 2.742155525238745,
"grad_norm": 0.3629186451435089,
"learning_rate": 4.3406481715319916e-07,
"loss": 0.1305,
"step": 1005
},
{
"epoch": 2.7448840381991815,
"grad_norm": 0.36121776700019836,
"learning_rate": 4.248465900580734e-07,
"loss": 0.1299,
"step": 1006
},
{
"epoch": 2.747612551159618,
"grad_norm": 0.3531065583229065,
"learning_rate": 4.1572517541747294e-07,
"loss": 0.1304,
"step": 1007
},
{
"epoch": 2.7503410641200547,
"grad_norm": 0.35587796568870544,
"learning_rate": 4.0670066545619224e-07,
"loss": 0.1289,
"step": 1008
},
{
"epoch": 2.753069577080491,
"grad_norm": 0.35462117195129395,
"learning_rate": 3.9777315141923847e-07,
"loss": 0.1286,
"step": 1009
},
{
"epoch": 2.7557980900409276,
"grad_norm": 0.34857916831970215,
"learning_rate": 3.889427235709153e-07,
"loss": 0.1282,
"step": 1010
},
{
"epoch": 2.758526603001364,
"grad_norm": 0.35415270924568176,
"learning_rate": 3.802094711939075e-07,
"loss": 0.1275,
"step": 1011
},
{
"epoch": 2.761255115961801,
"grad_norm": 0.37276458740234375,
"learning_rate": 3.715734825883766e-07,
"loss": 0.1338,
"step": 1012
},
{
"epoch": 2.7639836289222375,
"grad_norm": 0.3584205210208893,
"learning_rate": 3.6303484507106965e-07,
"loss": 0.1276,
"step": 1013
},
{
"epoch": 2.7667121418826737,
"grad_norm": 0.360146164894104,
"learning_rate": 3.5459364497443696e-07,
"loss": 0.1275,
"step": 1014
},
{
"epoch": 2.7694406548431107,
"grad_norm": 0.3654356896877289,
"learning_rate": 3.462499676457598e-07,
"loss": 0.1277,
"step": 1015
},
{
"epoch": 2.772169167803547,
"grad_norm": 0.3602968752384186,
"learning_rate": 3.38003897446284e-07,
"loss": 0.1319,
"step": 1016
},
{
"epoch": 2.7748976807639836,
"grad_norm": 0.3531845808029175,
"learning_rate": 3.298555177503726e-07,
"loss": 0.1311,
"step": 1017
},
{
"epoch": 2.77762619372442,
"grad_norm": 0.35511648654937744,
"learning_rate": 3.2180491094465414e-07,
"loss": 0.1292,
"step": 1018
},
{
"epoch": 2.780354706684857,
"grad_norm": 0.35548272728919983,
"learning_rate": 3.138521584272003e-07,
"loss": 0.1299,
"step": 1019
},
{
"epoch": 2.7830832196452935,
"grad_norm": 0.36467936635017395,
"learning_rate": 3.059973406066963e-07,
"loss": 0.1318,
"step": 1020
},
{
"epoch": 2.7858117326057297,
"grad_norm": 0.36441510915756226,
"learning_rate": 2.982405369016272e-07,
"loss": 0.1305,
"step": 1021
},
{
"epoch": 2.7885402455661663,
"grad_norm": 0.35912269353866577,
"learning_rate": 2.905818257394799e-07,
"loss": 0.1266,
"step": 1022
},
{
"epoch": 2.791268758526603,
"grad_norm": 0.3510645925998688,
"learning_rate": 2.830212845559466e-07,
"loss": 0.1292,
"step": 1023
},
{
"epoch": 2.7939972714870396,
"grad_norm": 0.38425615429878235,
"learning_rate": 2.7555898979413796e-07,
"loss": 0.1261,
"step": 1024
},
{
"epoch": 2.796725784447476,
"grad_norm": 0.36103349924087524,
"learning_rate": 2.6819501690382275e-07,
"loss": 0.131,
"step": 1025
},
{
"epoch": 2.799454297407913,
"grad_norm": 0.3564034104347229,
"learning_rate": 2.609294403406537e-07,
"loss": 0.1285,
"step": 1026
},
{
"epoch": 2.8021828103683495,
"grad_norm": 0.35618945956230164,
"learning_rate": 2.537623335654127e-07,
"loss": 0.1303,
"step": 1027
},
{
"epoch": 2.8049113233287857,
"grad_norm": 0.3517012298107147,
"learning_rate": 2.4669376904328244e-07,
"loss": 0.128,
"step": 1028
},
{
"epoch": 2.8076398362892223,
"grad_norm": 0.35787534713745117,
"learning_rate": 2.397238182430994e-07,
"loss": 0.1287,
"step": 1029
},
{
"epoch": 2.810368349249659,
"grad_norm": 0.3706185221672058,
"learning_rate": 2.3285255163663535e-07,
"loss": 0.1316,
"step": 1030
},
{
"epoch": 2.8130968622100956,
"grad_norm": 0.3420504331588745,
"learning_rate": 2.2608003869788786e-07,
"loss": 0.1248,
"step": 1031
},
{
"epoch": 2.815825375170532,
"grad_norm": 0.3569451868534088,
"learning_rate": 2.1940634790238003e-07,
"loss": 0.1279,
"step": 1032
},
{
"epoch": 2.8185538881309684,
"grad_norm": 0.3604009747505188,
"learning_rate": 2.1283154672645522e-07,
"loss": 0.1291,
"step": 1033
},
{
"epoch": 2.821282401091405,
"grad_norm": 0.3657495975494385,
"learning_rate": 2.063557016466111e-07,
"loss": 0.1295,
"step": 1034
},
{
"epoch": 2.8240109140518417,
"grad_norm": 0.3558668792247772,
"learning_rate": 1.999788781388201e-07,
"loss": 0.1268,
"step": 1035
},
{
"epoch": 2.8267394270122783,
"grad_norm": 0.36042869091033936,
"learning_rate": 1.9370114067785995e-07,
"loss": 0.129,
"step": 1036
},
{
"epoch": 2.829467939972715,
"grad_norm": 0.3488910496234894,
"learning_rate": 1.8752255273667752e-07,
"loss": 0.1274,
"step": 1037
},
{
"epoch": 2.8321964529331516,
"grad_norm": 0.35946106910705566,
"learning_rate": 1.8144317678573497e-07,
"loss": 0.1276,
"step": 1038
},
{
"epoch": 2.8349249658935882,
"grad_norm": 0.35392606258392334,
"learning_rate": 1.7546307429238129e-07,
"loss": 0.1289,
"step": 1039
},
{
"epoch": 2.8376534788540244,
"grad_norm": 0.35671889781951904,
"learning_rate": 1.6958230572023504e-07,
"loss": 0.1288,
"step": 1040
},
{
"epoch": 2.840381991814461,
"grad_norm": 0.3586263656616211,
"learning_rate": 1.6380093052856482e-07,
"loss": 0.1332,
"step": 1041
},
{
"epoch": 2.8431105047748977,
"grad_norm": 0.34798717498779297,
"learning_rate": 1.5811900717169537e-07,
"loss": 0.128,
"step": 1042
},
{
"epoch": 2.8458390177353343,
"grad_norm": 0.369842529296875,
"learning_rate": 1.5253659309841463e-07,
"loss": 0.1337,
"step": 1043
},
{
"epoch": 2.848567530695771,
"grad_norm": 0.3586486577987671,
"learning_rate": 1.4705374475138978e-07,
"loss": 0.1281,
"step": 1044
},
{
"epoch": 2.851296043656207,
"grad_norm": 0.3524888753890991,
"learning_rate": 1.416705175666e-07,
"loss": 0.1283,
"step": 1045
},
{
"epoch": 2.854024556616644,
"grad_norm": 0.3938431739807129,
"learning_rate": 1.3638696597277678e-07,
"loss": 0.1287,
"step": 1046
},
{
"epoch": 2.8567530695770804,
"grad_norm": 0.3598809540271759,
"learning_rate": 1.3120314339084782e-07,
"loss": 0.1289,
"step": 1047
},
{
"epoch": 2.859481582537517,
"grad_norm": 0.3557843565940857,
"learning_rate": 1.2611910223340408e-07,
"loss": 0.1287,
"step": 1048
},
{
"epoch": 2.8622100954979537,
"grad_norm": 0.3524629473686218,
"learning_rate": 1.2113489390416565e-07,
"loss": 0.1274,
"step": 1049
},
{
"epoch": 2.8649386084583903,
"grad_norm": 0.36476588249206543,
"learning_rate": 1.1625056879746133e-07,
"loss": 0.1313,
"step": 1050
},
{
"epoch": 2.867667121418827,
"grad_norm": 0.3563539683818817,
"learning_rate": 1.1146617629772316e-07,
"loss": 0.128,
"step": 1051
},
{
"epoch": 2.870395634379263,
"grad_norm": 0.36203011870384216,
"learning_rate": 1.0678176477898372e-07,
"loss": 0.1299,
"step": 1052
},
{
"epoch": 2.8731241473397,
"grad_norm": 0.3595244288444519,
"learning_rate": 1.0219738160438753e-07,
"loss": 0.1296,
"step": 1053
},
{
"epoch": 2.8758526603001364,
"grad_norm": 0.3583170473575592,
"learning_rate": 9.771307312571254e-08,
"loss": 0.1283,
"step": 1054
},
{
"epoch": 2.878581173260573,
"grad_norm": 0.3593871593475342,
"learning_rate": 9.332888468290168e-08,
"loss": 0.1298,
"step": 1055
},
{
"epoch": 2.8813096862210097,
"grad_norm": 0.3553028702735901,
"learning_rate": 8.90448606036054e-08,
"loss": 0.1286,
"step": 1056
},
{
"epoch": 2.884038199181446,
"grad_norm": 0.34894633293151855,
"learning_rate": 8.486104420272979e-08,
"loss": 0.1249,
"step": 1057
},
{
"epoch": 2.8867667121418825,
"grad_norm": 0.35654279589653015,
"learning_rate": 8.077747778200474e-08,
"loss": 0.1293,
"step": 1058
},
{
"epoch": 2.889495225102319,
"grad_norm": 0.3499641716480255,
"learning_rate": 7.679420262954984e-08,
"loss": 0.1293,
"step": 1059
},
{
"epoch": 2.892223738062756,
"grad_norm": 0.3609579801559448,
"learning_rate": 7.291125901946027e-08,
"loss": 0.1303,
"step": 1060
},
{
"epoch": 2.8949522510231924,
"grad_norm": 0.3622332215309143,
"learning_rate": 6.912868621140045e-08,
"loss": 0.1294,
"step": 1061
},
{
"epoch": 2.897680763983629,
"grad_norm": 0.348332941532135,
"learning_rate": 6.544652245020433e-08,
"loss": 0.1281,
"step": 1062
},
{
"epoch": 2.9004092769440657,
"grad_norm": 0.3568393588066101,
"learning_rate": 6.18648049654913e-08,
"loss": 0.1284,
"step": 1063
},
{
"epoch": 2.903137789904502,
"grad_norm": 0.3721940815448761,
"learning_rate": 5.838356997128869e-08,
"loss": 0.1287,
"step": 1064
},
{
"epoch": 2.9058663028649385,
"grad_norm": 0.3597653806209564,
"learning_rate": 5.500285266566319e-08,
"loss": 0.1314,
"step": 1065
},
{
"epoch": 2.908594815825375,
"grad_norm": 0.364513635635376,
"learning_rate": 5.1722687230369995e-08,
"loss": 0.1294,
"step": 1066
},
{
"epoch": 2.911323328785812,
"grad_norm": 0.36251965165138245,
"learning_rate": 4.854310683050312e-08,
"loss": 0.1288,
"step": 1067
},
{
"epoch": 2.9140518417462484,
"grad_norm": 0.35791829228401184,
"learning_rate": 4.5464143614162294e-08,
"loss": 0.1305,
"step": 1068
},
{
"epoch": 2.9167803547066846,
"grad_norm": 0.347622275352478,
"learning_rate": 4.2485828712126584e-08,
"loss": 0.1281,
"step": 1069
},
{
"epoch": 2.9195088676671213,
"grad_norm": 0.35234710574150085,
"learning_rate": 3.96081922375402e-08,
"loss": 0.1269,
"step": 1070
},
{
"epoch": 2.922237380627558,
"grad_norm": 0.3629789352416992,
"learning_rate": 3.683126328560826e-08,
"loss": 0.1298,
"step": 1071
},
{
"epoch": 2.9249658935879945,
"grad_norm": 0.35642895102500916,
"learning_rate": 3.4155069933301535e-08,
"loss": 0.1292,
"step": 1072
},
{
"epoch": 2.927694406548431,
"grad_norm": 0.3609231114387512,
"learning_rate": 3.1579639239074364e-08,
"loss": 0.131,
"step": 1073
},
{
"epoch": 2.930422919508868,
"grad_norm": 0.36312758922576904,
"learning_rate": 2.9104997242590528e-08,
"loss": 0.1284,
"step": 1074
},
{
"epoch": 2.9331514324693044,
"grad_norm": 0.36244162917137146,
"learning_rate": 2.673116896445671e-08,
"loss": 0.1286,
"step": 1075
},
{
"epoch": 2.9358799454297406,
"grad_norm": 0.35929951071739197,
"learning_rate": 2.4458178405974974e-08,
"loss": 0.13,
"step": 1076
},
{
"epoch": 2.9386084583901773,
"grad_norm": 0.3554922342300415,
"learning_rate": 2.2286048548897378e-08,
"loss": 0.1286,
"step": 1077
},
{
"epoch": 2.941336971350614,
"grad_norm": 0.3549968898296356,
"learning_rate": 2.0214801355192826e-08,
"loss": 0.1286,
"step": 1078
},
{
"epoch": 2.9440654843110505,
"grad_norm": 0.34757283329963684,
"learning_rate": 1.824445776682504e-08,
"loss": 0.127,
"step": 1079
},
{
"epoch": 2.946793997271487,
"grad_norm": 0.36166948080062866,
"learning_rate": 1.6375037705543827e-08,
"loss": 0.129,
"step": 1080
},
{
"epoch": 2.9495225102319234,
"grad_norm": 0.3565935492515564,
"learning_rate": 1.4606560072679687e-08,
"loss": 0.1273,
"step": 1081
},
{
"epoch": 2.9522510231923604,
"grad_norm": 0.3589683473110199,
"learning_rate": 1.2939042748955078e-08,
"loss": 0.1284,
"step": 1082
},
{
"epoch": 2.9549795361527966,
"grad_norm": 0.35582536458969116,
"learning_rate": 1.1372502594303448e-08,
"loss": 0.1289,
"step": 1083
},
{
"epoch": 2.9577080491132333,
"grad_norm": 0.3483884930610657,
"learning_rate": 9.906955447697153e-09,
"loss": 0.1262,
"step": 1084
},
{
"epoch": 2.96043656207367,
"grad_norm": 0.35305240750312805,
"learning_rate": 8.542416126989805e-09,
"loss": 0.1257,
"step": 1085
},
{
"epoch": 2.9631650750341065,
"grad_norm": 0.3558148145675659,
"learning_rate": 7.278898428764169e-09,
"loss": 0.1287,
"step": 1086
},
{
"epoch": 2.965893587994543,
"grad_norm": 0.35575419664382935,
"learning_rate": 6.1164151281944974e-09,
"loss": 0.1281,
"step": 1087
},
{
"epoch": 2.9686221009549794,
"grad_norm": 0.3598300814628601,
"learning_rate": 5.054977978916631e-09,
"loss": 0.1282,
"step": 1088
},
{
"epoch": 2.971350613915416,
"grad_norm": 0.35684746503829956,
"learning_rate": 4.094597712908099e-09,
"loss": 0.1276,
"step": 1089
},
{
"epoch": 2.9740791268758526,
"grad_norm": 0.3472525477409363,
"learning_rate": 3.2352840403804264e-09,
"loss": 0.1277,
"step": 1090
},
{
"epoch": 2.9768076398362893,
"grad_norm": 0.3728668689727783,
"learning_rate": 2.477045649681431e-09,
"loss": 0.1335,
"step": 1091
},
{
"epoch": 2.979536152796726,
"grad_norm": 0.3551958203315735,
"learning_rate": 1.8198902072097402e-09,
"loss": 0.1292,
"step": 1092
},
{
"epoch": 2.982264665757162,
"grad_norm": 0.3531115651130676,
"learning_rate": 1.2638243573293019e-09,
"loss": 0.1297,
"step": 1093
},
{
"epoch": 2.984993178717599,
"grad_norm": 0.3493654727935791,
"learning_rate": 8.088537223116533e-10,
"loss": 0.1267,
"step": 1094
},
{
"epoch": 2.9877216916780354,
"grad_norm": 0.3511788547039032,
"learning_rate": 4.549829022748586e-10,
"loss": 0.1254,
"step": 1095
},
{
"epoch": 2.990450204638472,
"grad_norm": 0.36361077427864075,
"learning_rate": 2.02215475132439e-10,
"loss": 0.1318,
"step": 1096
},
{
"epoch": 2.9931787175989086,
"grad_norm": 0.36402377486228943,
"learning_rate": 5.0553996568947216e-11,
"loss": 0.1314,
"step": 1097
},
{
"epoch": 2.9959072305593453,
"grad_norm": 0.3575246036052704,
"learning_rate": 0.0,
"loss": 0.1289,
"step": 1098
},
{
"epoch": 2.9959072305593453,
"step": 1098,
"total_flos": 3.496351998494638e+18,
"train_loss": 0.2091554065246834,
"train_runtime": 8168.8424,
"train_samples_per_second": 17.224,
"train_steps_per_second": 0.134
}
],
"logging_steps": 1,
"max_steps": 1098,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 999999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.496351998494638e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}