baseline-gemma-1.1-7b-it-sft / trainer_state.json
ZhangShenao's picture
Model save
10e371b verified
raw
history blame
191 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9959072305593453,
"eval_steps": 500,
"global_step": 1098,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002728512960436562,
"grad_norm": 74.70013427734375,
"learning_rate": 1.8181818181818183e-07,
"loss": 9.0455,
"step": 1
},
{
"epoch": 0.005457025920873124,
"grad_norm": 72.38750457763672,
"learning_rate": 3.6363636363636366e-07,
"loss": 9.1792,
"step": 2
},
{
"epoch": 0.008185538881309686,
"grad_norm": 72.11083984375,
"learning_rate": 5.454545454545455e-07,
"loss": 9.189,
"step": 3
},
{
"epoch": 0.010914051841746248,
"grad_norm": 70.28816986083984,
"learning_rate": 7.272727272727273e-07,
"loss": 9.1017,
"step": 4
},
{
"epoch": 0.013642564802182811,
"grad_norm": 64.50697326660156,
"learning_rate": 9.090909090909091e-07,
"loss": 8.9227,
"step": 5
},
{
"epoch": 0.01637107776261937,
"grad_norm": 63.397464752197266,
"learning_rate": 1.090909090909091e-06,
"loss": 8.9681,
"step": 6
},
{
"epoch": 0.019099590723055934,
"grad_norm": 52.970638275146484,
"learning_rate": 1.2727272727272728e-06,
"loss": 8.4263,
"step": 7
},
{
"epoch": 0.021828103683492497,
"grad_norm": 53.22758483886719,
"learning_rate": 1.4545454545454546e-06,
"loss": 8.2943,
"step": 8
},
{
"epoch": 0.02455661664392906,
"grad_norm": 42.63364791870117,
"learning_rate": 1.6363636363636365e-06,
"loss": 7.8594,
"step": 9
},
{
"epoch": 0.027285129604365622,
"grad_norm": 44.645076751708984,
"learning_rate": 1.8181818181818183e-06,
"loss": 8.0175,
"step": 10
},
{
"epoch": 0.030013642564802184,
"grad_norm": 76.23380279541016,
"learning_rate": 2.0000000000000003e-06,
"loss": 7.6523,
"step": 11
},
{
"epoch": 0.03274215552523874,
"grad_norm": 62.988075256347656,
"learning_rate": 2.181818181818182e-06,
"loss": 7.4509,
"step": 12
},
{
"epoch": 0.03547066848567531,
"grad_norm": 62.151546478271484,
"learning_rate": 2.363636363636364e-06,
"loss": 7.4096,
"step": 13
},
{
"epoch": 0.03819918144611187,
"grad_norm": 66.8625717163086,
"learning_rate": 2.5454545454545456e-06,
"loss": 7.2394,
"step": 14
},
{
"epoch": 0.040927694406548434,
"grad_norm": 84.82785034179688,
"learning_rate": 2.7272727272727272e-06,
"loss": 6.6904,
"step": 15
},
{
"epoch": 0.04365620736698499,
"grad_norm": 96.20341491699219,
"learning_rate": 2.9090909090909093e-06,
"loss": 6.0718,
"step": 16
},
{
"epoch": 0.04638472032742155,
"grad_norm": 104.0381088256836,
"learning_rate": 3.090909090909091e-06,
"loss": 5.4693,
"step": 17
},
{
"epoch": 0.04911323328785812,
"grad_norm": 163.81932067871094,
"learning_rate": 3.272727272727273e-06,
"loss": 4.8046,
"step": 18
},
{
"epoch": 0.05184174624829468,
"grad_norm": 100.85552215576172,
"learning_rate": 3.454545454545455e-06,
"loss": 3.8863,
"step": 19
},
{
"epoch": 0.054570259208731244,
"grad_norm": 94.65142822265625,
"learning_rate": 3.6363636363636366e-06,
"loss": 3.2934,
"step": 20
},
{
"epoch": 0.0572987721691678,
"grad_norm": 86.24637603759766,
"learning_rate": 3.818181818181819e-06,
"loss": 2.7249,
"step": 21
},
{
"epoch": 0.06002728512960437,
"grad_norm": 69.26914978027344,
"learning_rate": 4.000000000000001e-06,
"loss": 1.9717,
"step": 22
},
{
"epoch": 0.06275579809004093,
"grad_norm": 58.77188491821289,
"learning_rate": 4.181818181818182e-06,
"loss": 1.4078,
"step": 23
},
{
"epoch": 0.06548431105047749,
"grad_norm": 39.297298431396484,
"learning_rate": 4.363636363636364e-06,
"loss": 0.9179,
"step": 24
},
{
"epoch": 0.06821282401091405,
"grad_norm": 9.521720886230469,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.8258,
"step": 25
},
{
"epoch": 0.07094133697135062,
"grad_norm": 7.111854553222656,
"learning_rate": 4.727272727272728e-06,
"loss": 0.7448,
"step": 26
},
{
"epoch": 0.07366984993178717,
"grad_norm": 5.886301517486572,
"learning_rate": 4.90909090909091e-06,
"loss": 0.7132,
"step": 27
},
{
"epoch": 0.07639836289222374,
"grad_norm": 6.183082103729248,
"learning_rate": 5.090909090909091e-06,
"loss": 0.6925,
"step": 28
},
{
"epoch": 0.0791268758526603,
"grad_norm": 3.947871446609497,
"learning_rate": 5.272727272727273e-06,
"loss": 0.6724,
"step": 29
},
{
"epoch": 0.08185538881309687,
"grad_norm": 3.117551565170288,
"learning_rate": 5.4545454545454545e-06,
"loss": 0.6551,
"step": 30
},
{
"epoch": 0.08458390177353342,
"grad_norm": 2.5701775550842285,
"learning_rate": 5.636363636363636e-06,
"loss": 0.6278,
"step": 31
},
{
"epoch": 0.08731241473396999,
"grad_norm": 2.8870959281921387,
"learning_rate": 5.8181818181818185e-06,
"loss": 0.6113,
"step": 32
},
{
"epoch": 0.09004092769440655,
"grad_norm": 4.249965667724609,
"learning_rate": 6e-06,
"loss": 0.6114,
"step": 33
},
{
"epoch": 0.0927694406548431,
"grad_norm": 2.521073341369629,
"learning_rate": 6.181818181818182e-06,
"loss": 0.5882,
"step": 34
},
{
"epoch": 0.09549795361527967,
"grad_norm": 2.4046638011932373,
"learning_rate": 6.363636363636364e-06,
"loss": 0.5772,
"step": 35
},
{
"epoch": 0.09822646657571624,
"grad_norm": 2.6150600910186768,
"learning_rate": 6.545454545454546e-06,
"loss": 0.5681,
"step": 36
},
{
"epoch": 0.1009549795361528,
"grad_norm": 3.809873580932617,
"learning_rate": 6.7272727272727275e-06,
"loss": 0.5616,
"step": 37
},
{
"epoch": 0.10368349249658936,
"grad_norm": 2.455195665359497,
"learning_rate": 6.90909090909091e-06,
"loss": 0.5595,
"step": 38
},
{
"epoch": 0.10641200545702592,
"grad_norm": 1.6467875242233276,
"learning_rate": 7.0909090909090916e-06,
"loss": 0.5632,
"step": 39
},
{
"epoch": 0.10914051841746249,
"grad_norm": 1.7783292531967163,
"learning_rate": 7.272727272727273e-06,
"loss": 0.5377,
"step": 40
},
{
"epoch": 0.11186903137789904,
"grad_norm": 2.9395012855529785,
"learning_rate": 7.454545454545456e-06,
"loss": 0.5424,
"step": 41
},
{
"epoch": 0.1145975443383356,
"grad_norm": 4.087269306182861,
"learning_rate": 7.636363636363638e-06,
"loss": 0.5374,
"step": 42
},
{
"epoch": 0.11732605729877217,
"grad_norm": 1.7641241550445557,
"learning_rate": 7.81818181818182e-06,
"loss": 0.5343,
"step": 43
},
{
"epoch": 0.12005457025920874,
"grad_norm": 1.872262954711914,
"learning_rate": 8.000000000000001e-06,
"loss": 0.522,
"step": 44
},
{
"epoch": 0.12278308321964529,
"grad_norm": 2.4888625144958496,
"learning_rate": 8.181818181818183e-06,
"loss": 0.516,
"step": 45
},
{
"epoch": 0.12551159618008187,
"grad_norm": 2.895923614501953,
"learning_rate": 8.363636363636365e-06,
"loss": 0.5191,
"step": 46
},
{
"epoch": 0.12824010914051842,
"grad_norm": 2.1216979026794434,
"learning_rate": 8.545454545454546e-06,
"loss": 0.4997,
"step": 47
},
{
"epoch": 0.13096862210095497,
"grad_norm": 4.029631614685059,
"learning_rate": 8.727272727272728e-06,
"loss": 0.5045,
"step": 48
},
{
"epoch": 0.13369713506139155,
"grad_norm": 2.3348028659820557,
"learning_rate": 8.90909090909091e-06,
"loss": 0.5104,
"step": 49
},
{
"epoch": 0.1364256480218281,
"grad_norm": 2.498213291168213,
"learning_rate": 9.090909090909091e-06,
"loss": 0.4862,
"step": 50
},
{
"epoch": 0.13915416098226466,
"grad_norm": 3.932359218597412,
"learning_rate": 9.272727272727273e-06,
"loss": 0.4811,
"step": 51
},
{
"epoch": 0.14188267394270124,
"grad_norm": 2.6025686264038086,
"learning_rate": 9.454545454545456e-06,
"loss": 0.4873,
"step": 52
},
{
"epoch": 0.1446111869031378,
"grad_norm": 4.504275321960449,
"learning_rate": 9.636363636363638e-06,
"loss": 0.4705,
"step": 53
},
{
"epoch": 0.14733969986357434,
"grad_norm": 2.8336246013641357,
"learning_rate": 9.81818181818182e-06,
"loss": 0.4734,
"step": 54
},
{
"epoch": 0.15006821282401092,
"grad_norm": 3.5705838203430176,
"learning_rate": 1e-05,
"loss": 0.464,
"step": 55
},
{
"epoch": 0.15279672578444747,
"grad_norm": 4.8748931884765625,
"learning_rate": 1.0181818181818182e-05,
"loss": 0.4677,
"step": 56
},
{
"epoch": 0.15552523874488403,
"grad_norm": 3.273179292678833,
"learning_rate": 1.0363636363636364e-05,
"loss": 0.4663,
"step": 57
},
{
"epoch": 0.1582537517053206,
"grad_norm": 2.9470977783203125,
"learning_rate": 1.0545454545454546e-05,
"loss": 0.4544,
"step": 58
},
{
"epoch": 0.16098226466575716,
"grad_norm": 5.8234171867370605,
"learning_rate": 1.0727272727272729e-05,
"loss": 0.4569,
"step": 59
},
{
"epoch": 0.16371077762619374,
"grad_norm": 2.682898759841919,
"learning_rate": 1.0909090909090909e-05,
"loss": 0.4523,
"step": 60
},
{
"epoch": 0.1664392905866303,
"grad_norm": 9.162665367126465,
"learning_rate": 1.1090909090909092e-05,
"loss": 0.4428,
"step": 61
},
{
"epoch": 0.16916780354706684,
"grad_norm": 8.336901664733887,
"learning_rate": 1.1272727272727272e-05,
"loss": 0.4493,
"step": 62
},
{
"epoch": 0.17189631650750342,
"grad_norm": 3.5765390396118164,
"learning_rate": 1.1454545454545455e-05,
"loss": 0.4405,
"step": 63
},
{
"epoch": 0.17462482946793997,
"grad_norm": 6.3450026512146,
"learning_rate": 1.1636363636363637e-05,
"loss": 0.4349,
"step": 64
},
{
"epoch": 0.17735334242837653,
"grad_norm": 3.6071555614471436,
"learning_rate": 1.181818181818182e-05,
"loss": 0.4272,
"step": 65
},
{
"epoch": 0.1800818553888131,
"grad_norm": 3.7100143432617188,
"learning_rate": 1.2e-05,
"loss": 0.4161,
"step": 66
},
{
"epoch": 0.18281036834924966,
"grad_norm": 3.773010492324829,
"learning_rate": 1.2181818181818184e-05,
"loss": 0.4142,
"step": 67
},
{
"epoch": 0.1855388813096862,
"grad_norm": 2.8710012435913086,
"learning_rate": 1.2363636363636364e-05,
"loss": 0.4193,
"step": 68
},
{
"epoch": 0.1882673942701228,
"grad_norm": 1.9093347787857056,
"learning_rate": 1.2545454545454547e-05,
"loss": 0.417,
"step": 69
},
{
"epoch": 0.19099590723055934,
"grad_norm": 2.574664831161499,
"learning_rate": 1.2727272727272728e-05,
"loss": 0.4023,
"step": 70
},
{
"epoch": 0.1937244201909959,
"grad_norm": 2.875770330429077,
"learning_rate": 1.2909090909090912e-05,
"loss": 0.4032,
"step": 71
},
{
"epoch": 0.19645293315143247,
"grad_norm": 3.4898297786712646,
"learning_rate": 1.3090909090909092e-05,
"loss": 0.3789,
"step": 72
},
{
"epoch": 0.19918144611186903,
"grad_norm": 2.9872593879699707,
"learning_rate": 1.3272727272727275e-05,
"loss": 0.3955,
"step": 73
},
{
"epoch": 0.2019099590723056,
"grad_norm": 2.160285472869873,
"learning_rate": 1.3454545454545455e-05,
"loss": 0.3881,
"step": 74
},
{
"epoch": 0.20463847203274216,
"grad_norm": 2.384871006011963,
"learning_rate": 1.3636363636363637e-05,
"loss": 0.3921,
"step": 75
},
{
"epoch": 0.2073669849931787,
"grad_norm": 3.565929412841797,
"learning_rate": 1.381818181818182e-05,
"loss": 0.377,
"step": 76
},
{
"epoch": 0.2100954979536153,
"grad_norm": 2.4952895641326904,
"learning_rate": 1.4e-05,
"loss": 0.3795,
"step": 77
},
{
"epoch": 0.21282401091405184,
"grad_norm": 4.278308868408203,
"learning_rate": 1.4181818181818183e-05,
"loss": 0.3807,
"step": 78
},
{
"epoch": 0.2155525238744884,
"grad_norm": 2.777308464050293,
"learning_rate": 1.4363636363636365e-05,
"loss": 0.378,
"step": 79
},
{
"epoch": 0.21828103683492497,
"grad_norm": 2.3112456798553467,
"learning_rate": 1.4545454545454546e-05,
"loss": 0.3726,
"step": 80
},
{
"epoch": 0.22100954979536153,
"grad_norm": 3.0835683345794678,
"learning_rate": 1.4727272727272728e-05,
"loss": 0.3741,
"step": 81
},
{
"epoch": 0.22373806275579808,
"grad_norm": 3.618603467941284,
"learning_rate": 1.4909090909090911e-05,
"loss": 0.3631,
"step": 82
},
{
"epoch": 0.22646657571623466,
"grad_norm": 4.083821773529053,
"learning_rate": 1.5090909090909091e-05,
"loss": 0.3708,
"step": 83
},
{
"epoch": 0.2291950886766712,
"grad_norm": 1.9423736333847046,
"learning_rate": 1.5272727272727276e-05,
"loss": 0.3627,
"step": 84
},
{
"epoch": 0.23192360163710776,
"grad_norm": 4.401960372924805,
"learning_rate": 1.5454545454545454e-05,
"loss": 0.362,
"step": 85
},
{
"epoch": 0.23465211459754434,
"grad_norm": 3.1123781204223633,
"learning_rate": 1.563636363636364e-05,
"loss": 0.3578,
"step": 86
},
{
"epoch": 0.2373806275579809,
"grad_norm": 2.0257530212402344,
"learning_rate": 1.5818181818181818e-05,
"loss": 0.356,
"step": 87
},
{
"epoch": 0.24010914051841747,
"grad_norm": 3.943086624145508,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.3503,
"step": 88
},
{
"epoch": 0.24283765347885403,
"grad_norm": 4.66082763671875,
"learning_rate": 1.6181818181818184e-05,
"loss": 0.347,
"step": 89
},
{
"epoch": 0.24556616643929058,
"grad_norm": 2.2836050987243652,
"learning_rate": 1.6363636363636366e-05,
"loss": 0.3516,
"step": 90
},
{
"epoch": 0.24829467939972716,
"grad_norm": 6.001435279846191,
"learning_rate": 1.6545454545454548e-05,
"loss": 0.3421,
"step": 91
},
{
"epoch": 0.25102319236016374,
"grad_norm": 3.366344451904297,
"learning_rate": 1.672727272727273e-05,
"loss": 0.3506,
"step": 92
},
{
"epoch": 0.25375170532060026,
"grad_norm": 6.4907379150390625,
"learning_rate": 1.690909090909091e-05,
"loss": 0.3514,
"step": 93
},
{
"epoch": 0.25648021828103684,
"grad_norm": 3.615786075592041,
"learning_rate": 1.7090909090909092e-05,
"loss": 0.3589,
"step": 94
},
{
"epoch": 0.2592087312414734,
"grad_norm": 9.323090553283691,
"learning_rate": 1.7272727272727274e-05,
"loss": 0.3548,
"step": 95
},
{
"epoch": 0.26193724420190995,
"grad_norm": 5.748571395874023,
"learning_rate": 1.7454545454545456e-05,
"loss": 0.345,
"step": 96
},
{
"epoch": 0.2646657571623465,
"grad_norm": 6.471901893615723,
"learning_rate": 1.7636363636363637e-05,
"loss": 0.3495,
"step": 97
},
{
"epoch": 0.2673942701227831,
"grad_norm": 6.451484680175781,
"learning_rate": 1.781818181818182e-05,
"loss": 0.3305,
"step": 98
},
{
"epoch": 0.27012278308321963,
"grad_norm": 5.596473693847656,
"learning_rate": 1.8e-05,
"loss": 0.3377,
"step": 99
},
{
"epoch": 0.2728512960436562,
"grad_norm": 4.3230695724487305,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.3288,
"step": 100
},
{
"epoch": 0.2755798090040928,
"grad_norm": 5.818665981292725,
"learning_rate": 1.8363636363636367e-05,
"loss": 0.3232,
"step": 101
},
{
"epoch": 0.2783083219645293,
"grad_norm": 4.472134113311768,
"learning_rate": 1.8545454545454545e-05,
"loss": 0.3373,
"step": 102
},
{
"epoch": 0.2810368349249659,
"grad_norm": 7.769753932952881,
"learning_rate": 1.872727272727273e-05,
"loss": 0.3182,
"step": 103
},
{
"epoch": 0.2837653478854025,
"grad_norm": 7.661220550537109,
"learning_rate": 1.8909090909090912e-05,
"loss": 0.3243,
"step": 104
},
{
"epoch": 0.286493860845839,
"grad_norm": 3.2343485355377197,
"learning_rate": 1.9090909090909094e-05,
"loss": 0.3192,
"step": 105
},
{
"epoch": 0.2892223738062756,
"grad_norm": 3.323853015899658,
"learning_rate": 1.9272727272727275e-05,
"loss": 0.3075,
"step": 106
},
{
"epoch": 0.29195088676671216,
"grad_norm": 5.918493270874023,
"learning_rate": 1.9454545454545457e-05,
"loss": 0.3103,
"step": 107
},
{
"epoch": 0.2946793997271487,
"grad_norm": 3.1720244884490967,
"learning_rate": 1.963636363636364e-05,
"loss": 0.305,
"step": 108
},
{
"epoch": 0.29740791268758526,
"grad_norm": 6.2536821365356445,
"learning_rate": 1.981818181818182e-05,
"loss": 0.3066,
"step": 109
},
{
"epoch": 0.30013642564802184,
"grad_norm": 5.35992431640625,
"learning_rate": 2e-05,
"loss": 0.3027,
"step": 110
},
{
"epoch": 0.30286493860845837,
"grad_norm": 5.9867262840271,
"learning_rate": 1.9999949446003432e-05,
"loss": 0.3101,
"step": 111
},
{
"epoch": 0.30559345156889495,
"grad_norm": 5.217441082000732,
"learning_rate": 1.9999797784524866e-05,
"loss": 0.2974,
"step": 112
},
{
"epoch": 0.3083219645293315,
"grad_norm": 5.573215484619141,
"learning_rate": 1.9999545017097726e-05,
"loss": 0.3021,
"step": 113
},
{
"epoch": 0.31105047748976805,
"grad_norm": 5.682995796203613,
"learning_rate": 1.999919114627769e-05,
"loss": 0.305,
"step": 114
},
{
"epoch": 0.31377899045020463,
"grad_norm": 5.977804660797119,
"learning_rate": 1.9998736175642674e-05,
"loss": 0.3014,
"step": 115
},
{
"epoch": 0.3165075034106412,
"grad_norm": 5.056419372558594,
"learning_rate": 1.9998180109792793e-05,
"loss": 0.2918,
"step": 116
},
{
"epoch": 0.31923601637107774,
"grad_norm": 4.191486835479736,
"learning_rate": 1.999752295435032e-05,
"loss": 0.2859,
"step": 117
},
{
"epoch": 0.3219645293315143,
"grad_norm": 3.9454257488250732,
"learning_rate": 1.999676471595962e-05,
"loss": 0.2853,
"step": 118
},
{
"epoch": 0.3246930422919509,
"grad_norm": 6.6568922996521,
"learning_rate": 1.9995905402287094e-05,
"loss": 0.2906,
"step": 119
},
{
"epoch": 0.3274215552523875,
"grad_norm": 6.2630133628845215,
"learning_rate": 1.9994945022021085e-05,
"loss": 0.2945,
"step": 120
},
{
"epoch": 0.330150068212824,
"grad_norm": 3.6698672771453857,
"learning_rate": 1.9993883584871807e-05,
"loss": 0.2856,
"step": 121
},
{
"epoch": 0.3328785811732606,
"grad_norm": 4.083488464355469,
"learning_rate": 1.9992721101571238e-05,
"loss": 0.2897,
"step": 122
},
{
"epoch": 0.33560709413369716,
"grad_norm": 5.033275127410889,
"learning_rate": 1.999145758387301e-05,
"loss": 0.2853,
"step": 123
},
{
"epoch": 0.3383356070941337,
"grad_norm": 3.358428955078125,
"learning_rate": 1.9990093044552304e-05,
"loss": 0.2848,
"step": 124
},
{
"epoch": 0.34106412005457026,
"grad_norm": 6.593973159790039,
"learning_rate": 1.9988627497405696e-05,
"loss": 0.2868,
"step": 125
},
{
"epoch": 0.34379263301500684,
"grad_norm": 5.836681842803955,
"learning_rate": 1.9987060957251047e-05,
"loss": 0.2785,
"step": 126
},
{
"epoch": 0.34652114597544337,
"grad_norm": 2.7933859825134277,
"learning_rate": 1.9985393439927325e-05,
"loss": 0.273,
"step": 127
},
{
"epoch": 0.34924965893587995,
"grad_norm": 3.4475436210632324,
"learning_rate": 1.998362496229446e-05,
"loss": 0.2745,
"step": 128
},
{
"epoch": 0.3519781718963165,
"grad_norm": 4.551499366760254,
"learning_rate": 1.9981755542233175e-05,
"loss": 0.2792,
"step": 129
},
{
"epoch": 0.35470668485675305,
"grad_norm": 3.666813611984253,
"learning_rate": 1.997978519864481e-05,
"loss": 0.2711,
"step": 130
},
{
"epoch": 0.35743519781718963,
"grad_norm": 5.33425235748291,
"learning_rate": 1.9977713951451102e-05,
"loss": 0.27,
"step": 131
},
{
"epoch": 0.3601637107776262,
"grad_norm": 4.913830280303955,
"learning_rate": 1.9975541821594028e-05,
"loss": 0.2732,
"step": 132
},
{
"epoch": 0.36289222373806274,
"grad_norm": 3.9749391078948975,
"learning_rate": 1.9973268831035547e-05,
"loss": 0.2693,
"step": 133
},
{
"epoch": 0.3656207366984993,
"grad_norm": 3.055846691131592,
"learning_rate": 1.9970895002757413e-05,
"loss": 0.2692,
"step": 134
},
{
"epoch": 0.3683492496589359,
"grad_norm": 6.07874059677124,
"learning_rate": 1.996842036076093e-05,
"loss": 0.273,
"step": 135
},
{
"epoch": 0.3710777626193724,
"grad_norm": 4.657766819000244,
"learning_rate": 1.99658449300667e-05,
"loss": 0.2819,
"step": 136
},
{
"epoch": 0.373806275579809,
"grad_norm": 4.3276801109313965,
"learning_rate": 1.9963168736714395e-05,
"loss": 0.2737,
"step": 137
},
{
"epoch": 0.3765347885402456,
"grad_norm": 4.3239569664001465,
"learning_rate": 1.9960391807762462e-05,
"loss": 0.268,
"step": 138
},
{
"epoch": 0.3792633015006821,
"grad_norm": 3.855254888534546,
"learning_rate": 1.9957514171287875e-05,
"loss": 0.2607,
"step": 139
},
{
"epoch": 0.3819918144611187,
"grad_norm": 3.0584146976470947,
"learning_rate": 1.995453585638584e-05,
"loss": 0.2652,
"step": 140
},
{
"epoch": 0.38472032742155526,
"grad_norm": 4.666026592254639,
"learning_rate": 1.9951456893169497e-05,
"loss": 0.2614,
"step": 141
},
{
"epoch": 0.3874488403819918,
"grad_norm": 3.672959566116333,
"learning_rate": 1.994827731276963e-05,
"loss": 0.2725,
"step": 142
},
{
"epoch": 0.39017735334242837,
"grad_norm": 5.299169063568115,
"learning_rate": 1.994499714733434e-05,
"loss": 0.2629,
"step": 143
},
{
"epoch": 0.39290586630286495,
"grad_norm": 4.301833629608154,
"learning_rate": 1.9941616430028713e-05,
"loss": 0.2597,
"step": 144
},
{
"epoch": 0.3956343792633015,
"grad_norm": 4.388802528381348,
"learning_rate": 1.993813519503451e-05,
"loss": 0.2702,
"step": 145
},
{
"epoch": 0.39836289222373805,
"grad_norm": 3.4200570583343506,
"learning_rate": 1.9934553477549795e-05,
"loss": 0.2608,
"step": 146
},
{
"epoch": 0.40109140518417463,
"grad_norm": 6.084020614624023,
"learning_rate": 1.99308713137886e-05,
"loss": 0.2665,
"step": 147
},
{
"epoch": 0.4038199181446112,
"grad_norm": 4.747646331787109,
"learning_rate": 1.992708874098054e-05,
"loss": 0.2605,
"step": 148
},
{
"epoch": 0.40654843110504774,
"grad_norm": 3.8556838035583496,
"learning_rate": 1.992320579737045e-05,
"loss": 0.2562,
"step": 149
},
{
"epoch": 0.4092769440654843,
"grad_norm": 3.480994462966919,
"learning_rate": 1.9919222522217998e-05,
"loss": 0.2614,
"step": 150
},
{
"epoch": 0.4120054570259209,
"grad_norm": 3.6695926189422607,
"learning_rate": 1.9915138955797272e-05,
"loss": 0.2593,
"step": 151
},
{
"epoch": 0.4147339699863574,
"grad_norm": 2.5881447792053223,
"learning_rate": 1.9910955139396395e-05,
"loss": 0.2598,
"step": 152
},
{
"epoch": 0.417462482946794,
"grad_norm": 5.744887351989746,
"learning_rate": 1.99066711153171e-05,
"loss": 0.2553,
"step": 153
},
{
"epoch": 0.4201909959072306,
"grad_norm": 5.279797554016113,
"learning_rate": 1.990228692687429e-05,
"loss": 0.2616,
"step": 154
},
{
"epoch": 0.4229195088676671,
"grad_norm": 4.670532703399658,
"learning_rate": 1.9897802618395614e-05,
"loss": 0.2636,
"step": 155
},
{
"epoch": 0.4256480218281037,
"grad_norm": 4.182051658630371,
"learning_rate": 1.9893218235221016e-05,
"loss": 0.2604,
"step": 156
},
{
"epoch": 0.42837653478854026,
"grad_norm": 3.6281673908233643,
"learning_rate": 1.988853382370228e-05,
"loss": 0.2571,
"step": 157
},
{
"epoch": 0.4311050477489768,
"grad_norm": 3.440207004547119,
"learning_rate": 1.988374943120254e-05,
"loss": 0.2569,
"step": 158
},
{
"epoch": 0.43383356070941337,
"grad_norm": 4.212845802307129,
"learning_rate": 1.9878865106095838e-05,
"loss": 0.2482,
"step": 159
},
{
"epoch": 0.43656207366984995,
"grad_norm": 4.241349220275879,
"learning_rate": 1.9873880897766597e-05,
"loss": 0.2549,
"step": 160
},
{
"epoch": 0.4392905866302865,
"grad_norm": 3.784503221511841,
"learning_rate": 1.9868796856609154e-05,
"loss": 0.2641,
"step": 161
},
{
"epoch": 0.44201909959072305,
"grad_norm": 2.801694631576538,
"learning_rate": 1.9863613034027224e-05,
"loss": 0.2501,
"step": 162
},
{
"epoch": 0.44474761255115963,
"grad_norm": 5.349376678466797,
"learning_rate": 1.9858329482433404e-05,
"loss": 0.2552,
"step": 163
},
{
"epoch": 0.44747612551159616,
"grad_norm": 4.352358341217041,
"learning_rate": 1.985294625524861e-05,
"loss": 0.2534,
"step": 164
},
{
"epoch": 0.45020463847203274,
"grad_norm": 4.777315616607666,
"learning_rate": 1.984746340690159e-05,
"loss": 0.2555,
"step": 165
},
{
"epoch": 0.4529331514324693,
"grad_norm": 4.782495498657227,
"learning_rate": 1.9841880992828306e-05,
"loss": 0.2544,
"step": 166
},
{
"epoch": 0.45566166439290584,
"grad_norm": 2.6630942821502686,
"learning_rate": 1.983619906947144e-05,
"loss": 0.2542,
"step": 167
},
{
"epoch": 0.4583901773533424,
"grad_norm": 2.5042455196380615,
"learning_rate": 1.9830417694279766e-05,
"loss": 0.2511,
"step": 168
},
{
"epoch": 0.461118690313779,
"grad_norm": 4.9226579666137695,
"learning_rate": 1.9824536925707622e-05,
"loss": 0.2561,
"step": 169
},
{
"epoch": 0.4638472032742155,
"grad_norm": 4.228565216064453,
"learning_rate": 1.981855682321427e-05,
"loss": 0.2497,
"step": 170
},
{
"epoch": 0.4665757162346521,
"grad_norm": 4.8918280601501465,
"learning_rate": 1.9812477447263324e-05,
"loss": 0.2509,
"step": 171
},
{
"epoch": 0.4693042291950887,
"grad_norm": 4.446721076965332,
"learning_rate": 1.9806298859322143e-05,
"loss": 0.2519,
"step": 172
},
{
"epoch": 0.47203274215552526,
"grad_norm": 3.0315163135528564,
"learning_rate": 1.980002112186118e-05,
"loss": 0.247,
"step": 173
},
{
"epoch": 0.4747612551159618,
"grad_norm": 3.33701229095459,
"learning_rate": 1.979364429835339e-05,
"loss": 0.2516,
"step": 174
},
{
"epoch": 0.47748976807639837,
"grad_norm": 5.234941482543945,
"learning_rate": 1.9787168453273546e-05,
"loss": 0.2538,
"step": 175
},
{
"epoch": 0.48021828103683495,
"grad_norm": 3.7416799068450928,
"learning_rate": 1.978059365209762e-05,
"loss": 0.2578,
"step": 176
},
{
"epoch": 0.4829467939972715,
"grad_norm": 4.0507330894470215,
"learning_rate": 1.9773919961302113e-05,
"loss": 0.2515,
"step": 177
},
{
"epoch": 0.48567530695770805,
"grad_norm": 4.139606952667236,
"learning_rate": 1.9767147448363366e-05,
"loss": 0.2502,
"step": 178
},
{
"epoch": 0.48840381991814463,
"grad_norm": 3.8178112506866455,
"learning_rate": 1.9760276181756905e-05,
"loss": 0.2508,
"step": 179
},
{
"epoch": 0.49113233287858116,
"grad_norm": 3.926447629928589,
"learning_rate": 1.975330623095672e-05,
"loss": 0.2475,
"step": 180
},
{
"epoch": 0.49386084583901774,
"grad_norm": 3.354318141937256,
"learning_rate": 1.9746237666434588e-05,
"loss": 0.2502,
"step": 181
},
{
"epoch": 0.4965893587994543,
"grad_norm": 3.2970614433288574,
"learning_rate": 1.9739070559659347e-05,
"loss": 0.2472,
"step": 182
},
{
"epoch": 0.49931787175989084,
"grad_norm": 4.805551052093506,
"learning_rate": 1.973180498309618e-05,
"loss": 0.2427,
"step": 183
},
{
"epoch": 0.5020463847203275,
"grad_norm": 3.856684446334839,
"learning_rate": 1.9724441010205865e-05,
"loss": 0.2441,
"step": 184
},
{
"epoch": 0.504774897680764,
"grad_norm": 3.8979854583740234,
"learning_rate": 1.9716978715444056e-05,
"loss": 0.2463,
"step": 185
},
{
"epoch": 0.5075034106412005,
"grad_norm": 3.709199905395508,
"learning_rate": 1.9709418174260523e-05,
"loss": 0.246,
"step": 186
},
{
"epoch": 0.5102319236016372,
"grad_norm": 4.20219612121582,
"learning_rate": 1.9701759463098377e-05,
"loss": 0.2512,
"step": 187
},
{
"epoch": 0.5129604365620737,
"grad_norm": 3.796937942504883,
"learning_rate": 1.9694002659393306e-05,
"loss": 0.2482,
"step": 188
},
{
"epoch": 0.5156889495225102,
"grad_norm": 3.9981391429901123,
"learning_rate": 1.9686147841572803e-05,
"loss": 0.2359,
"step": 189
},
{
"epoch": 0.5184174624829468,
"grad_norm": 3.802412986755371,
"learning_rate": 1.9678195089055347e-05,
"loss": 0.2417,
"step": 190
},
{
"epoch": 0.5211459754433834,
"grad_norm": 4.142465114593506,
"learning_rate": 1.967014448224963e-05,
"loss": 0.2392,
"step": 191
},
{
"epoch": 0.5238744884038199,
"grad_norm": 3.554514169692993,
"learning_rate": 1.9661996102553716e-05,
"loss": 0.2399,
"step": 192
},
{
"epoch": 0.5266030013642565,
"grad_norm": 3.4873900413513184,
"learning_rate": 1.965375003235424e-05,
"loss": 0.2481,
"step": 193
},
{
"epoch": 0.529331514324693,
"grad_norm": 3.5542309284210205,
"learning_rate": 1.9645406355025565e-05,
"loss": 0.2423,
"step": 194
},
{
"epoch": 0.5320600272851296,
"grad_norm": 3.76362943649292,
"learning_rate": 1.9636965154928932e-05,
"loss": 0.2383,
"step": 195
},
{
"epoch": 0.5347885402455662,
"grad_norm": 2.883169651031494,
"learning_rate": 1.9628426517411625e-05,
"loss": 0.2383,
"step": 196
},
{
"epoch": 0.5375170532060027,
"grad_norm": 3.5655651092529297,
"learning_rate": 1.9619790528806092e-05,
"loss": 0.2387,
"step": 197
},
{
"epoch": 0.5402455661664393,
"grad_norm": 3.3741378784179688,
"learning_rate": 1.9611057276429085e-05,
"loss": 0.2444,
"step": 198
},
{
"epoch": 0.5429740791268759,
"grad_norm": 4.598501682281494,
"learning_rate": 1.9602226848580762e-05,
"loss": 0.2406,
"step": 199
},
{
"epoch": 0.5457025920873124,
"grad_norm": 4.26082181930542,
"learning_rate": 1.959329933454381e-05,
"loss": 0.2508,
"step": 200
},
{
"epoch": 0.548431105047749,
"grad_norm": 3.1739697456359863,
"learning_rate": 1.958427482458253e-05,
"loss": 0.242,
"step": 201
},
{
"epoch": 0.5511596180081856,
"grad_norm": 2.8693127632141113,
"learning_rate": 1.957515340994193e-05,
"loss": 0.2367,
"step": 202
},
{
"epoch": 0.5538881309686221,
"grad_norm": 4.578694820404053,
"learning_rate": 1.95659351828468e-05,
"loss": 0.2378,
"step": 203
},
{
"epoch": 0.5566166439290586,
"grad_norm": 3.6820926666259766,
"learning_rate": 1.9556620236500794e-05,
"loss": 0.2413,
"step": 204
},
{
"epoch": 0.5593451568894953,
"grad_norm": 4.010608196258545,
"learning_rate": 1.954720866508546e-05,
"loss": 0.2477,
"step": 205
},
{
"epoch": 0.5620736698499318,
"grad_norm": 3.723583221435547,
"learning_rate": 1.9537700563759303e-05,
"loss": 0.2376,
"step": 206
},
{
"epoch": 0.5648021828103683,
"grad_norm": 3.185204029083252,
"learning_rate": 1.9528096028656835e-05,
"loss": 0.2402,
"step": 207
},
{
"epoch": 0.567530695770805,
"grad_norm": 3.1054224967956543,
"learning_rate": 1.9518395156887574e-05,
"loss": 0.2401,
"step": 208
},
{
"epoch": 0.5702592087312415,
"grad_norm": 4.654784202575684,
"learning_rate": 1.9508598046535095e-05,
"loss": 0.2515,
"step": 209
},
{
"epoch": 0.572987721691678,
"grad_norm": 4.25405216217041,
"learning_rate": 1.949870479665602e-05,
"loss": 0.2442,
"step": 210
},
{
"epoch": 0.5757162346521146,
"grad_norm": 3.365250587463379,
"learning_rate": 1.9488715507279e-05,
"loss": 0.2379,
"step": 211
},
{
"epoch": 0.5784447476125512,
"grad_norm": 2.708874464035034,
"learning_rate": 1.9478630279403737e-05,
"loss": 0.2289,
"step": 212
},
{
"epoch": 0.5811732605729877,
"grad_norm": 4.695353031158447,
"learning_rate": 1.9468449214999956e-05,
"loss": 0.2449,
"step": 213
},
{
"epoch": 0.5839017735334243,
"grad_norm": 3.746852159500122,
"learning_rate": 1.9458172417006347e-05,
"loss": 0.2339,
"step": 214
},
{
"epoch": 0.5866302864938608,
"grad_norm": 3.6476454734802246,
"learning_rate": 1.9447799989329557e-05,
"loss": 0.2382,
"step": 215
},
{
"epoch": 0.5893587994542974,
"grad_norm": 3.3392837047576904,
"learning_rate": 1.943733203684312e-05,
"loss": 0.2406,
"step": 216
},
{
"epoch": 0.592087312414734,
"grad_norm": 3.441448211669922,
"learning_rate": 1.9426768665386397e-05,
"loss": 0.2446,
"step": 217
},
{
"epoch": 0.5948158253751705,
"grad_norm": 2.790771722793579,
"learning_rate": 1.9416109981763526e-05,
"loss": 0.2325,
"step": 218
},
{
"epoch": 0.597544338335607,
"grad_norm": 5.028257369995117,
"learning_rate": 1.9405356093742314e-05,
"loss": 0.2312,
"step": 219
},
{
"epoch": 0.6002728512960437,
"grad_norm": 4.487533092498779,
"learning_rate": 1.939450711005316e-05,
"loss": 0.2465,
"step": 220
},
{
"epoch": 0.6030013642564802,
"grad_norm": 3.343076467514038,
"learning_rate": 1.9383563140387966e-05,
"loss": 0.234,
"step": 221
},
{
"epoch": 0.6057298772169167,
"grad_norm": 3.0385632514953613,
"learning_rate": 1.9372524295399014e-05,
"loss": 0.233,
"step": 222
},
{
"epoch": 0.6084583901773534,
"grad_norm": 3.32714581489563,
"learning_rate": 1.9361390686697847e-05,
"loss": 0.2385,
"step": 223
},
{
"epoch": 0.6111869031377899,
"grad_norm": 2.715806245803833,
"learning_rate": 1.9350162426854152e-05,
"loss": 0.2317,
"step": 224
},
{
"epoch": 0.6139154160982264,
"grad_norm": 4.670190334320068,
"learning_rate": 1.9338839629394606e-05,
"loss": 0.2349,
"step": 225
},
{
"epoch": 0.616643929058663,
"grad_norm": 3.5198147296905518,
"learning_rate": 1.9327422408801744e-05,
"loss": 0.2249,
"step": 226
},
{
"epoch": 0.6193724420190996,
"grad_norm": 3.903116464614868,
"learning_rate": 1.9315910880512792e-05,
"loss": 0.2383,
"step": 227
},
{
"epoch": 0.6221009549795361,
"grad_norm": 3.9901745319366455,
"learning_rate": 1.93043051609185e-05,
"loss": 0.2412,
"step": 228
},
{
"epoch": 0.6248294679399727,
"grad_norm": 2.8431413173675537,
"learning_rate": 1.929260536736198e-05,
"loss": 0.2393,
"step": 229
},
{
"epoch": 0.6275579809004093,
"grad_norm": 3.0009732246398926,
"learning_rate": 1.9280811618137486e-05,
"loss": 0.2332,
"step": 230
},
{
"epoch": 0.6302864938608458,
"grad_norm": 4.375698566436768,
"learning_rate": 1.926892403248925e-05,
"loss": 0.2394,
"step": 231
},
{
"epoch": 0.6330150068212824,
"grad_norm": 3.7204086780548096,
"learning_rate": 1.9256942730610268e-05,
"loss": 0.2366,
"step": 232
},
{
"epoch": 0.635743519781719,
"grad_norm": 3.1521670818328857,
"learning_rate": 1.9244867833641078e-05,
"loss": 0.2355,
"step": 233
},
{
"epoch": 0.6384720327421555,
"grad_norm": 2.801316499710083,
"learning_rate": 1.9232699463668543e-05,
"loss": 0.2345,
"step": 234
},
{
"epoch": 0.6412005457025921,
"grad_norm": 4.499333381652832,
"learning_rate": 1.9220437743724605e-05,
"loss": 0.2311,
"step": 235
},
{
"epoch": 0.6439290586630286,
"grad_norm": 3.6295053958892822,
"learning_rate": 1.9208082797785057e-05,
"loss": 0.2304,
"step": 236
},
{
"epoch": 0.6466575716234653,
"grad_norm": 3.377734899520874,
"learning_rate": 1.9195634750768276e-05,
"loss": 0.2304,
"step": 237
},
{
"epoch": 0.6493860845839018,
"grad_norm": 3.2624452114105225,
"learning_rate": 1.9183093728533966e-05,
"loss": 0.2275,
"step": 238
},
{
"epoch": 0.6521145975443383,
"grad_norm": 3.3896477222442627,
"learning_rate": 1.9170459857881888e-05,
"loss": 0.2292,
"step": 239
},
{
"epoch": 0.654843110504775,
"grad_norm": 2.768524646759033,
"learning_rate": 1.9157733266550577e-05,
"loss": 0.2371,
"step": 240
},
{
"epoch": 0.6575716234652115,
"grad_norm": 3.741811513900757,
"learning_rate": 1.9144914083216036e-05,
"loss": 0.2302,
"step": 241
},
{
"epoch": 0.660300136425648,
"grad_norm": 3.910012722015381,
"learning_rate": 1.913200243749046e-05,
"loss": 0.2306,
"step": 242
},
{
"epoch": 0.6630286493860846,
"grad_norm": 3.4035255908966064,
"learning_rate": 1.91189984599209e-05,
"loss": 0.2302,
"step": 243
},
{
"epoch": 0.6657571623465212,
"grad_norm": 2.929786205291748,
"learning_rate": 1.910590228198798e-05,
"loss": 0.2316,
"step": 244
},
{
"epoch": 0.6684856753069577,
"grad_norm": 3.5022189617156982,
"learning_rate": 1.9092714036104508e-05,
"loss": 0.2387,
"step": 245
},
{
"epoch": 0.6712141882673943,
"grad_norm": 2.599740982055664,
"learning_rate": 1.9079433855614203e-05,
"loss": 0.2284,
"step": 246
},
{
"epoch": 0.6739427012278308,
"grad_norm": 4.416684627532959,
"learning_rate": 1.9066061874790302e-05,
"loss": 0.2323,
"step": 247
},
{
"epoch": 0.6766712141882674,
"grad_norm": 4.048118591308594,
"learning_rate": 1.9052598228834217e-05,
"loss": 0.2318,
"step": 248
},
{
"epoch": 0.679399727148704,
"grad_norm": 3.1929619312286377,
"learning_rate": 1.9039043053874175e-05,
"loss": 0.2375,
"step": 249
},
{
"epoch": 0.6821282401091405,
"grad_norm": 2.838665246963501,
"learning_rate": 1.9025396486963827e-05,
"loss": 0.2309,
"step": 250
},
{
"epoch": 0.684856753069577,
"grad_norm": 3.510965585708618,
"learning_rate": 1.9011658666080873e-05,
"loss": 0.226,
"step": 251
},
{
"epoch": 0.6875852660300137,
"grad_norm": 3.3126542568206787,
"learning_rate": 1.8997829730125662e-05,
"loss": 0.2255,
"step": 252
},
{
"epoch": 0.6903137789904502,
"grad_norm": 2.888857126235962,
"learning_rate": 1.898390981891979e-05,
"loss": 0.2244,
"step": 253
},
{
"epoch": 0.6930422919508867,
"grad_norm": 3.039818286895752,
"learning_rate": 1.8969899073204687e-05,
"loss": 0.219,
"step": 254
},
{
"epoch": 0.6957708049113234,
"grad_norm": 3.580467939376831,
"learning_rate": 1.895579763464019e-05,
"loss": 0.2317,
"step": 255
},
{
"epoch": 0.6984993178717599,
"grad_norm": 2.6989052295684814,
"learning_rate": 1.8941605645803115e-05,
"loss": 0.2276,
"step": 256
},
{
"epoch": 0.7012278308321964,
"grad_norm": 4.139655590057373,
"learning_rate": 1.8927323250185815e-05,
"loss": 0.2352,
"step": 257
},
{
"epoch": 0.703956343792633,
"grad_norm": 3.3270514011383057,
"learning_rate": 1.891295059219472e-05,
"loss": 0.2325,
"step": 258
},
{
"epoch": 0.7066848567530696,
"grad_norm": 3.4230055809020996,
"learning_rate": 1.88984878171489e-05,
"loss": 0.2363,
"step": 259
},
{
"epoch": 0.7094133697135061,
"grad_norm": 3.38492751121521,
"learning_rate": 1.888393507127856e-05,
"loss": 0.2186,
"step": 260
},
{
"epoch": 0.7121418826739427,
"grad_norm": 3.371267080307007,
"learning_rate": 1.8869292501723602e-05,
"loss": 0.2287,
"step": 261
},
{
"epoch": 0.7148703956343793,
"grad_norm": 3.399115800857544,
"learning_rate": 1.8854560256532098e-05,
"loss": 0.2267,
"step": 262
},
{
"epoch": 0.7175989085948158,
"grad_norm": 3.724271774291992,
"learning_rate": 1.8839738484658835e-05,
"loss": 0.2264,
"step": 263
},
{
"epoch": 0.7203274215552524,
"grad_norm": 3.2149219512939453,
"learning_rate": 1.8824827335963767e-05,
"loss": 0.2331,
"step": 264
},
{
"epoch": 0.723055934515689,
"grad_norm": 3.3156564235687256,
"learning_rate": 1.8809826961210527e-05,
"loss": 0.2304,
"step": 265
},
{
"epoch": 0.7257844474761255,
"grad_norm": 2.95145845413208,
"learning_rate": 1.879473751206489e-05,
"loss": 0.228,
"step": 266
},
{
"epoch": 0.7285129604365621,
"grad_norm": 4.918845176696777,
"learning_rate": 1.8779559141093256e-05,
"loss": 0.2338,
"step": 267
},
{
"epoch": 0.7312414733969986,
"grad_norm": 4.510799884796143,
"learning_rate": 1.876429200176108e-05,
"loss": 0.2257,
"step": 268
},
{
"epoch": 0.7339699863574352,
"grad_norm": 2.117758274078369,
"learning_rate": 1.8748936248431353e-05,
"loss": 0.2303,
"step": 269
},
{
"epoch": 0.7366984993178718,
"grad_norm": 2.579289197921753,
"learning_rate": 1.8733492036363007e-05,
"loss": 0.2362,
"step": 270
},
{
"epoch": 0.7394270122783083,
"grad_norm": 2.8581790924072266,
"learning_rate": 1.871795952170937e-05,
"loss": 0.2297,
"step": 271
},
{
"epoch": 0.7421555252387448,
"grad_norm": 1.994896650314331,
"learning_rate": 1.8702338861516587e-05,
"loss": 0.2326,
"step": 272
},
{
"epoch": 0.7448840381991815,
"grad_norm": 3.054471492767334,
"learning_rate": 1.8686630213722015e-05,
"loss": 0.2257,
"step": 273
},
{
"epoch": 0.747612551159618,
"grad_norm": 2.0378928184509277,
"learning_rate": 1.867083373715264e-05,
"loss": 0.2278,
"step": 274
},
{
"epoch": 0.7503410641200545,
"grad_norm": 4.272954940795898,
"learning_rate": 1.8654949591523467e-05,
"loss": 0.2295,
"step": 275
},
{
"epoch": 0.7530695770804912,
"grad_norm": 3.0166015625,
"learning_rate": 1.86389779374359e-05,
"loss": 0.2276,
"step": 276
},
{
"epoch": 0.7557980900409277,
"grad_norm": 2.997028350830078,
"learning_rate": 1.8622918936376133e-05,
"loss": 0.227,
"step": 277
},
{
"epoch": 0.7585266030013642,
"grad_norm": 3.245464563369751,
"learning_rate": 1.8606772750713503e-05,
"loss": 0.2324,
"step": 278
},
{
"epoch": 0.7612551159618008,
"grad_norm": 3.697631597518921,
"learning_rate": 1.8590539543698852e-05,
"loss": 0.2284,
"step": 279
},
{
"epoch": 0.7639836289222374,
"grad_norm": 3.2018115520477295,
"learning_rate": 1.857421947946288e-05,
"loss": 0.2317,
"step": 280
},
{
"epoch": 0.7667121418826739,
"grad_norm": 2.5883381366729736,
"learning_rate": 1.8557812723014476e-05,
"loss": 0.2286,
"step": 281
},
{
"epoch": 0.7694406548431105,
"grad_norm": 2.716522216796875,
"learning_rate": 1.8541319440239066e-05,
"loss": 0.2318,
"step": 282
},
{
"epoch": 0.772169167803547,
"grad_norm": 1.4050514698028564,
"learning_rate": 1.8524739797896924e-05,
"loss": 0.2274,
"step": 283
},
{
"epoch": 0.7748976807639836,
"grad_norm": 3.013352394104004,
"learning_rate": 1.8508073963621482e-05,
"loss": 0.2234,
"step": 284
},
{
"epoch": 0.7776261937244202,
"grad_norm": 1.8967962265014648,
"learning_rate": 1.8491322105917645e-05,
"loss": 0.2283,
"step": 285
},
{
"epoch": 0.7803547066848567,
"grad_norm": 5.349843978881836,
"learning_rate": 1.847448439416009e-05,
"loss": 0.2304,
"step": 286
},
{
"epoch": 0.7830832196452933,
"grad_norm": 3.769728422164917,
"learning_rate": 1.845756099859154e-05,
"loss": 0.235,
"step": 287
},
{
"epoch": 0.7858117326057299,
"grad_norm": 2.657892942428589,
"learning_rate": 1.8440552090321047e-05,
"loss": 0.2328,
"step": 288
},
{
"epoch": 0.7885402455661664,
"grad_norm": 2.9830169677734375,
"learning_rate": 1.842345784132227e-05,
"loss": 0.2344,
"step": 289
},
{
"epoch": 0.791268758526603,
"grad_norm": 3.0973758697509766,
"learning_rate": 1.8406278424431737e-05,
"loss": 0.2361,
"step": 290
},
{
"epoch": 0.7939972714870396,
"grad_norm": 2.921966791152954,
"learning_rate": 1.838901401334708e-05,
"loss": 0.236,
"step": 291
},
{
"epoch": 0.7967257844474761,
"grad_norm": 2.3755767345428467,
"learning_rate": 1.8371664782625287e-05,
"loss": 0.232,
"step": 292
},
{
"epoch": 0.7994542974079127,
"grad_norm": 2.2962570190429688,
"learning_rate": 1.835423090768096e-05,
"loss": 0.2279,
"step": 293
},
{
"epoch": 0.8021828103683493,
"grad_norm": 2.22548246383667,
"learning_rate": 1.8336712564784506e-05,
"loss": 0.2356,
"step": 294
},
{
"epoch": 0.8049113233287858,
"grad_norm": 2.092400074005127,
"learning_rate": 1.8319109931060367e-05,
"loss": 0.2333,
"step": 295
},
{
"epoch": 0.8076398362892224,
"grad_norm": 1.5649902820587158,
"learning_rate": 1.8301423184485253e-05,
"loss": 0.2291,
"step": 296
},
{
"epoch": 0.810368349249659,
"grad_norm": 3.5106163024902344,
"learning_rate": 1.82836525038863e-05,
"loss": 0.2312,
"step": 297
},
{
"epoch": 0.8130968622100955,
"grad_norm": 2.0999836921691895,
"learning_rate": 1.8265798068939295e-05,
"loss": 0.2274,
"step": 298
},
{
"epoch": 0.8158253751705321,
"grad_norm": 2.878789186477661,
"learning_rate": 1.824786006016685e-05,
"loss": 0.2202,
"step": 299
},
{
"epoch": 0.8185538881309686,
"grad_norm": 2.4010396003723145,
"learning_rate": 1.8229838658936566e-05,
"loss": 0.2348,
"step": 300
},
{
"epoch": 0.8212824010914052,
"grad_norm": 3.518007278442383,
"learning_rate": 1.821173404745922e-05,
"loss": 0.2347,
"step": 301
},
{
"epoch": 0.8240109140518418,
"grad_norm": 2.7652671337127686,
"learning_rate": 1.81935464087869e-05,
"loss": 0.2309,
"step": 302
},
{
"epoch": 0.8267394270122783,
"grad_norm": 2.9426369667053223,
"learning_rate": 1.8175275926811173e-05,
"loss": 0.2313,
"step": 303
},
{
"epoch": 0.8294679399727148,
"grad_norm": 2.348344326019287,
"learning_rate": 1.815692278626122e-05,
"loss": 0.2321,
"step": 304
},
{
"epoch": 0.8321964529331515,
"grad_norm": 2.599593162536621,
"learning_rate": 1.813848717270195e-05,
"loss": 0.2312,
"step": 305
},
{
"epoch": 0.834924965893588,
"grad_norm": 1.9701220989227295,
"learning_rate": 1.8119969272532164e-05,
"loss": 0.2301,
"step": 306
},
{
"epoch": 0.8376534788540245,
"grad_norm": 1.4868361949920654,
"learning_rate": 1.8101369272982633e-05,
"loss": 0.2269,
"step": 307
},
{
"epoch": 0.8403819918144612,
"grad_norm": 2.4426674842834473,
"learning_rate": 1.808268736211421e-05,
"loss": 0.2313,
"step": 308
},
{
"epoch": 0.8431105047748977,
"grad_norm": 1.3095004558563232,
"learning_rate": 1.806392372881596e-05,
"loss": 0.2309,
"step": 309
},
{
"epoch": 0.8458390177353342,
"grad_norm": 2.470839738845825,
"learning_rate": 1.8045078562803203e-05,
"loss": 0.2347,
"step": 310
},
{
"epoch": 0.8485675306957708,
"grad_norm": 2.4233086109161377,
"learning_rate": 1.8026152054615633e-05,
"loss": 0.2305,
"step": 311
},
{
"epoch": 0.8512960436562074,
"grad_norm": 1.4290592670440674,
"learning_rate": 1.800714439561538e-05,
"loss": 0.2353,
"step": 312
},
{
"epoch": 0.8540245566166439,
"grad_norm": 1.4730360507965088,
"learning_rate": 1.7988055777985066e-05,
"loss": 0.2284,
"step": 313
},
{
"epoch": 0.8567530695770805,
"grad_norm": 1.795615315437317,
"learning_rate": 1.7968886394725876e-05,
"loss": 0.2258,
"step": 314
},
{
"epoch": 0.859481582537517,
"grad_norm": 1.8772125244140625,
"learning_rate": 1.7949636439655592e-05,
"loss": 0.2246,
"step": 315
},
{
"epoch": 0.8622100954979536,
"grad_norm": 1.611531376838684,
"learning_rate": 1.793030610740665e-05,
"loss": 0.2355,
"step": 316
},
{
"epoch": 0.8649386084583902,
"grad_norm": 1.9092748165130615,
"learning_rate": 1.7910895593424166e-05,
"loss": 0.2287,
"step": 317
},
{
"epoch": 0.8676671214188267,
"grad_norm": 1.8194537162780762,
"learning_rate": 1.789140509396394e-05,
"loss": 0.2302,
"step": 318
},
{
"epoch": 0.8703956343792633,
"grad_norm": 3.0680384635925293,
"learning_rate": 1.7871834806090502e-05,
"loss": 0.2357,
"step": 319
},
{
"epoch": 0.8731241473396999,
"grad_norm": 1.7811588048934937,
"learning_rate": 1.7852184927675113e-05,
"loss": 0.2322,
"step": 320
},
{
"epoch": 0.8758526603001364,
"grad_norm": 2.6121044158935547,
"learning_rate": 1.7832455657393745e-05,
"loss": 0.2408,
"step": 321
},
{
"epoch": 0.878581173260573,
"grad_norm": 2.011620044708252,
"learning_rate": 1.7812647194725093e-05,
"loss": 0.2383,
"step": 322
},
{
"epoch": 0.8813096862210096,
"grad_norm": 2.1114919185638428,
"learning_rate": 1.7792759739948546e-05,
"loss": 0.2349,
"step": 323
},
{
"epoch": 0.8840381991814461,
"grad_norm": 2.098647117614746,
"learning_rate": 1.777279349414217e-05,
"loss": 0.2359,
"step": 324
},
{
"epoch": 0.8867667121418826,
"grad_norm": 2.1539065837860107,
"learning_rate": 1.7752748659180662e-05,
"loss": 0.2375,
"step": 325
},
{
"epoch": 0.8894952251023193,
"grad_norm": 1.4913593530654907,
"learning_rate": 1.7732625437733338e-05,
"loss": 0.23,
"step": 326
},
{
"epoch": 0.8922237380627558,
"grad_norm": 2.4666330814361572,
"learning_rate": 1.771242403326204e-05,
"loss": 0.2284,
"step": 327
},
{
"epoch": 0.8949522510231923,
"grad_norm": 2.010523796081543,
"learning_rate": 1.7692144650019125e-05,
"loss": 0.2322,
"step": 328
},
{
"epoch": 0.897680763983629,
"grad_norm": 2.6557087898254395,
"learning_rate": 1.767178749304536e-05,
"loss": 0.2346,
"step": 329
},
{
"epoch": 0.9004092769440655,
"grad_norm": 2.222409725189209,
"learning_rate": 1.765135276816787e-05,
"loss": 0.2422,
"step": 330
},
{
"epoch": 0.903137789904502,
"grad_norm": 2.119662284851074,
"learning_rate": 1.7630840681998068e-05,
"loss": 0.2309,
"step": 331
},
{
"epoch": 0.9058663028649386,
"grad_norm": 2.8383290767669678,
"learning_rate": 1.7610251441929532e-05,
"loss": 0.2387,
"step": 332
},
{
"epoch": 0.9085948158253752,
"grad_norm": 1.5416494607925415,
"learning_rate": 1.758958525613594e-05,
"loss": 0.2316,
"step": 333
},
{
"epoch": 0.9113233287858117,
"grad_norm": 3.2457337379455566,
"learning_rate": 1.7568842333568952e-05,
"loss": 0.2318,
"step": 334
},
{
"epoch": 0.9140518417462483,
"grad_norm": 2.664722442626953,
"learning_rate": 1.754802288395609e-05,
"loss": 0.2369,
"step": 335
},
{
"epoch": 0.9167803547066848,
"grad_norm": 3.379852294921875,
"learning_rate": 1.7527127117798635e-05,
"loss": 0.2386,
"step": 336
},
{
"epoch": 0.9195088676671214,
"grad_norm": 2.4600136280059814,
"learning_rate": 1.750615524636948e-05,
"loss": 0.242,
"step": 337
},
{
"epoch": 0.922237380627558,
"grad_norm": 3.1573469638824463,
"learning_rate": 1.7485107481711014e-05,
"loss": 0.2283,
"step": 338
},
{
"epoch": 0.9249658935879945,
"grad_norm": 2.4754116535186768,
"learning_rate": 1.7463984036632956e-05,
"loss": 0.2298,
"step": 339
},
{
"epoch": 0.927694406548431,
"grad_norm": 4.697962760925293,
"learning_rate": 1.7442785124710227e-05,
"loss": 0.2349,
"step": 340
},
{
"epoch": 0.9304229195088677,
"grad_norm": 4.781267166137695,
"learning_rate": 1.742151096028076e-05,
"loss": 0.2323,
"step": 341
},
{
"epoch": 0.9331514324693042,
"grad_norm": 1.7259125709533691,
"learning_rate": 1.7400161758443377e-05,
"loss": 0.2335,
"step": 342
},
{
"epoch": 0.9358799454297408,
"grad_norm": 3.0204477310180664,
"learning_rate": 1.7378737735055562e-05,
"loss": 0.2313,
"step": 343
},
{
"epoch": 0.9386084583901774,
"grad_norm": 2.2070934772491455,
"learning_rate": 1.735723910673132e-05,
"loss": 0.2284,
"step": 344
},
{
"epoch": 0.9413369713506139,
"grad_norm": 1.3308905363082886,
"learning_rate": 1.7335666090838965e-05,
"loss": 0.2295,
"step": 345
},
{
"epoch": 0.9440654843110505,
"grad_norm": 2.530228853225708,
"learning_rate": 1.7314018905498932e-05,
"loss": 0.2233,
"step": 346
},
{
"epoch": 0.946793997271487,
"grad_norm": 1.7885509729385376,
"learning_rate": 1.729229776958157e-05,
"loss": 0.2296,
"step": 347
},
{
"epoch": 0.9495225102319236,
"grad_norm": 3.9306282997131348,
"learning_rate": 1.7270502902704925e-05,
"loss": 0.2308,
"step": 348
},
{
"epoch": 0.9522510231923602,
"grad_norm": 2.548013210296631,
"learning_rate": 1.7248634525232523e-05,
"loss": 0.2327,
"step": 349
},
{
"epoch": 0.9549795361527967,
"grad_norm": 2.808361291885376,
"learning_rate": 1.7226692858271133e-05,
"loss": 0.2297,
"step": 350
},
{
"epoch": 0.9577080491132333,
"grad_norm": 2.629054069519043,
"learning_rate": 1.7204678123668556e-05,
"loss": 0.2215,
"step": 351
},
{
"epoch": 0.9604365620736699,
"grad_norm": 3.761340379714966,
"learning_rate": 1.718259054401135e-05,
"loss": 0.2293,
"step": 352
},
{
"epoch": 0.9631650750341064,
"grad_norm": 3.36629581451416,
"learning_rate": 1.71604303426226e-05,
"loss": 0.2269,
"step": 353
},
{
"epoch": 0.965893587994543,
"grad_norm": 3.0692992210388184,
"learning_rate": 1.7138197743559656e-05,
"loss": 0.2276,
"step": 354
},
{
"epoch": 0.9686221009549796,
"grad_norm": 2.2692136764526367,
"learning_rate": 1.7115892971611864e-05,
"loss": 0.2264,
"step": 355
},
{
"epoch": 0.9713506139154161,
"grad_norm": 3.0690739154815674,
"learning_rate": 1.7093516252298296e-05,
"loss": 0.2242,
"step": 356
},
{
"epoch": 0.9740791268758526,
"grad_norm": 2.2217397689819336,
"learning_rate": 1.7071067811865477e-05,
"loss": 0.2202,
"step": 357
},
{
"epoch": 0.9768076398362893,
"grad_norm": 3.9510974884033203,
"learning_rate": 1.7048547877285078e-05,
"loss": 0.2206,
"step": 358
},
{
"epoch": 0.9795361527967258,
"grad_norm": 4.3253254890441895,
"learning_rate": 1.7025956676251636e-05,
"loss": 0.2294,
"step": 359
},
{
"epoch": 0.9822646657571623,
"grad_norm": 2.0930545330047607,
"learning_rate": 1.7003294437180254e-05,
"loss": 0.2228,
"step": 360
},
{
"epoch": 0.984993178717599,
"grad_norm": 2.5518953800201416,
"learning_rate": 1.6980561389204285e-05,
"loss": 0.2288,
"step": 361
},
{
"epoch": 0.9877216916780355,
"grad_norm": 1.6454943418502808,
"learning_rate": 1.695775776217301e-05,
"loss": 0.2165,
"step": 362
},
{
"epoch": 0.990450204638472,
"grad_norm": 1.9209353923797607,
"learning_rate": 1.6934883786649333e-05,
"loss": 0.2144,
"step": 363
},
{
"epoch": 0.9931787175989086,
"grad_norm": 2.2938175201416016,
"learning_rate": 1.6911939693907422e-05,
"loss": 0.2274,
"step": 364
},
{
"epoch": 0.9959072305593452,
"grad_norm": 2.0856289863586426,
"learning_rate": 1.6888925715930396e-05,
"loss": 0.2185,
"step": 365
},
{
"epoch": 0.9986357435197817,
"grad_norm": 3.4801177978515625,
"learning_rate": 1.686584208540797e-05,
"loss": 0.2224,
"step": 366
},
{
"epoch": 1.0013642564802183,
"grad_norm": 2.19926118850708,
"learning_rate": 1.68426890357341e-05,
"loss": 0.2037,
"step": 367
},
{
"epoch": 1.004092769440655,
"grad_norm": 2.824136972427368,
"learning_rate": 1.6819466801004622e-05,
"loss": 0.1922,
"step": 368
},
{
"epoch": 1.0068212824010914,
"grad_norm": 1.9411588907241821,
"learning_rate": 1.6796175616014894e-05,
"loss": 0.19,
"step": 369
},
{
"epoch": 1.009549795361528,
"grad_norm": 2.6989634037017822,
"learning_rate": 1.6772815716257414e-05,
"loss": 0.1906,
"step": 370
},
{
"epoch": 1.0122783083219646,
"grad_norm": 2.432159662246704,
"learning_rate": 1.6749387337919434e-05,
"loss": 0.1814,
"step": 371
},
{
"epoch": 1.015006821282401,
"grad_norm": 1.962867259979248,
"learning_rate": 1.672589071788059e-05,
"loss": 0.1855,
"step": 372
},
{
"epoch": 1.0177353342428377,
"grad_norm": 2.1539595127105713,
"learning_rate": 1.6702326093710493e-05,
"loss": 0.1831,
"step": 373
},
{
"epoch": 1.0204638472032743,
"grad_norm": 2.6385738849639893,
"learning_rate": 1.6678693703666327e-05,
"loss": 0.1873,
"step": 374
},
{
"epoch": 1.0231923601637107,
"grad_norm": 2.0655276775360107,
"learning_rate": 1.6654993786690445e-05,
"loss": 0.1843,
"step": 375
},
{
"epoch": 1.0259208731241474,
"grad_norm": 2.4418952465057373,
"learning_rate": 1.6631226582407954e-05,
"loss": 0.1855,
"step": 376
},
{
"epoch": 1.028649386084584,
"grad_norm": 2.3700835704803467,
"learning_rate": 1.6607392331124282e-05,
"loss": 0.1784,
"step": 377
},
{
"epoch": 1.0313778990450204,
"grad_norm": 2.478952646255493,
"learning_rate": 1.6583491273822763e-05,
"loss": 0.1858,
"step": 378
},
{
"epoch": 1.034106412005457,
"grad_norm": 2.238147020339966,
"learning_rate": 1.6559523652162192e-05,
"loss": 0.185,
"step": 379
},
{
"epoch": 1.0368349249658937,
"grad_norm": 2.578995943069458,
"learning_rate": 1.653548970847438e-05,
"loss": 0.1854,
"step": 380
},
{
"epoch": 1.03956343792633,
"grad_norm": 2.7153942584991455,
"learning_rate": 1.651138968576171e-05,
"loss": 0.1859,
"step": 381
},
{
"epoch": 1.0422919508867667,
"grad_norm": 1.802567720413208,
"learning_rate": 1.6487223827694673e-05,
"loss": 0.1844,
"step": 382
},
{
"epoch": 1.0450204638472034,
"grad_norm": 1.6883435249328613,
"learning_rate": 1.646299237860941e-05,
"loss": 0.1858,
"step": 383
},
{
"epoch": 1.0477489768076398,
"grad_norm": 2.673168897628784,
"learning_rate": 1.643869558350524e-05,
"loss": 0.1855,
"step": 384
},
{
"epoch": 1.0504774897680764,
"grad_norm": 2.126051187515259,
"learning_rate": 1.6414333688042186e-05,
"loss": 0.1875,
"step": 385
},
{
"epoch": 1.053206002728513,
"grad_norm": 2.813554048538208,
"learning_rate": 1.638990693853848e-05,
"loss": 0.1842,
"step": 386
},
{
"epoch": 1.0559345156889495,
"grad_norm": 2.9675509929656982,
"learning_rate": 1.6365415581968086e-05,
"loss": 0.1899,
"step": 387
},
{
"epoch": 1.058663028649386,
"grad_norm": 1.7976346015930176,
"learning_rate": 1.6340859865958193e-05,
"loss": 0.1865,
"step": 388
},
{
"epoch": 1.0613915416098227,
"grad_norm": 2.076399326324463,
"learning_rate": 1.631624003878672e-05,
"loss": 0.1848,
"step": 389
},
{
"epoch": 1.0641200545702592,
"grad_norm": 2.5090761184692383,
"learning_rate": 1.6291556349379794e-05,
"loss": 0.1858,
"step": 390
},
{
"epoch": 1.0668485675306958,
"grad_norm": 1.3025147914886475,
"learning_rate": 1.6266809047309253e-05,
"loss": 0.1876,
"step": 391
},
{
"epoch": 1.0695770804911324,
"grad_norm": 2.8731820583343506,
"learning_rate": 1.6241998382790095e-05,
"loss": 0.1841,
"step": 392
},
{
"epoch": 1.0723055934515688,
"grad_norm": 2.52224063873291,
"learning_rate": 1.6217124606677973e-05,
"loss": 0.1817,
"step": 393
},
{
"epoch": 1.0750341064120055,
"grad_norm": 2.2980759143829346,
"learning_rate": 1.6192187970466646e-05,
"loss": 0.1783,
"step": 394
},
{
"epoch": 1.077762619372442,
"grad_norm": 2.4828219413757324,
"learning_rate": 1.6167188726285433e-05,
"loss": 0.1835,
"step": 395
},
{
"epoch": 1.0804911323328785,
"grad_norm": 1.777030348777771,
"learning_rate": 1.6142127126896682e-05,
"loss": 0.1876,
"step": 396
},
{
"epoch": 1.0832196452933152,
"grad_norm": 1.924090027809143,
"learning_rate": 1.611700342569319e-05,
"loss": 0.1844,
"step": 397
},
{
"epoch": 1.0859481582537518,
"grad_norm": 2.529407501220703,
"learning_rate": 1.6091817876695655e-05,
"loss": 0.184,
"step": 398
},
{
"epoch": 1.0886766712141882,
"grad_norm": 2.5993223190307617,
"learning_rate": 1.606657073455012e-05,
"loss": 0.1854,
"step": 399
},
{
"epoch": 1.0914051841746248,
"grad_norm": 2.47813081741333,
"learning_rate": 1.6041262254525362e-05,
"loss": 0.1913,
"step": 400
},
{
"epoch": 1.0941336971350615,
"grad_norm": 1.9955610036849976,
"learning_rate": 1.601589269251035e-05,
"loss": 0.1836,
"step": 401
},
{
"epoch": 1.096862210095498,
"grad_norm": 2.6094555854797363,
"learning_rate": 1.599046230501163e-05,
"loss": 0.1886,
"step": 402
},
{
"epoch": 1.0995907230559345,
"grad_norm": 1.9604135751724243,
"learning_rate": 1.5964971349150746e-05,
"loss": 0.1851,
"step": 403
},
{
"epoch": 1.1023192360163712,
"grad_norm": 2.7384026050567627,
"learning_rate": 1.593942008266164e-05,
"loss": 0.1861,
"step": 404
},
{
"epoch": 1.1050477489768076,
"grad_norm": 3.150024890899658,
"learning_rate": 1.591380876388804e-05,
"loss": 0.1898,
"step": 405
},
{
"epoch": 1.1077762619372442,
"grad_norm": 1.3892546892166138,
"learning_rate": 1.5888137651780847e-05,
"loss": 0.1837,
"step": 406
},
{
"epoch": 1.1105047748976808,
"grad_norm": 2.159607410430908,
"learning_rate": 1.5862407005895524e-05,
"loss": 0.1866,
"step": 407
},
{
"epoch": 1.1132332878581173,
"grad_norm": 1.919472336769104,
"learning_rate": 1.583661708638947e-05,
"loss": 0.1885,
"step": 408
},
{
"epoch": 1.115961800818554,
"grad_norm": 1.2415019273757935,
"learning_rate": 1.5810768154019386e-05,
"loss": 0.1855,
"step": 409
},
{
"epoch": 1.1186903137789905,
"grad_norm": 1.8879867792129517,
"learning_rate": 1.5784860470138633e-05,
"loss": 0.1873,
"step": 410
},
{
"epoch": 1.121418826739427,
"grad_norm": 1.4997961521148682,
"learning_rate": 1.5758894296694614e-05,
"loss": 0.1871,
"step": 411
},
{
"epoch": 1.1241473396998636,
"grad_norm": 2.5156009197235107,
"learning_rate": 1.573286989622609e-05,
"loss": 0.183,
"step": 412
},
{
"epoch": 1.1268758526603002,
"grad_norm": 1.9973162412643433,
"learning_rate": 1.5706787531860557e-05,
"loss": 0.1795,
"step": 413
},
{
"epoch": 1.1296043656207366,
"grad_norm": 2.4125733375549316,
"learning_rate": 1.568064746731156e-05,
"loss": 0.1903,
"step": 414
},
{
"epoch": 1.1323328785811733,
"grad_norm": 3.0142152309417725,
"learning_rate": 1.565444996687605e-05,
"loss": 0.1818,
"step": 415
},
{
"epoch": 1.13506139154161,
"grad_norm": 1.368192434310913,
"learning_rate": 1.5628195295431696e-05,
"loss": 0.1858,
"step": 416
},
{
"epoch": 1.1377899045020463,
"grad_norm": 2.020052433013916,
"learning_rate": 1.5601883718434207e-05,
"loss": 0.1912,
"step": 417
},
{
"epoch": 1.140518417462483,
"grad_norm": 1.513268232345581,
"learning_rate": 1.557551550191467e-05,
"loss": 0.1843,
"step": 418
},
{
"epoch": 1.1432469304229196,
"grad_norm": 0.989016592502594,
"learning_rate": 1.554909091247682e-05,
"loss": 0.1802,
"step": 419
},
{
"epoch": 1.145975443383356,
"grad_norm": 1.345131754875183,
"learning_rate": 1.5522610217294377e-05,
"loss": 0.187,
"step": 420
},
{
"epoch": 1.1487039563437926,
"grad_norm": 1.432015061378479,
"learning_rate": 1.549607368410834e-05,
"loss": 0.1799,
"step": 421
},
{
"epoch": 1.1514324693042293,
"grad_norm": 1.2019281387329102,
"learning_rate": 1.5469481581224274e-05,
"loss": 0.1881,
"step": 422
},
{
"epoch": 1.1541609822646657,
"grad_norm": 1.5989677906036377,
"learning_rate": 1.544283417750958e-05,
"loss": 0.1875,
"step": 423
},
{
"epoch": 1.1568894952251023,
"grad_norm": 1.4773874282836914,
"learning_rate": 1.5416131742390827e-05,
"loss": 0.1861,
"step": 424
},
{
"epoch": 1.159618008185539,
"grad_norm": 1.039644479751587,
"learning_rate": 1.5389374545850973e-05,
"loss": 0.191,
"step": 425
},
{
"epoch": 1.1623465211459754,
"grad_norm": 1.808279275894165,
"learning_rate": 1.5362562858426655e-05,
"loss": 0.1894,
"step": 426
},
{
"epoch": 1.165075034106412,
"grad_norm": 1.3765541315078735,
"learning_rate": 1.533569695120547e-05,
"loss": 0.1967,
"step": 427
},
{
"epoch": 1.1678035470668486,
"grad_norm": 1.5166419744491577,
"learning_rate": 1.530877709582321e-05,
"loss": 0.1916,
"step": 428
},
{
"epoch": 1.170532060027285,
"grad_norm": 2.17692232131958,
"learning_rate": 1.5281803564461135e-05,
"loss": 0.1911,
"step": 429
},
{
"epoch": 1.1732605729877217,
"grad_norm": 1.4779309034347534,
"learning_rate": 1.5254776629843204e-05,
"loss": 0.192,
"step": 430
},
{
"epoch": 1.1759890859481583,
"grad_norm": 2.124163866043091,
"learning_rate": 1.522769656523333e-05,
"loss": 0.1899,
"step": 431
},
{
"epoch": 1.1787175989085947,
"grad_norm": 1.3838238716125488,
"learning_rate": 1.5200563644432614e-05,
"loss": 0.1896,
"step": 432
},
{
"epoch": 1.1814461118690314,
"grad_norm": 1.1780169010162354,
"learning_rate": 1.5173378141776569e-05,
"loss": 0.1963,
"step": 433
},
{
"epoch": 1.184174624829468,
"grad_norm": 0.9128435850143433,
"learning_rate": 1.5146140332132359e-05,
"loss": 0.1831,
"step": 434
},
{
"epoch": 1.1869031377899044,
"grad_norm": 1.093888521194458,
"learning_rate": 1.5118850490896012e-05,
"loss": 0.1902,
"step": 435
},
{
"epoch": 1.189631650750341,
"grad_norm": 1.3495734930038452,
"learning_rate": 1.5091508893989633e-05,
"loss": 0.1886,
"step": 436
},
{
"epoch": 1.1923601637107777,
"grad_norm": 2.244813919067383,
"learning_rate": 1.5064115817858622e-05,
"loss": 0.195,
"step": 437
},
{
"epoch": 1.195088676671214,
"grad_norm": 1.4227724075317383,
"learning_rate": 1.5036671539468879e-05,
"loss": 0.1931,
"step": 438
},
{
"epoch": 1.1978171896316507,
"grad_norm": 2.971980333328247,
"learning_rate": 1.5009176336303987e-05,
"loss": 0.1945,
"step": 439
},
{
"epoch": 1.2005457025920874,
"grad_norm": 2.9744460582733154,
"learning_rate": 1.4981630486362435e-05,
"loss": 0.1886,
"step": 440
},
{
"epoch": 1.2032742155525238,
"grad_norm": 2.5313005447387695,
"learning_rate": 1.4954034268154777e-05,
"loss": 0.1961,
"step": 441
},
{
"epoch": 1.2060027285129604,
"grad_norm": 2.4875779151916504,
"learning_rate": 1.4926387960700843e-05,
"loss": 0.1964,
"step": 442
},
{
"epoch": 1.208731241473397,
"grad_norm": 1.7058812379837036,
"learning_rate": 1.4898691843526897e-05,
"loss": 0.1937,
"step": 443
},
{
"epoch": 1.2114597544338335,
"grad_norm": 2.4829766750335693,
"learning_rate": 1.4870946196662822e-05,
"loss": 0.1988,
"step": 444
},
{
"epoch": 1.21418826739427,
"grad_norm": 1.777729868888855,
"learning_rate": 1.4843151300639282e-05,
"loss": 0.2021,
"step": 445
},
{
"epoch": 1.2169167803547067,
"grad_norm": 1.580891728401184,
"learning_rate": 1.4815307436484898e-05,
"loss": 0.197,
"step": 446
},
{
"epoch": 1.2196452933151432,
"grad_norm": 1.710062861442566,
"learning_rate": 1.4787414885723386e-05,
"loss": 0.1934,
"step": 447
},
{
"epoch": 1.2223738062755798,
"grad_norm": 1.6073284149169922,
"learning_rate": 1.4759473930370738e-05,
"loss": 0.1927,
"step": 448
},
{
"epoch": 1.2251023192360164,
"grad_norm": 1.4678465127944946,
"learning_rate": 1.4731484852932338e-05,
"loss": 0.1946,
"step": 449
},
{
"epoch": 1.2278308321964528,
"grad_norm": 1.3270896673202515,
"learning_rate": 1.4703447936400135e-05,
"loss": 0.1877,
"step": 450
},
{
"epoch": 1.2305593451568895,
"grad_norm": 2.5790812969207764,
"learning_rate": 1.4675363464249763e-05,
"loss": 0.195,
"step": 451
},
{
"epoch": 1.233287858117326,
"grad_norm": 1.3712650537490845,
"learning_rate": 1.4647231720437687e-05,
"loss": 0.194,
"step": 452
},
{
"epoch": 1.2360163710777625,
"grad_norm": 3.243920087814331,
"learning_rate": 1.461905298939832e-05,
"loss": 0.1874,
"step": 453
},
{
"epoch": 1.2387448840381992,
"grad_norm": 2.803710460662842,
"learning_rate": 1.4590827556041158e-05,
"loss": 0.1948,
"step": 454
},
{
"epoch": 1.2414733969986358,
"grad_norm": 2.0171213150024414,
"learning_rate": 1.4562555705747894e-05,
"loss": 0.1946,
"step": 455
},
{
"epoch": 1.2442019099590724,
"grad_norm": 1.85854172706604,
"learning_rate": 1.4534237724369534e-05,
"loss": 0.192,
"step": 456
},
{
"epoch": 1.2469304229195088,
"grad_norm": 1.7220263481140137,
"learning_rate": 1.4505873898223498e-05,
"loss": 0.1926,
"step": 457
},
{
"epoch": 1.2496589358799455,
"grad_norm": 2.4301505088806152,
"learning_rate": 1.4477464514090745e-05,
"loss": 0.1874,
"step": 458
},
{
"epoch": 1.252387448840382,
"grad_norm": 1.6035195589065552,
"learning_rate": 1.4449009859212857e-05,
"loss": 0.1961,
"step": 459
},
{
"epoch": 1.2551159618008185,
"grad_norm": 1.4066811800003052,
"learning_rate": 1.4420510221289137e-05,
"loss": 0.1905,
"step": 460
},
{
"epoch": 1.2578444747612552,
"grad_norm": 1.3481764793395996,
"learning_rate": 1.4391965888473705e-05,
"loss": 0.1869,
"step": 461
},
{
"epoch": 1.2605729877216918,
"grad_norm": 1.235824704170227,
"learning_rate": 1.4363377149372584e-05,
"loss": 0.1894,
"step": 462
},
{
"epoch": 1.2633015006821282,
"grad_norm": 2.259446144104004,
"learning_rate": 1.4334744293040773e-05,
"loss": 0.1935,
"step": 463
},
{
"epoch": 1.2660300136425648,
"grad_norm": 1.5957938432693481,
"learning_rate": 1.430606760897934e-05,
"loss": 0.1947,
"step": 464
},
{
"epoch": 1.2687585266030013,
"grad_norm": 1.1070665121078491,
"learning_rate": 1.4277347387132482e-05,
"loss": 0.1793,
"step": 465
},
{
"epoch": 1.271487039563438,
"grad_norm": 1.6437128782272339,
"learning_rate": 1.4248583917884595e-05,
"loss": 0.1883,
"step": 466
},
{
"epoch": 1.2742155525238745,
"grad_norm": 1.2046825885772705,
"learning_rate": 1.4219777492057349e-05,
"loss": 0.1862,
"step": 467
},
{
"epoch": 1.2769440654843112,
"grad_norm": 1.8802250623703003,
"learning_rate": 1.4190928400906731e-05,
"loss": 0.1845,
"step": 468
},
{
"epoch": 1.2796725784447476,
"grad_norm": 1.2976617813110352,
"learning_rate": 1.4162036936120115e-05,
"loss": 0.1942,
"step": 469
},
{
"epoch": 1.2824010914051842,
"grad_norm": 2.1956803798675537,
"learning_rate": 1.4133103389813302e-05,
"loss": 0.1935,
"step": 470
},
{
"epoch": 1.2851296043656206,
"grad_norm": 1.6689682006835938,
"learning_rate": 1.410412805452757e-05,
"loss": 0.191,
"step": 471
},
{
"epoch": 1.2878581173260573,
"grad_norm": 2.384645938873291,
"learning_rate": 1.4075111223226721e-05,
"loss": 0.1898,
"step": 472
},
{
"epoch": 1.290586630286494,
"grad_norm": 1.9162484407424927,
"learning_rate": 1.4046053189294114e-05,
"loss": 0.1881,
"step": 473
},
{
"epoch": 1.2933151432469305,
"grad_norm": 2.4748995304107666,
"learning_rate": 1.4016954246529697e-05,
"loss": 0.1876,
"step": 474
},
{
"epoch": 1.296043656207367,
"grad_norm": 1.9592957496643066,
"learning_rate": 1.3987814689147041e-05,
"loss": 0.1958,
"step": 475
},
{
"epoch": 1.2987721691678036,
"grad_norm": 2.6773533821105957,
"learning_rate": 1.3958634811770361e-05,
"loss": 0.1967,
"step": 476
},
{
"epoch": 1.30150068212824,
"grad_norm": 2.139191150665283,
"learning_rate": 1.3929414909431544e-05,
"loss": 0.1917,
"step": 477
},
{
"epoch": 1.3042291950886766,
"grad_norm": 1.8918670415878296,
"learning_rate": 1.3900155277567157e-05,
"loss": 0.1923,
"step": 478
},
{
"epoch": 1.3069577080491133,
"grad_norm": 1.8260009288787842,
"learning_rate": 1.3870856212015468e-05,
"loss": 0.1912,
"step": 479
},
{
"epoch": 1.30968622100955,
"grad_norm": 1.858114242553711,
"learning_rate": 1.3841518009013446e-05,
"loss": 0.1912,
"step": 480
},
{
"epoch": 1.3124147339699863,
"grad_norm": 1.8641937971115112,
"learning_rate": 1.3812140965193775e-05,
"loss": 0.1906,
"step": 481
},
{
"epoch": 1.315143246930423,
"grad_norm": 2.4870729446411133,
"learning_rate": 1.378272537758185e-05,
"loss": 0.1901,
"step": 482
},
{
"epoch": 1.3178717598908594,
"grad_norm": 1.7273850440979004,
"learning_rate": 1.3753271543592772e-05,
"loss": 0.1898,
"step": 483
},
{
"epoch": 1.320600272851296,
"grad_norm": 1.6276838779449463,
"learning_rate": 1.3723779761028349e-05,
"loss": 0.1851,
"step": 484
},
{
"epoch": 1.3233287858117326,
"grad_norm": 1.5349172353744507,
"learning_rate": 1.3694250328074072e-05,
"loss": 0.19,
"step": 485
},
{
"epoch": 1.3260572987721693,
"grad_norm": 1.9674981832504272,
"learning_rate": 1.3664683543296114e-05,
"loss": 0.1858,
"step": 486
},
{
"epoch": 1.3287858117326057,
"grad_norm": 1.7096377611160278,
"learning_rate": 1.3635079705638298e-05,
"loss": 0.1853,
"step": 487
},
{
"epoch": 1.3315143246930423,
"grad_norm": 2.1643528938293457,
"learning_rate": 1.3605439114419095e-05,
"loss": 0.1803,
"step": 488
},
{
"epoch": 1.3342428376534787,
"grad_norm": 2.1609997749328613,
"learning_rate": 1.3575762069328567e-05,
"loss": 0.1888,
"step": 489
},
{
"epoch": 1.3369713506139154,
"grad_norm": 2.171285390853882,
"learning_rate": 1.3546048870425356e-05,
"loss": 0.1887,
"step": 490
},
{
"epoch": 1.339699863574352,
"grad_norm": 1.9047596454620361,
"learning_rate": 1.3516299818133664e-05,
"loss": 0.1844,
"step": 491
},
{
"epoch": 1.3424283765347886,
"grad_norm": 1.5691167116165161,
"learning_rate": 1.3486515213240188e-05,
"loss": 0.1889,
"step": 492
},
{
"epoch": 1.345156889495225,
"grad_norm": 1.809037208557129,
"learning_rate": 1.3456695356891079e-05,
"loss": 0.1868,
"step": 493
},
{
"epoch": 1.3478854024556617,
"grad_norm": 1.375952959060669,
"learning_rate": 1.3426840550588933e-05,
"loss": 0.1857,
"step": 494
},
{
"epoch": 1.350613915416098,
"grad_norm": 1.5163205862045288,
"learning_rate": 1.33969510961897e-05,
"loss": 0.1853,
"step": 495
},
{
"epoch": 1.3533424283765347,
"grad_norm": 1.8282253742218018,
"learning_rate": 1.3367027295899652e-05,
"loss": 0.1852,
"step": 496
},
{
"epoch": 1.3560709413369714,
"grad_norm": 1.6411131620407104,
"learning_rate": 1.3337069452272332e-05,
"loss": 0.185,
"step": 497
},
{
"epoch": 1.358799454297408,
"grad_norm": 1.8116145133972168,
"learning_rate": 1.3307077868205487e-05,
"loss": 0.1888,
"step": 498
},
{
"epoch": 1.3615279672578444,
"grad_norm": 1.7081019878387451,
"learning_rate": 1.3277052846937997e-05,
"loss": 0.1922,
"step": 499
},
{
"epoch": 1.364256480218281,
"grad_norm": 1.9334781169891357,
"learning_rate": 1.3246994692046837e-05,
"loss": 0.1837,
"step": 500
},
{
"epoch": 1.3669849931787175,
"grad_norm": 1.9487545490264893,
"learning_rate": 1.321690370744397e-05,
"loss": 0.183,
"step": 501
},
{
"epoch": 1.369713506139154,
"grad_norm": 1.4426416158676147,
"learning_rate": 1.3186780197373306e-05,
"loss": 0.1847,
"step": 502
},
{
"epoch": 1.3724420190995907,
"grad_norm": 1.153779149055481,
"learning_rate": 1.3156624466407607e-05,
"loss": 0.1818,
"step": 503
},
{
"epoch": 1.3751705320600274,
"grad_norm": 1.7035014629364014,
"learning_rate": 1.3126436819445423e-05,
"loss": 0.1889,
"step": 504
},
{
"epoch": 1.3778990450204638,
"grad_norm": 0.9309306740760803,
"learning_rate": 1.309621756170799e-05,
"loss": 0.1839,
"step": 505
},
{
"epoch": 1.3806275579809004,
"grad_norm": 1.7719669342041016,
"learning_rate": 1.3065966998736155e-05,
"loss": 0.1803,
"step": 506
},
{
"epoch": 1.3833560709413368,
"grad_norm": 1.2695763111114502,
"learning_rate": 1.3035685436387297e-05,
"loss": 0.1872,
"step": 507
},
{
"epoch": 1.3860845839017735,
"grad_norm": 1.6255990266799927,
"learning_rate": 1.300537318083221e-05,
"loss": 0.1815,
"step": 508
},
{
"epoch": 1.38881309686221,
"grad_norm": 1.109044075012207,
"learning_rate": 1.297503053855203e-05,
"loss": 0.1844,
"step": 509
},
{
"epoch": 1.3915416098226467,
"grad_norm": 1.4328869581222534,
"learning_rate": 1.2944657816335124e-05,
"loss": 0.1866,
"step": 510
},
{
"epoch": 1.3942701227830832,
"grad_norm": 1.5600212812423706,
"learning_rate": 1.2914255321273987e-05,
"loss": 0.1883,
"step": 511
},
{
"epoch": 1.3969986357435198,
"grad_norm": 1.095321774482727,
"learning_rate": 1.2883823360762149e-05,
"loss": 0.1875,
"step": 512
},
{
"epoch": 1.3997271487039564,
"grad_norm": 1.7407549619674683,
"learning_rate": 1.2853362242491054e-05,
"loss": 0.1819,
"step": 513
},
{
"epoch": 1.4024556616643928,
"grad_norm": 1.4621182680130005,
"learning_rate": 1.2822872274446958e-05,
"loss": 0.1869,
"step": 514
},
{
"epoch": 1.4051841746248295,
"grad_norm": 1.9369522333145142,
"learning_rate": 1.2792353764907803e-05,
"loss": 0.1879,
"step": 515
},
{
"epoch": 1.407912687585266,
"grad_norm": 1.6838245391845703,
"learning_rate": 1.276180702244012e-05,
"loss": 0.1916,
"step": 516
},
{
"epoch": 1.4106412005457025,
"grad_norm": 1.968902349472046,
"learning_rate": 1.273123235589589e-05,
"loss": 0.1865,
"step": 517
},
{
"epoch": 1.4133697135061392,
"grad_norm": 1.254606008529663,
"learning_rate": 1.2700630074409427e-05,
"loss": 0.1813,
"step": 518
},
{
"epoch": 1.4160982264665758,
"grad_norm": 1.8816652297973633,
"learning_rate": 1.2670000487394268e-05,
"loss": 0.1835,
"step": 519
},
{
"epoch": 1.4188267394270122,
"grad_norm": 1.258792757987976,
"learning_rate": 1.2639343904540008e-05,
"loss": 0.1821,
"step": 520
},
{
"epoch": 1.4215552523874488,
"grad_norm": 1.609707236289978,
"learning_rate": 1.260866063580921e-05,
"loss": 0.1875,
"step": 521
},
{
"epoch": 1.4242837653478855,
"grad_norm": 1.041623830795288,
"learning_rate": 1.2577950991434249e-05,
"loss": 0.1843,
"step": 522
},
{
"epoch": 1.427012278308322,
"grad_norm": 1.3459815979003906,
"learning_rate": 1.254721528191417e-05,
"loss": 0.1862,
"step": 523
},
{
"epoch": 1.4297407912687585,
"grad_norm": 1.082248330116272,
"learning_rate": 1.2516453818011567e-05,
"loss": 0.1862,
"step": 524
},
{
"epoch": 1.4324693042291952,
"grad_norm": 1.1448614597320557,
"learning_rate": 1.2485666910749427e-05,
"loss": 0.1854,
"step": 525
},
{
"epoch": 1.4351978171896316,
"grad_norm": 0.9976285099983215,
"learning_rate": 1.2454854871407993e-05,
"loss": 0.1905,
"step": 526
},
{
"epoch": 1.4379263301500682,
"grad_norm": 1.1316357851028442,
"learning_rate": 1.242401801152161e-05,
"loss": 0.1874,
"step": 527
},
{
"epoch": 1.4406548431105048,
"grad_norm": 1.0879647731781006,
"learning_rate": 1.2393156642875579e-05,
"loss": 0.188,
"step": 528
},
{
"epoch": 1.4433833560709413,
"grad_norm": 1.6636865139007568,
"learning_rate": 1.2362271077503007e-05,
"loss": 0.1877,
"step": 529
},
{
"epoch": 1.446111869031378,
"grad_norm": 1.0785863399505615,
"learning_rate": 1.2331361627681645e-05,
"loss": 0.1805,
"step": 530
},
{
"epoch": 1.4488403819918145,
"grad_norm": 1.1955676078796387,
"learning_rate": 1.2300428605930736e-05,
"loss": 0.186,
"step": 531
},
{
"epoch": 1.451568894952251,
"grad_norm": 1.0076512098312378,
"learning_rate": 1.2269472325007858e-05,
"loss": 0.1843,
"step": 532
},
{
"epoch": 1.4542974079126876,
"grad_norm": 0.9785951375961304,
"learning_rate": 1.2238493097905754e-05,
"loss": 0.1865,
"step": 533
},
{
"epoch": 1.4570259208731242,
"grad_norm": 1.2281177043914795,
"learning_rate": 1.2207491237849174e-05,
"loss": 0.1817,
"step": 534
},
{
"epoch": 1.4597544338335606,
"grad_norm": 1.263206958770752,
"learning_rate": 1.2176467058291699e-05,
"loss": 0.1816,
"step": 535
},
{
"epoch": 1.4624829467939973,
"grad_norm": 1.1939092874526978,
"learning_rate": 1.2145420872912586e-05,
"loss": 0.1842,
"step": 536
},
{
"epoch": 1.465211459754434,
"grad_norm": 1.162329912185669,
"learning_rate": 1.2114352995613582e-05,
"loss": 0.1878,
"step": 537
},
{
"epoch": 1.4679399727148703,
"grad_norm": 0.8624812960624695,
"learning_rate": 1.2083263740515764e-05,
"loss": 0.1839,
"step": 538
},
{
"epoch": 1.470668485675307,
"grad_norm": 1.0390948057174683,
"learning_rate": 1.2052153421956343e-05,
"loss": 0.189,
"step": 539
},
{
"epoch": 1.4733969986357436,
"grad_norm": 1.2229454517364502,
"learning_rate": 1.2021022354485514e-05,
"loss": 0.1896,
"step": 540
},
{
"epoch": 1.4761255115961802,
"grad_norm": 1.5394783020019531,
"learning_rate": 1.1989870852863254e-05,
"loss": 0.1839,
"step": 541
},
{
"epoch": 1.4788540245566166,
"grad_norm": 0.9282181859016418,
"learning_rate": 1.1958699232056135e-05,
"loss": 0.1847,
"step": 542
},
{
"epoch": 1.4815825375170533,
"grad_norm": 1.620680809020996,
"learning_rate": 1.1927507807234169e-05,
"loss": 0.1839,
"step": 543
},
{
"epoch": 1.4843110504774897,
"grad_norm": 1.1609814167022705,
"learning_rate": 1.1896296893767588e-05,
"loss": 0.187,
"step": 544
},
{
"epoch": 1.4870395634379263,
"grad_norm": 1.349696159362793,
"learning_rate": 1.186506680722367e-05,
"loss": 0.1873,
"step": 545
},
{
"epoch": 1.489768076398363,
"grad_norm": 0.9885646104812622,
"learning_rate": 1.1833817863363563e-05,
"loss": 0.1831,
"step": 546
},
{
"epoch": 1.4924965893587996,
"grad_norm": 1.7727011442184448,
"learning_rate": 1.180255037813906e-05,
"loss": 0.1911,
"step": 547
},
{
"epoch": 1.495225102319236,
"grad_norm": 1.5409637689590454,
"learning_rate": 1.1771264667689428e-05,
"loss": 0.1861,
"step": 548
},
{
"epoch": 1.4979536152796726,
"grad_norm": 1.6905938386917114,
"learning_rate": 1.1739961048338213e-05,
"loss": 0.1902,
"step": 549
},
{
"epoch": 1.500682128240109,
"grad_norm": 1.2861829996109009,
"learning_rate": 1.1708639836590024e-05,
"loss": 0.1886,
"step": 550
},
{
"epoch": 1.5034106412005457,
"grad_norm": 1.4801191091537476,
"learning_rate": 1.1677301349127349e-05,
"loss": 0.1868,
"step": 551
},
{
"epoch": 1.5061391541609823,
"grad_norm": 1.0299503803253174,
"learning_rate": 1.164594590280734e-05,
"loss": 0.193,
"step": 552
},
{
"epoch": 1.508867667121419,
"grad_norm": 1.705196738243103,
"learning_rate": 1.161457381465863e-05,
"loss": 0.19,
"step": 553
},
{
"epoch": 1.5115961800818554,
"grad_norm": 1.2943288087844849,
"learning_rate": 1.15831854018781e-05,
"loss": 0.1955,
"step": 554
},
{
"epoch": 1.514324693042292,
"grad_norm": 1.1207996606826782,
"learning_rate": 1.1551780981827699e-05,
"loss": 0.1867,
"step": 555
},
{
"epoch": 1.5170532060027284,
"grad_norm": 1.675868034362793,
"learning_rate": 1.1520360872031208e-05,
"loss": 0.1821,
"step": 556
},
{
"epoch": 1.519781718963165,
"grad_norm": 1.2020845413208008,
"learning_rate": 1.148892539017106e-05,
"loss": 0.1909,
"step": 557
},
{
"epoch": 1.5225102319236017,
"grad_norm": 0.9670946002006531,
"learning_rate": 1.1457474854085095e-05,
"loss": 0.1868,
"step": 558
},
{
"epoch": 1.5252387448840383,
"grad_norm": 1.1065406799316406,
"learning_rate": 1.1426009581763377e-05,
"loss": 0.1904,
"step": 559
},
{
"epoch": 1.5279672578444747,
"grad_norm": 1.2990299463272095,
"learning_rate": 1.139452989134496e-05,
"loss": 0.1812,
"step": 560
},
{
"epoch": 1.5306957708049114,
"grad_norm": 1.2565958499908447,
"learning_rate": 1.1363036101114671e-05,
"loss": 0.1896,
"step": 561
},
{
"epoch": 1.5334242837653478,
"grad_norm": 1.460721492767334,
"learning_rate": 1.1331528529499909e-05,
"loss": 0.1857,
"step": 562
},
{
"epoch": 1.5361527967257844,
"grad_norm": 1.0245671272277832,
"learning_rate": 1.1300007495067403e-05,
"loss": 0.1831,
"step": 563
},
{
"epoch": 1.538881309686221,
"grad_norm": 1.3558366298675537,
"learning_rate": 1.1268473316520007e-05,
"loss": 0.1869,
"step": 564
},
{
"epoch": 1.5416098226466577,
"grad_norm": 1.1182175874710083,
"learning_rate": 1.123692631269348e-05,
"loss": 0.1857,
"step": 565
},
{
"epoch": 1.544338335607094,
"grad_norm": 2.172612428665161,
"learning_rate": 1.1205366802553231e-05,
"loss": 0.1857,
"step": 566
},
{
"epoch": 1.5470668485675307,
"grad_norm": 1.591204285621643,
"learning_rate": 1.1173795105191146e-05,
"loss": 0.1849,
"step": 567
},
{
"epoch": 1.5497953615279672,
"grad_norm": 2.5178825855255127,
"learning_rate": 1.1142211539822318e-05,
"loss": 0.1847,
"step": 568
},
{
"epoch": 1.5525238744884038,
"grad_norm": 2.313584089279175,
"learning_rate": 1.1110616425781833e-05,
"loss": 0.1818,
"step": 569
},
{
"epoch": 1.5552523874488404,
"grad_norm": 1.1800787448883057,
"learning_rate": 1.1079010082521557e-05,
"loss": 0.1827,
"step": 570
},
{
"epoch": 1.557980900409277,
"grad_norm": 1.6123071908950806,
"learning_rate": 1.1047392829606876e-05,
"loss": 0.195,
"step": 571
},
{
"epoch": 1.5607094133697135,
"grad_norm": 0.9764700531959534,
"learning_rate": 1.101576498671349e-05,
"loss": 0.1837,
"step": 572
},
{
"epoch": 1.56343792633015,
"grad_norm": 1.6063830852508545,
"learning_rate": 1.098412687362418e-05,
"loss": 0.1856,
"step": 573
},
{
"epoch": 1.5661664392905865,
"grad_norm": 1.6164367198944092,
"learning_rate": 1.095247881022555e-05,
"loss": 0.1791,
"step": 574
},
{
"epoch": 1.5688949522510232,
"grad_norm": 1.005224585533142,
"learning_rate": 1.0920821116504816e-05,
"loss": 0.1865,
"step": 575
},
{
"epoch": 1.5716234652114598,
"grad_norm": 1.6949632167816162,
"learning_rate": 1.0889154112546569e-05,
"loss": 0.1831,
"step": 576
},
{
"epoch": 1.5743519781718964,
"grad_norm": 1.3134765625,
"learning_rate": 1.0857478118529534e-05,
"loss": 0.1853,
"step": 577
},
{
"epoch": 1.5770804911323328,
"grad_norm": 1.852800726890564,
"learning_rate": 1.0825793454723325e-05,
"loss": 0.1826,
"step": 578
},
{
"epoch": 1.5798090040927695,
"grad_norm": 1.9858014583587646,
"learning_rate": 1.079410044148522e-05,
"loss": 0.1843,
"step": 579
},
{
"epoch": 1.5825375170532059,
"grad_norm": 1.0838700532913208,
"learning_rate": 1.0762399399256917e-05,
"loss": 0.1847,
"step": 580
},
{
"epoch": 1.5852660300136425,
"grad_norm": 1.2184253931045532,
"learning_rate": 1.0730690648561293e-05,
"loss": 0.1848,
"step": 581
},
{
"epoch": 1.5879945429740792,
"grad_norm": 0.9322428703308105,
"learning_rate": 1.0698974509999159e-05,
"loss": 0.184,
"step": 582
},
{
"epoch": 1.5907230559345158,
"grad_norm": 2.5623462200164795,
"learning_rate": 1.0667251304246028e-05,
"loss": 0.1823,
"step": 583
},
{
"epoch": 1.5934515688949522,
"grad_norm": 1.9705911874771118,
"learning_rate": 1.0635521352048873e-05,
"loss": 0.1815,
"step": 584
},
{
"epoch": 1.5961800818553888,
"grad_norm": 2.024606704711914,
"learning_rate": 1.0603784974222862e-05,
"loss": 0.1909,
"step": 585
},
{
"epoch": 1.5989085948158253,
"grad_norm": 2.2946503162384033,
"learning_rate": 1.057204249164815e-05,
"loss": 0.1842,
"step": 586
},
{
"epoch": 1.601637107776262,
"grad_norm": 1.239753007888794,
"learning_rate": 1.0540294225266608e-05,
"loss": 0.1827,
"step": 587
},
{
"epoch": 1.6043656207366985,
"grad_norm": 1.7404361963272095,
"learning_rate": 1.0508540496078582e-05,
"loss": 0.1798,
"step": 588
},
{
"epoch": 1.6070941336971352,
"grad_norm": 1.5583375692367554,
"learning_rate": 1.0476781625139655e-05,
"loss": 0.1836,
"step": 589
},
{
"epoch": 1.6098226466575716,
"grad_norm": 1.1195285320281982,
"learning_rate": 1.0445017933557404e-05,
"loss": 0.1843,
"step": 590
},
{
"epoch": 1.6125511596180082,
"grad_norm": 1.6010850667953491,
"learning_rate": 1.0413249742488132e-05,
"loss": 0.1833,
"step": 591
},
{
"epoch": 1.6152796725784446,
"grad_norm": 1.3162715435028076,
"learning_rate": 1.0381477373133652e-05,
"loss": 0.184,
"step": 592
},
{
"epoch": 1.6180081855388813,
"grad_norm": 1.436637282371521,
"learning_rate": 1.0349701146738007e-05,
"loss": 0.1826,
"step": 593
},
{
"epoch": 1.620736698499318,
"grad_norm": 1.3779467344284058,
"learning_rate": 1.0317921384584245e-05,
"loss": 0.1809,
"step": 594
},
{
"epoch": 1.6234652114597545,
"grad_norm": 1.9165902137756348,
"learning_rate": 1.0286138407991171e-05,
"loss": 0.1835,
"step": 595
},
{
"epoch": 1.626193724420191,
"grad_norm": 1.925999402999878,
"learning_rate": 1.0254352538310075e-05,
"loss": 0.1799,
"step": 596
},
{
"epoch": 1.6289222373806276,
"grad_norm": 1.4837428331375122,
"learning_rate": 1.0222564096921505e-05,
"loss": 0.182,
"step": 597
},
{
"epoch": 1.631650750341064,
"grad_norm": 1.3574309349060059,
"learning_rate": 1.0190773405232024e-05,
"loss": 0.1796,
"step": 598
},
{
"epoch": 1.6343792633015006,
"grad_norm": 1.8260838985443115,
"learning_rate": 1.0158980784670927e-05,
"loss": 0.1811,
"step": 599
},
{
"epoch": 1.6371077762619373,
"grad_norm": 1.1332685947418213,
"learning_rate": 1.012718655668702e-05,
"loss": 0.1799,
"step": 600
},
{
"epoch": 1.639836289222374,
"grad_norm": 1.505661964416504,
"learning_rate": 1.0095391042745362e-05,
"loss": 0.1805,
"step": 601
},
{
"epoch": 1.6425648021828103,
"grad_norm": 1.5634633302688599,
"learning_rate": 1.0063594564324014e-05,
"loss": 0.1813,
"step": 602
},
{
"epoch": 1.645293315143247,
"grad_norm": 0.9850923418998718,
"learning_rate": 1.0031797442910788e-05,
"loss": 0.1742,
"step": 603
},
{
"epoch": 1.6480218281036834,
"grad_norm": 1.3787219524383545,
"learning_rate": 1e-05,
"loss": 0.1871,
"step": 604
},
{
"epoch": 1.65075034106412,
"grad_norm": 1.1535123586654663,
"learning_rate": 9.968202557089213e-06,
"loss": 0.1826,
"step": 605
},
{
"epoch": 1.6534788540245566,
"grad_norm": 0.977457582950592,
"learning_rate": 9.936405435675991e-06,
"loss": 0.181,
"step": 606
},
{
"epoch": 1.6562073669849933,
"grad_norm": 0.9296724200248718,
"learning_rate": 9.904608957254643e-06,
"loss": 0.1772,
"step": 607
},
{
"epoch": 1.65893587994543,
"grad_norm": 1.0147004127502441,
"learning_rate": 9.872813443312984e-06,
"loss": 0.178,
"step": 608
},
{
"epoch": 1.6616643929058663,
"grad_norm": 0.9305097460746765,
"learning_rate": 9.84101921532908e-06,
"loss": 0.1777,
"step": 609
},
{
"epoch": 1.6643929058663027,
"grad_norm": 1.123794674873352,
"learning_rate": 9.809226594767979e-06,
"loss": 0.1821,
"step": 610
},
{
"epoch": 1.6671214188267394,
"grad_norm": 1.1139179468154907,
"learning_rate": 9.777435903078493e-06,
"loss": 0.1794,
"step": 611
},
{
"epoch": 1.669849931787176,
"grad_norm": 0.873371422290802,
"learning_rate": 9.745647461689932e-06,
"loss": 0.1728,
"step": 612
},
{
"epoch": 1.6725784447476126,
"grad_norm": 0.8382684588432312,
"learning_rate": 9.713861592008834e-06,
"loss": 0.1741,
"step": 613
},
{
"epoch": 1.6753069577080493,
"grad_norm": 1.2288588285446167,
"learning_rate": 9.682078615415755e-06,
"loss": 0.1769,
"step": 614
},
{
"epoch": 1.6780354706684857,
"grad_norm": 1.0390663146972656,
"learning_rate": 9.650298853261998e-06,
"loss": 0.1747,
"step": 615
},
{
"epoch": 1.680763983628922,
"grad_norm": 1.0253381729125977,
"learning_rate": 9.618522626866351e-06,
"loss": 0.1835,
"step": 616
},
{
"epoch": 1.6834924965893587,
"grad_norm": 0.8118390440940857,
"learning_rate": 9.586750257511868e-06,
"loss": 0.178,
"step": 617
},
{
"epoch": 1.6862210095497954,
"grad_norm": 0.9968405365943909,
"learning_rate": 9.554982066442601e-06,
"loss": 0.1779,
"step": 618
},
{
"epoch": 1.688949522510232,
"grad_norm": 0.9809510111808777,
"learning_rate": 9.523218374860348e-06,
"loss": 0.1806,
"step": 619
},
{
"epoch": 1.6916780354706686,
"grad_norm": 0.9609003663063049,
"learning_rate": 9.49145950392142e-06,
"loss": 0.1825,
"step": 620
},
{
"epoch": 1.694406548431105,
"grad_norm": 1.0416550636291504,
"learning_rate": 9.459705774733397e-06,
"loss": 0.1845,
"step": 621
},
{
"epoch": 1.6971350613915415,
"grad_norm": 1.0884149074554443,
"learning_rate": 9.427957508351852e-06,
"loss": 0.1826,
"step": 622
},
{
"epoch": 1.699863574351978,
"grad_norm": 0.936427652835846,
"learning_rate": 9.39621502577714e-06,
"loss": 0.1814,
"step": 623
},
{
"epoch": 1.7025920873124147,
"grad_norm": 0.9558141827583313,
"learning_rate": 9.364478647951132e-06,
"loss": 0.187,
"step": 624
},
{
"epoch": 1.7053206002728514,
"grad_norm": 0.986827552318573,
"learning_rate": 9.332748695753973e-06,
"loss": 0.1817,
"step": 625
},
{
"epoch": 1.708049113233288,
"grad_norm": 0.9488497376441956,
"learning_rate": 9.301025490000843e-06,
"loss": 0.1807,
"step": 626
},
{
"epoch": 1.7107776261937244,
"grad_norm": 1.1289781332015991,
"learning_rate": 9.26930935143871e-06,
"loss": 0.1816,
"step": 627
},
{
"epoch": 1.7135061391541608,
"grad_norm": 1.031248688697815,
"learning_rate": 9.237600600743086e-06,
"loss": 0.1816,
"step": 628
},
{
"epoch": 1.7162346521145975,
"grad_norm": 0.9620054960250854,
"learning_rate": 9.20589955851478e-06,
"loss": 0.178,
"step": 629
},
{
"epoch": 1.718963165075034,
"grad_norm": 1.0059623718261719,
"learning_rate": 9.174206545276678e-06,
"loss": 0.1821,
"step": 630
},
{
"epoch": 1.7216916780354707,
"grad_norm": 0.8320233821868896,
"learning_rate": 9.14252188147047e-06,
"loss": 0.1787,
"step": 631
},
{
"epoch": 1.7244201909959074,
"grad_norm": 0.8600996732711792,
"learning_rate": 9.11084588745343e-06,
"loss": 0.1775,
"step": 632
},
{
"epoch": 1.7271487039563438,
"grad_norm": 0.7816293835639954,
"learning_rate": 9.07917888349519e-06,
"loss": 0.1716,
"step": 633
},
{
"epoch": 1.7298772169167802,
"grad_norm": 1.091169834136963,
"learning_rate": 9.047521189774456e-06,
"loss": 0.1793,
"step": 634
},
{
"epoch": 1.7326057298772168,
"grad_norm": 1.5603679418563843,
"learning_rate": 9.015873126375822e-06,
"loss": 0.1756,
"step": 635
},
{
"epoch": 1.7353342428376535,
"grad_norm": 1.0447919368743896,
"learning_rate": 8.984235013286512e-06,
"loss": 0.1813,
"step": 636
},
{
"epoch": 1.73806275579809,
"grad_norm": 1.329048991203308,
"learning_rate": 8.952607170393126e-06,
"loss": 0.1786,
"step": 637
},
{
"epoch": 1.7407912687585267,
"grad_norm": 1.0863256454467773,
"learning_rate": 8.920989917478446e-06,
"loss": 0.1763,
"step": 638
},
{
"epoch": 1.7435197817189632,
"grad_norm": 1.4967482089996338,
"learning_rate": 8.88938357421817e-06,
"loss": 0.1748,
"step": 639
},
{
"epoch": 1.7462482946793996,
"grad_norm": 1.3216819763183594,
"learning_rate": 8.857788460177685e-06,
"loss": 0.1829,
"step": 640
},
{
"epoch": 1.7489768076398362,
"grad_norm": 1.3818433284759521,
"learning_rate": 8.826204894808856e-06,
"loss": 0.1824,
"step": 641
},
{
"epoch": 1.7517053206002728,
"grad_norm": 1.4184951782226562,
"learning_rate": 8.79463319744677e-06,
"loss": 0.1778,
"step": 642
},
{
"epoch": 1.7544338335607095,
"grad_norm": 1.4973526000976562,
"learning_rate": 8.763073687306523e-06,
"loss": 0.1775,
"step": 643
},
{
"epoch": 1.7571623465211461,
"grad_norm": 1.6445238590240479,
"learning_rate": 8.731526683479991e-06,
"loss": 0.1803,
"step": 644
},
{
"epoch": 1.7598908594815825,
"grad_norm": 1.743328332901001,
"learning_rate": 8.699992504932599e-06,
"loss": 0.1777,
"step": 645
},
{
"epoch": 1.762619372442019,
"grad_norm": 1.3567850589752197,
"learning_rate": 8.668471470500094e-06,
"loss": 0.1781,
"step": 646
},
{
"epoch": 1.7653478854024556,
"grad_norm": 1.3435090780258179,
"learning_rate": 8.63696389888533e-06,
"loss": 0.1791,
"step": 647
},
{
"epoch": 1.7680763983628922,
"grad_norm": 0.8996968269348145,
"learning_rate": 8.605470108655046e-06,
"loss": 0.1745,
"step": 648
},
{
"epoch": 1.7708049113233288,
"grad_norm": 0.9321063160896301,
"learning_rate": 8.573990418236626e-06,
"loss": 0.1757,
"step": 649
},
{
"epoch": 1.7735334242837655,
"grad_norm": 0.7093449234962463,
"learning_rate": 8.542525145914907e-06,
"loss": 0.1749,
"step": 650
},
{
"epoch": 1.776261937244202,
"grad_norm": 1.155967354774475,
"learning_rate": 8.511074609828944e-06,
"loss": 0.1735,
"step": 651
},
{
"epoch": 1.7789904502046383,
"grad_norm": 0.8285070657730103,
"learning_rate": 8.479639127968793e-06,
"loss": 0.1787,
"step": 652
},
{
"epoch": 1.781718963165075,
"grad_norm": 1.2278356552124023,
"learning_rate": 8.448219018172303e-06,
"loss": 0.1804,
"step": 653
},
{
"epoch": 1.7844474761255116,
"grad_norm": 1.3046550750732422,
"learning_rate": 8.416814598121901e-06,
"loss": 0.1767,
"step": 654
},
{
"epoch": 1.7871759890859482,
"grad_norm": 0.9383738040924072,
"learning_rate": 8.385426185341374e-06,
"loss": 0.1777,
"step": 655
},
{
"epoch": 1.7899045020463848,
"grad_norm": 1.2242978811264038,
"learning_rate": 8.35405409719266e-06,
"loss": 0.1807,
"step": 656
},
{
"epoch": 1.7926330150068213,
"grad_norm": 0.9162175059318542,
"learning_rate": 8.322698650872656e-06,
"loss": 0.1736,
"step": 657
},
{
"epoch": 1.795361527967258,
"grad_norm": 1.3758541345596313,
"learning_rate": 8.291360163409978e-06,
"loss": 0.1747,
"step": 658
},
{
"epoch": 1.7980900409276943,
"grad_norm": 1.1687759160995483,
"learning_rate": 8.260038951661787e-06,
"loss": 0.1767,
"step": 659
},
{
"epoch": 1.800818553888131,
"grad_norm": 1.366279125213623,
"learning_rate": 8.228735332310575e-06,
"loss": 0.1828,
"step": 660
},
{
"epoch": 1.8035470668485676,
"grad_norm": 1.273398756980896,
"learning_rate": 8.197449621860944e-06,
"loss": 0.1755,
"step": 661
},
{
"epoch": 1.8062755798090042,
"grad_norm": 1.1922065019607544,
"learning_rate": 8.16618213663644e-06,
"loss": 0.1781,
"step": 662
},
{
"epoch": 1.8090040927694406,
"grad_norm": 0.9690226912498474,
"learning_rate": 8.134933192776333e-06,
"loss": 0.1778,
"step": 663
},
{
"epoch": 1.8117326057298773,
"grad_norm": 1.3045841455459595,
"learning_rate": 8.103703106232416e-06,
"loss": 0.183,
"step": 664
},
{
"epoch": 1.8144611186903137,
"grad_norm": 0.8391909599304199,
"learning_rate": 8.072492192765833e-06,
"loss": 0.1744,
"step": 665
},
{
"epoch": 1.8171896316507503,
"grad_norm": 0.9879373908042908,
"learning_rate": 8.041300767943867e-06,
"loss": 0.176,
"step": 666
},
{
"epoch": 1.819918144611187,
"grad_norm": 0.8668816685676575,
"learning_rate": 8.010129147136749e-06,
"loss": 0.1771,
"step": 667
},
{
"epoch": 1.8226466575716236,
"grad_norm": 0.8918569684028625,
"learning_rate": 7.978977645514488e-06,
"loss": 0.1782,
"step": 668
},
{
"epoch": 1.82537517053206,
"grad_norm": 1.0004130601882935,
"learning_rate": 7.947846578043658e-06,
"loss": 0.1797,
"step": 669
},
{
"epoch": 1.8281036834924966,
"grad_norm": 1.0414166450500488,
"learning_rate": 7.916736259484239e-06,
"loss": 0.1763,
"step": 670
},
{
"epoch": 1.830832196452933,
"grad_norm": 1.5945677757263184,
"learning_rate": 7.885647004386421e-06,
"loss": 0.1727,
"step": 671
},
{
"epoch": 1.8335607094133697,
"grad_norm": 1.3431050777435303,
"learning_rate": 7.854579127087418e-06,
"loss": 0.1763,
"step": 672
},
{
"epoch": 1.8362892223738063,
"grad_norm": 1.1084147691726685,
"learning_rate": 7.823532941708305e-06,
"loss": 0.1765,
"step": 673
},
{
"epoch": 1.839017735334243,
"grad_norm": 1.1193599700927734,
"learning_rate": 7.792508762150833e-06,
"loss": 0.1766,
"step": 674
},
{
"epoch": 1.8417462482946794,
"grad_norm": 1.06298828125,
"learning_rate": 7.761506902094248e-06,
"loss": 0.1738,
"step": 675
},
{
"epoch": 1.844474761255116,
"grad_norm": 1.2637038230895996,
"learning_rate": 7.730527674992143e-06,
"loss": 0.1798,
"step": 676
},
{
"epoch": 1.8472032742155524,
"grad_norm": 1.1762093305587769,
"learning_rate": 7.699571394069269e-06,
"loss": 0.1769,
"step": 677
},
{
"epoch": 1.849931787175989,
"grad_norm": 0.9763647317886353,
"learning_rate": 7.668638372318359e-06,
"loss": 0.1767,
"step": 678
},
{
"epoch": 1.8526603001364257,
"grad_norm": 0.976860523223877,
"learning_rate": 7.637728922496996e-06,
"loss": 0.1731,
"step": 679
},
{
"epoch": 1.8553888130968623,
"grad_norm": 0.8932291269302368,
"learning_rate": 7.606843357124426e-06,
"loss": 0.1734,
"step": 680
},
{
"epoch": 1.8581173260572987,
"grad_norm": 0.978801429271698,
"learning_rate": 7.575981988478393e-06,
"loss": 0.1758,
"step": 681
},
{
"epoch": 1.8608458390177354,
"grad_norm": 0.9244241714477539,
"learning_rate": 7.545145128592009e-06,
"loss": 0.171,
"step": 682
},
{
"epoch": 1.8635743519781718,
"grad_norm": 0.8267676830291748,
"learning_rate": 7.514333089250577e-06,
"loss": 0.1705,
"step": 683
},
{
"epoch": 1.8663028649386084,
"grad_norm": 1.031118392944336,
"learning_rate": 7.483546181988437e-06,
"loss": 0.1767,
"step": 684
},
{
"epoch": 1.869031377899045,
"grad_norm": 1.0337843894958496,
"learning_rate": 7.452784718085834e-06,
"loss": 0.1707,
"step": 685
},
{
"epoch": 1.8717598908594817,
"grad_norm": 1.016801118850708,
"learning_rate": 7.422049008565757e-06,
"loss": 0.1757,
"step": 686
},
{
"epoch": 1.874488403819918,
"grad_norm": 1.1047598123550415,
"learning_rate": 7.391339364190794e-06,
"loss": 0.1762,
"step": 687
},
{
"epoch": 1.8772169167803547,
"grad_norm": 1.1827868223190308,
"learning_rate": 7.360656095459995e-06,
"loss": 0.1702,
"step": 688
},
{
"epoch": 1.8799454297407912,
"grad_norm": 1.0567187070846558,
"learning_rate": 7.329999512605738e-06,
"loss": 0.1759,
"step": 689
},
{
"epoch": 1.8826739427012278,
"grad_norm": 1.480316400527954,
"learning_rate": 7.299369925590575e-06,
"loss": 0.1731,
"step": 690
},
{
"epoch": 1.8854024556616644,
"grad_norm": 0.9149882793426514,
"learning_rate": 7.268767644104114e-06,
"loss": 0.1786,
"step": 691
},
{
"epoch": 1.888130968622101,
"grad_norm": 1.175398826599121,
"learning_rate": 7.2381929775598835e-06,
"loss": 0.1743,
"step": 692
},
{
"epoch": 1.8908594815825375,
"grad_norm": 1.0491394996643066,
"learning_rate": 7.207646235092201e-06,
"loss": 0.1704,
"step": 693
},
{
"epoch": 1.893587994542974,
"grad_norm": 1.049817681312561,
"learning_rate": 7.1771277255530456e-06,
"loss": 0.175,
"step": 694
},
{
"epoch": 1.8963165075034105,
"grad_norm": 0.8883402347564697,
"learning_rate": 7.14663775750895e-06,
"loss": 0.1766,
"step": 695
},
{
"epoch": 1.8990450204638472,
"grad_norm": 0.8892170190811157,
"learning_rate": 7.116176639237853e-06,
"loss": 0.1764,
"step": 696
},
{
"epoch": 1.9017735334242838,
"grad_norm": 0.8051910400390625,
"learning_rate": 7.085744678726013e-06,
"loss": 0.1755,
"step": 697
},
{
"epoch": 1.9045020463847204,
"grad_norm": 1.0221798419952393,
"learning_rate": 7.05534218366488e-06,
"loss": 0.172,
"step": 698
},
{
"epoch": 1.9072305593451568,
"grad_norm": 0.8057647347450256,
"learning_rate": 7.024969461447973e-06,
"loss": 0.1781,
"step": 699
},
{
"epoch": 1.9099590723055935,
"grad_norm": 1.0149929523468018,
"learning_rate": 6.994626819167789e-06,
"loss": 0.1717,
"step": 700
},
{
"epoch": 1.9126875852660299,
"grad_norm": 0.9378892779350281,
"learning_rate": 6.964314563612709e-06,
"loss": 0.1746,
"step": 701
},
{
"epoch": 1.9154160982264665,
"grad_norm": 0.8830491900444031,
"learning_rate": 6.934033001263847e-06,
"loss": 0.1751,
"step": 702
},
{
"epoch": 1.9181446111869032,
"grad_norm": 1.0347460508346558,
"learning_rate": 6.9037824382920145e-06,
"loss": 0.1744,
"step": 703
},
{
"epoch": 1.9208731241473398,
"grad_norm": 0.8412283062934875,
"learning_rate": 6.873563180554583e-06,
"loss": 0.1702,
"step": 704
},
{
"epoch": 1.9236016371077762,
"grad_norm": 1.1008716821670532,
"learning_rate": 6.843375533592395e-06,
"loss": 0.1721,
"step": 705
},
{
"epoch": 1.9263301500682128,
"grad_norm": 1.1027634143829346,
"learning_rate": 6.813219802626698e-06,
"loss": 0.1697,
"step": 706
},
{
"epoch": 1.9290586630286493,
"grad_norm": 0.9036797285079956,
"learning_rate": 6.783096292556035e-06,
"loss": 0.1742,
"step": 707
},
{
"epoch": 1.931787175989086,
"grad_norm": 0.9932321310043335,
"learning_rate": 6.7530053079531664e-06,
"loss": 0.1695,
"step": 708
},
{
"epoch": 1.9345156889495225,
"grad_norm": 0.9580210447311401,
"learning_rate": 6.722947153062003e-06,
"loss": 0.1732,
"step": 709
},
{
"epoch": 1.9372442019099592,
"grad_norm": 0.7242920994758606,
"learning_rate": 6.692922131794517e-06,
"loss": 0.1676,
"step": 710
},
{
"epoch": 1.9399727148703958,
"grad_norm": 1.1400161981582642,
"learning_rate": 6.662930547727668e-06,
"loss": 0.17,
"step": 711
},
{
"epoch": 1.9427012278308322,
"grad_norm": 0.9288698434829712,
"learning_rate": 6.632972704100349e-06,
"loss": 0.1699,
"step": 712
},
{
"epoch": 1.9454297407912686,
"grad_norm": 1.0455366373062134,
"learning_rate": 6.603048903810305e-06,
"loss": 0.1691,
"step": 713
},
{
"epoch": 1.9481582537517053,
"grad_norm": 0.7870394587516785,
"learning_rate": 6.573159449411071e-06,
"loss": 0.1693,
"step": 714
},
{
"epoch": 1.950886766712142,
"grad_norm": 0.9377486705780029,
"learning_rate": 6.5433046431089205e-06,
"loss": 0.1717,
"step": 715
},
{
"epoch": 1.9536152796725785,
"grad_norm": 0.8685896992683411,
"learning_rate": 6.513484786759818e-06,
"loss": 0.1751,
"step": 716
},
{
"epoch": 1.9563437926330152,
"grad_norm": 1.2274606227874756,
"learning_rate": 6.483700181866337e-06,
"loss": 0.1725,
"step": 717
},
{
"epoch": 1.9590723055934516,
"grad_norm": 1.0122281312942505,
"learning_rate": 6.453951129574644e-06,
"loss": 0.1689,
"step": 718
},
{
"epoch": 1.961800818553888,
"grad_norm": 1.0082857608795166,
"learning_rate": 6.42423793067144e-06,
"loss": 0.174,
"step": 719
},
{
"epoch": 1.9645293315143246,
"grad_norm": 1.1493691205978394,
"learning_rate": 6.39456088558091e-06,
"loss": 0.1703,
"step": 720
},
{
"epoch": 1.9672578444747613,
"grad_norm": 0.7770411968231201,
"learning_rate": 6.364920294361701e-06,
"loss": 0.171,
"step": 721
},
{
"epoch": 1.969986357435198,
"grad_norm": 1.0410341024398804,
"learning_rate": 6.335316456703891e-06,
"loss": 0.1684,
"step": 722
},
{
"epoch": 1.9727148703956345,
"grad_norm": 1.0169074535369873,
"learning_rate": 6.3057496719259314e-06,
"loss": 0.172,
"step": 723
},
{
"epoch": 1.975443383356071,
"grad_norm": 1.2108582258224487,
"learning_rate": 6.276220238971653e-06,
"loss": 0.1687,
"step": 724
},
{
"epoch": 1.9781718963165074,
"grad_norm": 1.1328767538070679,
"learning_rate": 6.2467284564072294e-06,
"loss": 0.1709,
"step": 725
},
{
"epoch": 1.980900409276944,
"grad_norm": 0.8491936326026917,
"learning_rate": 6.2172746224181524e-06,
"loss": 0.1739,
"step": 726
},
{
"epoch": 1.9836289222373806,
"grad_norm": 0.977837860584259,
"learning_rate": 6.187859034806225e-06,
"loss": 0.1678,
"step": 727
},
{
"epoch": 1.9863574351978173,
"grad_norm": 1.0643810033798218,
"learning_rate": 6.158481990986558e-06,
"loss": 0.1698,
"step": 728
},
{
"epoch": 1.989085948158254,
"grad_norm": 1.1140060424804688,
"learning_rate": 6.1291437879845335e-06,
"loss": 0.1725,
"step": 729
},
{
"epoch": 1.9918144611186903,
"grad_norm": 0.7833942174911499,
"learning_rate": 6.099844722432844e-06,
"loss": 0.1718,
"step": 730
},
{
"epoch": 1.9945429740791267,
"grad_norm": 0.7607527375221252,
"learning_rate": 6.07058509056846e-06,
"loss": 0.1696,
"step": 731
},
{
"epoch": 1.9972714870395634,
"grad_norm": 0.6727914810180664,
"learning_rate": 6.041365188229641e-06,
"loss": 0.1733,
"step": 732
},
{
"epoch": 2.0,
"grad_norm": 0.7081233263015747,
"learning_rate": 6.012185310852962e-06,
"loss": 0.1684,
"step": 733
},
{
"epoch": 2.0027285129604366,
"grad_norm": 0.8626097440719604,
"learning_rate": 5.983045753470308e-06,
"loss": 0.133,
"step": 734
},
{
"epoch": 2.0054570259208733,
"grad_norm": 0.8325755596160889,
"learning_rate": 5.9539468107058885e-06,
"loss": 0.1346,
"step": 735
},
{
"epoch": 2.00818553888131,
"grad_norm": 0.6211928129196167,
"learning_rate": 5.924888776773281e-06,
"loss": 0.13,
"step": 736
},
{
"epoch": 2.010914051841746,
"grad_norm": 0.877323567867279,
"learning_rate": 5.895871945472434e-06,
"loss": 0.1327,
"step": 737
},
{
"epoch": 2.0136425648021827,
"grad_norm": 1.7426798343658447,
"learning_rate": 5.866896610186701e-06,
"loss": 0.1304,
"step": 738
},
{
"epoch": 2.0163710777626194,
"grad_norm": 1.0055882930755615,
"learning_rate": 5.8379630638798845e-06,
"loss": 0.1334,
"step": 739
},
{
"epoch": 2.019099590723056,
"grad_norm": 0.7875477075576782,
"learning_rate": 5.809071599093272e-06,
"loss": 0.1319,
"step": 740
},
{
"epoch": 2.0218281036834926,
"grad_norm": 0.6837515234947205,
"learning_rate": 5.780222507942654e-06,
"loss": 0.1277,
"step": 741
},
{
"epoch": 2.0245566166439293,
"grad_norm": 0.8274489045143127,
"learning_rate": 5.7514160821154085e-06,
"loss": 0.126,
"step": 742
},
{
"epoch": 2.0272851296043655,
"grad_norm": 0.6948771476745605,
"learning_rate": 5.7226526128675234e-06,
"loss": 0.1255,
"step": 743
},
{
"epoch": 2.030013642564802,
"grad_norm": 0.7969790101051331,
"learning_rate": 5.693932391020664e-06,
"loss": 0.1269,
"step": 744
},
{
"epoch": 2.0327421555252387,
"grad_norm": 0.7251871824264526,
"learning_rate": 5.665255706959231e-06,
"loss": 0.1268,
"step": 745
},
{
"epoch": 2.0354706684856754,
"grad_norm": 0.635795533657074,
"learning_rate": 5.63662285062742e-06,
"loss": 0.1328,
"step": 746
},
{
"epoch": 2.038199181446112,
"grad_norm": 1.0244261026382446,
"learning_rate": 5.608034111526298e-06,
"loss": 0.1286,
"step": 747
},
{
"epoch": 2.0409276944065486,
"grad_norm": 0.754211962223053,
"learning_rate": 5.579489778710867e-06,
"loss": 0.1251,
"step": 748
},
{
"epoch": 2.043656207366985,
"grad_norm": 0.7149548530578613,
"learning_rate": 5.550990140787147e-06,
"loss": 0.1281,
"step": 749
},
{
"epoch": 2.0463847203274215,
"grad_norm": 0.8640093803405762,
"learning_rate": 5.522535485909258e-06,
"loss": 0.1252,
"step": 750
},
{
"epoch": 2.049113233287858,
"grad_norm": 0.7243211269378662,
"learning_rate": 5.494126101776505e-06,
"loss": 0.1271,
"step": 751
},
{
"epoch": 2.0518417462482947,
"grad_norm": 1.0630416870117188,
"learning_rate": 5.465762275630471e-06,
"loss": 0.1262,
"step": 752
},
{
"epoch": 2.0545702592087314,
"grad_norm": 0.6877619028091431,
"learning_rate": 5.437444294252108e-06,
"loss": 0.1278,
"step": 753
},
{
"epoch": 2.057298772169168,
"grad_norm": 0.7790495157241821,
"learning_rate": 5.409172443958844e-06,
"loss": 0.126,
"step": 754
},
{
"epoch": 2.060027285129604,
"grad_norm": 1.3545631170272827,
"learning_rate": 5.380947010601681e-06,
"loss": 0.1275,
"step": 755
},
{
"epoch": 2.062755798090041,
"grad_norm": 0.723747193813324,
"learning_rate": 5.352768279562315e-06,
"loss": 0.1293,
"step": 756
},
{
"epoch": 2.0654843110504775,
"grad_norm": 0.9228985905647278,
"learning_rate": 5.324636535750238e-06,
"loss": 0.1239,
"step": 757
},
{
"epoch": 2.068212824010914,
"grad_norm": 0.779529869556427,
"learning_rate": 5.2965520635998676e-06,
"loss": 0.1266,
"step": 758
},
{
"epoch": 2.0709413369713507,
"grad_norm": 0.6386115550994873,
"learning_rate": 5.268515147067666e-06,
"loss": 0.1251,
"step": 759
},
{
"epoch": 2.0736698499317874,
"grad_norm": 0.7511923909187317,
"learning_rate": 5.240526069629265e-06,
"loss": 0.1263,
"step": 760
},
{
"epoch": 2.0763983628922236,
"grad_norm": 0.6733880639076233,
"learning_rate": 5.212585114276614e-06,
"loss": 0.1275,
"step": 761
},
{
"epoch": 2.07912687585266,
"grad_norm": 0.6625434160232544,
"learning_rate": 5.184692563515104e-06,
"loss": 0.1284,
"step": 762
},
{
"epoch": 2.081855388813097,
"grad_norm": 0.6756749153137207,
"learning_rate": 5.156848699360719e-06,
"loss": 0.1256,
"step": 763
},
{
"epoch": 2.0845839017735335,
"grad_norm": 0.8829818964004517,
"learning_rate": 5.129053803337181e-06,
"loss": 0.1252,
"step": 764
},
{
"epoch": 2.08731241473397,
"grad_norm": 0.6200577020645142,
"learning_rate": 5.101308156473104e-06,
"loss": 0.1275,
"step": 765
},
{
"epoch": 2.0900409276944067,
"grad_norm": 1.5605920553207397,
"learning_rate": 5.073612039299157e-06,
"loss": 0.1253,
"step": 766
},
{
"epoch": 2.092769440654843,
"grad_norm": 0.7764895558357239,
"learning_rate": 5.045965731845223e-06,
"loss": 0.1324,
"step": 767
},
{
"epoch": 2.0954979536152796,
"grad_norm": 0.6887750029563904,
"learning_rate": 5.018369513637567e-06,
"loss": 0.1315,
"step": 768
},
{
"epoch": 2.098226466575716,
"grad_norm": 1.163913607597351,
"learning_rate": 4.990823663696013e-06,
"loss": 0.1267,
"step": 769
},
{
"epoch": 2.100954979536153,
"grad_norm": 0.9275516271591187,
"learning_rate": 4.963328460531127e-06,
"loss": 0.1247,
"step": 770
},
{
"epoch": 2.1036834924965895,
"grad_norm": 0.8884099721908569,
"learning_rate": 4.9358841821413775e-06,
"loss": 0.1282,
"step": 771
},
{
"epoch": 2.106412005457026,
"grad_norm": 0.791497528553009,
"learning_rate": 4.908491106010368e-06,
"loss": 0.1258,
"step": 772
},
{
"epoch": 2.1091405184174623,
"grad_norm": 0.7595200538635254,
"learning_rate": 4.881149509103993e-06,
"loss": 0.13,
"step": 773
},
{
"epoch": 2.111869031377899,
"grad_norm": 0.7046754956245422,
"learning_rate": 4.853859667867641e-06,
"loss": 0.1247,
"step": 774
},
{
"epoch": 2.1145975443383356,
"grad_norm": 0.8449010848999023,
"learning_rate": 4.826621858223431e-06,
"loss": 0.127,
"step": 775
},
{
"epoch": 2.117326057298772,
"grad_norm": 0.6766776442527771,
"learning_rate": 4.799436355567391e-06,
"loss": 0.1286,
"step": 776
},
{
"epoch": 2.120054570259209,
"grad_norm": 0.8445289134979248,
"learning_rate": 4.772303434766669e-06,
"loss": 0.1266,
"step": 777
},
{
"epoch": 2.1227830832196455,
"grad_norm": 0.6168569326400757,
"learning_rate": 4.745223370156797e-06,
"loss": 0.1261,
"step": 778
},
{
"epoch": 2.1255115961800817,
"grad_norm": 0.8796712756156921,
"learning_rate": 4.7181964355388695e-06,
"loss": 0.1304,
"step": 779
},
{
"epoch": 2.1282401091405183,
"grad_norm": 0.7083600759506226,
"learning_rate": 4.691222904176791e-06,
"loss": 0.1264,
"step": 780
},
{
"epoch": 2.130968622100955,
"grad_norm": 0.8051674962043762,
"learning_rate": 4.664303048794533e-06,
"loss": 0.1262,
"step": 781
},
{
"epoch": 2.1336971350613916,
"grad_norm": 0.6285784840583801,
"learning_rate": 4.63743714157335e-06,
"loss": 0.1274,
"step": 782
},
{
"epoch": 2.136425648021828,
"grad_norm": 0.9793753027915955,
"learning_rate": 4.610625454149033e-06,
"loss": 0.1244,
"step": 783
},
{
"epoch": 2.139154160982265,
"grad_norm": 0.7283981442451477,
"learning_rate": 4.583868257609171e-06,
"loss": 0.1256,
"step": 784
},
{
"epoch": 2.141882673942701,
"grad_norm": 0.8791332244873047,
"learning_rate": 4.55716582249042e-06,
"loss": 0.1239,
"step": 785
},
{
"epoch": 2.1446111869031377,
"grad_norm": 0.7714548110961914,
"learning_rate": 4.530518418775734e-06,
"loss": 0.1269,
"step": 786
},
{
"epoch": 2.1473396998635743,
"grad_norm": 0.7238229513168335,
"learning_rate": 4.50392631589166e-06,
"loss": 0.1304,
"step": 787
},
{
"epoch": 2.150068212824011,
"grad_norm": 0.7066569924354553,
"learning_rate": 4.477389782705628e-06,
"loss": 0.128,
"step": 788
},
{
"epoch": 2.1527967257844476,
"grad_norm": 0.7357354760169983,
"learning_rate": 4.4509090875231865e-06,
"loss": 0.1242,
"step": 789
},
{
"epoch": 2.155525238744884,
"grad_norm": 0.6796067357063293,
"learning_rate": 4.424484498085335e-06,
"loss": 0.1262,
"step": 790
},
{
"epoch": 2.1582537517053204,
"grad_norm": 0.6906223893165588,
"learning_rate": 4.398116281565794e-06,
"loss": 0.1253,
"step": 791
},
{
"epoch": 2.160982264665757,
"grad_norm": 0.7198973894119263,
"learning_rate": 4.371804704568309e-06,
"loss": 0.1269,
"step": 792
},
{
"epoch": 2.1637107776261937,
"grad_norm": 0.6786914467811584,
"learning_rate": 4.345550033123954e-06,
"loss": 0.1252,
"step": 793
},
{
"epoch": 2.1664392905866303,
"grad_norm": 0.6092426776885986,
"learning_rate": 4.319352532688444e-06,
"loss": 0.125,
"step": 794
},
{
"epoch": 2.169167803547067,
"grad_norm": 0.6962175369262695,
"learning_rate": 4.293212468139447e-06,
"loss": 0.1279,
"step": 795
},
{
"epoch": 2.1718963165075036,
"grad_norm": 0.5801219344139099,
"learning_rate": 4.267130103773911e-06,
"loss": 0.1253,
"step": 796
},
{
"epoch": 2.17462482946794,
"grad_norm": 0.7044116258621216,
"learning_rate": 4.241105703305388e-06,
"loss": 0.1269,
"step": 797
},
{
"epoch": 2.1773533424283764,
"grad_norm": 0.8102765083312988,
"learning_rate": 4.2151395298613675e-06,
"loss": 0.1262,
"step": 798
},
{
"epoch": 2.180081855388813,
"grad_norm": 0.6070725321769714,
"learning_rate": 4.189231845980618e-06,
"loss": 0.1231,
"step": 799
},
{
"epoch": 2.1828103683492497,
"grad_norm": 0.7759428024291992,
"learning_rate": 4.163382913610533e-06,
"loss": 0.1281,
"step": 800
},
{
"epoch": 2.1855388813096863,
"grad_norm": 0.8558132648468018,
"learning_rate": 4.137592994104479e-06,
"loss": 0.1263,
"step": 801
},
{
"epoch": 2.188267394270123,
"grad_norm": 0.6967403888702393,
"learning_rate": 4.111862348219158e-06,
"loss": 0.1261,
"step": 802
},
{
"epoch": 2.190995907230559,
"grad_norm": 1.0783143043518066,
"learning_rate": 4.086191236111964e-06,
"loss": 0.1218,
"step": 803
},
{
"epoch": 2.193724420190996,
"grad_norm": 1.0875624418258667,
"learning_rate": 4.060579917338362e-06,
"loss": 0.1274,
"step": 804
},
{
"epoch": 2.1964529331514324,
"grad_norm": 0.8018985390663147,
"learning_rate": 4.035028650849255e-06,
"loss": 0.1224,
"step": 805
},
{
"epoch": 2.199181446111869,
"grad_norm": 1.0255545377731323,
"learning_rate": 4.009537694988372e-06,
"loss": 0.1262,
"step": 806
},
{
"epoch": 2.2019099590723057,
"grad_norm": 0.9814453721046448,
"learning_rate": 3.984107307489652e-06,
"loss": 0.1288,
"step": 807
},
{
"epoch": 2.2046384720327423,
"grad_norm": 0.7587682008743286,
"learning_rate": 3.958737745474638e-06,
"loss": 0.1234,
"step": 808
},
{
"epoch": 2.2073669849931785,
"grad_norm": 0.8585322499275208,
"learning_rate": 3.933429265449882e-06,
"loss": 0.1239,
"step": 809
},
{
"epoch": 2.210095497953615,
"grad_norm": 0.9962207674980164,
"learning_rate": 3.908182123304344e-06,
"loss": 0.1284,
"step": 810
},
{
"epoch": 2.212824010914052,
"grad_norm": 0.7775647044181824,
"learning_rate": 3.882996574306818e-06,
"loss": 0.1263,
"step": 811
},
{
"epoch": 2.2155525238744884,
"grad_norm": 1.0317277908325195,
"learning_rate": 3.857872873103322e-06,
"loss": 0.1285,
"step": 812
},
{
"epoch": 2.218281036834925,
"grad_norm": 0.9878216981887817,
"learning_rate": 3.832811273714569e-06,
"loss": 0.1263,
"step": 813
},
{
"epoch": 2.2210095497953617,
"grad_norm": 0.7432425022125244,
"learning_rate": 3.807812029533362e-06,
"loss": 0.1255,
"step": 814
},
{
"epoch": 2.223738062755798,
"grad_norm": 0.7971869111061096,
"learning_rate": 3.78287539332203e-06,
"loss": 0.1254,
"step": 815
},
{
"epoch": 2.2264665757162345,
"grad_norm": 1.0156506299972534,
"learning_rate": 3.7580016172099067e-06,
"loss": 0.1259,
"step": 816
},
{
"epoch": 2.229195088676671,
"grad_norm": 0.7717655897140503,
"learning_rate": 3.7331909526907527e-06,
"loss": 0.1221,
"step": 817
},
{
"epoch": 2.231923601637108,
"grad_norm": 0.6119164824485779,
"learning_rate": 3.708443650620206e-06,
"loss": 0.1256,
"step": 818
},
{
"epoch": 2.2346521145975444,
"grad_norm": 1.200944423675537,
"learning_rate": 3.6837599612132826e-06,
"loss": 0.1287,
"step": 819
},
{
"epoch": 2.237380627557981,
"grad_norm": 1.009369969367981,
"learning_rate": 3.659140134041812e-06,
"loss": 0.1249,
"step": 820
},
{
"epoch": 2.2401091405184177,
"grad_norm": 0.6488488912582397,
"learning_rate": 3.6345844180319157e-06,
"loss": 0.1231,
"step": 821
},
{
"epoch": 2.242837653478854,
"grad_norm": 0.8771604299545288,
"learning_rate": 3.6100930614615204e-06,
"loss": 0.1248,
"step": 822
},
{
"epoch": 2.2455661664392905,
"grad_norm": 1.1320050954818726,
"learning_rate": 3.5856663119578174e-06,
"loss": 0.1251,
"step": 823
},
{
"epoch": 2.248294679399727,
"grad_norm": 1.1242350339889526,
"learning_rate": 3.5613044164947617e-06,
"loss": 0.1255,
"step": 824
},
{
"epoch": 2.251023192360164,
"grad_norm": 0.7258925437927246,
"learning_rate": 3.5370076213905904e-06,
"loss": 0.1252,
"step": 825
},
{
"epoch": 2.2537517053206004,
"grad_norm": 0.844993531703949,
"learning_rate": 3.5127761723053313e-06,
"loss": 0.1238,
"step": 826
},
{
"epoch": 2.2564802182810366,
"grad_norm": 1.045020341873169,
"learning_rate": 3.4886103142382944e-06,
"loss": 0.1237,
"step": 827
},
{
"epoch": 2.2592087312414733,
"grad_norm": 0.6890770196914673,
"learning_rate": 3.46451029152562e-06,
"loss": 0.1273,
"step": 828
},
{
"epoch": 2.26193724420191,
"grad_norm": 0.7367672324180603,
"learning_rate": 3.440476347837811e-06,
"loss": 0.1264,
"step": 829
},
{
"epoch": 2.2646657571623465,
"grad_norm": 1.0174280405044556,
"learning_rate": 3.41650872617724e-06,
"loss": 0.1255,
"step": 830
},
{
"epoch": 2.267394270122783,
"grad_norm": 0.8533175587654114,
"learning_rate": 3.392607668875718e-06,
"loss": 0.1269,
"step": 831
},
{
"epoch": 2.27012278308322,
"grad_norm": 0.7625150680541992,
"learning_rate": 3.3687734175920505e-06,
"loss": 0.1249,
"step": 832
},
{
"epoch": 2.2728512960436564,
"grad_norm": 0.735122799873352,
"learning_rate": 3.3450062133095572e-06,
"loss": 0.1243,
"step": 833
},
{
"epoch": 2.2755798090040926,
"grad_norm": 0.7749839425086975,
"learning_rate": 3.321306296333673e-06,
"loss": 0.1246,
"step": 834
},
{
"epoch": 2.2783083219645293,
"grad_norm": 0.7999988794326782,
"learning_rate": 3.29767390628951e-06,
"loss": 0.1253,
"step": 835
},
{
"epoch": 2.281036834924966,
"grad_norm": 0.8466626405715942,
"learning_rate": 3.274109282119413e-06,
"loss": 0.1256,
"step": 836
},
{
"epoch": 2.2837653478854025,
"grad_norm": 0.7036960124969482,
"learning_rate": 3.2506126620805666e-06,
"loss": 0.1256,
"step": 837
},
{
"epoch": 2.286493860845839,
"grad_norm": 0.7315667271614075,
"learning_rate": 3.2271842837425917e-06,
"loss": 0.124,
"step": 838
},
{
"epoch": 2.2892223738062754,
"grad_norm": 0.7803946733474731,
"learning_rate": 3.203824383985108e-06,
"loss": 0.1247,
"step": 839
},
{
"epoch": 2.291950886766712,
"grad_norm": 0.6711841821670532,
"learning_rate": 3.180533198995379e-06,
"loss": 0.1247,
"step": 840
},
{
"epoch": 2.2946793997271486,
"grad_norm": 0.6884995698928833,
"learning_rate": 3.157310964265903e-06,
"loss": 0.1248,
"step": 841
},
{
"epoch": 2.2974079126875853,
"grad_norm": 0.6317690014839172,
"learning_rate": 3.134157914592032e-06,
"loss": 0.1233,
"step": 842
},
{
"epoch": 2.300136425648022,
"grad_norm": 0.7348897457122803,
"learning_rate": 3.1110742840696063e-06,
"loss": 0.1223,
"step": 843
},
{
"epoch": 2.3028649386084585,
"grad_norm": 0.8744590282440186,
"learning_rate": 3.088060306092582e-06,
"loss": 0.1242,
"step": 844
},
{
"epoch": 2.305593451568895,
"grad_norm": 0.6979570984840393,
"learning_rate": 3.0651162133506707e-06,
"loss": 0.1229,
"step": 845
},
{
"epoch": 2.3083219645293314,
"grad_norm": 0.6471583843231201,
"learning_rate": 3.042242237826991e-06,
"loss": 0.1249,
"step": 846
},
{
"epoch": 2.311050477489768,
"grad_norm": 0.8014260530471802,
"learning_rate": 3.0194386107957175e-06,
"loss": 0.1248,
"step": 847
},
{
"epoch": 2.3137789904502046,
"grad_norm": 0.676422119140625,
"learning_rate": 2.996705562819747e-06,
"loss": 0.1243,
"step": 848
},
{
"epoch": 2.3165075034106413,
"grad_norm": 0.707206666469574,
"learning_rate": 2.9740433237483667e-06,
"loss": 0.1249,
"step": 849
},
{
"epoch": 2.319236016371078,
"grad_norm": 0.6779247522354126,
"learning_rate": 2.951452122714926e-06,
"loss": 0.1235,
"step": 850
},
{
"epoch": 2.321964529331514,
"grad_norm": 0.6838175654411316,
"learning_rate": 2.9289321881345257e-06,
"loss": 0.1257,
"step": 851
},
{
"epoch": 2.3246930422919507,
"grad_norm": 0.894402027130127,
"learning_rate": 2.906483747701705e-06,
"loss": 0.1229,
"step": 852
},
{
"epoch": 2.3274215552523874,
"grad_norm": 0.6708566546440125,
"learning_rate": 2.88410702838814e-06,
"loss": 0.1202,
"step": 853
},
{
"epoch": 2.330150068212824,
"grad_norm": 0.7092727422714233,
"learning_rate": 2.861802256440348e-06,
"loss": 0.1242,
"step": 854
},
{
"epoch": 2.3328785811732606,
"grad_norm": 0.9429033994674683,
"learning_rate": 2.8395696573774034e-06,
"loss": 0.1258,
"step": 855
},
{
"epoch": 2.3356070941336973,
"grad_norm": 0.799321711063385,
"learning_rate": 2.8174094559886535e-06,
"loss": 0.1222,
"step": 856
},
{
"epoch": 2.338335607094134,
"grad_norm": 0.5677041411399841,
"learning_rate": 2.795321876331446e-06,
"loss": 0.1237,
"step": 857
},
{
"epoch": 2.34106412005457,
"grad_norm": 0.7222699522972107,
"learning_rate": 2.773307141728867e-06,
"loss": 0.1215,
"step": 858
},
{
"epoch": 2.3437926330150067,
"grad_norm": 0.7362144589424133,
"learning_rate": 2.751365474767479e-06,
"loss": 0.1243,
"step": 859
},
{
"epoch": 2.3465211459754434,
"grad_norm": 0.7244157791137695,
"learning_rate": 2.729497097295075e-06,
"loss": 0.1214,
"step": 860
},
{
"epoch": 2.34924965893588,
"grad_norm": 0.6958690881729126,
"learning_rate": 2.70770223041843e-06,
"loss": 0.1239,
"step": 861
},
{
"epoch": 2.3519781718963166,
"grad_norm": 0.6821141839027405,
"learning_rate": 2.6859810945010687e-06,
"loss": 0.1255,
"step": 862
},
{
"epoch": 2.354706684856753,
"grad_norm": 0.5896955132484436,
"learning_rate": 2.6643339091610376e-06,
"loss": 0.1184,
"step": 863
},
{
"epoch": 2.3574351978171895,
"grad_norm": 0.6613571047782898,
"learning_rate": 2.642760893268684e-06,
"loss": 0.1226,
"step": 864
},
{
"epoch": 2.360163710777626,
"grad_norm": 0.6776670813560486,
"learning_rate": 2.621262264944444e-06,
"loss": 0.1229,
"step": 865
},
{
"epoch": 2.3628922237380627,
"grad_norm": 0.6587377786636353,
"learning_rate": 2.5998382415566258e-06,
"loss": 0.1234,
"step": 866
},
{
"epoch": 2.3656207366984994,
"grad_norm": 0.7616446614265442,
"learning_rate": 2.5784890397192395e-06,
"loss": 0.1235,
"step": 867
},
{
"epoch": 2.368349249658936,
"grad_norm": 0.6983022093772888,
"learning_rate": 2.55721487528978e-06,
"loss": 0.1304,
"step": 868
},
{
"epoch": 2.3710777626193726,
"grad_norm": 0.5825099349021912,
"learning_rate": 2.5360159633670456e-06,
"loss": 0.1211,
"step": 869
},
{
"epoch": 2.373806275579809,
"grad_norm": 0.7340635657310486,
"learning_rate": 2.514892518288988e-06,
"loss": 0.123,
"step": 870
},
{
"epoch": 2.3765347885402455,
"grad_norm": 0.6723161339759827,
"learning_rate": 2.4938447536305243e-06,
"loss": 0.1257,
"step": 871
},
{
"epoch": 2.379263301500682,
"grad_norm": 0.6565150618553162,
"learning_rate": 2.4728728822013683e-06,
"loss": 0.1219,
"step": 872
},
{
"epoch": 2.3819918144611187,
"grad_norm": 0.609505295753479,
"learning_rate": 2.451977116043911e-06,
"loss": 0.1239,
"step": 873
},
{
"epoch": 2.3847203274215554,
"grad_norm": 0.6170854568481445,
"learning_rate": 2.431157666431052e-06,
"loss": 0.1265,
"step": 874
},
{
"epoch": 2.3874488403819916,
"grad_norm": 0.6223445534706116,
"learning_rate": 2.410414743864059e-06,
"loss": 0.1235,
"step": 875
},
{
"epoch": 2.390177353342428,
"grad_norm": 0.6010127663612366,
"learning_rate": 2.3897485580704684e-06,
"loss": 0.122,
"step": 876
},
{
"epoch": 2.392905866302865,
"grad_norm": 0.6026211380958557,
"learning_rate": 2.369159318001937e-06,
"loss": 0.1219,
"step": 877
},
{
"epoch": 2.3956343792633015,
"grad_norm": 0.6825677752494812,
"learning_rate": 2.348647231832131e-06,
"loss": 0.1213,
"step": 878
},
{
"epoch": 2.398362892223738,
"grad_norm": 0.6073306202888489,
"learning_rate": 2.3282125069546437e-06,
"loss": 0.1245,
"step": 879
},
{
"epoch": 2.4010914051841747,
"grad_norm": 0.8288139700889587,
"learning_rate": 2.30785534998088e-06,
"loss": 0.1229,
"step": 880
},
{
"epoch": 2.4038199181446114,
"grad_norm": 0.6051532626152039,
"learning_rate": 2.2875759667379616e-06,
"loss": 0.1225,
"step": 881
},
{
"epoch": 2.4065484311050476,
"grad_norm": 0.6381723284721375,
"learning_rate": 2.267374562266662e-06,
"loss": 0.1237,
"step": 882
},
{
"epoch": 2.409276944065484,
"grad_norm": 0.6321126222610474,
"learning_rate": 2.2472513408193385e-06,
"loss": 0.124,
"step": 883
},
{
"epoch": 2.412005457025921,
"grad_norm": 0.793342113494873,
"learning_rate": 2.227206505857834e-06,
"loss": 0.1217,
"step": 884
},
{
"epoch": 2.4147339699863575,
"grad_norm": 0.6574937105178833,
"learning_rate": 2.207240260051453e-06,
"loss": 0.1217,
"step": 885
},
{
"epoch": 2.417462482946794,
"grad_norm": 0.6402091979980469,
"learning_rate": 2.1873528052749094e-06,
"loss": 0.1197,
"step": 886
},
{
"epoch": 2.4201909959072307,
"grad_norm": 0.5669599771499634,
"learning_rate": 2.167544342606256e-06,
"loss": 0.1234,
"step": 887
},
{
"epoch": 2.422919508867667,
"grad_norm": 0.6423009037971497,
"learning_rate": 2.147815072324886e-06,
"loss": 0.122,
"step": 888
},
{
"epoch": 2.4256480218281036,
"grad_norm": 0.9944397807121277,
"learning_rate": 2.1281651939094996e-06,
"loss": 0.1221,
"step": 889
},
{
"epoch": 2.42837653478854,
"grad_norm": 0.7023611068725586,
"learning_rate": 2.1085949060360654e-06,
"loss": 0.1221,
"step": 890
},
{
"epoch": 2.431105047748977,
"grad_norm": 0.6616829037666321,
"learning_rate": 2.089104406575837e-06,
"loss": 0.121,
"step": 891
},
{
"epoch": 2.4338335607094135,
"grad_norm": 0.6649833917617798,
"learning_rate": 2.0696938925933505e-06,
"loss": 0.1244,
"step": 892
},
{
"epoch": 2.43656207366985,
"grad_norm": 0.9376471042633057,
"learning_rate": 2.0503635603444094e-06,
"loss": 0.1228,
"step": 893
},
{
"epoch": 2.4392905866302863,
"grad_norm": 0.6371198892593384,
"learning_rate": 2.0311136052741274e-06,
"loss": 0.1211,
"step": 894
},
{
"epoch": 2.442019099590723,
"grad_norm": 0.6163613796234131,
"learning_rate": 2.0119442220149356e-06,
"loss": 0.1236,
"step": 895
},
{
"epoch": 2.4447476125511596,
"grad_norm": 0.6523067951202393,
"learning_rate": 1.9928556043846215e-06,
"loss": 0.1244,
"step": 896
},
{
"epoch": 2.447476125511596,
"grad_norm": 0.7866846919059753,
"learning_rate": 1.9738479453843685e-06,
"loss": 0.1234,
"step": 897
},
{
"epoch": 2.450204638472033,
"grad_norm": 0.7902270555496216,
"learning_rate": 1.9549214371968008e-06,
"loss": 0.1235,
"step": 898
},
{
"epoch": 2.4529331514324695,
"grad_norm": 0.8805884718894958,
"learning_rate": 1.936076271184044e-06,
"loss": 0.1234,
"step": 899
},
{
"epoch": 2.4556616643929057,
"grad_norm": 0.6005100011825562,
"learning_rate": 1.917312637885791e-06,
"loss": 0.1221,
"step": 900
},
{
"epoch": 2.4583901773533423,
"grad_norm": 0.7518947720527649,
"learning_rate": 1.898630727017371e-06,
"loss": 0.1211,
"step": 901
},
{
"epoch": 2.461118690313779,
"grad_norm": 1.2557408809661865,
"learning_rate": 1.8800307274678364e-06,
"loss": 0.1203,
"step": 902
},
{
"epoch": 2.4638472032742156,
"grad_norm": 0.7856685519218445,
"learning_rate": 1.861512827298051e-06,
"loss": 0.1246,
"step": 903
},
{
"epoch": 2.466575716234652,
"grad_norm": 0.7623482346534729,
"learning_rate": 1.8430772137387853e-06,
"loss": 0.1231,
"step": 904
},
{
"epoch": 2.469304229195089,
"grad_norm": 0.6743024587631226,
"learning_rate": 1.8247240731888293e-06,
"loss": 0.1211,
"step": 905
},
{
"epoch": 2.472032742155525,
"grad_norm": 0.8061301112174988,
"learning_rate": 1.8064535912131032e-06,
"loss": 0.12,
"step": 906
},
{
"epoch": 2.4747612551159617,
"grad_norm": 0.8119410872459412,
"learning_rate": 1.7882659525407842e-06,
"loss": 0.1208,
"step": 907
},
{
"epoch": 2.4774897680763983,
"grad_norm": 0.6645972728729248,
"learning_rate": 1.7701613410634367e-06,
"loss": 0.1204,
"step": 908
},
{
"epoch": 2.480218281036835,
"grad_norm": 0.6861996650695801,
"learning_rate": 1.752139939833154e-06,
"loss": 0.1228,
"step": 909
},
{
"epoch": 2.4829467939972716,
"grad_norm": 0.6746291518211365,
"learning_rate": 1.7342019310607062e-06,
"loss": 0.1232,
"step": 910
},
{
"epoch": 2.485675306957708,
"grad_norm": 0.7837573289871216,
"learning_rate": 1.7163474961137029e-06,
"loss": 0.1226,
"step": 911
},
{
"epoch": 2.488403819918145,
"grad_norm": 0.7806301712989807,
"learning_rate": 1.6985768155147498e-06,
"loss": 0.1196,
"step": 912
},
{
"epoch": 2.491132332878581,
"grad_norm": 0.5961025953292847,
"learning_rate": 1.6808900689396334e-06,
"loss": 0.1207,
"step": 913
},
{
"epoch": 2.4938608458390177,
"grad_norm": 0.5994763374328613,
"learning_rate": 1.6632874352154982e-06,
"loss": 0.1221,
"step": 914
},
{
"epoch": 2.4965893587994543,
"grad_norm": 0.7007748484611511,
"learning_rate": 1.645769092319045e-06,
"loss": 0.1219,
"step": 915
},
{
"epoch": 2.499317871759891,
"grad_norm": 0.6721235513687134,
"learning_rate": 1.6283352173747148e-06,
"loss": 0.1207,
"step": 916
},
{
"epoch": 2.5020463847203276,
"grad_norm": 0.648476779460907,
"learning_rate": 1.6109859866529253e-06,
"loss": 0.1216,
"step": 917
},
{
"epoch": 2.504774897680764,
"grad_norm": 0.6167169213294983,
"learning_rate": 1.5937215755682667e-06,
"loss": 0.1221,
"step": 918
},
{
"epoch": 2.5075034106412004,
"grad_norm": 0.6264228820800781,
"learning_rate": 1.5765421586777285e-06,
"loss": 0.1197,
"step": 919
},
{
"epoch": 2.510231923601637,
"grad_norm": 0.6109753847122192,
"learning_rate": 1.559447909678954e-06,
"loss": 0.1212,
"step": 920
},
{
"epoch": 2.5129604365620737,
"grad_norm": 0.7094171643257141,
"learning_rate": 1.5424390014084644e-06,
"loss": 0.1216,
"step": 921
},
{
"epoch": 2.5156889495225103,
"grad_norm": 0.9089038968086243,
"learning_rate": 1.5255156058399124e-06,
"loss": 0.1206,
"step": 922
},
{
"epoch": 2.518417462482947,
"grad_norm": 0.6656555533409119,
"learning_rate": 1.5086778940823544e-06,
"loss": 0.1211,
"step": 923
},
{
"epoch": 2.5211459754433836,
"grad_norm": 0.6644884943962097,
"learning_rate": 1.4919260363785215e-06,
"loss": 0.1239,
"step": 924
},
{
"epoch": 2.52387448840382,
"grad_norm": 0.5716986060142517,
"learning_rate": 1.4752602021030794e-06,
"loss": 0.1204,
"step": 925
},
{
"epoch": 2.5266030013642564,
"grad_norm": 0.9487411975860596,
"learning_rate": 1.4586805597609333e-06,
"loss": 0.1214,
"step": 926
},
{
"epoch": 2.529331514324693,
"grad_norm": 0.8672340512275696,
"learning_rate": 1.4421872769855262e-06,
"loss": 0.1228,
"step": 927
},
{
"epoch": 2.5320600272851297,
"grad_norm": 0.6485180854797363,
"learning_rate": 1.4257805205371233e-06,
"loss": 0.123,
"step": 928
},
{
"epoch": 2.5347885402455663,
"grad_norm": 0.593885600566864,
"learning_rate": 1.409460456301147e-06,
"loss": 0.1214,
"step": 929
},
{
"epoch": 2.5375170532060025,
"grad_norm": 0.5803414583206177,
"learning_rate": 1.3932272492864984e-06,
"loss": 0.1229,
"step": 930
},
{
"epoch": 2.540245566166439,
"grad_norm": 0.6213256120681763,
"learning_rate": 1.3770810636238685e-06,
"loss": 0.1199,
"step": 931
},
{
"epoch": 2.542974079126876,
"grad_norm": 0.8107399344444275,
"learning_rate": 1.3610220625641002e-06,
"loss": 0.1205,
"step": 932
},
{
"epoch": 2.5457025920873124,
"grad_norm": 0.5576758980751038,
"learning_rate": 1.3450504084765381e-06,
"loss": 0.122,
"step": 933
},
{
"epoch": 2.548431105047749,
"grad_norm": 0.6477549076080322,
"learning_rate": 1.3291662628473634e-06,
"loss": 0.1225,
"step": 934
},
{
"epoch": 2.5511596180081857,
"grad_norm": 0.5818179249763489,
"learning_rate": 1.313369786277987e-06,
"loss": 0.1191,
"step": 935
},
{
"epoch": 2.5538881309686223,
"grad_norm": 0.7482567429542542,
"learning_rate": 1.2976611384834148e-06,
"loss": 0.1205,
"step": 936
},
{
"epoch": 2.5566166439290585,
"grad_norm": 0.6153425574302673,
"learning_rate": 1.2820404782906315e-06,
"loss": 0.1218,
"step": 937
},
{
"epoch": 2.559345156889495,
"grad_norm": 0.6472384333610535,
"learning_rate": 1.266507963636997e-06,
"loss": 0.1216,
"step": 938
},
{
"epoch": 2.562073669849932,
"grad_norm": 0.6243811845779419,
"learning_rate": 1.2510637515686497e-06,
"loss": 0.1187,
"step": 939
},
{
"epoch": 2.5648021828103684,
"grad_norm": 0.643061637878418,
"learning_rate": 1.2357079982389197e-06,
"loss": 0.1197,
"step": 940
},
{
"epoch": 2.567530695770805,
"grad_norm": 0.592107892036438,
"learning_rate": 1.2204408589067462e-06,
"loss": 0.1231,
"step": 941
},
{
"epoch": 2.5702592087312413,
"grad_norm": 0.5936471819877625,
"learning_rate": 1.2052624879351105e-06,
"loss": 0.1199,
"step": 942
},
{
"epoch": 2.572987721691678,
"grad_norm": 0.6120553612709045,
"learning_rate": 1.190173038789476e-06,
"loss": 0.1229,
"step": 943
},
{
"epoch": 2.5757162346521145,
"grad_norm": 0.5819773077964783,
"learning_rate": 1.175172664036235e-06,
"loss": 0.1205,
"step": 944
},
{
"epoch": 2.578444747612551,
"grad_norm": 0.5850197076797485,
"learning_rate": 1.1602615153411666e-06,
"loss": 0.1188,
"step": 945
},
{
"epoch": 2.581173260572988,
"grad_norm": 0.5791446566581726,
"learning_rate": 1.1454397434679022e-06,
"loss": 0.1202,
"step": 946
},
{
"epoch": 2.5839017735334244,
"grad_norm": 0.6168528199195862,
"learning_rate": 1.1307074982764022e-06,
"loss": 0.1233,
"step": 947
},
{
"epoch": 2.586630286493861,
"grad_norm": 0.6764042973518372,
"learning_rate": 1.116064928721442e-06,
"loss": 0.1218,
"step": 948
},
{
"epoch": 2.5893587994542973,
"grad_norm": 0.6462754011154175,
"learning_rate": 1.1015121828511033e-06,
"loss": 0.1189,
"step": 949
},
{
"epoch": 2.592087312414734,
"grad_norm": 0.557019054889679,
"learning_rate": 1.0870494078052796e-06,
"loss": 0.1194,
"step": 950
},
{
"epoch": 2.5948158253751705,
"grad_norm": 0.6844297647476196,
"learning_rate": 1.0726767498141877e-06,
"loss": 0.1252,
"step": 951
},
{
"epoch": 2.597544338335607,
"grad_norm": 0.6181948781013489,
"learning_rate": 1.0583943541968856e-06,
"loss": 0.1204,
"step": 952
},
{
"epoch": 2.600272851296044,
"grad_norm": 0.6468029022216797,
"learning_rate": 1.044202365359811e-06,
"loss": 0.1213,
"step": 953
},
{
"epoch": 2.60300136425648,
"grad_norm": 0.607456386089325,
"learning_rate": 1.0301009267953145e-06,
"loss": 0.119,
"step": 954
},
{
"epoch": 2.6057298772169166,
"grad_norm": 0.6066553592681885,
"learning_rate": 1.0160901810802114e-06,
"loss": 0.1223,
"step": 955
},
{
"epoch": 2.6084583901773533,
"grad_norm": 0.6003243923187256,
"learning_rate": 1.0021702698743408e-06,
"loss": 0.1239,
"step": 956
},
{
"epoch": 2.61118690313779,
"grad_norm": 0.5733383297920227,
"learning_rate": 9.883413339191295e-07,
"loss": 0.1206,
"step": 957
},
{
"epoch": 2.6139154160982265,
"grad_norm": 0.6340444087982178,
"learning_rate": 9.746035130361741e-07,
"loss": 0.1162,
"step": 958
},
{
"epoch": 2.616643929058663,
"grad_norm": 0.5557238459587097,
"learning_rate": 9.609569461258262e-07,
"loss": 0.1197,
"step": 959
},
{
"epoch": 2.6193724420191,
"grad_norm": 0.590971827507019,
"learning_rate": 9.474017711657835e-07,
"loss": 0.1203,
"step": 960
},
{
"epoch": 2.622100954979536,
"grad_norm": 0.6239796280860901,
"learning_rate": 9.339381252097001e-07,
"loss": 0.1248,
"step": 961
},
{
"epoch": 2.6248294679399726,
"grad_norm": 0.6050336360931396,
"learning_rate": 9.205661443857994e-07,
"loss": 0.1213,
"step": 962
},
{
"epoch": 2.6275579809004093,
"grad_norm": 0.564030647277832,
"learning_rate": 9.072859638954956e-07,
"loss": 0.1189,
"step": 963
},
{
"epoch": 2.630286493860846,
"grad_norm": 0.5844472646713257,
"learning_rate": 8.940977180120247e-07,
"loss": 0.121,
"step": 964
},
{
"epoch": 2.6330150068212825,
"grad_norm": 0.6036733984947205,
"learning_rate": 8.810015400790994e-07,
"loss": 0.1203,
"step": 965
},
{
"epoch": 2.6357435197817187,
"grad_norm": 0.619272768497467,
"learning_rate": 8.67997562509546e-07,
"loss": 0.12,
"step": 966
},
{
"epoch": 2.6384720327421554,
"grad_norm": 0.5766704678535461,
"learning_rate": 8.550859167839665e-07,
"loss": 0.1192,
"step": 967
},
{
"epoch": 2.641200545702592,
"grad_norm": 0.6223350763320923,
"learning_rate": 8.42266733449425e-07,
"loss": 0.1218,
"step": 968
},
{
"epoch": 2.6439290586630286,
"grad_norm": 0.569657564163208,
"learning_rate": 8.295401421181126e-07,
"loss": 0.1239,
"step": 969
},
{
"epoch": 2.6466575716234653,
"grad_norm": 0.6090306639671326,
"learning_rate": 8.169062714660347e-07,
"loss": 0.1213,
"step": 970
},
{
"epoch": 2.649386084583902,
"grad_norm": 0.6015664935112,
"learning_rate": 8.043652492317256e-07,
"loss": 0.122,
"step": 971
},
{
"epoch": 2.6521145975443385,
"grad_norm": 0.5519795417785645,
"learning_rate": 7.919172022149458e-07,
"loss": 0.1204,
"step": 972
},
{
"epoch": 2.6548431105047747,
"grad_norm": 0.6051272749900818,
"learning_rate": 7.795622562753957e-07,
"loss": 0.1175,
"step": 973
},
{
"epoch": 2.6575716234652114,
"grad_norm": 0.5712152123451233,
"learning_rate": 7.673005363314578e-07,
"loss": 0.1234,
"step": 974
},
{
"epoch": 2.660300136425648,
"grad_norm": 0.5520964860916138,
"learning_rate": 7.551321663589229e-07,
"loss": 0.1198,
"step": 975
},
{
"epoch": 2.6630286493860846,
"grad_norm": 0.711789608001709,
"learning_rate": 7.430572693897342e-07,
"loss": 0.1188,
"step": 976
},
{
"epoch": 2.6657571623465213,
"grad_norm": 0.6316415667533875,
"learning_rate": 7.310759675107515e-07,
"loss": 0.1197,
"step": 977
},
{
"epoch": 2.6684856753069575,
"grad_norm": 0.6260297298431396,
"learning_rate": 7.19188381862519e-07,
"loss": 0.1195,
"step": 978
},
{
"epoch": 2.6712141882673945,
"grad_norm": 0.5559263229370117,
"learning_rate": 7.073946326380243e-07,
"loss": 0.1211,
"step": 979
},
{
"epoch": 2.6739427012278307,
"grad_norm": 0.5541836619377136,
"learning_rate": 6.956948390814977e-07,
"loss": 0.1206,
"step": 980
},
{
"epoch": 2.6766712141882674,
"grad_norm": 0.5544542074203491,
"learning_rate": 6.840891194872112e-07,
"loss": 0.1212,
"step": 981
},
{
"epoch": 2.679399727148704,
"grad_norm": 0.5722479224205017,
"learning_rate": 6.725775911982602e-07,
"loss": 0.1162,
"step": 982
},
{
"epoch": 2.6821282401091406,
"grad_norm": 0.58185213804245,
"learning_rate": 6.61160370605397e-07,
"loss": 0.1204,
"step": 983
},
{
"epoch": 2.6848567530695773,
"grad_norm": 0.5623180270195007,
"learning_rate": 6.498375731458529e-07,
"loss": 0.1197,
"step": 984
},
{
"epoch": 2.6875852660300135,
"grad_norm": 0.7158675193786621,
"learning_rate": 6.386093133021554e-07,
"loss": 0.12,
"step": 985
},
{
"epoch": 2.69031377899045,
"grad_norm": 0.5737314224243164,
"learning_rate": 6.274757046009871e-07,
"loss": 0.12,
"step": 986
},
{
"epoch": 2.6930422919508867,
"grad_norm": 0.5828775763511658,
"learning_rate": 6.164368596120351e-07,
"loss": 0.1187,
"step": 987
},
{
"epoch": 2.6957708049113234,
"grad_norm": 0.6204085946083069,
"learning_rate": 6.054928899468427e-07,
"loss": 0.1183,
"step": 988
},
{
"epoch": 2.69849931787176,
"grad_norm": 0.6081199645996094,
"learning_rate": 5.946439062576903e-07,
"loss": 0.1198,
"step": 989
},
{
"epoch": 2.701227830832196,
"grad_norm": 0.5731498599052429,
"learning_rate": 5.83890018236476e-07,
"loss": 0.1194,
"step": 990
},
{
"epoch": 2.7039563437926333,
"grad_norm": 0.5686942934989929,
"learning_rate": 5.732313346136032e-07,
"loss": 0.1209,
"step": 991
},
{
"epoch": 2.7066848567530695,
"grad_norm": 0.7291159629821777,
"learning_rate": 5.626679631568832e-07,
"loss": 0.1197,
"step": 992
},
{
"epoch": 2.709413369713506,
"grad_norm": 0.609981119632721,
"learning_rate": 5.52200010670444e-07,
"loss": 0.1171,
"step": 993
},
{
"epoch": 2.7121418826739427,
"grad_norm": 0.5696244239807129,
"learning_rate": 5.418275829936537e-07,
"loss": 0.1223,
"step": 994
},
{
"epoch": 2.7148703956343794,
"grad_norm": 0.5775014162063599,
"learning_rate": 5.315507850000456e-07,
"loss": 0.1208,
"step": 995
},
{
"epoch": 2.717598908594816,
"grad_norm": 0.5481632947921753,
"learning_rate": 5.213697205962631e-07,
"loss": 0.1217,
"step": 996
},
{
"epoch": 2.720327421555252,
"grad_norm": 0.6667028665542603,
"learning_rate": 5.112844927210048e-07,
"loss": 0.1161,
"step": 997
},
{
"epoch": 2.723055934515689,
"grad_norm": 0.5523772835731506,
"learning_rate": 5.012952033439844e-07,
"loss": 0.118,
"step": 998
},
{
"epoch": 2.7257844474761255,
"grad_norm": 0.5393189787864685,
"learning_rate": 4.914019534649039e-07,
"loss": 0.1234,
"step": 999
},
{
"epoch": 2.728512960436562,
"grad_norm": 0.5639728903770447,
"learning_rate": 4.816048431124265e-07,
"loss": 0.1194,
"step": 1000
},
{
"epoch": 2.7312414733969987,
"grad_norm": 0.6052538156509399,
"learning_rate": 4.7190397134316946e-07,
"loss": 0.122,
"step": 1001
},
{
"epoch": 2.733969986357435,
"grad_norm": 0.5383861660957336,
"learning_rate": 4.6229943624069963e-07,
"loss": 0.1183,
"step": 1002
},
{
"epoch": 2.736698499317872,
"grad_norm": 0.5372768640518188,
"learning_rate": 4.5279133491454406e-07,
"loss": 0.1174,
"step": 1003
},
{
"epoch": 2.739427012278308,
"grad_norm": 0.5531887412071228,
"learning_rate": 4.4337976349920763e-07,
"loss": 0.1198,
"step": 1004
},
{
"epoch": 2.742155525238745,
"grad_norm": 0.622898519039154,
"learning_rate": 4.3406481715319916e-07,
"loss": 0.1183,
"step": 1005
},
{
"epoch": 2.7448840381991815,
"grad_norm": 0.5865097045898438,
"learning_rate": 4.248465900580734e-07,
"loss": 0.1235,
"step": 1006
},
{
"epoch": 2.747612551159618,
"grad_norm": 0.5709042549133301,
"learning_rate": 4.1572517541747294e-07,
"loss": 0.1184,
"step": 1007
},
{
"epoch": 2.7503410641200547,
"grad_norm": 0.5316457152366638,
"learning_rate": 4.0670066545619224e-07,
"loss": 0.1208,
"step": 1008
},
{
"epoch": 2.753069577080491,
"grad_norm": 0.5730975866317749,
"learning_rate": 3.9777315141923847e-07,
"loss": 0.1203,
"step": 1009
},
{
"epoch": 2.7557980900409276,
"grad_norm": 0.5913178324699402,
"learning_rate": 3.889427235709153e-07,
"loss": 0.1166,
"step": 1010
},
{
"epoch": 2.758526603001364,
"grad_norm": 0.5924515724182129,
"learning_rate": 3.802094711939075e-07,
"loss": 0.1225,
"step": 1011
},
{
"epoch": 2.761255115961801,
"grad_norm": 0.5406906604766846,
"learning_rate": 3.715734825883766e-07,
"loss": 0.1203,
"step": 1012
},
{
"epoch": 2.7639836289222375,
"grad_norm": 0.5417434573173523,
"learning_rate": 3.6303484507106965e-07,
"loss": 0.1225,
"step": 1013
},
{
"epoch": 2.7667121418826737,
"grad_norm": 0.5310923457145691,
"learning_rate": 3.5459364497443696e-07,
"loss": 0.1223,
"step": 1014
},
{
"epoch": 2.7694406548431107,
"grad_norm": 0.60945725440979,
"learning_rate": 3.462499676457598e-07,
"loss": 0.1172,
"step": 1015
},
{
"epoch": 2.772169167803547,
"grad_norm": 0.5810585021972656,
"learning_rate": 3.38003897446284e-07,
"loss": 0.1197,
"step": 1016
},
{
"epoch": 2.7748976807639836,
"grad_norm": 0.5767588019371033,
"learning_rate": 3.298555177503726e-07,
"loss": 0.1206,
"step": 1017
},
{
"epoch": 2.77762619372442,
"grad_norm": 0.5493221282958984,
"learning_rate": 3.2180491094465414e-07,
"loss": 0.1209,
"step": 1018
},
{
"epoch": 2.780354706684857,
"grad_norm": 0.5434983968734741,
"learning_rate": 3.138521584272003e-07,
"loss": 0.1233,
"step": 1019
},
{
"epoch": 2.7830832196452935,
"grad_norm": 0.6788725256919861,
"learning_rate": 3.059973406066963e-07,
"loss": 0.121,
"step": 1020
},
{
"epoch": 2.7858117326057297,
"grad_norm": 0.5693413615226746,
"learning_rate": 2.982405369016272e-07,
"loss": 0.1204,
"step": 1021
},
{
"epoch": 2.7885402455661663,
"grad_norm": 0.5602209568023682,
"learning_rate": 2.905818257394799e-07,
"loss": 0.1185,
"step": 1022
},
{
"epoch": 2.791268758526603,
"grad_norm": 0.5665656328201294,
"learning_rate": 2.830212845559466e-07,
"loss": 0.1225,
"step": 1023
},
{
"epoch": 2.7939972714870396,
"grad_norm": 0.5789375305175781,
"learning_rate": 2.7555898979413796e-07,
"loss": 0.1199,
"step": 1024
},
{
"epoch": 2.796725784447476,
"grad_norm": 0.5932232141494751,
"learning_rate": 2.6819501690382275e-07,
"loss": 0.1193,
"step": 1025
},
{
"epoch": 2.799454297407913,
"grad_norm": 0.6497074961662292,
"learning_rate": 2.609294403406537e-07,
"loss": 0.1217,
"step": 1026
},
{
"epoch": 2.8021828103683495,
"grad_norm": 0.5614838004112244,
"learning_rate": 2.537623335654127e-07,
"loss": 0.1187,
"step": 1027
},
{
"epoch": 2.8049113233287857,
"grad_norm": 0.5890393257141113,
"learning_rate": 2.4669376904328244e-07,
"loss": 0.1208,
"step": 1028
},
{
"epoch": 2.8076398362892223,
"grad_norm": 0.5420589447021484,
"learning_rate": 2.397238182430994e-07,
"loss": 0.1171,
"step": 1029
},
{
"epoch": 2.810368349249659,
"grad_norm": 0.5810146331787109,
"learning_rate": 2.3285255163663535e-07,
"loss": 0.1207,
"step": 1030
},
{
"epoch": 2.8130968622100956,
"grad_norm": 0.6615646481513977,
"learning_rate": 2.2608003869788786e-07,
"loss": 0.1209,
"step": 1031
},
{
"epoch": 2.815825375170532,
"grad_norm": 0.5754259824752808,
"learning_rate": 2.1940634790238003e-07,
"loss": 0.1208,
"step": 1032
},
{
"epoch": 2.8185538881309684,
"grad_norm": 0.642939567565918,
"learning_rate": 2.1283154672645522e-07,
"loss": 0.1205,
"step": 1033
},
{
"epoch": 2.821282401091405,
"grad_norm": 0.5582534074783325,
"learning_rate": 2.063557016466111e-07,
"loss": 0.1188,
"step": 1034
},
{
"epoch": 2.8240109140518417,
"grad_norm": 0.5194653272628784,
"learning_rate": 1.999788781388201e-07,
"loss": 0.1202,
"step": 1035
},
{
"epoch": 2.8267394270122783,
"grad_norm": 0.5354530215263367,
"learning_rate": 1.9370114067785995e-07,
"loss": 0.1196,
"step": 1036
},
{
"epoch": 2.829467939972715,
"grad_norm": 0.5607388615608215,
"learning_rate": 1.8752255273667752e-07,
"loss": 0.1166,
"step": 1037
},
{
"epoch": 2.8321964529331516,
"grad_norm": 0.5641687512397766,
"learning_rate": 1.8144317678573497e-07,
"loss": 0.1181,
"step": 1038
},
{
"epoch": 2.8349249658935882,
"grad_norm": 0.5822216272354126,
"learning_rate": 1.7546307429238129e-07,
"loss": 0.1195,
"step": 1039
},
{
"epoch": 2.8376534788540244,
"grad_norm": 0.547010600566864,
"learning_rate": 1.6958230572023504e-07,
"loss": 0.1192,
"step": 1040
},
{
"epoch": 2.840381991814461,
"grad_norm": 0.6075326204299927,
"learning_rate": 1.6380093052856482e-07,
"loss": 0.1186,
"step": 1041
},
{
"epoch": 2.8431105047748977,
"grad_norm": 0.5427007675170898,
"learning_rate": 1.5811900717169537e-07,
"loss": 0.122,
"step": 1042
},
{
"epoch": 2.8458390177353343,
"grad_norm": 0.5267951488494873,
"learning_rate": 1.5253659309841463e-07,
"loss": 0.1196,
"step": 1043
},
{
"epoch": 2.848567530695771,
"grad_norm": 0.5560564398765564,
"learning_rate": 1.4705374475138978e-07,
"loss": 0.121,
"step": 1044
},
{
"epoch": 2.851296043656207,
"grad_norm": 0.7646809220314026,
"learning_rate": 1.416705175666e-07,
"loss": 0.12,
"step": 1045
},
{
"epoch": 2.854024556616644,
"grad_norm": 0.5174649953842163,
"learning_rate": 1.3638696597277678e-07,
"loss": 0.1195,
"step": 1046
},
{
"epoch": 2.8567530695770804,
"grad_norm": 0.549839198589325,
"learning_rate": 1.3120314339084782e-07,
"loss": 0.1179,
"step": 1047
},
{
"epoch": 2.859481582537517,
"grad_norm": 0.6386695504188538,
"learning_rate": 1.2611910223340408e-07,
"loss": 0.1194,
"step": 1048
},
{
"epoch": 2.8622100954979537,
"grad_norm": 0.5545169115066528,
"learning_rate": 1.2113489390416565e-07,
"loss": 0.1204,
"step": 1049
},
{
"epoch": 2.8649386084583903,
"grad_norm": 0.5403316617012024,
"learning_rate": 1.1625056879746133e-07,
"loss": 0.1181,
"step": 1050
},
{
"epoch": 2.867667121418827,
"grad_norm": 0.6179401278495789,
"learning_rate": 1.1146617629772316e-07,
"loss": 0.1206,
"step": 1051
},
{
"epoch": 2.870395634379263,
"grad_norm": 0.5648332834243774,
"learning_rate": 1.0678176477898372e-07,
"loss": 0.1189,
"step": 1052
},
{
"epoch": 2.8731241473397,
"grad_norm": 0.6361631751060486,
"learning_rate": 1.0219738160438753e-07,
"loss": 0.1212,
"step": 1053
},
{
"epoch": 2.8758526603001364,
"grad_norm": 0.5742694735527039,
"learning_rate": 9.771307312571254e-08,
"loss": 0.1185,
"step": 1054
},
{
"epoch": 2.878581173260573,
"grad_norm": 0.5447854995727539,
"learning_rate": 9.332888468290168e-08,
"loss": 0.1192,
"step": 1055
},
{
"epoch": 2.8813096862210097,
"grad_norm": 0.5452158451080322,
"learning_rate": 8.90448606036054e-08,
"loss": 0.1182,
"step": 1056
},
{
"epoch": 2.884038199181446,
"grad_norm": 0.5986908674240112,
"learning_rate": 8.486104420272979e-08,
"loss": 0.1213,
"step": 1057
},
{
"epoch": 2.8867667121418825,
"grad_norm": 0.5244796276092529,
"learning_rate": 8.077747778200474e-08,
"loss": 0.1218,
"step": 1058
},
{
"epoch": 2.889495225102319,
"grad_norm": 0.5394319295883179,
"learning_rate": 7.679420262954984e-08,
"loss": 0.1195,
"step": 1059
},
{
"epoch": 2.892223738062756,
"grad_norm": 0.6105839014053345,
"learning_rate": 7.291125901946027e-08,
"loss": 0.1209,
"step": 1060
},
{
"epoch": 2.8949522510231924,
"grad_norm": 0.5574126839637756,
"learning_rate": 6.912868621140045e-08,
"loss": 0.1206,
"step": 1061
},
{
"epoch": 2.897680763983629,
"grad_norm": 0.5388451218605042,
"learning_rate": 6.544652245020433e-08,
"loss": 0.1211,
"step": 1062
},
{
"epoch": 2.9004092769440657,
"grad_norm": 0.5984753370285034,
"learning_rate": 6.18648049654913e-08,
"loss": 0.1201,
"step": 1063
},
{
"epoch": 2.903137789904502,
"grad_norm": 0.5538267493247986,
"learning_rate": 5.838356997128869e-08,
"loss": 0.1198,
"step": 1064
},
{
"epoch": 2.9058663028649385,
"grad_norm": 0.6062244772911072,
"learning_rate": 5.500285266566319e-08,
"loss": 0.1192,
"step": 1065
},
{
"epoch": 2.908594815825375,
"grad_norm": 0.5678040385246277,
"learning_rate": 5.1722687230369995e-08,
"loss": 0.1214,
"step": 1066
},
{
"epoch": 2.911323328785812,
"grad_norm": 0.5567272901535034,
"learning_rate": 4.854310683050312e-08,
"loss": 0.1152,
"step": 1067
},
{
"epoch": 2.9140518417462484,
"grad_norm": 0.551325798034668,
"learning_rate": 4.5464143614162294e-08,
"loss": 0.1211,
"step": 1068
},
{
"epoch": 2.9167803547066846,
"grad_norm": 0.5160740613937378,
"learning_rate": 4.2485828712126584e-08,
"loss": 0.1174,
"step": 1069
},
{
"epoch": 2.9195088676671213,
"grad_norm": 0.5358514189720154,
"learning_rate": 3.96081922375402e-08,
"loss": 0.1187,
"step": 1070
},
{
"epoch": 2.922237380627558,
"grad_norm": 0.5362658500671387,
"learning_rate": 3.683126328560826e-08,
"loss": 0.1174,
"step": 1071
},
{
"epoch": 2.9249658935879945,
"grad_norm": 0.6440585851669312,
"learning_rate": 3.4155069933301535e-08,
"loss": 0.1143,
"step": 1072
},
{
"epoch": 2.927694406548431,
"grad_norm": 0.5408025979995728,
"learning_rate": 3.1579639239074364e-08,
"loss": 0.119,
"step": 1073
},
{
"epoch": 2.930422919508868,
"grad_norm": 0.5744211673736572,
"learning_rate": 2.9104997242590528e-08,
"loss": 0.1212,
"step": 1074
},
{
"epoch": 2.9331514324693044,
"grad_norm": 0.5802960991859436,
"learning_rate": 2.673116896445671e-08,
"loss": 0.1225,
"step": 1075
},
{
"epoch": 2.9358799454297406,
"grad_norm": 0.6244254112243652,
"learning_rate": 2.4458178405974974e-08,
"loss": 0.121,
"step": 1076
},
{
"epoch": 2.9386084583901773,
"grad_norm": 0.5513128638267517,
"learning_rate": 2.2286048548897378e-08,
"loss": 0.121,
"step": 1077
},
{
"epoch": 2.941336971350614,
"grad_norm": 0.5372868776321411,
"learning_rate": 2.0214801355192826e-08,
"loss": 0.1208,
"step": 1078
},
{
"epoch": 2.9440654843110505,
"grad_norm": 0.550727903842926,
"learning_rate": 1.824445776682504e-08,
"loss": 0.1193,
"step": 1079
},
{
"epoch": 2.946793997271487,
"grad_norm": 0.5490522384643555,
"learning_rate": 1.6375037705543827e-08,
"loss": 0.1197,
"step": 1080
},
{
"epoch": 2.9495225102319234,
"grad_norm": 0.5467248558998108,
"learning_rate": 1.4606560072679687e-08,
"loss": 0.1187,
"step": 1081
},
{
"epoch": 2.9522510231923604,
"grad_norm": 0.5518107414245605,
"learning_rate": 1.2939042748955078e-08,
"loss": 0.1192,
"step": 1082
},
{
"epoch": 2.9549795361527966,
"grad_norm": 0.5449725985527039,
"learning_rate": 1.1372502594303448e-08,
"loss": 0.1187,
"step": 1083
},
{
"epoch": 2.9577080491132333,
"grad_norm": 0.546035885810852,
"learning_rate": 9.906955447697153e-09,
"loss": 0.1199,
"step": 1084
},
{
"epoch": 2.96043656207367,
"grad_norm": 0.6331919431686401,
"learning_rate": 8.542416126989805e-09,
"loss": 0.116,
"step": 1085
},
{
"epoch": 2.9631650750341065,
"grad_norm": 0.5184534192085266,
"learning_rate": 7.278898428764169e-09,
"loss": 0.1202,
"step": 1086
},
{
"epoch": 2.965893587994543,
"grad_norm": 0.5950097441673279,
"learning_rate": 6.1164151281944974e-09,
"loss": 0.1163,
"step": 1087
},
{
"epoch": 2.9686221009549794,
"grad_norm": 0.5502780675888062,
"learning_rate": 5.054977978916631e-09,
"loss": 0.1175,
"step": 1088
},
{
"epoch": 2.971350613915416,
"grad_norm": 0.5208196640014648,
"learning_rate": 4.094597712908099e-09,
"loss": 0.1206,
"step": 1089
},
{
"epoch": 2.9740791268758526,
"grad_norm": 0.5595605969429016,
"learning_rate": 3.2352840403804264e-09,
"loss": 0.1185,
"step": 1090
},
{
"epoch": 2.9768076398362893,
"grad_norm": 0.6191928386688232,
"learning_rate": 2.477045649681431e-09,
"loss": 0.1181,
"step": 1091
},
{
"epoch": 2.979536152796726,
"grad_norm": 0.5491606593132019,
"learning_rate": 1.8198902072097402e-09,
"loss": 0.1194,
"step": 1092
},
{
"epoch": 2.982264665757162,
"grad_norm": 0.5842803120613098,
"learning_rate": 1.2638243573293019e-09,
"loss": 0.1201,
"step": 1093
},
{
"epoch": 2.984993178717599,
"grad_norm": 0.5514943599700928,
"learning_rate": 8.088537223116533e-10,
"loss": 0.117,
"step": 1094
},
{
"epoch": 2.9877216916780354,
"grad_norm": 0.603424072265625,
"learning_rate": 4.549829022748586e-10,
"loss": 0.1182,
"step": 1095
},
{
"epoch": 2.990450204638472,
"grad_norm": 0.59501051902771,
"learning_rate": 2.02215475132439e-10,
"loss": 0.1163,
"step": 1096
},
{
"epoch": 2.9931787175989086,
"grad_norm": 0.6325620412826538,
"learning_rate": 5.0553996568947216e-11,
"loss": 0.1229,
"step": 1097
},
{
"epoch": 2.9959072305593453,
"grad_norm": 0.536891520023346,
"learning_rate": 0.0,
"loss": 0.1192,
"step": 1098
},
{
"epoch": 2.9959072305593453,
"step": 1098,
"total_flos": 1.338644246774258e+19,
"train_loss": 0.3336014450614244,
"train_runtime": 19884.0933,
"train_samples_per_second": 7.076,
"train_steps_per_second": 0.055
}
],
"logging_steps": 1,
"max_steps": 1098,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 999999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.338644246774258e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}