baseline-gemma-2-9b-it-sft / trainer_state.json
ZhangShenao's picture
Model save
987a43a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9959072305593453,
"eval_steps": 500,
"global_step": 1098,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002728512960436562,
"grad_norm": 13.13153076171875,
"learning_rate": 1.8181818181818183e-07,
"loss": 1.0682,
"step": 1
},
{
"epoch": 0.005457025920873124,
"grad_norm": 12.502574920654297,
"learning_rate": 3.6363636363636366e-07,
"loss": 1.0664,
"step": 2
},
{
"epoch": 0.008185538881309686,
"grad_norm": 12.2980318069458,
"learning_rate": 5.454545454545455e-07,
"loss": 1.0515,
"step": 3
},
{
"epoch": 0.010914051841746248,
"grad_norm": 12.096355438232422,
"learning_rate": 7.272727272727273e-07,
"loss": 1.046,
"step": 4
},
{
"epoch": 0.013642564802182811,
"grad_norm": 11.493118286132812,
"learning_rate": 9.090909090909091e-07,
"loss": 1.0358,
"step": 5
},
{
"epoch": 0.01637107776261937,
"grad_norm": 11.29008960723877,
"learning_rate": 1.090909090909091e-06,
"loss": 1.0308,
"step": 6
},
{
"epoch": 0.019099590723055934,
"grad_norm": 8.374974250793457,
"learning_rate": 1.2727272727272728e-06,
"loss": 0.9522,
"step": 7
},
{
"epoch": 0.021828103683492497,
"grad_norm": 6.757812976837158,
"learning_rate": 1.4545454545454546e-06,
"loss": 0.9036,
"step": 8
},
{
"epoch": 0.02455661664392906,
"grad_norm": 4.820138931274414,
"learning_rate": 1.6363636363636365e-06,
"loss": 0.8463,
"step": 9
},
{
"epoch": 0.027285129604365622,
"grad_norm": 4.44769811630249,
"learning_rate": 1.8181818181818183e-06,
"loss": 0.8134,
"step": 10
},
{
"epoch": 0.030013642564802184,
"grad_norm": 3.9749584197998047,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.7435,
"step": 11
},
{
"epoch": 0.03274215552523874,
"grad_norm": 3.9931881427764893,
"learning_rate": 2.181818181818182e-06,
"loss": 0.7355,
"step": 12
},
{
"epoch": 0.03547066848567531,
"grad_norm": 3.2150256633758545,
"learning_rate": 2.363636363636364e-06,
"loss": 0.7128,
"step": 13
},
{
"epoch": 0.03819918144611187,
"grad_norm": 2.37015962600708,
"learning_rate": 2.5454545454545456e-06,
"loss": 0.6894,
"step": 14
},
{
"epoch": 0.040927694406548434,
"grad_norm": 1.3267147541046143,
"learning_rate": 2.7272727272727272e-06,
"loss": 0.6372,
"step": 15
},
{
"epoch": 0.04365620736698499,
"grad_norm": 1.1746413707733154,
"learning_rate": 2.9090909090909093e-06,
"loss": 0.6209,
"step": 16
},
{
"epoch": 0.04638472032742155,
"grad_norm": 1.107882022857666,
"learning_rate": 3.090909090909091e-06,
"loss": 0.6018,
"step": 17
},
{
"epoch": 0.04911323328785812,
"grad_norm": 1.0003585815429688,
"learning_rate": 3.272727272727273e-06,
"loss": 0.5872,
"step": 18
},
{
"epoch": 0.05184174624829468,
"grad_norm": 1.0367988348007202,
"learning_rate": 3.454545454545455e-06,
"loss": 0.5581,
"step": 19
},
{
"epoch": 0.054570259208731244,
"grad_norm": 1.337457299232483,
"learning_rate": 3.6363636363636366e-06,
"loss": 0.5544,
"step": 20
},
{
"epoch": 0.0572987721691678,
"grad_norm": 1.690187692642212,
"learning_rate": 3.818181818181819e-06,
"loss": 0.5362,
"step": 21
},
{
"epoch": 0.06002728512960437,
"grad_norm": 7.317511558532715,
"learning_rate": 4.000000000000001e-06,
"loss": 0.5536,
"step": 22
},
{
"epoch": 0.06275579809004093,
"grad_norm": 2.9221789836883545,
"learning_rate": 4.181818181818182e-06,
"loss": 0.5305,
"step": 23
},
{
"epoch": 0.06548431105047749,
"grad_norm": 0.9176937937736511,
"learning_rate": 4.363636363636364e-06,
"loss": 0.5204,
"step": 24
},
{
"epoch": 0.06821282401091405,
"grad_norm": 0.6119560599327087,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.5218,
"step": 25
},
{
"epoch": 0.07094133697135062,
"grad_norm": 0.5912665724754333,
"learning_rate": 4.727272727272728e-06,
"loss": 0.5083,
"step": 26
},
{
"epoch": 0.07366984993178717,
"grad_norm": 0.6215618848800659,
"learning_rate": 4.90909090909091e-06,
"loss": 0.5121,
"step": 27
},
{
"epoch": 0.07639836289222374,
"grad_norm": 0.6058225631713867,
"learning_rate": 5.090909090909091e-06,
"loss": 0.5078,
"step": 28
},
{
"epoch": 0.0791268758526603,
"grad_norm": 0.5321693420410156,
"learning_rate": 5.272727272727273e-06,
"loss": 0.4971,
"step": 29
},
{
"epoch": 0.08185538881309687,
"grad_norm": 0.5189298391342163,
"learning_rate": 5.4545454545454545e-06,
"loss": 0.4829,
"step": 30
},
{
"epoch": 0.08458390177353342,
"grad_norm": 0.5020465850830078,
"learning_rate": 5.636363636363636e-06,
"loss": 0.4916,
"step": 31
},
{
"epoch": 0.08731241473396999,
"grad_norm": 0.49391138553619385,
"learning_rate": 5.8181818181818185e-06,
"loss": 0.4763,
"step": 32
},
{
"epoch": 0.09004092769440655,
"grad_norm": 0.4882354736328125,
"learning_rate": 6e-06,
"loss": 0.4749,
"step": 33
},
{
"epoch": 0.0927694406548431,
"grad_norm": 0.5190555453300476,
"learning_rate": 6.181818181818182e-06,
"loss": 0.4756,
"step": 34
},
{
"epoch": 0.09549795361527967,
"grad_norm": 0.4930441379547119,
"learning_rate": 6.363636363636364e-06,
"loss": 0.4691,
"step": 35
},
{
"epoch": 0.09822646657571624,
"grad_norm": 0.48273417353630066,
"learning_rate": 6.545454545454546e-06,
"loss": 0.4646,
"step": 36
},
{
"epoch": 0.1009549795361528,
"grad_norm": 0.49655964970588684,
"learning_rate": 6.7272727272727275e-06,
"loss": 0.4572,
"step": 37
},
{
"epoch": 0.10368349249658936,
"grad_norm": 0.47393277287483215,
"learning_rate": 6.90909090909091e-06,
"loss": 0.4474,
"step": 38
},
{
"epoch": 0.10641200545702592,
"grad_norm": 0.5115132331848145,
"learning_rate": 7.0909090909090916e-06,
"loss": 0.4517,
"step": 39
},
{
"epoch": 0.10914051841746249,
"grad_norm": 0.4923647940158844,
"learning_rate": 7.272727272727273e-06,
"loss": 0.4435,
"step": 40
},
{
"epoch": 0.11186903137789904,
"grad_norm": 0.49242180585861206,
"learning_rate": 7.454545454545456e-06,
"loss": 0.4388,
"step": 41
},
{
"epoch": 0.1145975443383356,
"grad_norm": 0.49869081377983093,
"learning_rate": 7.636363636363638e-06,
"loss": 0.4435,
"step": 42
},
{
"epoch": 0.11732605729877217,
"grad_norm": 0.507636547088623,
"learning_rate": 7.81818181818182e-06,
"loss": 0.4307,
"step": 43
},
{
"epoch": 0.12005457025920874,
"grad_norm": 0.511820375919342,
"learning_rate": 8.000000000000001e-06,
"loss": 0.4328,
"step": 44
},
{
"epoch": 0.12278308321964529,
"grad_norm": 0.5347044467926025,
"learning_rate": 8.181818181818183e-06,
"loss": 0.4275,
"step": 45
},
{
"epoch": 0.12551159618008187,
"grad_norm": 0.5053935050964355,
"learning_rate": 8.363636363636365e-06,
"loss": 0.4182,
"step": 46
},
{
"epoch": 0.12824010914051842,
"grad_norm": 0.5245314836502075,
"learning_rate": 8.545454545454546e-06,
"loss": 0.4151,
"step": 47
},
{
"epoch": 0.13096862210095497,
"grad_norm": 0.4985579550266266,
"learning_rate": 8.727272727272728e-06,
"loss": 0.4041,
"step": 48
},
{
"epoch": 0.13369713506139155,
"grad_norm": 0.5282207131385803,
"learning_rate": 8.90909090909091e-06,
"loss": 0.4074,
"step": 49
},
{
"epoch": 0.1364256480218281,
"grad_norm": 0.5185700058937073,
"learning_rate": 9.090909090909091e-06,
"loss": 0.3981,
"step": 50
},
{
"epoch": 0.13915416098226466,
"grad_norm": 0.5206958651542664,
"learning_rate": 9.272727272727273e-06,
"loss": 0.4016,
"step": 51
},
{
"epoch": 0.14188267394270124,
"grad_norm": 0.5600295662879944,
"learning_rate": 9.454545454545456e-06,
"loss": 0.3967,
"step": 52
},
{
"epoch": 0.1446111869031378,
"grad_norm": 0.5325789451599121,
"learning_rate": 9.636363636363638e-06,
"loss": 0.3902,
"step": 53
},
{
"epoch": 0.14733969986357434,
"grad_norm": 0.556331992149353,
"learning_rate": 9.81818181818182e-06,
"loss": 0.3848,
"step": 54
},
{
"epoch": 0.15006821282401092,
"grad_norm": 0.5511519312858582,
"learning_rate": 1e-05,
"loss": 0.3815,
"step": 55
},
{
"epoch": 0.15279672578444747,
"grad_norm": 0.5680494904518127,
"learning_rate": 1.0181818181818182e-05,
"loss": 0.3793,
"step": 56
},
{
"epoch": 0.15552523874488403,
"grad_norm": 0.5566679835319519,
"learning_rate": 1.0363636363636364e-05,
"loss": 0.3712,
"step": 57
},
{
"epoch": 0.1582537517053206,
"grad_norm": 0.5773091912269592,
"learning_rate": 1.0545454545454546e-05,
"loss": 0.3771,
"step": 58
},
{
"epoch": 0.16098226466575716,
"grad_norm": 0.5730243921279907,
"learning_rate": 1.0727272727272729e-05,
"loss": 0.3721,
"step": 59
},
{
"epoch": 0.16371077762619374,
"grad_norm": 0.5875830054283142,
"learning_rate": 1.0909090909090909e-05,
"loss": 0.3643,
"step": 60
},
{
"epoch": 0.1664392905866303,
"grad_norm": 0.6426472663879395,
"learning_rate": 1.1090909090909092e-05,
"loss": 0.3511,
"step": 61
},
{
"epoch": 0.16916780354706684,
"grad_norm": 0.6310497522354126,
"learning_rate": 1.1272727272727272e-05,
"loss": 0.3535,
"step": 62
},
{
"epoch": 0.17189631650750342,
"grad_norm": 0.640872061252594,
"learning_rate": 1.1454545454545455e-05,
"loss": 0.345,
"step": 63
},
{
"epoch": 0.17462482946793997,
"grad_norm": 0.5955975651741028,
"learning_rate": 1.1636363636363637e-05,
"loss": 0.35,
"step": 64
},
{
"epoch": 0.17735334242837653,
"grad_norm": 0.6298477053642273,
"learning_rate": 1.181818181818182e-05,
"loss": 0.3399,
"step": 65
},
{
"epoch": 0.1800818553888131,
"grad_norm": 0.6077655553817749,
"learning_rate": 1.2e-05,
"loss": 0.3456,
"step": 66
},
{
"epoch": 0.18281036834924966,
"grad_norm": 0.5816149115562439,
"learning_rate": 1.2181818181818184e-05,
"loss": 0.3355,
"step": 67
},
{
"epoch": 0.1855388813096862,
"grad_norm": 0.6176887154579163,
"learning_rate": 1.2363636363636364e-05,
"loss": 0.329,
"step": 68
},
{
"epoch": 0.1882673942701228,
"grad_norm": 0.6302605271339417,
"learning_rate": 1.2545454545454547e-05,
"loss": 0.337,
"step": 69
},
{
"epoch": 0.19099590723055934,
"grad_norm": 0.5885477662086487,
"learning_rate": 1.2727272727272728e-05,
"loss": 0.3333,
"step": 70
},
{
"epoch": 0.1937244201909959,
"grad_norm": 0.5948558449745178,
"learning_rate": 1.2909090909090912e-05,
"loss": 0.3207,
"step": 71
},
{
"epoch": 0.19645293315143247,
"grad_norm": 0.6642739772796631,
"learning_rate": 1.3090909090909092e-05,
"loss": 0.3142,
"step": 72
},
{
"epoch": 0.19918144611186903,
"grad_norm": 0.6380135416984558,
"learning_rate": 1.3272727272727275e-05,
"loss": 0.3285,
"step": 73
},
{
"epoch": 0.2019099590723056,
"grad_norm": 0.6566604971885681,
"learning_rate": 1.3454545454545455e-05,
"loss": 0.3154,
"step": 74
},
{
"epoch": 0.20463847203274216,
"grad_norm": 0.5697330236434937,
"learning_rate": 1.3636363636363637e-05,
"loss": 0.3075,
"step": 75
},
{
"epoch": 0.2073669849931787,
"grad_norm": 0.6676266193389893,
"learning_rate": 1.381818181818182e-05,
"loss": 0.3055,
"step": 76
},
{
"epoch": 0.2100954979536153,
"grad_norm": 0.6059987545013428,
"learning_rate": 1.4e-05,
"loss": 0.3022,
"step": 77
},
{
"epoch": 0.21282401091405184,
"grad_norm": 0.6220597624778748,
"learning_rate": 1.4181818181818183e-05,
"loss": 0.2972,
"step": 78
},
{
"epoch": 0.2155525238744884,
"grad_norm": 0.6020926237106323,
"learning_rate": 1.4363636363636365e-05,
"loss": 0.3062,
"step": 79
},
{
"epoch": 0.21828103683492497,
"grad_norm": 0.6444036960601807,
"learning_rate": 1.4545454545454546e-05,
"loss": 0.2982,
"step": 80
},
{
"epoch": 0.22100954979536153,
"grad_norm": 0.5778307914733887,
"learning_rate": 1.4727272727272728e-05,
"loss": 0.2971,
"step": 81
},
{
"epoch": 0.22373806275579808,
"grad_norm": 0.5990443825721741,
"learning_rate": 1.4909090909090911e-05,
"loss": 0.2945,
"step": 82
},
{
"epoch": 0.22646657571623466,
"grad_norm": 0.6038920283317566,
"learning_rate": 1.5090909090909091e-05,
"loss": 0.2953,
"step": 83
},
{
"epoch": 0.2291950886766712,
"grad_norm": 0.6122650504112244,
"learning_rate": 1.5272727272727276e-05,
"loss": 0.2948,
"step": 84
},
{
"epoch": 0.23192360163710776,
"grad_norm": 0.5494102835655212,
"learning_rate": 1.5454545454545454e-05,
"loss": 0.284,
"step": 85
},
{
"epoch": 0.23465211459754434,
"grad_norm": 0.5922354459762573,
"learning_rate": 1.563636363636364e-05,
"loss": 0.2943,
"step": 86
},
{
"epoch": 0.2373806275579809,
"grad_norm": 0.5704284310340881,
"learning_rate": 1.5818181818181818e-05,
"loss": 0.2885,
"step": 87
},
{
"epoch": 0.24010914051841747,
"grad_norm": 0.6150228381156921,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.2839,
"step": 88
},
{
"epoch": 0.24283765347885403,
"grad_norm": 0.5552194118499756,
"learning_rate": 1.6181818181818184e-05,
"loss": 0.278,
"step": 89
},
{
"epoch": 0.24556616643929058,
"grad_norm": 0.5677224397659302,
"learning_rate": 1.6363636363636366e-05,
"loss": 0.2888,
"step": 90
},
{
"epoch": 0.24829467939972716,
"grad_norm": 0.556135356426239,
"learning_rate": 1.6545454545454548e-05,
"loss": 0.2819,
"step": 91
},
{
"epoch": 0.25102319236016374,
"grad_norm": 0.6040939092636108,
"learning_rate": 1.672727272727273e-05,
"loss": 0.2757,
"step": 92
},
{
"epoch": 0.25375170532060026,
"grad_norm": 0.5893986821174622,
"learning_rate": 1.690909090909091e-05,
"loss": 0.2753,
"step": 93
},
{
"epoch": 0.25648021828103684,
"grad_norm": 0.5702283382415771,
"learning_rate": 1.7090909090909092e-05,
"loss": 0.2714,
"step": 94
},
{
"epoch": 0.2592087312414734,
"grad_norm": 0.5491526126861572,
"learning_rate": 1.7272727272727274e-05,
"loss": 0.2774,
"step": 95
},
{
"epoch": 0.26193724420190995,
"grad_norm": 0.557965099811554,
"learning_rate": 1.7454545454545456e-05,
"loss": 0.2679,
"step": 96
},
{
"epoch": 0.2646657571623465,
"grad_norm": 0.6038682460784912,
"learning_rate": 1.7636363636363637e-05,
"loss": 0.2635,
"step": 97
},
{
"epoch": 0.2673942701227831,
"grad_norm": 0.567489743232727,
"learning_rate": 1.781818181818182e-05,
"loss": 0.2709,
"step": 98
},
{
"epoch": 0.27012278308321963,
"grad_norm": 0.5320610404014587,
"learning_rate": 1.8e-05,
"loss": 0.266,
"step": 99
},
{
"epoch": 0.2728512960436562,
"grad_norm": 0.5480786561965942,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.2697,
"step": 100
},
{
"epoch": 0.2755798090040928,
"grad_norm": 0.5415038466453552,
"learning_rate": 1.8363636363636367e-05,
"loss": 0.256,
"step": 101
},
{
"epoch": 0.2783083219645293,
"grad_norm": 0.5506438612937927,
"learning_rate": 1.8545454545454545e-05,
"loss": 0.2558,
"step": 102
},
{
"epoch": 0.2810368349249659,
"grad_norm": 0.5091832280158997,
"learning_rate": 1.872727272727273e-05,
"loss": 0.2567,
"step": 103
},
{
"epoch": 0.2837653478854025,
"grad_norm": 0.529866099357605,
"learning_rate": 1.8909090909090912e-05,
"loss": 0.2604,
"step": 104
},
{
"epoch": 0.286493860845839,
"grad_norm": 0.5355682969093323,
"learning_rate": 1.9090909090909094e-05,
"loss": 0.2564,
"step": 105
},
{
"epoch": 0.2892223738062756,
"grad_norm": 0.5305929183959961,
"learning_rate": 1.9272727272727275e-05,
"loss": 0.2611,
"step": 106
},
{
"epoch": 0.29195088676671216,
"grad_norm": 0.5013212561607361,
"learning_rate": 1.9454545454545457e-05,
"loss": 0.2479,
"step": 107
},
{
"epoch": 0.2946793997271487,
"grad_norm": 0.5442197918891907,
"learning_rate": 1.963636363636364e-05,
"loss": 0.2529,
"step": 108
},
{
"epoch": 0.29740791268758526,
"grad_norm": 0.5186132192611694,
"learning_rate": 1.981818181818182e-05,
"loss": 0.249,
"step": 109
},
{
"epoch": 0.30013642564802184,
"grad_norm": 0.5124782919883728,
"learning_rate": 2e-05,
"loss": 0.2537,
"step": 110
},
{
"epoch": 0.30286493860845837,
"grad_norm": 0.5179018974304199,
"learning_rate": 1.9999949446003432e-05,
"loss": 0.2462,
"step": 111
},
{
"epoch": 0.30559345156889495,
"grad_norm": 0.5417453050613403,
"learning_rate": 1.9999797784524866e-05,
"loss": 0.2502,
"step": 112
},
{
"epoch": 0.3083219645293315,
"grad_norm": 0.5183635950088501,
"learning_rate": 1.9999545017097726e-05,
"loss": 0.2559,
"step": 113
},
{
"epoch": 0.31105047748976805,
"grad_norm": 0.5046340227127075,
"learning_rate": 1.999919114627769e-05,
"loss": 0.2466,
"step": 114
},
{
"epoch": 0.31377899045020463,
"grad_norm": 0.4859924912452698,
"learning_rate": 1.9998736175642674e-05,
"loss": 0.2473,
"step": 115
},
{
"epoch": 0.3165075034106412,
"grad_norm": 0.5244961380958557,
"learning_rate": 1.9998180109792793e-05,
"loss": 0.2421,
"step": 116
},
{
"epoch": 0.31923601637107774,
"grad_norm": 0.4975835978984833,
"learning_rate": 1.999752295435032e-05,
"loss": 0.2411,
"step": 117
},
{
"epoch": 0.3219645293315143,
"grad_norm": 0.500035285949707,
"learning_rate": 1.999676471595962e-05,
"loss": 0.2399,
"step": 118
},
{
"epoch": 0.3246930422919509,
"grad_norm": 0.47836625576019287,
"learning_rate": 1.9995905402287094e-05,
"loss": 0.2525,
"step": 119
},
{
"epoch": 0.3274215552523875,
"grad_norm": 0.4964773952960968,
"learning_rate": 1.9994945022021085e-05,
"loss": 0.2466,
"step": 120
},
{
"epoch": 0.330150068212824,
"grad_norm": 0.4777352809906006,
"learning_rate": 1.9993883584871807e-05,
"loss": 0.2435,
"step": 121
},
{
"epoch": 0.3328785811732606,
"grad_norm": 0.48544883728027344,
"learning_rate": 1.9992721101571238e-05,
"loss": 0.2425,
"step": 122
},
{
"epoch": 0.33560709413369716,
"grad_norm": 0.4984164535999298,
"learning_rate": 1.999145758387301e-05,
"loss": 0.2438,
"step": 123
},
{
"epoch": 0.3383356070941337,
"grad_norm": 0.47539612650871277,
"learning_rate": 1.9990093044552304e-05,
"loss": 0.2327,
"step": 124
},
{
"epoch": 0.34106412005457026,
"grad_norm": 0.4909318685531616,
"learning_rate": 1.9988627497405696e-05,
"loss": 0.2455,
"step": 125
},
{
"epoch": 0.34379263301500684,
"grad_norm": 0.46352440118789673,
"learning_rate": 1.9987060957251047e-05,
"loss": 0.2346,
"step": 126
},
{
"epoch": 0.34652114597544337,
"grad_norm": 0.5029696226119995,
"learning_rate": 1.9985393439927325e-05,
"loss": 0.2417,
"step": 127
},
{
"epoch": 0.34924965893587995,
"grad_norm": 0.4627346396446228,
"learning_rate": 1.998362496229446e-05,
"loss": 0.2381,
"step": 128
},
{
"epoch": 0.3519781718963165,
"grad_norm": 0.4858626127243042,
"learning_rate": 1.9981755542233175e-05,
"loss": 0.2392,
"step": 129
},
{
"epoch": 0.35470668485675305,
"grad_norm": 0.4663969874382019,
"learning_rate": 1.997978519864481e-05,
"loss": 0.2382,
"step": 130
},
{
"epoch": 0.35743519781718963,
"grad_norm": 0.47233930230140686,
"learning_rate": 1.9977713951451102e-05,
"loss": 0.2345,
"step": 131
},
{
"epoch": 0.3601637107776262,
"grad_norm": 0.4638475179672241,
"learning_rate": 1.9975541821594028e-05,
"loss": 0.2278,
"step": 132
},
{
"epoch": 0.36289222373806274,
"grad_norm": 0.49125927686691284,
"learning_rate": 1.9973268831035547e-05,
"loss": 0.237,
"step": 133
},
{
"epoch": 0.3656207366984993,
"grad_norm": 0.46837231516838074,
"learning_rate": 1.9970895002757413e-05,
"loss": 0.2341,
"step": 134
},
{
"epoch": 0.3683492496589359,
"grad_norm": 0.4689665734767914,
"learning_rate": 1.996842036076093e-05,
"loss": 0.2302,
"step": 135
},
{
"epoch": 0.3710777626193724,
"grad_norm": 0.46963638067245483,
"learning_rate": 1.99658449300667e-05,
"loss": 0.2356,
"step": 136
},
{
"epoch": 0.373806275579809,
"grad_norm": 0.45877447724342346,
"learning_rate": 1.9963168736714395e-05,
"loss": 0.2358,
"step": 137
},
{
"epoch": 0.3765347885402456,
"grad_norm": 0.46581560373306274,
"learning_rate": 1.9960391807762462e-05,
"loss": 0.2298,
"step": 138
},
{
"epoch": 0.3792633015006821,
"grad_norm": 0.4520280063152313,
"learning_rate": 1.9957514171287875e-05,
"loss": 0.2265,
"step": 139
},
{
"epoch": 0.3819918144611187,
"grad_norm": 0.4703895151615143,
"learning_rate": 1.995453585638584e-05,
"loss": 0.2307,
"step": 140
},
{
"epoch": 0.38472032742155526,
"grad_norm": 0.45207223296165466,
"learning_rate": 1.9951456893169497e-05,
"loss": 0.2372,
"step": 141
},
{
"epoch": 0.3874488403819918,
"grad_norm": 0.45681706070899963,
"learning_rate": 1.994827731276963e-05,
"loss": 0.2375,
"step": 142
},
{
"epoch": 0.39017735334242837,
"grad_norm": 0.44319695234298706,
"learning_rate": 1.994499714733434e-05,
"loss": 0.2278,
"step": 143
},
{
"epoch": 0.39290586630286495,
"grad_norm": 0.45957088470458984,
"learning_rate": 1.9941616430028713e-05,
"loss": 0.2264,
"step": 144
},
{
"epoch": 0.3956343792633015,
"grad_norm": 0.4694841504096985,
"learning_rate": 1.993813519503451e-05,
"loss": 0.229,
"step": 145
},
{
"epoch": 0.39836289222373805,
"grad_norm": 0.45481976866722107,
"learning_rate": 1.9934553477549795e-05,
"loss": 0.2298,
"step": 146
},
{
"epoch": 0.40109140518417463,
"grad_norm": 0.4374624788761139,
"learning_rate": 1.99308713137886e-05,
"loss": 0.2275,
"step": 147
},
{
"epoch": 0.4038199181446112,
"grad_norm": 0.431749552488327,
"learning_rate": 1.992708874098054e-05,
"loss": 0.2293,
"step": 148
},
{
"epoch": 0.40654843110504774,
"grad_norm": 0.4361872673034668,
"learning_rate": 1.992320579737045e-05,
"loss": 0.2254,
"step": 149
},
{
"epoch": 0.4092769440654843,
"grad_norm": 0.44613024592399597,
"learning_rate": 1.9919222522217998e-05,
"loss": 0.2195,
"step": 150
},
{
"epoch": 0.4120054570259209,
"grad_norm": 0.4328283667564392,
"learning_rate": 1.9915138955797272e-05,
"loss": 0.2255,
"step": 151
},
{
"epoch": 0.4147339699863574,
"grad_norm": 0.4296876788139343,
"learning_rate": 1.9910955139396395e-05,
"loss": 0.2178,
"step": 152
},
{
"epoch": 0.417462482946794,
"grad_norm": 0.43290847539901733,
"learning_rate": 1.99066711153171e-05,
"loss": 0.2255,
"step": 153
},
{
"epoch": 0.4201909959072306,
"grad_norm": 0.43814530968666077,
"learning_rate": 1.990228692687429e-05,
"loss": 0.2216,
"step": 154
},
{
"epoch": 0.4229195088676671,
"grad_norm": 0.42485710978507996,
"learning_rate": 1.9897802618395614e-05,
"loss": 0.2216,
"step": 155
},
{
"epoch": 0.4256480218281037,
"grad_norm": 0.43098196387290955,
"learning_rate": 1.9893218235221016e-05,
"loss": 0.2231,
"step": 156
},
{
"epoch": 0.42837653478854026,
"grad_norm": 0.4232180714607239,
"learning_rate": 1.988853382370228e-05,
"loss": 0.2188,
"step": 157
},
{
"epoch": 0.4311050477489768,
"grad_norm": 0.4411165714263916,
"learning_rate": 1.988374943120254e-05,
"loss": 0.2283,
"step": 158
},
{
"epoch": 0.43383356070941337,
"grad_norm": 0.43496963381767273,
"learning_rate": 1.9878865106095838e-05,
"loss": 0.2193,
"step": 159
},
{
"epoch": 0.43656207366984995,
"grad_norm": 0.43149223923683167,
"learning_rate": 1.9873880897766597e-05,
"loss": 0.2206,
"step": 160
},
{
"epoch": 0.4392905866302865,
"grad_norm": 0.525326669216156,
"learning_rate": 1.9868796856609154e-05,
"loss": 0.2182,
"step": 161
},
{
"epoch": 0.44201909959072305,
"grad_norm": 0.4580979645252228,
"learning_rate": 1.9863613034027224e-05,
"loss": 0.22,
"step": 162
},
{
"epoch": 0.44474761255115963,
"grad_norm": 0.43087807297706604,
"learning_rate": 1.9858329482433404e-05,
"loss": 0.2187,
"step": 163
},
{
"epoch": 0.44747612551159616,
"grad_norm": 0.4238346219062805,
"learning_rate": 1.985294625524861e-05,
"loss": 0.2221,
"step": 164
},
{
"epoch": 0.45020463847203274,
"grad_norm": 0.4487808346748352,
"learning_rate": 1.984746340690159e-05,
"loss": 0.2233,
"step": 165
},
{
"epoch": 0.4529331514324693,
"grad_norm": 0.4480034112930298,
"learning_rate": 1.9841880992828306e-05,
"loss": 0.2266,
"step": 166
},
{
"epoch": 0.45566166439290584,
"grad_norm": 0.417173832654953,
"learning_rate": 1.983619906947144e-05,
"loss": 0.2217,
"step": 167
},
{
"epoch": 0.4583901773533424,
"grad_norm": 0.43056657910346985,
"learning_rate": 1.9830417694279766e-05,
"loss": 0.2185,
"step": 168
},
{
"epoch": 0.461118690313779,
"grad_norm": 0.41167616844177246,
"learning_rate": 1.9824536925707622e-05,
"loss": 0.2247,
"step": 169
},
{
"epoch": 0.4638472032742155,
"grad_norm": 0.42600226402282715,
"learning_rate": 1.981855682321427e-05,
"loss": 0.218,
"step": 170
},
{
"epoch": 0.4665757162346521,
"grad_norm": 0.4478660225868225,
"learning_rate": 1.9812477447263324e-05,
"loss": 0.2171,
"step": 171
},
{
"epoch": 0.4693042291950887,
"grad_norm": 0.4131629168987274,
"learning_rate": 1.9806298859322143e-05,
"loss": 0.2154,
"step": 172
},
{
"epoch": 0.47203274215552526,
"grad_norm": 0.39991888403892517,
"learning_rate": 1.980002112186118e-05,
"loss": 0.2181,
"step": 173
},
{
"epoch": 0.4747612551159618,
"grad_norm": 0.3961407244205475,
"learning_rate": 1.979364429835339e-05,
"loss": 0.2151,
"step": 174
},
{
"epoch": 0.47748976807639837,
"grad_norm": 0.39714792370796204,
"learning_rate": 1.9787168453273546e-05,
"loss": 0.212,
"step": 175
},
{
"epoch": 0.48021828103683495,
"grad_norm": 0.4097214937210083,
"learning_rate": 1.978059365209762e-05,
"loss": 0.2137,
"step": 176
},
{
"epoch": 0.4829467939972715,
"grad_norm": 0.40017515420913696,
"learning_rate": 1.9773919961302113e-05,
"loss": 0.2163,
"step": 177
},
{
"epoch": 0.48567530695770805,
"grad_norm": 0.41595569252967834,
"learning_rate": 1.9767147448363366e-05,
"loss": 0.2171,
"step": 178
},
{
"epoch": 0.48840381991814463,
"grad_norm": 0.3972041606903076,
"learning_rate": 1.9760276181756905e-05,
"loss": 0.2157,
"step": 179
},
{
"epoch": 0.49113233287858116,
"grad_norm": 0.38620567321777344,
"learning_rate": 1.975330623095672e-05,
"loss": 0.2088,
"step": 180
},
{
"epoch": 0.49386084583901774,
"grad_norm": 0.4123203754425049,
"learning_rate": 1.9746237666434588e-05,
"loss": 0.2153,
"step": 181
},
{
"epoch": 0.4965893587994543,
"grad_norm": 0.39745667576789856,
"learning_rate": 1.9739070559659347e-05,
"loss": 0.2127,
"step": 182
},
{
"epoch": 0.49931787175989084,
"grad_norm": 0.40074360370635986,
"learning_rate": 1.973180498309618e-05,
"loss": 0.2162,
"step": 183
},
{
"epoch": 0.5020463847203275,
"grad_norm": 0.40014582872390747,
"learning_rate": 1.9724441010205865e-05,
"loss": 0.2044,
"step": 184
},
{
"epoch": 0.504774897680764,
"grad_norm": 0.40337255597114563,
"learning_rate": 1.9716978715444056e-05,
"loss": 0.2147,
"step": 185
},
{
"epoch": 0.5075034106412005,
"grad_norm": 0.3925221264362335,
"learning_rate": 1.9709418174260523e-05,
"loss": 0.2108,
"step": 186
},
{
"epoch": 0.5102319236016372,
"grad_norm": 0.4033859670162201,
"learning_rate": 1.9701759463098377e-05,
"loss": 0.2153,
"step": 187
},
{
"epoch": 0.5129604365620737,
"grad_norm": 0.4059382975101471,
"learning_rate": 1.9694002659393306e-05,
"loss": 0.2126,
"step": 188
},
{
"epoch": 0.5156889495225102,
"grad_norm": 0.38035619258880615,
"learning_rate": 1.9686147841572803e-05,
"loss": 0.2119,
"step": 189
},
{
"epoch": 0.5184174624829468,
"grad_norm": 0.41206416487693787,
"learning_rate": 1.9678195089055347e-05,
"loss": 0.2119,
"step": 190
},
{
"epoch": 0.5211459754433834,
"grad_norm": 0.3932187855243683,
"learning_rate": 1.967014448224963e-05,
"loss": 0.209,
"step": 191
},
{
"epoch": 0.5238744884038199,
"grad_norm": 0.39542925357818604,
"learning_rate": 1.9661996102553716e-05,
"loss": 0.2115,
"step": 192
},
{
"epoch": 0.5266030013642565,
"grad_norm": 0.40574580430984497,
"learning_rate": 1.965375003235424e-05,
"loss": 0.2074,
"step": 193
},
{
"epoch": 0.529331514324693,
"grad_norm": 0.3841424286365509,
"learning_rate": 1.9645406355025565e-05,
"loss": 0.2116,
"step": 194
},
{
"epoch": 0.5320600272851296,
"grad_norm": 0.3910590410232544,
"learning_rate": 1.9636965154928932e-05,
"loss": 0.2076,
"step": 195
},
{
"epoch": 0.5347885402455662,
"grad_norm": 0.3898683190345764,
"learning_rate": 1.9628426517411625e-05,
"loss": 0.2036,
"step": 196
},
{
"epoch": 0.5375170532060027,
"grad_norm": 0.38541069626808167,
"learning_rate": 1.9619790528806092e-05,
"loss": 0.2076,
"step": 197
},
{
"epoch": 0.5402455661664393,
"grad_norm": 0.39501479268074036,
"learning_rate": 1.9611057276429085e-05,
"loss": 0.2106,
"step": 198
},
{
"epoch": 0.5429740791268759,
"grad_norm": 0.41505786776542664,
"learning_rate": 1.9602226848580762e-05,
"loss": 0.2094,
"step": 199
},
{
"epoch": 0.5457025920873124,
"grad_norm": 0.3798902630805969,
"learning_rate": 1.959329933454381e-05,
"loss": 0.2071,
"step": 200
},
{
"epoch": 0.548431105047749,
"grad_norm": 0.39763155579566956,
"learning_rate": 1.958427482458253e-05,
"loss": 0.2085,
"step": 201
},
{
"epoch": 0.5511596180081856,
"grad_norm": 0.3722373843193054,
"learning_rate": 1.957515340994193e-05,
"loss": 0.2101,
"step": 202
},
{
"epoch": 0.5538881309686221,
"grad_norm": 0.41766637563705444,
"learning_rate": 1.95659351828468e-05,
"loss": 0.218,
"step": 203
},
{
"epoch": 0.5566166439290586,
"grad_norm": 0.3588719964027405,
"learning_rate": 1.9556620236500794e-05,
"loss": 0.199,
"step": 204
},
{
"epoch": 0.5593451568894953,
"grad_norm": 0.372455894947052,
"learning_rate": 1.954720866508546e-05,
"loss": 0.207,
"step": 205
},
{
"epoch": 0.5620736698499318,
"grad_norm": 0.37608572840690613,
"learning_rate": 1.9537700563759303e-05,
"loss": 0.2147,
"step": 206
},
{
"epoch": 0.5648021828103683,
"grad_norm": 0.3798050880432129,
"learning_rate": 1.9528096028656835e-05,
"loss": 0.2087,
"step": 207
},
{
"epoch": 0.567530695770805,
"grad_norm": 0.37626174092292786,
"learning_rate": 1.9518395156887574e-05,
"loss": 0.1992,
"step": 208
},
{
"epoch": 0.5702592087312415,
"grad_norm": 0.3616938292980194,
"learning_rate": 1.9508598046535095e-05,
"loss": 0.202,
"step": 209
},
{
"epoch": 0.572987721691678,
"grad_norm": 0.36245420575141907,
"learning_rate": 1.949870479665602e-05,
"loss": 0.2033,
"step": 210
},
{
"epoch": 0.5757162346521146,
"grad_norm": 0.37910935282707214,
"learning_rate": 1.9488715507279e-05,
"loss": 0.2111,
"step": 211
},
{
"epoch": 0.5784447476125512,
"grad_norm": 0.3634055554866791,
"learning_rate": 1.9478630279403737e-05,
"loss": 0.206,
"step": 212
},
{
"epoch": 0.5811732605729877,
"grad_norm": 0.3632001280784607,
"learning_rate": 1.9468449214999956e-05,
"loss": 0.2043,
"step": 213
},
{
"epoch": 0.5839017735334243,
"grad_norm": 0.3869618773460388,
"learning_rate": 1.9458172417006347e-05,
"loss": 0.2109,
"step": 214
},
{
"epoch": 0.5866302864938608,
"grad_norm": 0.3787137269973755,
"learning_rate": 1.9447799989329557e-05,
"loss": 0.2053,
"step": 215
},
{
"epoch": 0.5893587994542974,
"grad_norm": 0.36317574977874756,
"learning_rate": 1.943733203684312e-05,
"loss": 0.2059,
"step": 216
},
{
"epoch": 0.592087312414734,
"grad_norm": 0.3737640976905823,
"learning_rate": 1.9426768665386397e-05,
"loss": 0.207,
"step": 217
},
{
"epoch": 0.5948158253751705,
"grad_norm": 0.3722177743911743,
"learning_rate": 1.9416109981763526e-05,
"loss": 0.2024,
"step": 218
},
{
"epoch": 0.597544338335607,
"grad_norm": 0.38148534297943115,
"learning_rate": 1.9405356093742314e-05,
"loss": 0.2037,
"step": 219
},
{
"epoch": 0.6002728512960437,
"grad_norm": 0.3735097348690033,
"learning_rate": 1.939450711005316e-05,
"loss": 0.1986,
"step": 220
},
{
"epoch": 0.6030013642564802,
"grad_norm": 0.3758648633956909,
"learning_rate": 1.9383563140387966e-05,
"loss": 0.2011,
"step": 221
},
{
"epoch": 0.6057298772169167,
"grad_norm": 0.3648459315299988,
"learning_rate": 1.9372524295399014e-05,
"loss": 0.2067,
"step": 222
},
{
"epoch": 0.6084583901773534,
"grad_norm": 0.373404860496521,
"learning_rate": 1.9361390686697847e-05,
"loss": 0.199,
"step": 223
},
{
"epoch": 0.6111869031377899,
"grad_norm": 0.38956329226493835,
"learning_rate": 1.9350162426854152e-05,
"loss": 0.1986,
"step": 224
},
{
"epoch": 0.6139154160982264,
"grad_norm": 0.36711621284484863,
"learning_rate": 1.9338839629394606e-05,
"loss": 0.2007,
"step": 225
},
{
"epoch": 0.616643929058663,
"grad_norm": 0.3746264576911926,
"learning_rate": 1.9327422408801744e-05,
"loss": 0.2053,
"step": 226
},
{
"epoch": 0.6193724420190996,
"grad_norm": 0.35744959115982056,
"learning_rate": 1.9315910880512792e-05,
"loss": 0.1953,
"step": 227
},
{
"epoch": 0.6221009549795361,
"grad_norm": 0.3548133969306946,
"learning_rate": 1.93043051609185e-05,
"loss": 0.1989,
"step": 228
},
{
"epoch": 0.6248294679399727,
"grad_norm": 0.3690285384654999,
"learning_rate": 1.929260536736198e-05,
"loss": 0.2028,
"step": 229
},
{
"epoch": 0.6275579809004093,
"grad_norm": 0.35690346360206604,
"learning_rate": 1.9280811618137486e-05,
"loss": 0.198,
"step": 230
},
{
"epoch": 0.6302864938608458,
"grad_norm": 0.35622960329055786,
"learning_rate": 1.926892403248925e-05,
"loss": 0.2026,
"step": 231
},
{
"epoch": 0.6330150068212824,
"grad_norm": 0.35483741760253906,
"learning_rate": 1.9256942730610268e-05,
"loss": 0.2002,
"step": 232
},
{
"epoch": 0.635743519781719,
"grad_norm": 0.3364560604095459,
"learning_rate": 1.9244867833641078e-05,
"loss": 0.1926,
"step": 233
},
{
"epoch": 0.6384720327421555,
"grad_norm": 0.3680644631385803,
"learning_rate": 1.9232699463668543e-05,
"loss": 0.2027,
"step": 234
},
{
"epoch": 0.6412005457025921,
"grad_norm": 0.3371049165725708,
"learning_rate": 1.9220437743724605e-05,
"loss": 0.2031,
"step": 235
},
{
"epoch": 0.6439290586630286,
"grad_norm": 0.3472127616405487,
"learning_rate": 1.9208082797785057e-05,
"loss": 0.2054,
"step": 236
},
{
"epoch": 0.6466575716234653,
"grad_norm": 0.3515476882457733,
"learning_rate": 1.9195634750768276e-05,
"loss": 0.2002,
"step": 237
},
{
"epoch": 0.6493860845839018,
"grad_norm": 0.3557245433330536,
"learning_rate": 1.9183093728533966e-05,
"loss": 0.1988,
"step": 238
},
{
"epoch": 0.6521145975443383,
"grad_norm": 0.34736377000808716,
"learning_rate": 1.9170459857881888e-05,
"loss": 0.201,
"step": 239
},
{
"epoch": 0.654843110504775,
"grad_norm": 0.3408997654914856,
"learning_rate": 1.9157733266550577e-05,
"loss": 0.2006,
"step": 240
},
{
"epoch": 0.6575716234652115,
"grad_norm": 0.3465856909751892,
"learning_rate": 1.9144914083216036e-05,
"loss": 0.1965,
"step": 241
},
{
"epoch": 0.660300136425648,
"grad_norm": 0.3276280462741852,
"learning_rate": 1.913200243749046e-05,
"loss": 0.1951,
"step": 242
},
{
"epoch": 0.6630286493860846,
"grad_norm": 0.3457287847995758,
"learning_rate": 1.91189984599209e-05,
"loss": 0.2048,
"step": 243
},
{
"epoch": 0.6657571623465212,
"grad_norm": 0.3575926721096039,
"learning_rate": 1.910590228198798e-05,
"loss": 0.1966,
"step": 244
},
{
"epoch": 0.6684856753069577,
"grad_norm": 0.34771567583084106,
"learning_rate": 1.9092714036104508e-05,
"loss": 0.1966,
"step": 245
},
{
"epoch": 0.6712141882673943,
"grad_norm": 0.3461324870586395,
"learning_rate": 1.9079433855614203e-05,
"loss": 0.1989,
"step": 246
},
{
"epoch": 0.6739427012278308,
"grad_norm": 0.35530659556388855,
"learning_rate": 1.9066061874790302e-05,
"loss": 0.2034,
"step": 247
},
{
"epoch": 0.6766712141882674,
"grad_norm": 0.34976205229759216,
"learning_rate": 1.9052598228834217e-05,
"loss": 0.1952,
"step": 248
},
{
"epoch": 0.679399727148704,
"grad_norm": 0.34561771154403687,
"learning_rate": 1.9039043053874175e-05,
"loss": 0.1922,
"step": 249
},
{
"epoch": 0.6821282401091405,
"grad_norm": 0.34467610716819763,
"learning_rate": 1.9025396486963827e-05,
"loss": 0.1958,
"step": 250
},
{
"epoch": 0.684856753069577,
"grad_norm": 0.3314482271671295,
"learning_rate": 1.9011658666080873e-05,
"loss": 0.1934,
"step": 251
},
{
"epoch": 0.6875852660300137,
"grad_norm": 0.3351077437400818,
"learning_rate": 1.8997829730125662e-05,
"loss": 0.1952,
"step": 252
},
{
"epoch": 0.6903137789904502,
"grad_norm": 0.3419744372367859,
"learning_rate": 1.898390981891979e-05,
"loss": 0.1974,
"step": 253
},
{
"epoch": 0.6930422919508867,
"grad_norm": 0.33833423256874084,
"learning_rate": 1.8969899073204687e-05,
"loss": 0.1966,
"step": 254
},
{
"epoch": 0.6957708049113234,
"grad_norm": 0.34223422408103943,
"learning_rate": 1.895579763464019e-05,
"loss": 0.1983,
"step": 255
},
{
"epoch": 0.6984993178717599,
"grad_norm": 0.34170493483543396,
"learning_rate": 1.8941605645803115e-05,
"loss": 0.1983,
"step": 256
},
{
"epoch": 0.7012278308321964,
"grad_norm": 0.3345998227596283,
"learning_rate": 1.8927323250185815e-05,
"loss": 0.1962,
"step": 257
},
{
"epoch": 0.703956343792633,
"grad_norm": 0.35899677872657776,
"learning_rate": 1.891295059219472e-05,
"loss": 0.1947,
"step": 258
},
{
"epoch": 0.7066848567530696,
"grad_norm": 0.3411823809146881,
"learning_rate": 1.88984878171489e-05,
"loss": 0.1973,
"step": 259
},
{
"epoch": 0.7094133697135061,
"grad_norm": 0.3441268801689148,
"learning_rate": 1.888393507127856e-05,
"loss": 0.1949,
"step": 260
},
{
"epoch": 0.7121418826739427,
"grad_norm": 0.34265270829200745,
"learning_rate": 1.8869292501723602e-05,
"loss": 0.1977,
"step": 261
},
{
"epoch": 0.7148703956343793,
"grad_norm": 0.3428017795085907,
"learning_rate": 1.8854560256532098e-05,
"loss": 0.1969,
"step": 262
},
{
"epoch": 0.7175989085948158,
"grad_norm": 0.3393175005912781,
"learning_rate": 1.8839738484658835e-05,
"loss": 0.1948,
"step": 263
},
{
"epoch": 0.7203274215552524,
"grad_norm": 0.3410533368587494,
"learning_rate": 1.8824827335963767e-05,
"loss": 0.1963,
"step": 264
},
{
"epoch": 0.723055934515689,
"grad_norm": 0.35079458355903625,
"learning_rate": 1.8809826961210527e-05,
"loss": 0.1936,
"step": 265
},
{
"epoch": 0.7257844474761255,
"grad_norm": 0.3475019931793213,
"learning_rate": 1.879473751206489e-05,
"loss": 0.1911,
"step": 266
},
{
"epoch": 0.7285129604365621,
"grad_norm": 0.3527446687221527,
"learning_rate": 1.8779559141093256e-05,
"loss": 0.1961,
"step": 267
},
{
"epoch": 0.7312414733969986,
"grad_norm": 0.35162854194641113,
"learning_rate": 1.876429200176108e-05,
"loss": 0.1952,
"step": 268
},
{
"epoch": 0.7339699863574352,
"grad_norm": 0.3261922597885132,
"learning_rate": 1.8748936248431353e-05,
"loss": 0.1896,
"step": 269
},
{
"epoch": 0.7366984993178718,
"grad_norm": 0.3384665548801422,
"learning_rate": 1.8733492036363007e-05,
"loss": 0.1918,
"step": 270
},
{
"epoch": 0.7394270122783083,
"grad_norm": 0.3292391002178192,
"learning_rate": 1.871795952170937e-05,
"loss": 0.1941,
"step": 271
},
{
"epoch": 0.7421555252387448,
"grad_norm": 0.332894504070282,
"learning_rate": 1.8702338861516587e-05,
"loss": 0.1901,
"step": 272
},
{
"epoch": 0.7448840381991815,
"grad_norm": 0.44181516766548157,
"learning_rate": 1.8686630213722015e-05,
"loss": 0.1955,
"step": 273
},
{
"epoch": 0.747612551159618,
"grad_norm": 0.3397521674633026,
"learning_rate": 1.867083373715264e-05,
"loss": 0.193,
"step": 274
},
{
"epoch": 0.7503410641200545,
"grad_norm": 0.33744487166404724,
"learning_rate": 1.8654949591523467e-05,
"loss": 0.2003,
"step": 275
},
{
"epoch": 0.7530695770804912,
"grad_norm": 0.3381851017475128,
"learning_rate": 1.86389779374359e-05,
"loss": 0.1949,
"step": 276
},
{
"epoch": 0.7557980900409277,
"grad_norm": 0.3497074842453003,
"learning_rate": 1.8622918936376133e-05,
"loss": 0.2024,
"step": 277
},
{
"epoch": 0.7585266030013642,
"grad_norm": 0.3291502892971039,
"learning_rate": 1.8606772750713503e-05,
"loss": 0.1975,
"step": 278
},
{
"epoch": 0.7612551159618008,
"grad_norm": 0.39153552055358887,
"learning_rate": 1.8590539543698852e-05,
"loss": 0.195,
"step": 279
},
{
"epoch": 0.7639836289222374,
"grad_norm": 0.33644160628318787,
"learning_rate": 1.857421947946288e-05,
"loss": 0.1971,
"step": 280
},
{
"epoch": 0.7667121418826739,
"grad_norm": 0.3286866247653961,
"learning_rate": 1.8557812723014476e-05,
"loss": 0.1922,
"step": 281
},
{
"epoch": 0.7694406548431105,
"grad_norm": 0.33656951785087585,
"learning_rate": 1.8541319440239066e-05,
"loss": 0.1916,
"step": 282
},
{
"epoch": 0.772169167803547,
"grad_norm": 0.36169102787971497,
"learning_rate": 1.8524739797896924e-05,
"loss": 0.1938,
"step": 283
},
{
"epoch": 0.7748976807639836,
"grad_norm": 0.3508145213127136,
"learning_rate": 1.8508073963621482e-05,
"loss": 0.2001,
"step": 284
},
{
"epoch": 0.7776261937244202,
"grad_norm": 0.3326241374015808,
"learning_rate": 1.8491322105917645e-05,
"loss": 0.1935,
"step": 285
},
{
"epoch": 0.7803547066848567,
"grad_norm": 0.3261318504810333,
"learning_rate": 1.847448439416009e-05,
"loss": 0.1917,
"step": 286
},
{
"epoch": 0.7830832196452933,
"grad_norm": 0.3250694274902344,
"learning_rate": 1.845756099859154e-05,
"loss": 0.1944,
"step": 287
},
{
"epoch": 0.7858117326057299,
"grad_norm": 0.3388361632823944,
"learning_rate": 1.8440552090321047e-05,
"loss": 0.1945,
"step": 288
},
{
"epoch": 0.7885402455661664,
"grad_norm": 0.3398139774799347,
"learning_rate": 1.842345784132227e-05,
"loss": 0.1933,
"step": 289
},
{
"epoch": 0.791268758526603,
"grad_norm": 0.32879796624183655,
"learning_rate": 1.8406278424431737e-05,
"loss": 0.1902,
"step": 290
},
{
"epoch": 0.7939972714870396,
"grad_norm": 0.34064918756484985,
"learning_rate": 1.838901401334708e-05,
"loss": 0.1915,
"step": 291
},
{
"epoch": 0.7967257844474761,
"grad_norm": 0.32874321937561035,
"learning_rate": 1.8371664782625287e-05,
"loss": 0.1931,
"step": 292
},
{
"epoch": 0.7994542974079127,
"grad_norm": 0.33242276310920715,
"learning_rate": 1.835423090768096e-05,
"loss": 0.1933,
"step": 293
},
{
"epoch": 0.8021828103683493,
"grad_norm": 0.3419250547885895,
"learning_rate": 1.8336712564784506e-05,
"loss": 0.1941,
"step": 294
},
{
"epoch": 0.8049113233287858,
"grad_norm": 0.32681533694267273,
"learning_rate": 1.8319109931060367e-05,
"loss": 0.1897,
"step": 295
},
{
"epoch": 0.8076398362892224,
"grad_norm": 0.3370327353477478,
"learning_rate": 1.8301423184485253e-05,
"loss": 0.192,
"step": 296
},
{
"epoch": 0.810368349249659,
"grad_norm": 0.33470556139945984,
"learning_rate": 1.82836525038863e-05,
"loss": 0.193,
"step": 297
},
{
"epoch": 0.8130968622100955,
"grad_norm": 0.3526148200035095,
"learning_rate": 1.8265798068939295e-05,
"loss": 0.1971,
"step": 298
},
{
"epoch": 0.8158253751705321,
"grad_norm": 0.32294756174087524,
"learning_rate": 1.824786006016685e-05,
"loss": 0.192,
"step": 299
},
{
"epoch": 0.8185538881309686,
"grad_norm": 0.33643051981925964,
"learning_rate": 1.8229838658936566e-05,
"loss": 0.1891,
"step": 300
},
{
"epoch": 0.8212824010914052,
"grad_norm": 0.32787808775901794,
"learning_rate": 1.821173404745922e-05,
"loss": 0.1896,
"step": 301
},
{
"epoch": 0.8240109140518418,
"grad_norm": 0.3204740285873413,
"learning_rate": 1.81935464087869e-05,
"loss": 0.1893,
"step": 302
},
{
"epoch": 0.8267394270122783,
"grad_norm": 0.3371548056602478,
"learning_rate": 1.8175275926811173e-05,
"loss": 0.1931,
"step": 303
},
{
"epoch": 0.8294679399727148,
"grad_norm": 0.32333609461784363,
"learning_rate": 1.815692278626122e-05,
"loss": 0.1907,
"step": 304
},
{
"epoch": 0.8321964529331515,
"grad_norm": 0.3068220019340515,
"learning_rate": 1.813848717270195e-05,
"loss": 0.1863,
"step": 305
},
{
"epoch": 0.834924965893588,
"grad_norm": 0.317272424697876,
"learning_rate": 1.8119969272532164e-05,
"loss": 0.19,
"step": 306
},
{
"epoch": 0.8376534788540245,
"grad_norm": 0.318190336227417,
"learning_rate": 1.8101369272982633e-05,
"loss": 0.1904,
"step": 307
},
{
"epoch": 0.8403819918144612,
"grad_norm": 0.34059062600135803,
"learning_rate": 1.808268736211421e-05,
"loss": 0.1925,
"step": 308
},
{
"epoch": 0.8431105047748977,
"grad_norm": 0.3187810778617859,
"learning_rate": 1.806392372881596e-05,
"loss": 0.1908,
"step": 309
},
{
"epoch": 0.8458390177353342,
"grad_norm": 0.32425281405448914,
"learning_rate": 1.8045078562803203e-05,
"loss": 0.1898,
"step": 310
},
{
"epoch": 0.8485675306957708,
"grad_norm": 0.32228004932403564,
"learning_rate": 1.8026152054615633e-05,
"loss": 0.1893,
"step": 311
},
{
"epoch": 0.8512960436562074,
"grad_norm": 0.3189632296562195,
"learning_rate": 1.800714439561538e-05,
"loss": 0.1909,
"step": 312
},
{
"epoch": 0.8540245566166439,
"grad_norm": 0.32371801137924194,
"learning_rate": 1.7988055777985066e-05,
"loss": 0.191,
"step": 313
},
{
"epoch": 0.8567530695770805,
"grad_norm": 0.3115307688713074,
"learning_rate": 1.7968886394725876e-05,
"loss": 0.1882,
"step": 314
},
{
"epoch": 0.859481582537517,
"grad_norm": 0.3097411096096039,
"learning_rate": 1.7949636439655592e-05,
"loss": 0.1893,
"step": 315
},
{
"epoch": 0.8622100954979536,
"grad_norm": 0.31214120984077454,
"learning_rate": 1.793030610740665e-05,
"loss": 0.1908,
"step": 316
},
{
"epoch": 0.8649386084583902,
"grad_norm": 0.3025393486022949,
"learning_rate": 1.7910895593424166e-05,
"loss": 0.187,
"step": 317
},
{
"epoch": 0.8676671214188267,
"grad_norm": 0.3117706775665283,
"learning_rate": 1.789140509396394e-05,
"loss": 0.1894,
"step": 318
},
{
"epoch": 0.8703956343792633,
"grad_norm": 0.3168593943119049,
"learning_rate": 1.7871834806090502e-05,
"loss": 0.1892,
"step": 319
},
{
"epoch": 0.8731241473396999,
"grad_norm": 0.3119298219680786,
"learning_rate": 1.7852184927675113e-05,
"loss": 0.1846,
"step": 320
},
{
"epoch": 0.8758526603001364,
"grad_norm": 0.31288179755210876,
"learning_rate": 1.7832455657393745e-05,
"loss": 0.1846,
"step": 321
},
{
"epoch": 0.878581173260573,
"grad_norm": 0.3070971965789795,
"learning_rate": 1.7812647194725093e-05,
"loss": 0.1884,
"step": 322
},
{
"epoch": 0.8813096862210096,
"grad_norm": 0.3243504762649536,
"learning_rate": 1.7792759739948546e-05,
"loss": 0.1922,
"step": 323
},
{
"epoch": 0.8840381991814461,
"grad_norm": 0.311040997505188,
"learning_rate": 1.777279349414217e-05,
"loss": 0.1902,
"step": 324
},
{
"epoch": 0.8867667121418826,
"grad_norm": 0.31191757321357727,
"learning_rate": 1.7752748659180662e-05,
"loss": 0.1834,
"step": 325
},
{
"epoch": 0.8894952251023193,
"grad_norm": 0.3067293167114258,
"learning_rate": 1.7732625437733338e-05,
"loss": 0.1875,
"step": 326
},
{
"epoch": 0.8922237380627558,
"grad_norm": 0.29551970958709717,
"learning_rate": 1.771242403326204e-05,
"loss": 0.1842,
"step": 327
},
{
"epoch": 0.8949522510231923,
"grad_norm": 0.3030517101287842,
"learning_rate": 1.7692144650019125e-05,
"loss": 0.1856,
"step": 328
},
{
"epoch": 0.897680763983629,
"grad_norm": 0.3112403154373169,
"learning_rate": 1.767178749304536e-05,
"loss": 0.1928,
"step": 329
},
{
"epoch": 0.9004092769440655,
"grad_norm": 0.3096674084663391,
"learning_rate": 1.765135276816787e-05,
"loss": 0.1889,
"step": 330
},
{
"epoch": 0.903137789904502,
"grad_norm": 0.3074805736541748,
"learning_rate": 1.7630840681998068e-05,
"loss": 0.191,
"step": 331
},
{
"epoch": 0.9058663028649386,
"grad_norm": 0.3202775716781616,
"learning_rate": 1.7610251441929532e-05,
"loss": 0.1943,
"step": 332
},
{
"epoch": 0.9085948158253752,
"grad_norm": 0.311928927898407,
"learning_rate": 1.758958525613594e-05,
"loss": 0.1878,
"step": 333
},
{
"epoch": 0.9113233287858117,
"grad_norm": 0.3051501512527466,
"learning_rate": 1.7568842333568952e-05,
"loss": 0.1879,
"step": 334
},
{
"epoch": 0.9140518417462483,
"grad_norm": 0.31153738498687744,
"learning_rate": 1.754802288395609e-05,
"loss": 0.1899,
"step": 335
},
{
"epoch": 0.9167803547066848,
"grad_norm": 0.3160760700702667,
"learning_rate": 1.7527127117798635e-05,
"loss": 0.1904,
"step": 336
},
{
"epoch": 0.9195088676671214,
"grad_norm": 0.34473487734794617,
"learning_rate": 1.750615524636948e-05,
"loss": 0.185,
"step": 337
},
{
"epoch": 0.922237380627558,
"grad_norm": 0.3128487765789032,
"learning_rate": 1.7485107481711014e-05,
"loss": 0.1836,
"step": 338
},
{
"epoch": 0.9249658935879945,
"grad_norm": 0.3044068217277527,
"learning_rate": 1.7463984036632956e-05,
"loss": 0.1836,
"step": 339
},
{
"epoch": 0.927694406548431,
"grad_norm": 0.3154110014438629,
"learning_rate": 1.7442785124710227e-05,
"loss": 0.1899,
"step": 340
},
{
"epoch": 0.9304229195088677,
"grad_norm": 0.315696120262146,
"learning_rate": 1.742151096028076e-05,
"loss": 0.1878,
"step": 341
},
{
"epoch": 0.9331514324693042,
"grad_norm": 0.2926492393016815,
"learning_rate": 1.7400161758443377e-05,
"loss": 0.186,
"step": 342
},
{
"epoch": 0.9358799454297408,
"grad_norm": 0.30544963479042053,
"learning_rate": 1.7378737735055562e-05,
"loss": 0.1838,
"step": 343
},
{
"epoch": 0.9386084583901774,
"grad_norm": 0.3120751976966858,
"learning_rate": 1.735723910673132e-05,
"loss": 0.1864,
"step": 344
},
{
"epoch": 0.9413369713506139,
"grad_norm": 0.3031761944293976,
"learning_rate": 1.7335666090838965e-05,
"loss": 0.1881,
"step": 345
},
{
"epoch": 0.9440654843110505,
"grad_norm": 0.3090995252132416,
"learning_rate": 1.7314018905498932e-05,
"loss": 0.1915,
"step": 346
},
{
"epoch": 0.946793997271487,
"grad_norm": 0.30660194158554077,
"learning_rate": 1.729229776958157e-05,
"loss": 0.1847,
"step": 347
},
{
"epoch": 0.9495225102319236,
"grad_norm": 0.3076416254043579,
"learning_rate": 1.7270502902704925e-05,
"loss": 0.1818,
"step": 348
},
{
"epoch": 0.9522510231923602,
"grad_norm": 0.3053886592388153,
"learning_rate": 1.7248634525232523e-05,
"loss": 0.1864,
"step": 349
},
{
"epoch": 0.9549795361527967,
"grad_norm": 0.3136518597602844,
"learning_rate": 1.7226692858271133e-05,
"loss": 0.1853,
"step": 350
},
{
"epoch": 0.9577080491132333,
"grad_norm": 0.30702710151672363,
"learning_rate": 1.7204678123668556e-05,
"loss": 0.1827,
"step": 351
},
{
"epoch": 0.9604365620736699,
"grad_norm": 0.3063594698905945,
"learning_rate": 1.718259054401135e-05,
"loss": 0.1888,
"step": 352
},
{
"epoch": 0.9631650750341064,
"grad_norm": 0.29953381419181824,
"learning_rate": 1.71604303426226e-05,
"loss": 0.1823,
"step": 353
},
{
"epoch": 0.965893587994543,
"grad_norm": 0.326468288898468,
"learning_rate": 1.7138197743559656e-05,
"loss": 0.1899,
"step": 354
},
{
"epoch": 0.9686221009549796,
"grad_norm": 0.30320125818252563,
"learning_rate": 1.7115892971611864e-05,
"loss": 0.1866,
"step": 355
},
{
"epoch": 0.9713506139154161,
"grad_norm": 0.3083033263683319,
"learning_rate": 1.7093516252298296e-05,
"loss": 0.191,
"step": 356
},
{
"epoch": 0.9740791268758526,
"grad_norm": 0.3020968437194824,
"learning_rate": 1.7071067811865477e-05,
"loss": 0.1844,
"step": 357
},
{
"epoch": 0.9768076398362893,
"grad_norm": 0.31210261583328247,
"learning_rate": 1.7048547877285078e-05,
"loss": 0.1823,
"step": 358
},
{
"epoch": 0.9795361527967258,
"grad_norm": 0.3151850700378418,
"learning_rate": 1.7025956676251636e-05,
"loss": 0.1874,
"step": 359
},
{
"epoch": 0.9822646657571623,
"grad_norm": 0.29550716280937195,
"learning_rate": 1.7003294437180254e-05,
"loss": 0.1847,
"step": 360
},
{
"epoch": 0.984993178717599,
"grad_norm": 0.29015323519706726,
"learning_rate": 1.6980561389204285e-05,
"loss": 0.1796,
"step": 361
},
{
"epoch": 0.9877216916780355,
"grad_norm": 0.2936837077140808,
"learning_rate": 1.695775776217301e-05,
"loss": 0.1821,
"step": 362
},
{
"epoch": 0.990450204638472,
"grad_norm": 0.3011324107646942,
"learning_rate": 1.6934883786649333e-05,
"loss": 0.1904,
"step": 363
},
{
"epoch": 0.9931787175989086,
"grad_norm": 0.29194238781929016,
"learning_rate": 1.6911939693907422e-05,
"loss": 0.183,
"step": 364
},
{
"epoch": 0.9959072305593452,
"grad_norm": 0.3032688498497009,
"learning_rate": 1.6888925715930396e-05,
"loss": 0.1832,
"step": 365
},
{
"epoch": 0.9986357435197817,
"grad_norm": 0.31402158737182617,
"learning_rate": 1.686584208540797e-05,
"loss": 0.1868,
"step": 366
},
{
"epoch": 1.0013642564802183,
"grad_norm": 0.2912708818912506,
"learning_rate": 1.68426890357341e-05,
"loss": 0.1716,
"step": 367
},
{
"epoch": 1.004092769440655,
"grad_norm": 0.3310892879962921,
"learning_rate": 1.6819466801004622e-05,
"loss": 0.1559,
"step": 368
},
{
"epoch": 1.0068212824010914,
"grad_norm": 0.28492170572280884,
"learning_rate": 1.6796175616014894e-05,
"loss": 0.1547,
"step": 369
},
{
"epoch": 1.009549795361528,
"grad_norm": 0.32012051343917847,
"learning_rate": 1.6772815716257414e-05,
"loss": 0.1536,
"step": 370
},
{
"epoch": 1.0122783083219646,
"grad_norm": 0.3411386013031006,
"learning_rate": 1.6749387337919434e-05,
"loss": 0.1486,
"step": 371
},
{
"epoch": 1.015006821282401,
"grad_norm": 0.3767143785953522,
"learning_rate": 1.672589071788059e-05,
"loss": 0.1532,
"step": 372
},
{
"epoch": 1.0177353342428377,
"grad_norm": 0.3266505002975464,
"learning_rate": 1.6702326093710493e-05,
"loss": 0.1525,
"step": 373
},
{
"epoch": 1.0204638472032743,
"grad_norm": 0.2949761152267456,
"learning_rate": 1.6678693703666327e-05,
"loss": 0.1502,
"step": 374
},
{
"epoch": 1.0231923601637107,
"grad_norm": 0.3091878294944763,
"learning_rate": 1.6654993786690445e-05,
"loss": 0.148,
"step": 375
},
{
"epoch": 1.0259208731241474,
"grad_norm": 0.3043467104434967,
"learning_rate": 1.6631226582407954e-05,
"loss": 0.1552,
"step": 376
},
{
"epoch": 1.028649386084584,
"grad_norm": 0.7541605830192566,
"learning_rate": 1.6607392331124282e-05,
"loss": 0.1487,
"step": 377
},
{
"epoch": 1.0313778990450204,
"grad_norm": 0.3137814700603485,
"learning_rate": 1.6583491273822763e-05,
"loss": 0.1545,
"step": 378
},
{
"epoch": 1.034106412005457,
"grad_norm": 0.3274117708206177,
"learning_rate": 1.6559523652162192e-05,
"loss": 0.153,
"step": 379
},
{
"epoch": 1.0368349249658937,
"grad_norm": 0.333359956741333,
"learning_rate": 1.653548970847438e-05,
"loss": 0.1531,
"step": 380
},
{
"epoch": 1.03956343792633,
"grad_norm": 0.33492153882980347,
"learning_rate": 1.651138968576171e-05,
"loss": 0.16,
"step": 381
},
{
"epoch": 1.0422919508867667,
"grad_norm": 0.30271798372268677,
"learning_rate": 1.6487223827694673e-05,
"loss": 0.1547,
"step": 382
},
{
"epoch": 1.0450204638472034,
"grad_norm": 0.30128586292266846,
"learning_rate": 1.646299237860941e-05,
"loss": 0.153,
"step": 383
},
{
"epoch": 1.0477489768076398,
"grad_norm": 0.28623437881469727,
"learning_rate": 1.643869558350524e-05,
"loss": 0.153,
"step": 384
},
{
"epoch": 1.0504774897680764,
"grad_norm": 0.29904499650001526,
"learning_rate": 1.6414333688042186e-05,
"loss": 0.1511,
"step": 385
},
{
"epoch": 1.053206002728513,
"grad_norm": 0.29639753699302673,
"learning_rate": 1.638990693853848e-05,
"loss": 0.1491,
"step": 386
},
{
"epoch": 1.0559345156889495,
"grad_norm": 0.31091123819351196,
"learning_rate": 1.6365415581968086e-05,
"loss": 0.1546,
"step": 387
},
{
"epoch": 1.058663028649386,
"grad_norm": 0.3298807740211487,
"learning_rate": 1.6340859865958193e-05,
"loss": 0.1562,
"step": 388
},
{
"epoch": 1.0613915416098227,
"grad_norm": 0.30699941515922546,
"learning_rate": 1.631624003878672e-05,
"loss": 0.1508,
"step": 389
},
{
"epoch": 1.0641200545702592,
"grad_norm": 0.30474844574928284,
"learning_rate": 1.6291556349379794e-05,
"loss": 0.1512,
"step": 390
},
{
"epoch": 1.0668485675306958,
"grad_norm": 0.3141867518424988,
"learning_rate": 1.6266809047309253e-05,
"loss": 0.1583,
"step": 391
},
{
"epoch": 1.0695770804911324,
"grad_norm": 0.2910953462123871,
"learning_rate": 1.6241998382790095e-05,
"loss": 0.1558,
"step": 392
},
{
"epoch": 1.0723055934515688,
"grad_norm": 0.28838658332824707,
"learning_rate": 1.6217124606677973e-05,
"loss": 0.1504,
"step": 393
},
{
"epoch": 1.0750341064120055,
"grad_norm": 0.30053088068962097,
"learning_rate": 1.6192187970466646e-05,
"loss": 0.1551,
"step": 394
},
{
"epoch": 1.077762619372442,
"grad_norm": 0.2822672426700592,
"learning_rate": 1.6167188726285433e-05,
"loss": 0.1473,
"step": 395
},
{
"epoch": 1.0804911323328785,
"grad_norm": 0.30218231678009033,
"learning_rate": 1.6142127126896682e-05,
"loss": 0.15,
"step": 396
},
{
"epoch": 1.0832196452933152,
"grad_norm": 0.2925630807876587,
"learning_rate": 1.611700342569319e-05,
"loss": 0.1527,
"step": 397
},
{
"epoch": 1.0859481582537518,
"grad_norm": 0.29162564873695374,
"learning_rate": 1.6091817876695655e-05,
"loss": 0.1512,
"step": 398
},
{
"epoch": 1.0886766712141882,
"grad_norm": 0.2804575264453888,
"learning_rate": 1.606657073455012e-05,
"loss": 0.1476,
"step": 399
},
{
"epoch": 1.0914051841746248,
"grad_norm": 0.2880861759185791,
"learning_rate": 1.6041262254525362e-05,
"loss": 0.1535,
"step": 400
},
{
"epoch": 1.0941336971350615,
"grad_norm": 0.28040415048599243,
"learning_rate": 1.601589269251035e-05,
"loss": 0.1491,
"step": 401
},
{
"epoch": 1.096862210095498,
"grad_norm": 0.29513758420944214,
"learning_rate": 1.599046230501163e-05,
"loss": 0.1532,
"step": 402
},
{
"epoch": 1.0995907230559345,
"grad_norm": 0.30574843287467957,
"learning_rate": 1.5964971349150746e-05,
"loss": 0.1493,
"step": 403
},
{
"epoch": 1.1023192360163712,
"grad_norm": 0.2899196445941925,
"learning_rate": 1.593942008266164e-05,
"loss": 0.155,
"step": 404
},
{
"epoch": 1.1050477489768076,
"grad_norm": 0.27873891592025757,
"learning_rate": 1.591380876388804e-05,
"loss": 0.1487,
"step": 405
},
{
"epoch": 1.1077762619372442,
"grad_norm": 0.291477233171463,
"learning_rate": 1.5888137651780847e-05,
"loss": 0.1505,
"step": 406
},
{
"epoch": 1.1105047748976808,
"grad_norm": 0.3003198504447937,
"learning_rate": 1.5862407005895524e-05,
"loss": 0.152,
"step": 407
},
{
"epoch": 1.1132332878581173,
"grad_norm": 0.2949116826057434,
"learning_rate": 1.583661708638947e-05,
"loss": 0.156,
"step": 408
},
{
"epoch": 1.115961800818554,
"grad_norm": 0.286874383687973,
"learning_rate": 1.5810768154019386e-05,
"loss": 0.1513,
"step": 409
},
{
"epoch": 1.1186903137789905,
"grad_norm": 0.29310694336891174,
"learning_rate": 1.5784860470138633e-05,
"loss": 0.1541,
"step": 410
},
{
"epoch": 1.121418826739427,
"grad_norm": 0.28464475274086,
"learning_rate": 1.5758894296694614e-05,
"loss": 0.1528,
"step": 411
},
{
"epoch": 1.1241473396998636,
"grad_norm": 0.2829197347164154,
"learning_rate": 1.573286989622609e-05,
"loss": 0.1555,
"step": 412
},
{
"epoch": 1.1268758526603002,
"grad_norm": 0.2770795226097107,
"learning_rate": 1.5706787531860557e-05,
"loss": 0.1513,
"step": 413
},
{
"epoch": 1.1296043656207366,
"grad_norm": 0.28912314772605896,
"learning_rate": 1.568064746731156e-05,
"loss": 0.1583,
"step": 414
},
{
"epoch": 1.1323328785811733,
"grad_norm": 0.2982404828071594,
"learning_rate": 1.565444996687605e-05,
"loss": 0.1554,
"step": 415
},
{
"epoch": 1.13506139154161,
"grad_norm": 0.2901241183280945,
"learning_rate": 1.5628195295431696e-05,
"loss": 0.1536,
"step": 416
},
{
"epoch": 1.1377899045020463,
"grad_norm": 0.311419814825058,
"learning_rate": 1.5601883718434207e-05,
"loss": 0.1545,
"step": 417
},
{
"epoch": 1.140518417462483,
"grad_norm": 0.2851913571357727,
"learning_rate": 1.557551550191467e-05,
"loss": 0.1544,
"step": 418
},
{
"epoch": 1.1432469304229196,
"grad_norm": 0.2762346863746643,
"learning_rate": 1.554909091247682e-05,
"loss": 0.1499,
"step": 419
},
{
"epoch": 1.145975443383356,
"grad_norm": 0.288626492023468,
"learning_rate": 1.5522610217294377e-05,
"loss": 0.151,
"step": 420
},
{
"epoch": 1.1487039563437926,
"grad_norm": 0.28625771403312683,
"learning_rate": 1.549607368410834e-05,
"loss": 0.1533,
"step": 421
},
{
"epoch": 1.1514324693042293,
"grad_norm": 0.28250762820243835,
"learning_rate": 1.5469481581224274e-05,
"loss": 0.1536,
"step": 422
},
{
"epoch": 1.1541609822646657,
"grad_norm": 0.2767171859741211,
"learning_rate": 1.544283417750958e-05,
"loss": 0.1485,
"step": 423
},
{
"epoch": 1.1568894952251023,
"grad_norm": 0.2896566390991211,
"learning_rate": 1.5416131742390827e-05,
"loss": 0.1536,
"step": 424
},
{
"epoch": 1.159618008185539,
"grad_norm": 0.2898276746273041,
"learning_rate": 1.5389374545850973e-05,
"loss": 0.1539,
"step": 425
},
{
"epoch": 1.1623465211459754,
"grad_norm": 0.2844085693359375,
"learning_rate": 1.5362562858426655e-05,
"loss": 0.1505,
"step": 426
},
{
"epoch": 1.165075034106412,
"grad_norm": 0.28003549575805664,
"learning_rate": 1.533569695120547e-05,
"loss": 0.1509,
"step": 427
},
{
"epoch": 1.1678035470668486,
"grad_norm": 0.2836598753929138,
"learning_rate": 1.530877709582321e-05,
"loss": 0.153,
"step": 428
},
{
"epoch": 1.170532060027285,
"grad_norm": 0.2787971496582031,
"learning_rate": 1.5281803564461135e-05,
"loss": 0.1512,
"step": 429
},
{
"epoch": 1.1732605729877217,
"grad_norm": 0.2842913269996643,
"learning_rate": 1.5254776629843204e-05,
"loss": 0.1491,
"step": 430
},
{
"epoch": 1.1759890859481583,
"grad_norm": 0.2781684696674347,
"learning_rate": 1.522769656523333e-05,
"loss": 0.1509,
"step": 431
},
{
"epoch": 1.1787175989085947,
"grad_norm": 0.28691452741622925,
"learning_rate": 1.5200563644432614e-05,
"loss": 0.1517,
"step": 432
},
{
"epoch": 1.1814461118690314,
"grad_norm": 0.29217803478240967,
"learning_rate": 1.5173378141776569e-05,
"loss": 0.1549,
"step": 433
},
{
"epoch": 1.184174624829468,
"grad_norm": 0.28329455852508545,
"learning_rate": 1.5146140332132359e-05,
"loss": 0.1507,
"step": 434
},
{
"epoch": 1.1869031377899044,
"grad_norm": 0.2823912799358368,
"learning_rate": 1.5118850490896012e-05,
"loss": 0.1479,
"step": 435
},
{
"epoch": 1.189631650750341,
"grad_norm": 0.28157341480255127,
"learning_rate": 1.5091508893989633e-05,
"loss": 0.1498,
"step": 436
},
{
"epoch": 1.1923601637107777,
"grad_norm": 0.28963181376457214,
"learning_rate": 1.5064115817858622e-05,
"loss": 0.1544,
"step": 437
},
{
"epoch": 1.195088676671214,
"grad_norm": 0.2853146195411682,
"learning_rate": 1.5036671539468879e-05,
"loss": 0.1523,
"step": 438
},
{
"epoch": 1.1978171896316507,
"grad_norm": 0.27613565325737,
"learning_rate": 1.5009176336303987e-05,
"loss": 0.1527,
"step": 439
},
{
"epoch": 1.2005457025920874,
"grad_norm": 0.2780938446521759,
"learning_rate": 1.4981630486362435e-05,
"loss": 0.1506,
"step": 440
},
{
"epoch": 1.2032742155525238,
"grad_norm": 0.28589677810668945,
"learning_rate": 1.4954034268154777e-05,
"loss": 0.1544,
"step": 441
},
{
"epoch": 1.2060027285129604,
"grad_norm": 0.295510470867157,
"learning_rate": 1.4926387960700843e-05,
"loss": 0.152,
"step": 442
},
{
"epoch": 1.208731241473397,
"grad_norm": 0.281441330909729,
"learning_rate": 1.4898691843526897e-05,
"loss": 0.1516,
"step": 443
},
{
"epoch": 1.2114597544338335,
"grad_norm": 0.29004696011543274,
"learning_rate": 1.4870946196662822e-05,
"loss": 0.153,
"step": 444
},
{
"epoch": 1.21418826739427,
"grad_norm": 0.2828517258167267,
"learning_rate": 1.4843151300639282e-05,
"loss": 0.154,
"step": 445
},
{
"epoch": 1.2169167803547067,
"grad_norm": 0.2731991410255432,
"learning_rate": 1.4815307436484898e-05,
"loss": 0.1501,
"step": 446
},
{
"epoch": 1.2196452933151432,
"grad_norm": 0.27273738384246826,
"learning_rate": 1.4787414885723386e-05,
"loss": 0.1532,
"step": 447
},
{
"epoch": 1.2223738062755798,
"grad_norm": 0.2747660279273987,
"learning_rate": 1.4759473930370738e-05,
"loss": 0.1536,
"step": 448
},
{
"epoch": 1.2251023192360164,
"grad_norm": 0.2898302972316742,
"learning_rate": 1.4731484852932338e-05,
"loss": 0.1522,
"step": 449
},
{
"epoch": 1.2278308321964528,
"grad_norm": 0.2835775911808014,
"learning_rate": 1.4703447936400135e-05,
"loss": 0.1518,
"step": 450
},
{
"epoch": 1.2305593451568895,
"grad_norm": 0.28972864151000977,
"learning_rate": 1.4675363464249763e-05,
"loss": 0.1557,
"step": 451
},
{
"epoch": 1.233287858117326,
"grad_norm": 0.28045570850372314,
"learning_rate": 1.4647231720437687e-05,
"loss": 0.1509,
"step": 452
},
{
"epoch": 1.2360163710777625,
"grad_norm": 0.2846221625804901,
"learning_rate": 1.461905298939832e-05,
"loss": 0.1535,
"step": 453
},
{
"epoch": 1.2387448840381992,
"grad_norm": 0.26935145258903503,
"learning_rate": 1.4590827556041158e-05,
"loss": 0.151,
"step": 454
},
{
"epoch": 1.2414733969986358,
"grad_norm": 0.2784791886806488,
"learning_rate": 1.4562555705747894e-05,
"loss": 0.1559,
"step": 455
},
{
"epoch": 1.2442019099590724,
"grad_norm": 0.2784489393234253,
"learning_rate": 1.4534237724369534e-05,
"loss": 0.1517,
"step": 456
},
{
"epoch": 1.2469304229195088,
"grad_norm": 0.26813435554504395,
"learning_rate": 1.4505873898223498e-05,
"loss": 0.1511,
"step": 457
},
{
"epoch": 1.2496589358799455,
"grad_norm": 0.27907952666282654,
"learning_rate": 1.4477464514090745e-05,
"loss": 0.1517,
"step": 458
},
{
"epoch": 1.252387448840382,
"grad_norm": 0.2844650447368622,
"learning_rate": 1.4449009859212857e-05,
"loss": 0.1512,
"step": 459
},
{
"epoch": 1.2551159618008185,
"grad_norm": 0.28687214851379395,
"learning_rate": 1.4420510221289137e-05,
"loss": 0.1534,
"step": 460
},
{
"epoch": 1.2578444747612552,
"grad_norm": 0.27363499999046326,
"learning_rate": 1.4391965888473705e-05,
"loss": 0.1519,
"step": 461
},
{
"epoch": 1.2605729877216918,
"grad_norm": 0.28266823291778564,
"learning_rate": 1.4363377149372584e-05,
"loss": 0.1524,
"step": 462
},
{
"epoch": 1.2633015006821282,
"grad_norm": 0.27779197692871094,
"learning_rate": 1.4334744293040773e-05,
"loss": 0.1475,
"step": 463
},
{
"epoch": 1.2660300136425648,
"grad_norm": 0.27429601550102234,
"learning_rate": 1.430606760897934e-05,
"loss": 0.1514,
"step": 464
},
{
"epoch": 1.2687585266030013,
"grad_norm": 0.2683657705783844,
"learning_rate": 1.4277347387132482e-05,
"loss": 0.1476,
"step": 465
},
{
"epoch": 1.271487039563438,
"grad_norm": 0.27010712027549744,
"learning_rate": 1.4248583917884595e-05,
"loss": 0.151,
"step": 466
},
{
"epoch": 1.2742155525238745,
"grad_norm": 0.283557653427124,
"learning_rate": 1.4219777492057349e-05,
"loss": 0.153,
"step": 467
},
{
"epoch": 1.2769440654843112,
"grad_norm": 0.26577454805374146,
"learning_rate": 1.4190928400906731e-05,
"loss": 0.1493,
"step": 468
},
{
"epoch": 1.2796725784447476,
"grad_norm": 0.2751932740211487,
"learning_rate": 1.4162036936120115e-05,
"loss": 0.1501,
"step": 469
},
{
"epoch": 1.2824010914051842,
"grad_norm": 0.27455008029937744,
"learning_rate": 1.4133103389813302e-05,
"loss": 0.1484,
"step": 470
},
{
"epoch": 1.2851296043656206,
"grad_norm": 0.2785343825817108,
"learning_rate": 1.410412805452757e-05,
"loss": 0.1538,
"step": 471
},
{
"epoch": 1.2878581173260573,
"grad_norm": 0.2785497009754181,
"learning_rate": 1.4075111223226721e-05,
"loss": 0.1515,
"step": 472
},
{
"epoch": 1.290586630286494,
"grad_norm": 0.2835540175437927,
"learning_rate": 1.4046053189294114e-05,
"loss": 0.1566,
"step": 473
},
{
"epoch": 1.2933151432469305,
"grad_norm": 0.2900526523590088,
"learning_rate": 1.4016954246529697e-05,
"loss": 0.1557,
"step": 474
},
{
"epoch": 1.296043656207367,
"grad_norm": 0.28149256110191345,
"learning_rate": 1.3987814689147041e-05,
"loss": 0.1543,
"step": 475
},
{
"epoch": 1.2987721691678036,
"grad_norm": 0.2713780701160431,
"learning_rate": 1.3958634811770361e-05,
"loss": 0.1501,
"step": 476
},
{
"epoch": 1.30150068212824,
"grad_norm": 0.28081363439559937,
"learning_rate": 1.3929414909431544e-05,
"loss": 0.1528,
"step": 477
},
{
"epoch": 1.3042291950886766,
"grad_norm": 0.2775419354438782,
"learning_rate": 1.3900155277567157e-05,
"loss": 0.1542,
"step": 478
},
{
"epoch": 1.3069577080491133,
"grad_norm": 0.28829649090766907,
"learning_rate": 1.3870856212015468e-05,
"loss": 0.1542,
"step": 479
},
{
"epoch": 1.30968622100955,
"grad_norm": 0.2776946425437927,
"learning_rate": 1.3841518009013446e-05,
"loss": 0.1535,
"step": 480
},
{
"epoch": 1.3124147339699863,
"grad_norm": 0.27957138419151306,
"learning_rate": 1.3812140965193775e-05,
"loss": 0.1541,
"step": 481
},
{
"epoch": 1.315143246930423,
"grad_norm": 0.26690757274627686,
"learning_rate": 1.378272537758185e-05,
"loss": 0.1498,
"step": 482
},
{
"epoch": 1.3178717598908594,
"grad_norm": 0.2730836868286133,
"learning_rate": 1.3753271543592772e-05,
"loss": 0.1557,
"step": 483
},
{
"epoch": 1.320600272851296,
"grad_norm": 0.28376683592796326,
"learning_rate": 1.3723779761028349e-05,
"loss": 0.1544,
"step": 484
},
{
"epoch": 1.3233287858117326,
"grad_norm": 0.271511048078537,
"learning_rate": 1.3694250328074072e-05,
"loss": 0.1507,
"step": 485
},
{
"epoch": 1.3260572987721693,
"grad_norm": 0.2825874984264374,
"learning_rate": 1.3664683543296114e-05,
"loss": 0.154,
"step": 486
},
{
"epoch": 1.3287858117326057,
"grad_norm": 0.274321049451828,
"learning_rate": 1.3635079705638298e-05,
"loss": 0.1536,
"step": 487
},
{
"epoch": 1.3315143246930423,
"grad_norm": 0.27644291520118713,
"learning_rate": 1.3605439114419095e-05,
"loss": 0.1511,
"step": 488
},
{
"epoch": 1.3342428376534787,
"grad_norm": 0.2841475307941437,
"learning_rate": 1.3575762069328567e-05,
"loss": 0.1561,
"step": 489
},
{
"epoch": 1.3369713506139154,
"grad_norm": 0.2663029134273529,
"learning_rate": 1.3546048870425356e-05,
"loss": 0.149,
"step": 490
},
{
"epoch": 1.339699863574352,
"grad_norm": 0.2731173634529114,
"learning_rate": 1.3516299818133664e-05,
"loss": 0.1505,
"step": 491
},
{
"epoch": 1.3424283765347886,
"grad_norm": 0.2730637192726135,
"learning_rate": 1.3486515213240188e-05,
"loss": 0.1498,
"step": 492
},
{
"epoch": 1.345156889495225,
"grad_norm": 0.2693032920360565,
"learning_rate": 1.3456695356891079e-05,
"loss": 0.1496,
"step": 493
},
{
"epoch": 1.3478854024556617,
"grad_norm": 0.2721140682697296,
"learning_rate": 1.3426840550588933e-05,
"loss": 0.1483,
"step": 494
},
{
"epoch": 1.350613915416098,
"grad_norm": 0.2766999304294586,
"learning_rate": 1.33969510961897e-05,
"loss": 0.1533,
"step": 495
},
{
"epoch": 1.3533424283765347,
"grad_norm": 0.28428953886032104,
"learning_rate": 1.3367027295899652e-05,
"loss": 0.1537,
"step": 496
},
{
"epoch": 1.3560709413369714,
"grad_norm": 0.2701844871044159,
"learning_rate": 1.3337069452272332e-05,
"loss": 0.1523,
"step": 497
},
{
"epoch": 1.358799454297408,
"grad_norm": 0.2715851366519928,
"learning_rate": 1.3307077868205487e-05,
"loss": 0.1508,
"step": 498
},
{
"epoch": 1.3615279672578444,
"grad_norm": 0.27837562561035156,
"learning_rate": 1.3277052846937997e-05,
"loss": 0.1504,
"step": 499
},
{
"epoch": 1.364256480218281,
"grad_norm": 0.2765304148197174,
"learning_rate": 1.3246994692046837e-05,
"loss": 0.1534,
"step": 500
},
{
"epoch": 1.3669849931787175,
"grad_norm": 0.26995524764060974,
"learning_rate": 1.321690370744397e-05,
"loss": 0.1503,
"step": 501
},
{
"epoch": 1.369713506139154,
"grad_norm": 0.2703496515750885,
"learning_rate": 1.3186780197373306e-05,
"loss": 0.1532,
"step": 502
},
{
"epoch": 1.3724420190995907,
"grad_norm": 0.2615608274936676,
"learning_rate": 1.3156624466407607e-05,
"loss": 0.1484,
"step": 503
},
{
"epoch": 1.3751705320600274,
"grad_norm": 0.26023590564727783,
"learning_rate": 1.3126436819445423e-05,
"loss": 0.1494,
"step": 504
},
{
"epoch": 1.3778990450204638,
"grad_norm": 0.27510491013526917,
"learning_rate": 1.309621756170799e-05,
"loss": 0.1513,
"step": 505
},
{
"epoch": 1.3806275579809004,
"grad_norm": 0.2730810046195984,
"learning_rate": 1.3065966998736155e-05,
"loss": 0.1545,
"step": 506
},
{
"epoch": 1.3833560709413368,
"grad_norm": 0.2774997055530548,
"learning_rate": 1.3035685436387297e-05,
"loss": 0.154,
"step": 507
},
{
"epoch": 1.3860845839017735,
"grad_norm": 0.26868611574172974,
"learning_rate": 1.300537318083221e-05,
"loss": 0.1532,
"step": 508
},
{
"epoch": 1.38881309686221,
"grad_norm": 0.2723233699798584,
"learning_rate": 1.297503053855203e-05,
"loss": 0.1535,
"step": 509
},
{
"epoch": 1.3915416098226467,
"grad_norm": 0.2790588438510895,
"learning_rate": 1.2944657816335124e-05,
"loss": 0.1495,
"step": 510
},
{
"epoch": 1.3942701227830832,
"grad_norm": 0.2784240245819092,
"learning_rate": 1.2914255321273987e-05,
"loss": 0.1515,
"step": 511
},
{
"epoch": 1.3969986357435198,
"grad_norm": 0.2623065412044525,
"learning_rate": 1.2883823360762149e-05,
"loss": 0.1484,
"step": 512
},
{
"epoch": 1.3997271487039564,
"grad_norm": 0.263235479593277,
"learning_rate": 1.2853362242491054e-05,
"loss": 0.1489,
"step": 513
},
{
"epoch": 1.4024556616643928,
"grad_norm": 0.2626051902770996,
"learning_rate": 1.2822872274446958e-05,
"loss": 0.1499,
"step": 514
},
{
"epoch": 1.4051841746248295,
"grad_norm": 0.26870426535606384,
"learning_rate": 1.2792353764907803e-05,
"loss": 0.1558,
"step": 515
},
{
"epoch": 1.407912687585266,
"grad_norm": 0.27244260907173157,
"learning_rate": 1.276180702244012e-05,
"loss": 0.1493,
"step": 516
},
{
"epoch": 1.4106412005457025,
"grad_norm": 0.2713581323623657,
"learning_rate": 1.273123235589589e-05,
"loss": 0.148,
"step": 517
},
{
"epoch": 1.4133697135061392,
"grad_norm": 0.26679232716560364,
"learning_rate": 1.2700630074409427e-05,
"loss": 0.1509,
"step": 518
},
{
"epoch": 1.4160982264665758,
"grad_norm": 0.27474647760391235,
"learning_rate": 1.2670000487394268e-05,
"loss": 0.1519,
"step": 519
},
{
"epoch": 1.4188267394270122,
"grad_norm": 0.2619456648826599,
"learning_rate": 1.2639343904540008e-05,
"loss": 0.1466,
"step": 520
},
{
"epoch": 1.4215552523874488,
"grad_norm": 0.27978694438934326,
"learning_rate": 1.260866063580921e-05,
"loss": 0.152,
"step": 521
},
{
"epoch": 1.4242837653478855,
"grad_norm": 0.2694023549556732,
"learning_rate": 1.2577950991434249e-05,
"loss": 0.1486,
"step": 522
},
{
"epoch": 1.427012278308322,
"grad_norm": 0.287028968334198,
"learning_rate": 1.254721528191417e-05,
"loss": 0.1565,
"step": 523
},
{
"epoch": 1.4297407912687585,
"grad_norm": 0.2575623691082001,
"learning_rate": 1.2516453818011567e-05,
"loss": 0.1504,
"step": 524
},
{
"epoch": 1.4324693042291952,
"grad_norm": 0.2718178331851959,
"learning_rate": 1.2485666910749427e-05,
"loss": 0.1545,
"step": 525
},
{
"epoch": 1.4351978171896316,
"grad_norm": 0.26796644926071167,
"learning_rate": 1.2454854871407993e-05,
"loss": 0.1496,
"step": 526
},
{
"epoch": 1.4379263301500682,
"grad_norm": 0.2680867612361908,
"learning_rate": 1.242401801152161e-05,
"loss": 0.1513,
"step": 527
},
{
"epoch": 1.4406548431105048,
"grad_norm": 0.2729087769985199,
"learning_rate": 1.2393156642875579e-05,
"loss": 0.157,
"step": 528
},
{
"epoch": 1.4433833560709413,
"grad_norm": 0.26261597871780396,
"learning_rate": 1.2362271077503007e-05,
"loss": 0.1508,
"step": 529
},
{
"epoch": 1.446111869031378,
"grad_norm": 0.25339093804359436,
"learning_rate": 1.2331361627681645e-05,
"loss": 0.1497,
"step": 530
},
{
"epoch": 1.4488403819918145,
"grad_norm": 0.26805320382118225,
"learning_rate": 1.2300428605930736e-05,
"loss": 0.1553,
"step": 531
},
{
"epoch": 1.451568894952251,
"grad_norm": 0.27098819613456726,
"learning_rate": 1.2269472325007858e-05,
"loss": 0.1526,
"step": 532
},
{
"epoch": 1.4542974079126876,
"grad_norm": 0.26341021060943604,
"learning_rate": 1.2238493097905754e-05,
"loss": 0.1522,
"step": 533
},
{
"epoch": 1.4570259208731242,
"grad_norm": 0.2733905613422394,
"learning_rate": 1.2207491237849174e-05,
"loss": 0.1529,
"step": 534
},
{
"epoch": 1.4597544338335606,
"grad_norm": 0.27226415276527405,
"learning_rate": 1.2176467058291699e-05,
"loss": 0.1508,
"step": 535
},
{
"epoch": 1.4624829467939973,
"grad_norm": 0.27076438069343567,
"learning_rate": 1.2145420872912586e-05,
"loss": 0.1504,
"step": 536
},
{
"epoch": 1.465211459754434,
"grad_norm": 0.2671010494232178,
"learning_rate": 1.2114352995613582e-05,
"loss": 0.1501,
"step": 537
},
{
"epoch": 1.4679399727148703,
"grad_norm": 0.26683861017227173,
"learning_rate": 1.2083263740515764e-05,
"loss": 0.1497,
"step": 538
},
{
"epoch": 1.470668485675307,
"grad_norm": 0.279232919216156,
"learning_rate": 1.2052153421956343e-05,
"loss": 0.1534,
"step": 539
},
{
"epoch": 1.4733969986357436,
"grad_norm": 0.2888340353965759,
"learning_rate": 1.2021022354485514e-05,
"loss": 0.1485,
"step": 540
},
{
"epoch": 1.4761255115961802,
"grad_norm": 0.27901962399482727,
"learning_rate": 1.1989870852863254e-05,
"loss": 0.1495,
"step": 541
},
{
"epoch": 1.4788540245566166,
"grad_norm": 0.2673856019973755,
"learning_rate": 1.1958699232056135e-05,
"loss": 0.1512,
"step": 542
},
{
"epoch": 1.4815825375170533,
"grad_norm": 0.27028724551200867,
"learning_rate": 1.1927507807234169e-05,
"loss": 0.1516,
"step": 543
},
{
"epoch": 1.4843110504774897,
"grad_norm": 0.2789236605167389,
"learning_rate": 1.1896296893767588e-05,
"loss": 0.152,
"step": 544
},
{
"epoch": 1.4870395634379263,
"grad_norm": 0.257271945476532,
"learning_rate": 1.186506680722367e-05,
"loss": 0.1483,
"step": 545
},
{
"epoch": 1.489768076398363,
"grad_norm": 0.2604617178440094,
"learning_rate": 1.1833817863363563e-05,
"loss": 0.1472,
"step": 546
},
{
"epoch": 1.4924965893587996,
"grad_norm": 0.26739516854286194,
"learning_rate": 1.180255037813906e-05,
"loss": 0.153,
"step": 547
},
{
"epoch": 1.495225102319236,
"grad_norm": 0.2652672827243805,
"learning_rate": 1.1771264667689428e-05,
"loss": 0.152,
"step": 548
},
{
"epoch": 1.4979536152796726,
"grad_norm": 0.26091867685317993,
"learning_rate": 1.1739961048338213e-05,
"loss": 0.1486,
"step": 549
},
{
"epoch": 1.500682128240109,
"grad_norm": 0.25840824842453003,
"learning_rate": 1.1708639836590024e-05,
"loss": 0.1501,
"step": 550
},
{
"epoch": 1.5034106412005457,
"grad_norm": 0.2686673104763031,
"learning_rate": 1.1677301349127349e-05,
"loss": 0.1492,
"step": 551
},
{
"epoch": 1.5061391541609823,
"grad_norm": 0.26495644450187683,
"learning_rate": 1.164594590280734e-05,
"loss": 0.1466,
"step": 552
},
{
"epoch": 1.508867667121419,
"grad_norm": 0.27758917212486267,
"learning_rate": 1.161457381465863e-05,
"loss": 0.1567,
"step": 553
},
{
"epoch": 1.5115961800818554,
"grad_norm": 0.2669970989227295,
"learning_rate": 1.15831854018781e-05,
"loss": 0.1523,
"step": 554
},
{
"epoch": 1.514324693042292,
"grad_norm": 0.258527010679245,
"learning_rate": 1.1551780981827699e-05,
"loss": 0.1495,
"step": 555
},
{
"epoch": 1.5170532060027284,
"grad_norm": 0.2614873945713043,
"learning_rate": 1.1520360872031208e-05,
"loss": 0.1483,
"step": 556
},
{
"epoch": 1.519781718963165,
"grad_norm": 0.2619025409221649,
"learning_rate": 1.148892539017106e-05,
"loss": 0.1518,
"step": 557
},
{
"epoch": 1.5225102319236017,
"grad_norm": 0.2671552002429962,
"learning_rate": 1.1457474854085095e-05,
"loss": 0.1518,
"step": 558
},
{
"epoch": 1.5252387448840383,
"grad_norm": 0.265127032995224,
"learning_rate": 1.1426009581763377e-05,
"loss": 0.1532,
"step": 559
},
{
"epoch": 1.5279672578444747,
"grad_norm": 0.2522525191307068,
"learning_rate": 1.139452989134496e-05,
"loss": 0.1465,
"step": 560
},
{
"epoch": 1.5306957708049114,
"grad_norm": 0.2574995160102844,
"learning_rate": 1.1363036101114671e-05,
"loss": 0.1453,
"step": 561
},
{
"epoch": 1.5334242837653478,
"grad_norm": 0.26629239320755005,
"learning_rate": 1.1331528529499909e-05,
"loss": 0.1468,
"step": 562
},
{
"epoch": 1.5361527967257844,
"grad_norm": 0.25669702887535095,
"learning_rate": 1.1300007495067403e-05,
"loss": 0.1493,
"step": 563
},
{
"epoch": 1.538881309686221,
"grad_norm": 0.28362584114074707,
"learning_rate": 1.1268473316520007e-05,
"loss": 0.1581,
"step": 564
},
{
"epoch": 1.5416098226466577,
"grad_norm": 0.26283422112464905,
"learning_rate": 1.123692631269348e-05,
"loss": 0.1521,
"step": 565
},
{
"epoch": 1.544338335607094,
"grad_norm": 0.2613787055015564,
"learning_rate": 1.1205366802553231e-05,
"loss": 0.1526,
"step": 566
},
{
"epoch": 1.5470668485675307,
"grad_norm": 0.25800710916519165,
"learning_rate": 1.1173795105191146e-05,
"loss": 0.1487,
"step": 567
},
{
"epoch": 1.5497953615279672,
"grad_norm": 0.2534065246582031,
"learning_rate": 1.1142211539822318e-05,
"loss": 0.1481,
"step": 568
},
{
"epoch": 1.5525238744884038,
"grad_norm": 0.2590927481651306,
"learning_rate": 1.1110616425781833e-05,
"loss": 0.15,
"step": 569
},
{
"epoch": 1.5552523874488404,
"grad_norm": 0.2595134377479553,
"learning_rate": 1.1079010082521557e-05,
"loss": 0.1466,
"step": 570
},
{
"epoch": 1.557980900409277,
"grad_norm": 0.2580026090145111,
"learning_rate": 1.1047392829606876e-05,
"loss": 0.1478,
"step": 571
},
{
"epoch": 1.5607094133697135,
"grad_norm": 0.2645810544490814,
"learning_rate": 1.101576498671349e-05,
"loss": 0.1485,
"step": 572
},
{
"epoch": 1.56343792633015,
"grad_norm": 0.2832702696323395,
"learning_rate": 1.098412687362418e-05,
"loss": 0.1514,
"step": 573
},
{
"epoch": 1.5661664392905865,
"grad_norm": 0.26239389181137085,
"learning_rate": 1.095247881022555e-05,
"loss": 0.1511,
"step": 574
},
{
"epoch": 1.5688949522510232,
"grad_norm": 0.2691895067691803,
"learning_rate": 1.0920821116504816e-05,
"loss": 0.1507,
"step": 575
},
{
"epoch": 1.5716234652114598,
"grad_norm": 0.26281505823135376,
"learning_rate": 1.0889154112546569e-05,
"loss": 0.1483,
"step": 576
},
{
"epoch": 1.5743519781718964,
"grad_norm": 0.2522363066673279,
"learning_rate": 1.0857478118529534e-05,
"loss": 0.1498,
"step": 577
},
{
"epoch": 1.5770804911323328,
"grad_norm": 0.25809285044670105,
"learning_rate": 1.0825793454723325e-05,
"loss": 0.1487,
"step": 578
},
{
"epoch": 1.5798090040927695,
"grad_norm": 0.25884050130844116,
"learning_rate": 1.079410044148522e-05,
"loss": 0.1478,
"step": 579
},
{
"epoch": 1.5825375170532059,
"grad_norm": 0.2594729959964752,
"learning_rate": 1.0762399399256917e-05,
"loss": 0.1495,
"step": 580
},
{
"epoch": 1.5852660300136425,
"grad_norm": 0.2587921619415283,
"learning_rate": 1.0730690648561293e-05,
"loss": 0.1504,
"step": 581
},
{
"epoch": 1.5879945429740792,
"grad_norm": 0.2573103606700897,
"learning_rate": 1.0698974509999159e-05,
"loss": 0.1499,
"step": 582
},
{
"epoch": 1.5907230559345158,
"grad_norm": 0.2691209614276886,
"learning_rate": 1.0667251304246028e-05,
"loss": 0.1512,
"step": 583
},
{
"epoch": 1.5934515688949522,
"grad_norm": 0.2579781115055084,
"learning_rate": 1.0635521352048873e-05,
"loss": 0.1519,
"step": 584
},
{
"epoch": 1.5961800818553888,
"grad_norm": 0.26046183705329895,
"learning_rate": 1.0603784974222862e-05,
"loss": 0.1517,
"step": 585
},
{
"epoch": 1.5989085948158253,
"grad_norm": 0.2546302378177643,
"learning_rate": 1.057204249164815e-05,
"loss": 0.1527,
"step": 586
},
{
"epoch": 1.601637107776262,
"grad_norm": 0.2573506534099579,
"learning_rate": 1.0540294225266608e-05,
"loss": 0.1479,
"step": 587
},
{
"epoch": 1.6043656207366985,
"grad_norm": 0.25746142864227295,
"learning_rate": 1.0508540496078582e-05,
"loss": 0.1504,
"step": 588
},
{
"epoch": 1.6070941336971352,
"grad_norm": 0.26589056849479675,
"learning_rate": 1.0476781625139655e-05,
"loss": 0.1543,
"step": 589
},
{
"epoch": 1.6098226466575716,
"grad_norm": 0.2670317590236664,
"learning_rate": 1.0445017933557404e-05,
"loss": 0.1496,
"step": 590
},
{
"epoch": 1.6125511596180082,
"grad_norm": 0.26336291432380676,
"learning_rate": 1.0413249742488132e-05,
"loss": 0.1456,
"step": 591
},
{
"epoch": 1.6152796725784446,
"grad_norm": 0.2557404637336731,
"learning_rate": 1.0381477373133652e-05,
"loss": 0.1484,
"step": 592
},
{
"epoch": 1.6180081855388813,
"grad_norm": 0.2566952109336853,
"learning_rate": 1.0349701146738007e-05,
"loss": 0.1476,
"step": 593
},
{
"epoch": 1.620736698499318,
"grad_norm": 0.25710833072662354,
"learning_rate": 1.0317921384584245e-05,
"loss": 0.1482,
"step": 594
},
{
"epoch": 1.6234652114597545,
"grad_norm": 0.2633715271949768,
"learning_rate": 1.0286138407991171e-05,
"loss": 0.1519,
"step": 595
},
{
"epoch": 1.626193724420191,
"grad_norm": 0.2598545551300049,
"learning_rate": 1.0254352538310075e-05,
"loss": 0.1463,
"step": 596
},
{
"epoch": 1.6289222373806276,
"grad_norm": 0.26151853799819946,
"learning_rate": 1.0222564096921505e-05,
"loss": 0.148,
"step": 597
},
{
"epoch": 1.631650750341064,
"grad_norm": 0.26459234952926636,
"learning_rate": 1.0190773405232024e-05,
"loss": 0.1519,
"step": 598
},
{
"epoch": 1.6343792633015006,
"grad_norm": 0.26215240359306335,
"learning_rate": 1.0158980784670927e-05,
"loss": 0.1498,
"step": 599
},
{
"epoch": 1.6371077762619373,
"grad_norm": 0.2636496126651764,
"learning_rate": 1.012718655668702e-05,
"loss": 0.1499,
"step": 600
},
{
"epoch": 1.639836289222374,
"grad_norm": 0.2527211904525757,
"learning_rate": 1.0095391042745362e-05,
"loss": 0.1475,
"step": 601
},
{
"epoch": 1.6425648021828103,
"grad_norm": 0.2661401629447937,
"learning_rate": 1.0063594564324014e-05,
"loss": 0.1491,
"step": 602
},
{
"epoch": 1.645293315143247,
"grad_norm": 0.26444581151008606,
"learning_rate": 1.0031797442910788e-05,
"loss": 0.1485,
"step": 603
},
{
"epoch": 1.6480218281036834,
"grad_norm": 0.2744594216346741,
"learning_rate": 1e-05,
"loss": 0.151,
"step": 604
},
{
"epoch": 1.65075034106412,
"grad_norm": 0.2647473216056824,
"learning_rate": 9.968202557089213e-06,
"loss": 0.1499,
"step": 605
},
{
"epoch": 1.6534788540245566,
"grad_norm": 0.25674381852149963,
"learning_rate": 9.936405435675991e-06,
"loss": 0.1476,
"step": 606
},
{
"epoch": 1.6562073669849933,
"grad_norm": 0.24829277396202087,
"learning_rate": 9.904608957254643e-06,
"loss": 0.1466,
"step": 607
},
{
"epoch": 1.65893587994543,
"grad_norm": 0.26628366112709045,
"learning_rate": 9.872813443312984e-06,
"loss": 0.1495,
"step": 608
},
{
"epoch": 1.6616643929058663,
"grad_norm": 0.25105050206184387,
"learning_rate": 9.84101921532908e-06,
"loss": 0.1457,
"step": 609
},
{
"epoch": 1.6643929058663027,
"grad_norm": 0.25182658433914185,
"learning_rate": 9.809226594767979e-06,
"loss": 0.1456,
"step": 610
},
{
"epoch": 1.6671214188267394,
"grad_norm": 0.25435999035835266,
"learning_rate": 9.777435903078493e-06,
"loss": 0.1491,
"step": 611
},
{
"epoch": 1.669849931787176,
"grad_norm": 0.26035425066947937,
"learning_rate": 9.745647461689932e-06,
"loss": 0.1482,
"step": 612
},
{
"epoch": 1.6725784447476126,
"grad_norm": 0.2703112065792084,
"learning_rate": 9.713861592008834e-06,
"loss": 0.1507,
"step": 613
},
{
"epoch": 1.6753069577080493,
"grad_norm": 0.2530798316001892,
"learning_rate": 9.682078615415755e-06,
"loss": 0.1467,
"step": 614
},
{
"epoch": 1.6780354706684857,
"grad_norm": 0.2670607566833496,
"learning_rate": 9.650298853261998e-06,
"loss": 0.1507,
"step": 615
},
{
"epoch": 1.680763983628922,
"grad_norm": 0.25545379519462585,
"learning_rate": 9.618522626866351e-06,
"loss": 0.1462,
"step": 616
},
{
"epoch": 1.6834924965893587,
"grad_norm": 0.2772793769836426,
"learning_rate": 9.586750257511868e-06,
"loss": 0.1516,
"step": 617
},
{
"epoch": 1.6862210095497954,
"grad_norm": 0.24653498828411102,
"learning_rate": 9.554982066442601e-06,
"loss": 0.1422,
"step": 618
},
{
"epoch": 1.688949522510232,
"grad_norm": 0.2552987039089203,
"learning_rate": 9.523218374860348e-06,
"loss": 0.145,
"step": 619
},
{
"epoch": 1.6916780354706686,
"grad_norm": 0.26025906205177307,
"learning_rate": 9.49145950392142e-06,
"loss": 0.1481,
"step": 620
},
{
"epoch": 1.694406548431105,
"grad_norm": 0.2519054114818573,
"learning_rate": 9.459705774733397e-06,
"loss": 0.1487,
"step": 621
},
{
"epoch": 1.6971350613915415,
"grad_norm": 0.2666691839694977,
"learning_rate": 9.427957508351852e-06,
"loss": 0.1536,
"step": 622
},
{
"epoch": 1.699863574351978,
"grad_norm": 0.2632191479206085,
"learning_rate": 9.39621502577714e-06,
"loss": 0.1465,
"step": 623
},
{
"epoch": 1.7025920873124147,
"grad_norm": 0.2520408034324646,
"learning_rate": 9.364478647951132e-06,
"loss": 0.1453,
"step": 624
},
{
"epoch": 1.7053206002728514,
"grad_norm": 0.25211259722709656,
"learning_rate": 9.332748695753973e-06,
"loss": 0.148,
"step": 625
},
{
"epoch": 1.708049113233288,
"grad_norm": 0.2573055326938629,
"learning_rate": 9.301025490000843e-06,
"loss": 0.148,
"step": 626
},
{
"epoch": 1.7107776261937244,
"grad_norm": 0.25375857949256897,
"learning_rate": 9.26930935143871e-06,
"loss": 0.1488,
"step": 627
},
{
"epoch": 1.7135061391541608,
"grad_norm": 0.26745373010635376,
"learning_rate": 9.237600600743086e-06,
"loss": 0.151,
"step": 628
},
{
"epoch": 1.7162346521145975,
"grad_norm": 0.24393707513809204,
"learning_rate": 9.20589955851478e-06,
"loss": 0.1455,
"step": 629
},
{
"epoch": 1.718963165075034,
"grad_norm": 0.26576268672943115,
"learning_rate": 9.174206545276678e-06,
"loss": 0.149,
"step": 630
},
{
"epoch": 1.7216916780354707,
"grad_norm": 0.2506358325481415,
"learning_rate": 9.14252188147047e-06,
"loss": 0.1475,
"step": 631
},
{
"epoch": 1.7244201909959074,
"grad_norm": 0.2575211226940155,
"learning_rate": 9.11084588745343e-06,
"loss": 0.1513,
"step": 632
},
{
"epoch": 1.7271487039563438,
"grad_norm": 0.2662312090396881,
"learning_rate": 9.07917888349519e-06,
"loss": 0.1461,
"step": 633
},
{
"epoch": 1.7298772169167802,
"grad_norm": 0.2556709349155426,
"learning_rate": 9.047521189774456e-06,
"loss": 0.1493,
"step": 634
},
{
"epoch": 1.7326057298772168,
"grad_norm": 0.2519993185997009,
"learning_rate": 9.015873126375822e-06,
"loss": 0.1483,
"step": 635
},
{
"epoch": 1.7353342428376535,
"grad_norm": 0.24838009476661682,
"learning_rate": 8.984235013286512e-06,
"loss": 0.1471,
"step": 636
},
{
"epoch": 1.73806275579809,
"grad_norm": 0.2519931495189667,
"learning_rate": 8.952607170393126e-06,
"loss": 0.1474,
"step": 637
},
{
"epoch": 1.7407912687585267,
"grad_norm": 0.25966668128967285,
"learning_rate": 8.920989917478446e-06,
"loss": 0.147,
"step": 638
},
{
"epoch": 1.7435197817189632,
"grad_norm": 0.2602226138114929,
"learning_rate": 8.88938357421817e-06,
"loss": 0.1482,
"step": 639
},
{
"epoch": 1.7462482946793996,
"grad_norm": 0.2486027628183365,
"learning_rate": 8.857788460177685e-06,
"loss": 0.1477,
"step": 640
},
{
"epoch": 1.7489768076398362,
"grad_norm": 0.2588677704334259,
"learning_rate": 8.826204894808856e-06,
"loss": 0.1504,
"step": 641
},
{
"epoch": 1.7517053206002728,
"grad_norm": 0.25979679822921753,
"learning_rate": 8.79463319744677e-06,
"loss": 0.1521,
"step": 642
},
{
"epoch": 1.7544338335607095,
"grad_norm": 0.24735727906227112,
"learning_rate": 8.763073687306523e-06,
"loss": 0.1493,
"step": 643
},
{
"epoch": 1.7571623465211461,
"grad_norm": 0.24922150373458862,
"learning_rate": 8.731526683479991e-06,
"loss": 0.1469,
"step": 644
},
{
"epoch": 1.7598908594815825,
"grad_norm": 0.24837970733642578,
"learning_rate": 8.699992504932599e-06,
"loss": 0.1489,
"step": 645
},
{
"epoch": 1.762619372442019,
"grad_norm": 0.24551905691623688,
"learning_rate": 8.668471470500094e-06,
"loss": 0.1468,
"step": 646
},
{
"epoch": 1.7653478854024556,
"grad_norm": 0.25992244482040405,
"learning_rate": 8.63696389888533e-06,
"loss": 0.1512,
"step": 647
},
{
"epoch": 1.7680763983628922,
"grad_norm": 0.2457309365272522,
"learning_rate": 8.605470108655046e-06,
"loss": 0.1476,
"step": 648
},
{
"epoch": 1.7708049113233288,
"grad_norm": 0.2568957805633545,
"learning_rate": 8.573990418236626e-06,
"loss": 0.1516,
"step": 649
},
{
"epoch": 1.7735334242837655,
"grad_norm": 0.2809511423110962,
"learning_rate": 8.542525145914907e-06,
"loss": 0.1479,
"step": 650
},
{
"epoch": 1.776261937244202,
"grad_norm": 0.25830382108688354,
"learning_rate": 8.511074609828944e-06,
"loss": 0.1464,
"step": 651
},
{
"epoch": 1.7789904502046383,
"grad_norm": 0.2584952414035797,
"learning_rate": 8.479639127968793e-06,
"loss": 0.1477,
"step": 652
},
{
"epoch": 1.781718963165075,
"grad_norm": 0.26085707545280457,
"learning_rate": 8.448219018172303e-06,
"loss": 0.1493,
"step": 653
},
{
"epoch": 1.7844474761255116,
"grad_norm": 0.2670457363128662,
"learning_rate": 8.416814598121901e-06,
"loss": 0.1499,
"step": 654
},
{
"epoch": 1.7871759890859482,
"grad_norm": 0.25824981927871704,
"learning_rate": 8.385426185341374e-06,
"loss": 0.147,
"step": 655
},
{
"epoch": 1.7899045020463848,
"grad_norm": 0.25377392768859863,
"learning_rate": 8.35405409719266e-06,
"loss": 0.1472,
"step": 656
},
{
"epoch": 1.7926330150068213,
"grad_norm": 0.24447529017925262,
"learning_rate": 8.322698650872656e-06,
"loss": 0.1464,
"step": 657
},
{
"epoch": 1.795361527967258,
"grad_norm": 0.24391107261180878,
"learning_rate": 8.291360163409978e-06,
"loss": 0.1445,
"step": 658
},
{
"epoch": 1.7980900409276943,
"grad_norm": 0.25044581294059753,
"learning_rate": 8.260038951661787e-06,
"loss": 0.1461,
"step": 659
},
{
"epoch": 1.800818553888131,
"grad_norm": 0.2596738636493683,
"learning_rate": 8.228735332310575e-06,
"loss": 0.1498,
"step": 660
},
{
"epoch": 1.8035470668485676,
"grad_norm": 0.26180022954940796,
"learning_rate": 8.197449621860944e-06,
"loss": 0.1497,
"step": 661
},
{
"epoch": 1.8062755798090042,
"grad_norm": 0.24803955852985382,
"learning_rate": 8.16618213663644e-06,
"loss": 0.1462,
"step": 662
},
{
"epoch": 1.8090040927694406,
"grad_norm": 0.2657015919685364,
"learning_rate": 8.134933192776333e-06,
"loss": 0.1488,
"step": 663
},
{
"epoch": 1.8117326057298773,
"grad_norm": 0.25257861614227295,
"learning_rate": 8.103703106232416e-06,
"loss": 0.1476,
"step": 664
},
{
"epoch": 1.8144611186903137,
"grad_norm": 0.24522091448307037,
"learning_rate": 8.072492192765833e-06,
"loss": 0.1444,
"step": 665
},
{
"epoch": 1.8171896316507503,
"grad_norm": 0.24656806886196136,
"learning_rate": 8.041300767943867e-06,
"loss": 0.1461,
"step": 666
},
{
"epoch": 1.819918144611187,
"grad_norm": 0.2571374475955963,
"learning_rate": 8.010129147136749e-06,
"loss": 0.1508,
"step": 667
},
{
"epoch": 1.8226466575716236,
"grad_norm": 0.2505132853984833,
"learning_rate": 7.978977645514488e-06,
"loss": 0.1492,
"step": 668
},
{
"epoch": 1.82537517053206,
"grad_norm": 0.25223055481910706,
"learning_rate": 7.947846578043658e-06,
"loss": 0.1465,
"step": 669
},
{
"epoch": 1.8281036834924966,
"grad_norm": 0.2564716637134552,
"learning_rate": 7.916736259484239e-06,
"loss": 0.1487,
"step": 670
},
{
"epoch": 1.830832196452933,
"grad_norm": 0.24133503437042236,
"learning_rate": 7.885647004386421e-06,
"loss": 0.1443,
"step": 671
},
{
"epoch": 1.8335607094133697,
"grad_norm": 0.24343539774417877,
"learning_rate": 7.854579127087418e-06,
"loss": 0.1434,
"step": 672
},
{
"epoch": 1.8362892223738063,
"grad_norm": 0.25764116644859314,
"learning_rate": 7.823532941708305e-06,
"loss": 0.1504,
"step": 673
},
{
"epoch": 1.839017735334243,
"grad_norm": 0.2580263018608093,
"learning_rate": 7.792508762150833e-06,
"loss": 0.1524,
"step": 674
},
{
"epoch": 1.8417462482946794,
"grad_norm": 0.24214167892932892,
"learning_rate": 7.761506902094248e-06,
"loss": 0.146,
"step": 675
},
{
"epoch": 1.844474761255116,
"grad_norm": 0.25198671221733093,
"learning_rate": 7.730527674992143e-06,
"loss": 0.1488,
"step": 676
},
{
"epoch": 1.8472032742155524,
"grad_norm": 0.24607273936271667,
"learning_rate": 7.699571394069269e-06,
"loss": 0.1444,
"step": 677
},
{
"epoch": 1.849931787175989,
"grad_norm": 0.24693147838115692,
"learning_rate": 7.668638372318359e-06,
"loss": 0.1467,
"step": 678
},
{
"epoch": 1.8526603001364257,
"grad_norm": 0.2474067658185959,
"learning_rate": 7.637728922496996e-06,
"loss": 0.1431,
"step": 679
},
{
"epoch": 1.8553888130968623,
"grad_norm": 0.2558061182498932,
"learning_rate": 7.606843357124426e-06,
"loss": 0.1494,
"step": 680
},
{
"epoch": 1.8581173260572987,
"grad_norm": 0.25092291831970215,
"learning_rate": 7.575981988478393e-06,
"loss": 0.147,
"step": 681
},
{
"epoch": 1.8608458390177354,
"grad_norm": 0.25070279836654663,
"learning_rate": 7.545145128592009e-06,
"loss": 0.1464,
"step": 682
},
{
"epoch": 1.8635743519781718,
"grad_norm": 0.24806460738182068,
"learning_rate": 7.514333089250577e-06,
"loss": 0.1434,
"step": 683
},
{
"epoch": 1.8663028649386084,
"grad_norm": 0.26147374510765076,
"learning_rate": 7.483546181988437e-06,
"loss": 0.1499,
"step": 684
},
{
"epoch": 1.869031377899045,
"grad_norm": 0.2504315972328186,
"learning_rate": 7.452784718085834e-06,
"loss": 0.1467,
"step": 685
},
{
"epoch": 1.8717598908594817,
"grad_norm": 0.2534312307834625,
"learning_rate": 7.422049008565757e-06,
"loss": 0.1468,
"step": 686
},
{
"epoch": 1.874488403819918,
"grad_norm": 0.2536216676235199,
"learning_rate": 7.391339364190794e-06,
"loss": 0.1472,
"step": 687
},
{
"epoch": 1.8772169167803547,
"grad_norm": 0.25117945671081543,
"learning_rate": 7.360656095459995e-06,
"loss": 0.1479,
"step": 688
},
{
"epoch": 1.8799454297407912,
"grad_norm": 0.25060558319091797,
"learning_rate": 7.329999512605738e-06,
"loss": 0.145,
"step": 689
},
{
"epoch": 1.8826739427012278,
"grad_norm": 0.2718704640865326,
"learning_rate": 7.299369925590575e-06,
"loss": 0.1471,
"step": 690
},
{
"epoch": 1.8854024556616644,
"grad_norm": 0.25542333722114563,
"learning_rate": 7.268767644104114e-06,
"loss": 0.1492,
"step": 691
},
{
"epoch": 1.888130968622101,
"grad_norm": 0.25371450185775757,
"learning_rate": 7.2381929775598835e-06,
"loss": 0.1492,
"step": 692
},
{
"epoch": 1.8908594815825375,
"grad_norm": 0.25434619188308716,
"learning_rate": 7.207646235092201e-06,
"loss": 0.1441,
"step": 693
},
{
"epoch": 1.893587994542974,
"grad_norm": 0.2546011805534363,
"learning_rate": 7.1771277255530456e-06,
"loss": 0.1473,
"step": 694
},
{
"epoch": 1.8963165075034105,
"grad_norm": 0.2435360848903656,
"learning_rate": 7.14663775750895e-06,
"loss": 0.1449,
"step": 695
},
{
"epoch": 1.8990450204638472,
"grad_norm": 0.2683243751525879,
"learning_rate": 7.116176639237853e-06,
"loss": 0.147,
"step": 696
},
{
"epoch": 1.9017735334242838,
"grad_norm": 0.2562590539455414,
"learning_rate": 7.085744678726013e-06,
"loss": 0.1474,
"step": 697
},
{
"epoch": 1.9045020463847204,
"grad_norm": 0.25696176290512085,
"learning_rate": 7.05534218366488e-06,
"loss": 0.1459,
"step": 698
},
{
"epoch": 1.9072305593451568,
"grad_norm": 0.2516495883464813,
"learning_rate": 7.024969461447973e-06,
"loss": 0.1484,
"step": 699
},
{
"epoch": 1.9099590723055935,
"grad_norm": 0.25367051362991333,
"learning_rate": 6.994626819167789e-06,
"loss": 0.1497,
"step": 700
},
{
"epoch": 1.9126875852660299,
"grad_norm": 0.24233455955982208,
"learning_rate": 6.964314563612709e-06,
"loss": 0.1458,
"step": 701
},
{
"epoch": 1.9154160982264665,
"grad_norm": 0.2497435063123703,
"learning_rate": 6.934033001263847e-06,
"loss": 0.1464,
"step": 702
},
{
"epoch": 1.9181446111869032,
"grad_norm": 0.250087171792984,
"learning_rate": 6.9037824382920145e-06,
"loss": 0.1459,
"step": 703
},
{
"epoch": 1.9208731241473398,
"grad_norm": 0.24459220468997955,
"learning_rate": 6.873563180554583e-06,
"loss": 0.1465,
"step": 704
},
{
"epoch": 1.9236016371077762,
"grad_norm": 0.238655686378479,
"learning_rate": 6.843375533592395e-06,
"loss": 0.1463,
"step": 705
},
{
"epoch": 1.9263301500682128,
"grad_norm": 0.24296574294567108,
"learning_rate": 6.813219802626698e-06,
"loss": 0.1441,
"step": 706
},
{
"epoch": 1.9290586630286493,
"grad_norm": 0.25467896461486816,
"learning_rate": 6.783096292556035e-06,
"loss": 0.15,
"step": 707
},
{
"epoch": 1.931787175989086,
"grad_norm": 0.24202294647693634,
"learning_rate": 6.7530053079531664e-06,
"loss": 0.1453,
"step": 708
},
{
"epoch": 1.9345156889495225,
"grad_norm": 0.2509196996688843,
"learning_rate": 6.722947153062003e-06,
"loss": 0.1478,
"step": 709
},
{
"epoch": 1.9372442019099592,
"grad_norm": 0.25373610854148865,
"learning_rate": 6.692922131794517e-06,
"loss": 0.1495,
"step": 710
},
{
"epoch": 1.9399727148703958,
"grad_norm": 0.25147223472595215,
"learning_rate": 6.662930547727668e-06,
"loss": 0.1488,
"step": 711
},
{
"epoch": 1.9427012278308322,
"grad_norm": 0.25598257780075073,
"learning_rate": 6.632972704100349e-06,
"loss": 0.1435,
"step": 712
},
{
"epoch": 1.9454297407912686,
"grad_norm": 0.24954983592033386,
"learning_rate": 6.603048903810305e-06,
"loss": 0.146,
"step": 713
},
{
"epoch": 1.9481582537517053,
"grad_norm": 0.24284091591835022,
"learning_rate": 6.573159449411071e-06,
"loss": 0.1424,
"step": 714
},
{
"epoch": 1.950886766712142,
"grad_norm": 0.2458324134349823,
"learning_rate": 6.5433046431089205e-06,
"loss": 0.1455,
"step": 715
},
{
"epoch": 1.9536152796725785,
"grad_norm": 0.24704335629940033,
"learning_rate": 6.513484786759818e-06,
"loss": 0.1454,
"step": 716
},
{
"epoch": 1.9563437926330152,
"grad_norm": 0.24183446168899536,
"learning_rate": 6.483700181866337e-06,
"loss": 0.1449,
"step": 717
},
{
"epoch": 1.9590723055934516,
"grad_norm": 0.24773114919662476,
"learning_rate": 6.453951129574644e-06,
"loss": 0.1441,
"step": 718
},
{
"epoch": 1.961800818553888,
"grad_norm": 0.23333759605884552,
"learning_rate": 6.42423793067144e-06,
"loss": 0.1391,
"step": 719
},
{
"epoch": 1.9645293315143246,
"grad_norm": 0.24517878890037537,
"learning_rate": 6.39456088558091e-06,
"loss": 0.1409,
"step": 720
},
{
"epoch": 1.9672578444747613,
"grad_norm": 0.2502005100250244,
"learning_rate": 6.364920294361701e-06,
"loss": 0.1456,
"step": 721
},
{
"epoch": 1.969986357435198,
"grad_norm": 0.250099778175354,
"learning_rate": 6.335316456703891e-06,
"loss": 0.1481,
"step": 722
},
{
"epoch": 1.9727148703956345,
"grad_norm": 0.25519365072250366,
"learning_rate": 6.3057496719259314e-06,
"loss": 0.1482,
"step": 723
},
{
"epoch": 1.975443383356071,
"grad_norm": 0.24322687089443207,
"learning_rate": 6.276220238971653e-06,
"loss": 0.1437,
"step": 724
},
{
"epoch": 1.9781718963165074,
"grad_norm": 0.2547174096107483,
"learning_rate": 6.2467284564072294e-06,
"loss": 0.1489,
"step": 725
},
{
"epoch": 1.980900409276944,
"grad_norm": 0.24615444242954254,
"learning_rate": 6.2172746224181524e-06,
"loss": 0.145,
"step": 726
},
{
"epoch": 1.9836289222373806,
"grad_norm": 0.24662983417510986,
"learning_rate": 6.187859034806225e-06,
"loss": 0.1464,
"step": 727
},
{
"epoch": 1.9863574351978173,
"grad_norm": 0.24387580156326294,
"learning_rate": 6.158481990986558e-06,
"loss": 0.1468,
"step": 728
},
{
"epoch": 1.989085948158254,
"grad_norm": 0.23867018520832062,
"learning_rate": 6.1291437879845335e-06,
"loss": 0.1419,
"step": 729
},
{
"epoch": 1.9918144611186903,
"grad_norm": 0.25524288415908813,
"learning_rate": 6.099844722432844e-06,
"loss": 0.147,
"step": 730
},
{
"epoch": 1.9945429740791267,
"grad_norm": 0.24148067831993103,
"learning_rate": 6.07058509056846e-06,
"loss": 0.1454,
"step": 731
},
{
"epoch": 1.9972714870395634,
"grad_norm": 0.24991227686405182,
"learning_rate": 6.041365188229641e-06,
"loss": 0.1458,
"step": 732
},
{
"epoch": 2.0,
"grad_norm": 0.2470255196094513,
"learning_rate": 6.012185310852962e-06,
"loss": 0.1443,
"step": 733
},
{
"epoch": 2.0027285129604366,
"grad_norm": 0.28532329201698303,
"learning_rate": 5.983045753470308e-06,
"loss": 0.113,
"step": 734
},
{
"epoch": 2.0054570259208733,
"grad_norm": 0.2951700985431671,
"learning_rate": 5.9539468107058885e-06,
"loss": 0.1154,
"step": 735
},
{
"epoch": 2.00818553888131,
"grad_norm": 0.25754493474960327,
"learning_rate": 5.924888776773281e-06,
"loss": 0.1142,
"step": 736
},
{
"epoch": 2.010914051841746,
"grad_norm": 0.23163330554962158,
"learning_rate": 5.895871945472434e-06,
"loss": 0.1128,
"step": 737
},
{
"epoch": 2.0136425648021827,
"grad_norm": 0.2300463765859604,
"learning_rate": 5.866896610186701e-06,
"loss": 0.1102,
"step": 738
},
{
"epoch": 2.0163710777626194,
"grad_norm": 0.2664891481399536,
"learning_rate": 5.8379630638798845e-06,
"loss": 0.1128,
"step": 739
},
{
"epoch": 2.019099590723056,
"grad_norm": 0.3177363872528076,
"learning_rate": 5.809071599093272e-06,
"loss": 0.1143,
"step": 740
},
{
"epoch": 2.0218281036834926,
"grad_norm": 0.3134574890136719,
"learning_rate": 5.780222507942654e-06,
"loss": 0.1098,
"step": 741
},
{
"epoch": 2.0245566166439293,
"grad_norm": 0.3055528402328491,
"learning_rate": 5.7514160821154085e-06,
"loss": 0.1107,
"step": 742
},
{
"epoch": 2.0272851296043655,
"grad_norm": 0.30870890617370605,
"learning_rate": 5.7226526128675234e-06,
"loss": 0.1113,
"step": 743
},
{
"epoch": 2.030013642564802,
"grad_norm": 0.28284764289855957,
"learning_rate": 5.693932391020664e-06,
"loss": 0.1077,
"step": 744
},
{
"epoch": 2.0327421555252387,
"grad_norm": 0.28134864568710327,
"learning_rate": 5.665255706959231e-06,
"loss": 0.1083,
"step": 745
},
{
"epoch": 2.0354706684856754,
"grad_norm": 0.2702168822288513,
"learning_rate": 5.63662285062742e-06,
"loss": 0.11,
"step": 746
},
{
"epoch": 2.038199181446112,
"grad_norm": 0.27063465118408203,
"learning_rate": 5.608034111526298e-06,
"loss": 0.1131,
"step": 747
},
{
"epoch": 2.0409276944065486,
"grad_norm": 0.26345351338386536,
"learning_rate": 5.579489778710867e-06,
"loss": 0.1098,
"step": 748
},
{
"epoch": 2.043656207366985,
"grad_norm": 0.2631242275238037,
"learning_rate": 5.550990140787147e-06,
"loss": 0.1108,
"step": 749
},
{
"epoch": 2.0463847203274215,
"grad_norm": 0.2555985748767853,
"learning_rate": 5.522535485909258e-06,
"loss": 0.1077,
"step": 750
},
{
"epoch": 2.049113233287858,
"grad_norm": 0.2671637237071991,
"learning_rate": 5.494126101776505e-06,
"loss": 0.112,
"step": 751
},
{
"epoch": 2.0518417462482947,
"grad_norm": 0.2700263559818268,
"learning_rate": 5.465762275630471e-06,
"loss": 0.1073,
"step": 752
},
{
"epoch": 2.0545702592087314,
"grad_norm": 0.2737869620323181,
"learning_rate": 5.437444294252108e-06,
"loss": 0.1133,
"step": 753
},
{
"epoch": 2.057298772169168,
"grad_norm": 0.270761638879776,
"learning_rate": 5.409172443958844e-06,
"loss": 0.11,
"step": 754
},
{
"epoch": 2.060027285129604,
"grad_norm": 0.28410157561302185,
"learning_rate": 5.380947010601681e-06,
"loss": 0.1094,
"step": 755
},
{
"epoch": 2.062755798090041,
"grad_norm": 0.26760488748550415,
"learning_rate": 5.352768279562315e-06,
"loss": 0.1095,
"step": 756
},
{
"epoch": 2.0654843110504775,
"grad_norm": 0.2895072102546692,
"learning_rate": 5.324636535750238e-06,
"loss": 0.112,
"step": 757
},
{
"epoch": 2.068212824010914,
"grad_norm": 0.29230374097824097,
"learning_rate": 5.2965520635998676e-06,
"loss": 0.1085,
"step": 758
},
{
"epoch": 2.0709413369713507,
"grad_norm": 0.26666226983070374,
"learning_rate": 5.268515147067666e-06,
"loss": 0.1078,
"step": 759
},
{
"epoch": 2.0736698499317874,
"grad_norm": 0.27564340829849243,
"learning_rate": 5.240526069629265e-06,
"loss": 0.1109,
"step": 760
},
{
"epoch": 2.0763983628922236,
"grad_norm": 0.27139028906822205,
"learning_rate": 5.212585114276614e-06,
"loss": 0.1075,
"step": 761
},
{
"epoch": 2.07912687585266,
"grad_norm": 0.26753172278404236,
"learning_rate": 5.184692563515104e-06,
"loss": 0.1092,
"step": 762
},
{
"epoch": 2.081855388813097,
"grad_norm": 0.2602234482765198,
"learning_rate": 5.156848699360719e-06,
"loss": 0.1106,
"step": 763
},
{
"epoch": 2.0845839017735335,
"grad_norm": 0.2596394121646881,
"learning_rate": 5.129053803337181e-06,
"loss": 0.1074,
"step": 764
},
{
"epoch": 2.08731241473397,
"grad_norm": 0.2702711522579193,
"learning_rate": 5.101308156473104e-06,
"loss": 0.111,
"step": 765
},
{
"epoch": 2.0900409276944067,
"grad_norm": 0.2646404206752777,
"learning_rate": 5.073612039299157e-06,
"loss": 0.1089,
"step": 766
},
{
"epoch": 2.092769440654843,
"grad_norm": 0.26629284024238586,
"learning_rate": 5.045965731845223e-06,
"loss": 0.112,
"step": 767
},
{
"epoch": 2.0954979536152796,
"grad_norm": 0.2776147425174713,
"learning_rate": 5.018369513637567e-06,
"loss": 0.1112,
"step": 768
},
{
"epoch": 2.098226466575716,
"grad_norm": 0.26024189591407776,
"learning_rate": 4.990823663696013e-06,
"loss": 0.1082,
"step": 769
},
{
"epoch": 2.100954979536153,
"grad_norm": 0.2952435612678528,
"learning_rate": 4.963328460531127e-06,
"loss": 0.1085,
"step": 770
},
{
"epoch": 2.1036834924965895,
"grad_norm": 0.27994927763938904,
"learning_rate": 4.9358841821413775e-06,
"loss": 0.112,
"step": 771
},
{
"epoch": 2.106412005457026,
"grad_norm": 0.2703171372413635,
"learning_rate": 4.908491106010368e-06,
"loss": 0.1077,
"step": 772
},
{
"epoch": 2.1091405184174623,
"grad_norm": 0.2815234363079071,
"learning_rate": 4.881149509103993e-06,
"loss": 0.1079,
"step": 773
},
{
"epoch": 2.111869031377899,
"grad_norm": 0.2859188914299011,
"learning_rate": 4.853859667867641e-06,
"loss": 0.1106,
"step": 774
},
{
"epoch": 2.1145975443383356,
"grad_norm": 0.28008148074150085,
"learning_rate": 4.826621858223431e-06,
"loss": 0.1087,
"step": 775
},
{
"epoch": 2.117326057298772,
"grad_norm": 0.27847301959991455,
"learning_rate": 4.799436355567391e-06,
"loss": 0.108,
"step": 776
},
{
"epoch": 2.120054570259209,
"grad_norm": 0.2699519991874695,
"learning_rate": 4.772303434766669e-06,
"loss": 0.1071,
"step": 777
},
{
"epoch": 2.1227830832196455,
"grad_norm": 0.26534488797187805,
"learning_rate": 4.745223370156797e-06,
"loss": 0.1074,
"step": 778
},
{
"epoch": 2.1255115961800817,
"grad_norm": 0.25681716203689575,
"learning_rate": 4.7181964355388695e-06,
"loss": 0.1077,
"step": 779
},
{
"epoch": 2.1282401091405183,
"grad_norm": 0.26010218262672424,
"learning_rate": 4.691222904176791e-06,
"loss": 0.1081,
"step": 780
},
{
"epoch": 2.130968622100955,
"grad_norm": 0.2689591944217682,
"learning_rate": 4.664303048794533e-06,
"loss": 0.1105,
"step": 781
},
{
"epoch": 2.1336971350613916,
"grad_norm": 0.2674596905708313,
"learning_rate": 4.63743714157335e-06,
"loss": 0.1075,
"step": 782
},
{
"epoch": 2.136425648021828,
"grad_norm": 0.27123984694480896,
"learning_rate": 4.610625454149033e-06,
"loss": 0.1098,
"step": 783
},
{
"epoch": 2.139154160982265,
"grad_norm": 0.2624165415763855,
"learning_rate": 4.583868257609171e-06,
"loss": 0.1079,
"step": 784
},
{
"epoch": 2.141882673942701,
"grad_norm": 0.28557854890823364,
"learning_rate": 4.55716582249042e-06,
"loss": 0.1087,
"step": 785
},
{
"epoch": 2.1446111869031377,
"grad_norm": 0.2614542245864868,
"learning_rate": 4.530518418775734e-06,
"loss": 0.1076,
"step": 786
},
{
"epoch": 2.1473396998635743,
"grad_norm": 0.26394596695899963,
"learning_rate": 4.50392631589166e-06,
"loss": 0.1096,
"step": 787
},
{
"epoch": 2.150068212824011,
"grad_norm": 0.2653542757034302,
"learning_rate": 4.477389782705628e-06,
"loss": 0.1077,
"step": 788
},
{
"epoch": 2.1527967257844476,
"grad_norm": 0.2776028513908386,
"learning_rate": 4.4509090875231865e-06,
"loss": 0.1081,
"step": 789
},
{
"epoch": 2.155525238744884,
"grad_norm": 0.26990100741386414,
"learning_rate": 4.424484498085335e-06,
"loss": 0.1103,
"step": 790
},
{
"epoch": 2.1582537517053204,
"grad_norm": 0.26783284544944763,
"learning_rate": 4.398116281565794e-06,
"loss": 0.1093,
"step": 791
},
{
"epoch": 2.160982264665757,
"grad_norm": 0.26489558815956116,
"learning_rate": 4.371804704568309e-06,
"loss": 0.1116,
"step": 792
},
{
"epoch": 2.1637107776261937,
"grad_norm": 0.2781592607498169,
"learning_rate": 4.345550033123954e-06,
"loss": 0.1124,
"step": 793
},
{
"epoch": 2.1664392905866303,
"grad_norm": 0.27223044633865356,
"learning_rate": 4.319352532688444e-06,
"loss": 0.1113,
"step": 794
},
{
"epoch": 2.169167803547067,
"grad_norm": 0.2668192982673645,
"learning_rate": 4.293212468139447e-06,
"loss": 0.1101,
"step": 795
},
{
"epoch": 2.1718963165075036,
"grad_norm": 0.2734562158584595,
"learning_rate": 4.267130103773911e-06,
"loss": 0.1104,
"step": 796
},
{
"epoch": 2.17462482946794,
"grad_norm": 0.2697877883911133,
"learning_rate": 4.241105703305388e-06,
"loss": 0.1105,
"step": 797
},
{
"epoch": 2.1773533424283764,
"grad_norm": 0.2793903648853302,
"learning_rate": 4.2151395298613675e-06,
"loss": 0.1133,
"step": 798
},
{
"epoch": 2.180081855388813,
"grad_norm": 0.2638899087905884,
"learning_rate": 4.189231845980618e-06,
"loss": 0.1086,
"step": 799
},
{
"epoch": 2.1828103683492497,
"grad_norm": 0.2613518536090851,
"learning_rate": 4.163382913610533e-06,
"loss": 0.1105,
"step": 800
},
{
"epoch": 2.1855388813096863,
"grad_norm": 0.2611803114414215,
"learning_rate": 4.137592994104479e-06,
"loss": 0.1075,
"step": 801
},
{
"epoch": 2.188267394270123,
"grad_norm": 0.26179367303848267,
"learning_rate": 4.111862348219158e-06,
"loss": 0.109,
"step": 802
},
{
"epoch": 2.190995907230559,
"grad_norm": 0.2695135176181793,
"learning_rate": 4.086191236111964e-06,
"loss": 0.1083,
"step": 803
},
{
"epoch": 2.193724420190996,
"grad_norm": 0.2760058045387268,
"learning_rate": 4.060579917338362e-06,
"loss": 0.1109,
"step": 804
},
{
"epoch": 2.1964529331514324,
"grad_norm": 0.2639741897583008,
"learning_rate": 4.035028650849255e-06,
"loss": 0.1065,
"step": 805
},
{
"epoch": 2.199181446111869,
"grad_norm": 0.2773294150829315,
"learning_rate": 4.009537694988372e-06,
"loss": 0.1096,
"step": 806
},
{
"epoch": 2.2019099590723057,
"grad_norm": 0.2659907341003418,
"learning_rate": 3.984107307489652e-06,
"loss": 0.1095,
"step": 807
},
{
"epoch": 2.2046384720327423,
"grad_norm": 0.27414199709892273,
"learning_rate": 3.958737745474638e-06,
"loss": 0.1098,
"step": 808
},
{
"epoch": 2.2073669849931785,
"grad_norm": 0.2689702808856964,
"learning_rate": 3.933429265449882e-06,
"loss": 0.1087,
"step": 809
},
{
"epoch": 2.210095497953615,
"grad_norm": 0.2715907096862793,
"learning_rate": 3.908182123304344e-06,
"loss": 0.1083,
"step": 810
},
{
"epoch": 2.212824010914052,
"grad_norm": 0.2769804894924164,
"learning_rate": 3.882996574306818e-06,
"loss": 0.1088,
"step": 811
},
{
"epoch": 2.2155525238744884,
"grad_norm": 0.2748919427394867,
"learning_rate": 3.857872873103322e-06,
"loss": 0.1095,
"step": 812
},
{
"epoch": 2.218281036834925,
"grad_norm": 0.26264533400535583,
"learning_rate": 3.832811273714569e-06,
"loss": 0.1065,
"step": 813
},
{
"epoch": 2.2210095497953617,
"grad_norm": 0.27048805356025696,
"learning_rate": 3.807812029533362e-06,
"loss": 0.1108,
"step": 814
},
{
"epoch": 2.223738062755798,
"grad_norm": 0.2633140981197357,
"learning_rate": 3.78287539332203e-06,
"loss": 0.1111,
"step": 815
},
{
"epoch": 2.2264665757162345,
"grad_norm": 0.2589558959007263,
"learning_rate": 3.7580016172099067e-06,
"loss": 0.109,
"step": 816
},
{
"epoch": 2.229195088676671,
"grad_norm": 0.268934428691864,
"learning_rate": 3.7331909526907527e-06,
"loss": 0.1095,
"step": 817
},
{
"epoch": 2.231923601637108,
"grad_norm": 0.27397724986076355,
"learning_rate": 3.708443650620206e-06,
"loss": 0.1086,
"step": 818
},
{
"epoch": 2.2346521145975444,
"grad_norm": 0.27343809604644775,
"learning_rate": 3.6837599612132826e-06,
"loss": 0.1128,
"step": 819
},
{
"epoch": 2.237380627557981,
"grad_norm": 0.2655726969242096,
"learning_rate": 3.659140134041812e-06,
"loss": 0.1075,
"step": 820
},
{
"epoch": 2.2401091405184177,
"grad_norm": 0.2576850354671478,
"learning_rate": 3.6345844180319157e-06,
"loss": 0.1077,
"step": 821
},
{
"epoch": 2.242837653478854,
"grad_norm": 0.2726428508758545,
"learning_rate": 3.6100930614615204e-06,
"loss": 0.1124,
"step": 822
},
{
"epoch": 2.2455661664392905,
"grad_norm": 0.2689356505870819,
"learning_rate": 3.5856663119578174e-06,
"loss": 0.107,
"step": 823
},
{
"epoch": 2.248294679399727,
"grad_norm": 0.25623708963394165,
"learning_rate": 3.5613044164947617e-06,
"loss": 0.1083,
"step": 824
},
{
"epoch": 2.251023192360164,
"grad_norm": 0.26781338453292847,
"learning_rate": 3.5370076213905904e-06,
"loss": 0.1062,
"step": 825
},
{
"epoch": 2.2537517053206004,
"grad_norm": 0.2634563148021698,
"learning_rate": 3.5127761723053313e-06,
"loss": 0.1089,
"step": 826
},
{
"epoch": 2.2564802182810366,
"grad_norm": 0.2796477675437927,
"learning_rate": 3.4886103142382944e-06,
"loss": 0.1088,
"step": 827
},
{
"epoch": 2.2592087312414733,
"grad_norm": 0.26979538798332214,
"learning_rate": 3.46451029152562e-06,
"loss": 0.1069,
"step": 828
},
{
"epoch": 2.26193724420191,
"grad_norm": 0.268950879573822,
"learning_rate": 3.440476347837811e-06,
"loss": 0.1089,
"step": 829
},
{
"epoch": 2.2646657571623465,
"grad_norm": 0.26354870200157166,
"learning_rate": 3.41650872617724e-06,
"loss": 0.1087,
"step": 830
},
{
"epoch": 2.267394270122783,
"grad_norm": 0.2734336256980896,
"learning_rate": 3.392607668875718e-06,
"loss": 0.1081,
"step": 831
},
{
"epoch": 2.27012278308322,
"grad_norm": 0.26652878522872925,
"learning_rate": 3.3687734175920505e-06,
"loss": 0.1097,
"step": 832
},
{
"epoch": 2.2728512960436564,
"grad_norm": 0.2743259370326996,
"learning_rate": 3.3450062133095572e-06,
"loss": 0.1107,
"step": 833
},
{
"epoch": 2.2755798090040926,
"grad_norm": 0.2795916199684143,
"learning_rate": 3.321306296333673e-06,
"loss": 0.1104,
"step": 834
},
{
"epoch": 2.2783083219645293,
"grad_norm": 0.2681027054786682,
"learning_rate": 3.29767390628951e-06,
"loss": 0.1098,
"step": 835
},
{
"epoch": 2.281036834924966,
"grad_norm": 0.2739357352256775,
"learning_rate": 3.274109282119413e-06,
"loss": 0.1108,
"step": 836
},
{
"epoch": 2.2837653478854025,
"grad_norm": 0.2674682140350342,
"learning_rate": 3.2506126620805666e-06,
"loss": 0.1087,
"step": 837
},
{
"epoch": 2.286493860845839,
"grad_norm": 0.2670055031776428,
"learning_rate": 3.2271842837425917e-06,
"loss": 0.1101,
"step": 838
},
{
"epoch": 2.2892223738062754,
"grad_norm": 0.26569753885269165,
"learning_rate": 3.203824383985108e-06,
"loss": 0.1072,
"step": 839
},
{
"epoch": 2.291950886766712,
"grad_norm": 0.27032536268234253,
"learning_rate": 3.180533198995379e-06,
"loss": 0.107,
"step": 840
},
{
"epoch": 2.2946793997271486,
"grad_norm": 0.2656431496143341,
"learning_rate": 3.157310964265903e-06,
"loss": 0.1096,
"step": 841
},
{
"epoch": 2.2974079126875853,
"grad_norm": 0.269829660654068,
"learning_rate": 3.134157914592032e-06,
"loss": 0.1069,
"step": 842
},
{
"epoch": 2.300136425648022,
"grad_norm": 0.2710098326206207,
"learning_rate": 3.1110742840696063e-06,
"loss": 0.1098,
"step": 843
},
{
"epoch": 2.3028649386084585,
"grad_norm": 0.26863613724708557,
"learning_rate": 3.088060306092582e-06,
"loss": 0.1091,
"step": 844
},
{
"epoch": 2.305593451568895,
"grad_norm": 0.2896701991558075,
"learning_rate": 3.0651162133506707e-06,
"loss": 0.1121,
"step": 845
},
{
"epoch": 2.3083219645293314,
"grad_norm": 0.2537677586078644,
"learning_rate": 3.042242237826991e-06,
"loss": 0.1063,
"step": 846
},
{
"epoch": 2.311050477489768,
"grad_norm": 0.2653418481349945,
"learning_rate": 3.0194386107957175e-06,
"loss": 0.108,
"step": 847
},
{
"epoch": 2.3137789904502046,
"grad_norm": 0.26476767659187317,
"learning_rate": 2.996705562819747e-06,
"loss": 0.1078,
"step": 848
},
{
"epoch": 2.3165075034106413,
"grad_norm": 0.2813807427883148,
"learning_rate": 2.9740433237483667e-06,
"loss": 0.1082,
"step": 849
},
{
"epoch": 2.319236016371078,
"grad_norm": 0.2876110374927521,
"learning_rate": 2.951452122714926e-06,
"loss": 0.1063,
"step": 850
},
{
"epoch": 2.321964529331514,
"grad_norm": 0.26683416962623596,
"learning_rate": 2.9289321881345257e-06,
"loss": 0.1112,
"step": 851
},
{
"epoch": 2.3246930422919507,
"grad_norm": 0.25410008430480957,
"learning_rate": 2.906483747701705e-06,
"loss": 0.1091,
"step": 852
},
{
"epoch": 2.3274215552523874,
"grad_norm": 0.2653825283050537,
"learning_rate": 2.88410702838814e-06,
"loss": 0.1096,
"step": 853
},
{
"epoch": 2.330150068212824,
"grad_norm": 0.2617843747138977,
"learning_rate": 2.861802256440348e-06,
"loss": 0.1075,
"step": 854
},
{
"epoch": 2.3328785811732606,
"grad_norm": 0.27584707736968994,
"learning_rate": 2.8395696573774034e-06,
"loss": 0.1108,
"step": 855
},
{
"epoch": 2.3356070941336973,
"grad_norm": 0.26570525765419006,
"learning_rate": 2.8174094559886535e-06,
"loss": 0.1096,
"step": 856
},
{
"epoch": 2.338335607094134,
"grad_norm": 0.2747531533241272,
"learning_rate": 2.795321876331446e-06,
"loss": 0.107,
"step": 857
},
{
"epoch": 2.34106412005457,
"grad_norm": 0.2622661590576172,
"learning_rate": 2.773307141728867e-06,
"loss": 0.1058,
"step": 858
},
{
"epoch": 2.3437926330150067,
"grad_norm": 0.27443957328796387,
"learning_rate": 2.751365474767479e-06,
"loss": 0.1098,
"step": 859
},
{
"epoch": 2.3465211459754434,
"grad_norm": 0.2646692395210266,
"learning_rate": 2.729497097295075e-06,
"loss": 0.1078,
"step": 860
},
{
"epoch": 2.34924965893588,
"grad_norm": 0.275473952293396,
"learning_rate": 2.70770223041843e-06,
"loss": 0.1092,
"step": 861
},
{
"epoch": 2.3519781718963166,
"grad_norm": 0.2787085473537445,
"learning_rate": 2.6859810945010687e-06,
"loss": 0.1113,
"step": 862
},
{
"epoch": 2.354706684856753,
"grad_norm": 0.2730537950992584,
"learning_rate": 2.6643339091610376e-06,
"loss": 0.1103,
"step": 863
},
{
"epoch": 2.3574351978171895,
"grad_norm": 0.27616754174232483,
"learning_rate": 2.642760893268684e-06,
"loss": 0.1085,
"step": 864
},
{
"epoch": 2.360163710777626,
"grad_norm": 0.2819797694683075,
"learning_rate": 2.621262264944444e-06,
"loss": 0.1088,
"step": 865
},
{
"epoch": 2.3628922237380627,
"grad_norm": 0.27417269349098206,
"learning_rate": 2.5998382415566258e-06,
"loss": 0.11,
"step": 866
},
{
"epoch": 2.3656207366984994,
"grad_norm": 0.27512454986572266,
"learning_rate": 2.5784890397192395e-06,
"loss": 0.11,
"step": 867
},
{
"epoch": 2.368349249658936,
"grad_norm": 0.27606305480003357,
"learning_rate": 2.55721487528978e-06,
"loss": 0.1083,
"step": 868
},
{
"epoch": 2.3710777626193726,
"grad_norm": 0.26631057262420654,
"learning_rate": 2.5360159633670456e-06,
"loss": 0.1083,
"step": 869
},
{
"epoch": 2.373806275579809,
"grad_norm": 0.2638460397720337,
"learning_rate": 2.514892518288988e-06,
"loss": 0.1084,
"step": 870
},
{
"epoch": 2.3765347885402455,
"grad_norm": 0.26732948422431946,
"learning_rate": 2.4938447536305243e-06,
"loss": 0.1088,
"step": 871
},
{
"epoch": 2.379263301500682,
"grad_norm": 0.26805248856544495,
"learning_rate": 2.4728728822013683e-06,
"loss": 0.1075,
"step": 872
},
{
"epoch": 2.3819918144611187,
"grad_norm": 0.2634589374065399,
"learning_rate": 2.451977116043911e-06,
"loss": 0.1102,
"step": 873
},
{
"epoch": 2.3847203274215554,
"grad_norm": 0.2670380771160126,
"learning_rate": 2.431157666431052e-06,
"loss": 0.1075,
"step": 874
},
{
"epoch": 2.3874488403819916,
"grad_norm": 0.2665899097919464,
"learning_rate": 2.410414743864059e-06,
"loss": 0.1102,
"step": 875
},
{
"epoch": 2.390177353342428,
"grad_norm": 0.28205356001853943,
"learning_rate": 2.3897485580704684e-06,
"loss": 0.1096,
"step": 876
},
{
"epoch": 2.392905866302865,
"grad_norm": 0.2688262462615967,
"learning_rate": 2.369159318001937e-06,
"loss": 0.1083,
"step": 877
},
{
"epoch": 2.3956343792633015,
"grad_norm": 0.25036585330963135,
"learning_rate": 2.348647231832131e-06,
"loss": 0.1072,
"step": 878
},
{
"epoch": 2.398362892223738,
"grad_norm": 0.2626017928123474,
"learning_rate": 2.3282125069546437e-06,
"loss": 0.1068,
"step": 879
},
{
"epoch": 2.4010914051841747,
"grad_norm": 0.2714347243309021,
"learning_rate": 2.30785534998088e-06,
"loss": 0.1096,
"step": 880
},
{
"epoch": 2.4038199181446114,
"grad_norm": 0.2627617120742798,
"learning_rate": 2.2875759667379616e-06,
"loss": 0.1078,
"step": 881
},
{
"epoch": 2.4065484311050476,
"grad_norm": 0.27050861716270447,
"learning_rate": 2.267374562266662e-06,
"loss": 0.1105,
"step": 882
},
{
"epoch": 2.409276944065484,
"grad_norm": 0.27512407302856445,
"learning_rate": 2.2472513408193385e-06,
"loss": 0.1078,
"step": 883
},
{
"epoch": 2.412005457025921,
"grad_norm": 0.26855772733688354,
"learning_rate": 2.227206505857834e-06,
"loss": 0.107,
"step": 884
},
{
"epoch": 2.4147339699863575,
"grad_norm": 0.2625463604927063,
"learning_rate": 2.207240260051453e-06,
"loss": 0.1087,
"step": 885
},
{
"epoch": 2.417462482946794,
"grad_norm": 0.27381083369255066,
"learning_rate": 2.1873528052749094e-06,
"loss": 0.1084,
"step": 886
},
{
"epoch": 2.4201909959072307,
"grad_norm": 0.26363614201545715,
"learning_rate": 2.167544342606256e-06,
"loss": 0.1062,
"step": 887
},
{
"epoch": 2.422919508867667,
"grad_norm": 0.2696930766105652,
"learning_rate": 2.147815072324886e-06,
"loss": 0.109,
"step": 888
},
{
"epoch": 2.4256480218281036,
"grad_norm": 0.27058467268943787,
"learning_rate": 2.1281651939094996e-06,
"loss": 0.1077,
"step": 889
},
{
"epoch": 2.42837653478854,
"grad_norm": 0.2752489149570465,
"learning_rate": 2.1085949060360654e-06,
"loss": 0.1093,
"step": 890
},
{
"epoch": 2.431105047748977,
"grad_norm": 0.2617742121219635,
"learning_rate": 2.089104406575837e-06,
"loss": 0.1083,
"step": 891
},
{
"epoch": 2.4338335607094135,
"grad_norm": 0.26945194602012634,
"learning_rate": 2.0696938925933505e-06,
"loss": 0.108,
"step": 892
},
{
"epoch": 2.43656207366985,
"grad_norm": 0.2854002118110657,
"learning_rate": 2.0503635603444094e-06,
"loss": 0.1111,
"step": 893
},
{
"epoch": 2.4392905866302863,
"grad_norm": 0.2578170895576477,
"learning_rate": 2.0311136052741274e-06,
"loss": 0.1067,
"step": 894
},
{
"epoch": 2.442019099590723,
"grad_norm": 0.28232836723327637,
"learning_rate": 2.0119442220149356e-06,
"loss": 0.1069,
"step": 895
},
{
"epoch": 2.4447476125511596,
"grad_norm": 0.26999664306640625,
"learning_rate": 1.9928556043846215e-06,
"loss": 0.1098,
"step": 896
},
{
"epoch": 2.447476125511596,
"grad_norm": 0.2636784613132477,
"learning_rate": 1.9738479453843685e-06,
"loss": 0.1075,
"step": 897
},
{
"epoch": 2.450204638472033,
"grad_norm": 0.2663605511188507,
"learning_rate": 1.9549214371968008e-06,
"loss": 0.1094,
"step": 898
},
{
"epoch": 2.4529331514324695,
"grad_norm": 0.26863041520118713,
"learning_rate": 1.936076271184044e-06,
"loss": 0.1099,
"step": 899
},
{
"epoch": 2.4556616643929057,
"grad_norm": 0.2696433663368225,
"learning_rate": 1.917312637885791e-06,
"loss": 0.1078,
"step": 900
},
{
"epoch": 2.4583901773533423,
"grad_norm": 0.27077409625053406,
"learning_rate": 1.898630727017371e-06,
"loss": 0.1081,
"step": 901
},
{
"epoch": 2.461118690313779,
"grad_norm": 0.27648037672042847,
"learning_rate": 1.8800307274678364e-06,
"loss": 0.1096,
"step": 902
},
{
"epoch": 2.4638472032742156,
"grad_norm": 0.2673186659812927,
"learning_rate": 1.861512827298051e-06,
"loss": 0.1076,
"step": 903
},
{
"epoch": 2.466575716234652,
"grad_norm": 0.2637488543987274,
"learning_rate": 1.8430772137387853e-06,
"loss": 0.1092,
"step": 904
},
{
"epoch": 2.469304229195089,
"grad_norm": 0.274550199508667,
"learning_rate": 1.8247240731888293e-06,
"loss": 0.1104,
"step": 905
},
{
"epoch": 2.472032742155525,
"grad_norm": 0.26185035705566406,
"learning_rate": 1.8064535912131032e-06,
"loss": 0.1086,
"step": 906
},
{
"epoch": 2.4747612551159617,
"grad_norm": 0.2731874883174896,
"learning_rate": 1.7882659525407842e-06,
"loss": 0.1084,
"step": 907
},
{
"epoch": 2.4774897680763983,
"grad_norm": 0.26598888635635376,
"learning_rate": 1.7701613410634367e-06,
"loss": 0.1091,
"step": 908
},
{
"epoch": 2.480218281036835,
"grad_norm": 0.2706669569015503,
"learning_rate": 1.752139939833154e-06,
"loss": 0.1074,
"step": 909
},
{
"epoch": 2.4829467939972716,
"grad_norm": 0.2622982859611511,
"learning_rate": 1.7342019310607062e-06,
"loss": 0.1079,
"step": 910
},
{
"epoch": 2.485675306957708,
"grad_norm": 0.270145446062088,
"learning_rate": 1.7163474961137029e-06,
"loss": 0.1089,
"step": 911
},
{
"epoch": 2.488403819918145,
"grad_norm": 0.26119205355644226,
"learning_rate": 1.6985768155147498e-06,
"loss": 0.1075,
"step": 912
},
{
"epoch": 2.491132332878581,
"grad_norm": 0.2709420621395111,
"learning_rate": 1.6808900689396334e-06,
"loss": 0.1073,
"step": 913
},
{
"epoch": 2.4938608458390177,
"grad_norm": 0.26236698031425476,
"learning_rate": 1.6632874352154982e-06,
"loss": 0.1092,
"step": 914
},
{
"epoch": 2.4965893587994543,
"grad_norm": 0.2690337598323822,
"learning_rate": 1.645769092319045e-06,
"loss": 0.1077,
"step": 915
},
{
"epoch": 2.499317871759891,
"grad_norm": 0.2827068865299225,
"learning_rate": 1.6283352173747148e-06,
"loss": 0.1087,
"step": 916
},
{
"epoch": 2.5020463847203276,
"grad_norm": 0.27215540409088135,
"learning_rate": 1.6109859866529253e-06,
"loss": 0.1094,
"step": 917
},
{
"epoch": 2.504774897680764,
"grad_norm": 0.2663928270339966,
"learning_rate": 1.5937215755682667e-06,
"loss": 0.1081,
"step": 918
},
{
"epoch": 2.5075034106412004,
"grad_norm": 0.26082897186279297,
"learning_rate": 1.5765421586777285e-06,
"loss": 0.1067,
"step": 919
},
{
"epoch": 2.510231923601637,
"grad_norm": 0.2696387767791748,
"learning_rate": 1.559447909678954e-06,
"loss": 0.1057,
"step": 920
},
{
"epoch": 2.5129604365620737,
"grad_norm": 0.26911720633506775,
"learning_rate": 1.5424390014084644e-06,
"loss": 0.109,
"step": 921
},
{
"epoch": 2.5156889495225103,
"grad_norm": 0.27101537585258484,
"learning_rate": 1.5255156058399124e-06,
"loss": 0.1059,
"step": 922
},
{
"epoch": 2.518417462482947,
"grad_norm": 0.2690192759037018,
"learning_rate": 1.5086778940823544e-06,
"loss": 0.1063,
"step": 923
},
{
"epoch": 2.5211459754433836,
"grad_norm": 0.26710933446884155,
"learning_rate": 1.4919260363785215e-06,
"loss": 0.1047,
"step": 924
},
{
"epoch": 2.52387448840382,
"grad_norm": 0.26335206627845764,
"learning_rate": 1.4752602021030794e-06,
"loss": 0.1079,
"step": 925
},
{
"epoch": 2.5266030013642564,
"grad_norm": 0.26651206612586975,
"learning_rate": 1.4586805597609333e-06,
"loss": 0.1075,
"step": 926
},
{
"epoch": 2.529331514324693,
"grad_norm": 0.25876766443252563,
"learning_rate": 1.4421872769855262e-06,
"loss": 0.1069,
"step": 927
},
{
"epoch": 2.5320600272851297,
"grad_norm": 0.27579307556152344,
"learning_rate": 1.4257805205371233e-06,
"loss": 0.1094,
"step": 928
},
{
"epoch": 2.5347885402455663,
"grad_norm": 0.2780783176422119,
"learning_rate": 1.409460456301147e-06,
"loss": 0.1099,
"step": 929
},
{
"epoch": 2.5375170532060025,
"grad_norm": 0.27787429094314575,
"learning_rate": 1.3932272492864984e-06,
"loss": 0.1093,
"step": 930
},
{
"epoch": 2.540245566166439,
"grad_norm": 0.27652138471603394,
"learning_rate": 1.3770810636238685e-06,
"loss": 0.1086,
"step": 931
},
{
"epoch": 2.542974079126876,
"grad_norm": 0.26061540842056274,
"learning_rate": 1.3610220625641002e-06,
"loss": 0.107,
"step": 932
},
{
"epoch": 2.5457025920873124,
"grad_norm": 0.26401764154434204,
"learning_rate": 1.3450504084765381e-06,
"loss": 0.1074,
"step": 933
},
{
"epoch": 2.548431105047749,
"grad_norm": 0.2741183936595917,
"learning_rate": 1.3291662628473634e-06,
"loss": 0.1112,
"step": 934
},
{
"epoch": 2.5511596180081857,
"grad_norm": 0.27535951137542725,
"learning_rate": 1.313369786277987e-06,
"loss": 0.1085,
"step": 935
},
{
"epoch": 2.5538881309686223,
"grad_norm": 0.2647560238838196,
"learning_rate": 1.2976611384834148e-06,
"loss": 0.1083,
"step": 936
},
{
"epoch": 2.5566166439290585,
"grad_norm": 0.27376842498779297,
"learning_rate": 1.2820404782906315e-06,
"loss": 0.1095,
"step": 937
},
{
"epoch": 2.559345156889495,
"grad_norm": 0.2654034495353699,
"learning_rate": 1.266507963636997e-06,
"loss": 0.1086,
"step": 938
},
{
"epoch": 2.562073669849932,
"grad_norm": 0.2726665735244751,
"learning_rate": 1.2510637515686497e-06,
"loss": 0.1072,
"step": 939
},
{
"epoch": 2.5648021828103684,
"grad_norm": 0.27220022678375244,
"learning_rate": 1.2357079982389197e-06,
"loss": 0.1069,
"step": 940
},
{
"epoch": 2.567530695770805,
"grad_norm": 0.26593485474586487,
"learning_rate": 1.2204408589067462e-06,
"loss": 0.1066,
"step": 941
},
{
"epoch": 2.5702592087312413,
"grad_norm": 0.26143768429756165,
"learning_rate": 1.2052624879351105e-06,
"loss": 0.1078,
"step": 942
},
{
"epoch": 2.572987721691678,
"grad_norm": 0.26813191175460815,
"learning_rate": 1.190173038789476e-06,
"loss": 0.1078,
"step": 943
},
{
"epoch": 2.5757162346521145,
"grad_norm": 0.26784244179725647,
"learning_rate": 1.175172664036235e-06,
"loss": 0.1085,
"step": 944
},
{
"epoch": 2.578444747612551,
"grad_norm": 0.2719161808490753,
"learning_rate": 1.1602615153411666e-06,
"loss": 0.1067,
"step": 945
},
{
"epoch": 2.581173260572988,
"grad_norm": 0.2792585790157318,
"learning_rate": 1.1454397434679022e-06,
"loss": 0.1079,
"step": 946
},
{
"epoch": 2.5839017735334244,
"grad_norm": 0.25650888681411743,
"learning_rate": 1.1307074982764022e-06,
"loss": 0.1081,
"step": 947
},
{
"epoch": 2.586630286493861,
"grad_norm": 0.2769523859024048,
"learning_rate": 1.116064928721442e-06,
"loss": 0.1109,
"step": 948
},
{
"epoch": 2.5893587994542973,
"grad_norm": 0.27244603633880615,
"learning_rate": 1.1015121828511033e-06,
"loss": 0.1099,
"step": 949
},
{
"epoch": 2.592087312414734,
"grad_norm": 0.27726295590400696,
"learning_rate": 1.0870494078052796e-06,
"loss": 0.1088,
"step": 950
},
{
"epoch": 2.5948158253751705,
"grad_norm": 0.2656663954257965,
"learning_rate": 1.0726767498141877e-06,
"loss": 0.1072,
"step": 951
},
{
"epoch": 2.597544338335607,
"grad_norm": 0.27096185088157654,
"learning_rate": 1.0583943541968856e-06,
"loss": 0.1069,
"step": 952
},
{
"epoch": 2.600272851296044,
"grad_norm": 0.26978376507759094,
"learning_rate": 1.044202365359811e-06,
"loss": 0.1067,
"step": 953
},
{
"epoch": 2.60300136425648,
"grad_norm": 0.2618367075920105,
"learning_rate": 1.0301009267953145e-06,
"loss": 0.1067,
"step": 954
},
{
"epoch": 2.6057298772169166,
"grad_norm": 0.2704748213291168,
"learning_rate": 1.0160901810802114e-06,
"loss": 0.1061,
"step": 955
},
{
"epoch": 2.6084583901773533,
"grad_norm": 0.26577070355415344,
"learning_rate": 1.0021702698743408e-06,
"loss": 0.1089,
"step": 956
},
{
"epoch": 2.61118690313779,
"grad_norm": 0.27934566140174866,
"learning_rate": 9.883413339191295e-07,
"loss": 0.1074,
"step": 957
},
{
"epoch": 2.6139154160982265,
"grad_norm": 0.2697683274745941,
"learning_rate": 9.746035130361741e-07,
"loss": 0.1069,
"step": 958
},
{
"epoch": 2.616643929058663,
"grad_norm": 0.25919702649116516,
"learning_rate": 9.609569461258262e-07,
"loss": 0.1055,
"step": 959
},
{
"epoch": 2.6193724420191,
"grad_norm": 0.2682025730609894,
"learning_rate": 9.474017711657835e-07,
"loss": 0.1085,
"step": 960
},
{
"epoch": 2.622100954979536,
"grad_norm": 0.2794445753097534,
"learning_rate": 9.339381252097001e-07,
"loss": 0.1084,
"step": 961
},
{
"epoch": 2.6248294679399726,
"grad_norm": 0.26595231890678406,
"learning_rate": 9.205661443857994e-07,
"loss": 0.1071,
"step": 962
},
{
"epoch": 2.6275579809004093,
"grad_norm": 0.2697543799877167,
"learning_rate": 9.072859638954956e-07,
"loss": 0.1091,
"step": 963
},
{
"epoch": 2.630286493860846,
"grad_norm": 0.27470725774765015,
"learning_rate": 8.940977180120247e-07,
"loss": 0.1059,
"step": 964
},
{
"epoch": 2.6330150068212825,
"grad_norm": 0.2674699127674103,
"learning_rate": 8.810015400790994e-07,
"loss": 0.1077,
"step": 965
},
{
"epoch": 2.6357435197817187,
"grad_norm": 0.26429158449172974,
"learning_rate": 8.67997562509546e-07,
"loss": 0.1036,
"step": 966
},
{
"epoch": 2.6384720327421554,
"grad_norm": 0.26330381631851196,
"learning_rate": 8.550859167839665e-07,
"loss": 0.1074,
"step": 967
},
{
"epoch": 2.641200545702592,
"grad_norm": 0.2676031291484833,
"learning_rate": 8.42266733449425e-07,
"loss": 0.1088,
"step": 968
},
{
"epoch": 2.6439290586630286,
"grad_norm": 0.26969388127326965,
"learning_rate": 8.295401421181126e-07,
"loss": 0.109,
"step": 969
},
{
"epoch": 2.6466575716234653,
"grad_norm": 0.27987077832221985,
"learning_rate": 8.169062714660347e-07,
"loss": 0.1049,
"step": 970
},
{
"epoch": 2.649386084583902,
"grad_norm": 0.27343514561653137,
"learning_rate": 8.043652492317256e-07,
"loss": 0.1059,
"step": 971
},
{
"epoch": 2.6521145975443385,
"grad_norm": 0.26633647084236145,
"learning_rate": 7.919172022149458e-07,
"loss": 0.1074,
"step": 972
},
{
"epoch": 2.6548431105047747,
"grad_norm": 0.2681094706058502,
"learning_rate": 7.795622562753957e-07,
"loss": 0.1075,
"step": 973
},
{
"epoch": 2.6575716234652114,
"grad_norm": 0.2750418484210968,
"learning_rate": 7.673005363314578e-07,
"loss": 0.1079,
"step": 974
},
{
"epoch": 2.660300136425648,
"grad_norm": 0.26239413022994995,
"learning_rate": 7.551321663589229e-07,
"loss": 0.1057,
"step": 975
},
{
"epoch": 2.6630286493860846,
"grad_norm": 0.2685093879699707,
"learning_rate": 7.430572693897342e-07,
"loss": 0.1081,
"step": 976
},
{
"epoch": 2.6657571623465213,
"grad_norm": 0.265713095664978,
"learning_rate": 7.310759675107515e-07,
"loss": 0.107,
"step": 977
},
{
"epoch": 2.6684856753069575,
"grad_norm": 0.26597341895103455,
"learning_rate": 7.19188381862519e-07,
"loss": 0.1085,
"step": 978
},
{
"epoch": 2.6712141882673945,
"grad_norm": 0.27451950311660767,
"learning_rate": 7.073946326380243e-07,
"loss": 0.1088,
"step": 979
},
{
"epoch": 2.6739427012278307,
"grad_norm": 0.26499027013778687,
"learning_rate": 6.956948390814977e-07,
"loss": 0.1072,
"step": 980
},
{
"epoch": 2.6766712141882674,
"grad_norm": 0.26630744338035583,
"learning_rate": 6.840891194872112e-07,
"loss": 0.106,
"step": 981
},
{
"epoch": 2.679399727148704,
"grad_norm": 0.2721530497074127,
"learning_rate": 6.725775911982602e-07,
"loss": 0.1069,
"step": 982
},
{
"epoch": 2.6821282401091406,
"grad_norm": 0.2674759030342102,
"learning_rate": 6.61160370605397e-07,
"loss": 0.1062,
"step": 983
},
{
"epoch": 2.6848567530695773,
"grad_norm": 0.2578357458114624,
"learning_rate": 6.498375731458529e-07,
"loss": 0.1066,
"step": 984
},
{
"epoch": 2.6875852660300135,
"grad_norm": 0.2588570713996887,
"learning_rate": 6.386093133021554e-07,
"loss": 0.1085,
"step": 985
},
{
"epoch": 2.69031377899045,
"grad_norm": 0.2657422125339508,
"learning_rate": 6.274757046009871e-07,
"loss": 0.1068,
"step": 986
},
{
"epoch": 2.6930422919508867,
"grad_norm": 0.26033133268356323,
"learning_rate": 6.164368596120351e-07,
"loss": 0.1074,
"step": 987
},
{
"epoch": 2.6957708049113234,
"grad_norm": 0.2675817608833313,
"learning_rate": 6.054928899468427e-07,
"loss": 0.1065,
"step": 988
},
{
"epoch": 2.69849931787176,
"grad_norm": 0.2733577787876129,
"learning_rate": 5.946439062576903e-07,
"loss": 0.109,
"step": 989
},
{
"epoch": 2.701227830832196,
"grad_norm": 0.2761248052120209,
"learning_rate": 5.83890018236476e-07,
"loss": 0.1063,
"step": 990
},
{
"epoch": 2.7039563437926333,
"grad_norm": 0.2787305414676666,
"learning_rate": 5.732313346136032e-07,
"loss": 0.1076,
"step": 991
},
{
"epoch": 2.7066848567530695,
"grad_norm": 0.26708054542541504,
"learning_rate": 5.626679631568832e-07,
"loss": 0.108,
"step": 992
},
{
"epoch": 2.709413369713506,
"grad_norm": 0.2809232771396637,
"learning_rate": 5.52200010670444e-07,
"loss": 0.1084,
"step": 993
},
{
"epoch": 2.7121418826739427,
"grad_norm": 0.28152310848236084,
"learning_rate": 5.418275829936537e-07,
"loss": 0.1081,
"step": 994
},
{
"epoch": 2.7148703956343794,
"grad_norm": 0.2751696705818176,
"learning_rate": 5.315507850000456e-07,
"loss": 0.1085,
"step": 995
},
{
"epoch": 2.717598908594816,
"grad_norm": 0.27097487449645996,
"learning_rate": 5.213697205962631e-07,
"loss": 0.1061,
"step": 996
},
{
"epoch": 2.720327421555252,
"grad_norm": 0.27399781346321106,
"learning_rate": 5.112844927210048e-07,
"loss": 0.1076,
"step": 997
},
{
"epoch": 2.723055934515689,
"grad_norm": 0.278167724609375,
"learning_rate": 5.012952033439844e-07,
"loss": 0.106,
"step": 998
},
{
"epoch": 2.7257844474761255,
"grad_norm": 0.2697390019893646,
"learning_rate": 4.914019534649039e-07,
"loss": 0.1089,
"step": 999
},
{
"epoch": 2.728512960436562,
"grad_norm": 0.27422428131103516,
"learning_rate": 4.816048431124265e-07,
"loss": 0.1082,
"step": 1000
},
{
"epoch": 2.7312414733969987,
"grad_norm": 0.2834809124469757,
"learning_rate": 4.7190397134316946e-07,
"loss": 0.1088,
"step": 1001
},
{
"epoch": 2.733969986357435,
"grad_norm": 0.26203733682632446,
"learning_rate": 4.6229943624069963e-07,
"loss": 0.1081,
"step": 1002
},
{
"epoch": 2.736698499317872,
"grad_norm": 0.2634824812412262,
"learning_rate": 4.5279133491454406e-07,
"loss": 0.1067,
"step": 1003
},
{
"epoch": 2.739427012278308,
"grad_norm": 0.2688659727573395,
"learning_rate": 4.4337976349920763e-07,
"loss": 0.1094,
"step": 1004
},
{
"epoch": 2.742155525238745,
"grad_norm": 0.2748819887638092,
"learning_rate": 4.3406481715319916e-07,
"loss": 0.1097,
"step": 1005
},
{
"epoch": 2.7448840381991815,
"grad_norm": 0.2626483738422394,
"learning_rate": 4.248465900580734e-07,
"loss": 0.1093,
"step": 1006
},
{
"epoch": 2.747612551159618,
"grad_norm": 0.26204735040664673,
"learning_rate": 4.1572517541747294e-07,
"loss": 0.1074,
"step": 1007
},
{
"epoch": 2.7503410641200547,
"grad_norm": 0.2688845992088318,
"learning_rate": 4.0670066545619224e-07,
"loss": 0.1089,
"step": 1008
},
{
"epoch": 2.753069577080491,
"grad_norm": 0.26710936427116394,
"learning_rate": 3.9777315141923847e-07,
"loss": 0.1063,
"step": 1009
},
{
"epoch": 2.7557980900409276,
"grad_norm": 0.27956438064575195,
"learning_rate": 3.889427235709153e-07,
"loss": 0.1071,
"step": 1010
},
{
"epoch": 2.758526603001364,
"grad_norm": 0.264320433139801,
"learning_rate": 3.802094711939075e-07,
"loss": 0.1063,
"step": 1011
},
{
"epoch": 2.761255115961801,
"grad_norm": 0.27795156836509705,
"learning_rate": 3.715734825883766e-07,
"loss": 0.1075,
"step": 1012
},
{
"epoch": 2.7639836289222375,
"grad_norm": 0.2652377188205719,
"learning_rate": 3.6303484507106965e-07,
"loss": 0.106,
"step": 1013
},
{
"epoch": 2.7667121418826737,
"grad_norm": 0.26418134570121765,
"learning_rate": 3.5459364497443696e-07,
"loss": 0.1049,
"step": 1014
},
{
"epoch": 2.7694406548431107,
"grad_norm": 0.271123468875885,
"learning_rate": 3.462499676457598e-07,
"loss": 0.1065,
"step": 1015
},
{
"epoch": 2.772169167803547,
"grad_norm": 0.26815399527549744,
"learning_rate": 3.38003897446284e-07,
"loss": 0.1072,
"step": 1016
},
{
"epoch": 2.7748976807639836,
"grad_norm": 0.26320740580558777,
"learning_rate": 3.298555177503726e-07,
"loss": 0.1027,
"step": 1017
},
{
"epoch": 2.77762619372442,
"grad_norm": 0.26707082986831665,
"learning_rate": 3.2180491094465414e-07,
"loss": 0.1042,
"step": 1018
},
{
"epoch": 2.780354706684857,
"grad_norm": 0.27013859152793884,
"learning_rate": 3.138521584272003e-07,
"loss": 0.1079,
"step": 1019
},
{
"epoch": 2.7830832196452935,
"grad_norm": 0.27672314643859863,
"learning_rate": 3.059973406066963e-07,
"loss": 0.107,
"step": 1020
},
{
"epoch": 2.7858117326057297,
"grad_norm": 0.27290451526641846,
"learning_rate": 2.982405369016272e-07,
"loss": 0.1088,
"step": 1021
},
{
"epoch": 2.7885402455661663,
"grad_norm": 0.2681630551815033,
"learning_rate": 2.905818257394799e-07,
"loss": 0.1078,
"step": 1022
},
{
"epoch": 2.791268758526603,
"grad_norm": 0.2599634826183319,
"learning_rate": 2.830212845559466e-07,
"loss": 0.107,
"step": 1023
},
{
"epoch": 2.7939972714870396,
"grad_norm": 0.2621892988681793,
"learning_rate": 2.7555898979413796e-07,
"loss": 0.1064,
"step": 1024
},
{
"epoch": 2.796725784447476,
"grad_norm": 0.26226896047592163,
"learning_rate": 2.6819501690382275e-07,
"loss": 0.1082,
"step": 1025
},
{
"epoch": 2.799454297407913,
"grad_norm": 0.26856353878974915,
"learning_rate": 2.609294403406537e-07,
"loss": 0.108,
"step": 1026
},
{
"epoch": 2.8021828103683495,
"grad_norm": 0.2593378722667694,
"learning_rate": 2.537623335654127e-07,
"loss": 0.1079,
"step": 1027
},
{
"epoch": 2.8049113233287857,
"grad_norm": 0.2652793228626251,
"learning_rate": 2.4669376904328244e-07,
"loss": 0.105,
"step": 1028
},
{
"epoch": 2.8076398362892223,
"grad_norm": 0.2759750187397003,
"learning_rate": 2.397238182430994e-07,
"loss": 0.1096,
"step": 1029
},
{
"epoch": 2.810368349249659,
"grad_norm": 0.2655271291732788,
"learning_rate": 2.3285255163663535e-07,
"loss": 0.1058,
"step": 1030
},
{
"epoch": 2.8130968622100956,
"grad_norm": 0.28118759393692017,
"learning_rate": 2.2608003869788786e-07,
"loss": 0.1069,
"step": 1031
},
{
"epoch": 2.815825375170532,
"grad_norm": 0.26823240518569946,
"learning_rate": 2.1940634790238003e-07,
"loss": 0.1069,
"step": 1032
},
{
"epoch": 2.8185538881309684,
"grad_norm": 0.27307194471359253,
"learning_rate": 2.1283154672645522e-07,
"loss": 0.1079,
"step": 1033
},
{
"epoch": 2.821282401091405,
"grad_norm": 0.27044185996055603,
"learning_rate": 2.063557016466111e-07,
"loss": 0.1077,
"step": 1034
},
{
"epoch": 2.8240109140518417,
"grad_norm": 0.2753000855445862,
"learning_rate": 1.999788781388201e-07,
"loss": 0.1086,
"step": 1035
},
{
"epoch": 2.8267394270122783,
"grad_norm": 0.2681461274623871,
"learning_rate": 1.9370114067785995e-07,
"loss": 0.1068,
"step": 1036
},
{
"epoch": 2.829467939972715,
"grad_norm": 0.268425315618515,
"learning_rate": 1.8752255273667752e-07,
"loss": 0.1062,
"step": 1037
},
{
"epoch": 2.8321964529331516,
"grad_norm": 0.268317848443985,
"learning_rate": 1.8144317678573497e-07,
"loss": 0.1067,
"step": 1038
},
{
"epoch": 2.8349249658935882,
"grad_norm": 0.27236369252204895,
"learning_rate": 1.7546307429238129e-07,
"loss": 0.1073,
"step": 1039
},
{
"epoch": 2.8376534788540244,
"grad_norm": 0.26471349596977234,
"learning_rate": 1.6958230572023504e-07,
"loss": 0.1055,
"step": 1040
},
{
"epoch": 2.840381991814461,
"grad_norm": 0.26626163721084595,
"learning_rate": 1.6380093052856482e-07,
"loss": 0.1049,
"step": 1041
},
{
"epoch": 2.8431105047748977,
"grad_norm": 0.27178752422332764,
"learning_rate": 1.5811900717169537e-07,
"loss": 0.1104,
"step": 1042
},
{
"epoch": 2.8458390177353343,
"grad_norm": 0.26552823185920715,
"learning_rate": 1.5253659309841463e-07,
"loss": 0.105,
"step": 1043
},
{
"epoch": 2.848567530695771,
"grad_norm": 0.2542631924152374,
"learning_rate": 1.4705374475138978e-07,
"loss": 0.106,
"step": 1044
},
{
"epoch": 2.851296043656207,
"grad_norm": 0.2633902430534363,
"learning_rate": 1.416705175666e-07,
"loss": 0.1071,
"step": 1045
},
{
"epoch": 2.854024556616644,
"grad_norm": 0.2680572271347046,
"learning_rate": 1.3638696597277678e-07,
"loss": 0.1087,
"step": 1046
},
{
"epoch": 2.8567530695770804,
"grad_norm": 0.26956793665885925,
"learning_rate": 1.3120314339084782e-07,
"loss": 0.1058,
"step": 1047
},
{
"epoch": 2.859481582537517,
"grad_norm": 0.2674187421798706,
"learning_rate": 1.2611910223340408e-07,
"loss": 0.1054,
"step": 1048
},
{
"epoch": 2.8622100954979537,
"grad_norm": 0.29006314277648926,
"learning_rate": 1.2113489390416565e-07,
"loss": 0.1095,
"step": 1049
},
{
"epoch": 2.8649386084583903,
"grad_norm": 0.2720702886581421,
"learning_rate": 1.1625056879746133e-07,
"loss": 0.1069,
"step": 1050
},
{
"epoch": 2.867667121418827,
"grad_norm": 0.27799519896507263,
"learning_rate": 1.1146617629772316e-07,
"loss": 0.1073,
"step": 1051
},
{
"epoch": 2.870395634379263,
"grad_norm": 0.2695223391056061,
"learning_rate": 1.0678176477898372e-07,
"loss": 0.1068,
"step": 1052
},
{
"epoch": 2.8731241473397,
"grad_norm": 0.2720678448677063,
"learning_rate": 1.0219738160438753e-07,
"loss": 0.1056,
"step": 1053
},
{
"epoch": 2.8758526603001364,
"grad_norm": 0.26623815298080444,
"learning_rate": 9.771307312571254e-08,
"loss": 0.1065,
"step": 1054
},
{
"epoch": 2.878581173260573,
"grad_norm": 0.26687225699424744,
"learning_rate": 9.332888468290168e-08,
"loss": 0.1061,
"step": 1055
},
{
"epoch": 2.8813096862210097,
"grad_norm": 0.2614527940750122,
"learning_rate": 8.90448606036054e-08,
"loss": 0.1053,
"step": 1056
},
{
"epoch": 2.884038199181446,
"grad_norm": 0.274239182472229,
"learning_rate": 8.486104420272979e-08,
"loss": 0.1061,
"step": 1057
},
{
"epoch": 2.8867667121418825,
"grad_norm": 0.2697349488735199,
"learning_rate": 8.077747778200474e-08,
"loss": 0.1072,
"step": 1058
},
{
"epoch": 2.889495225102319,
"grad_norm": 0.25701501965522766,
"learning_rate": 7.679420262954984e-08,
"loss": 0.1073,
"step": 1059
},
{
"epoch": 2.892223738062756,
"grad_norm": 0.2809615135192871,
"learning_rate": 7.291125901946027e-08,
"loss": 0.1058,
"step": 1060
},
{
"epoch": 2.8949522510231924,
"grad_norm": 0.27922967076301575,
"learning_rate": 6.912868621140045e-08,
"loss": 0.1084,
"step": 1061
},
{
"epoch": 2.897680763983629,
"grad_norm": 0.28092989325523376,
"learning_rate": 6.544652245020433e-08,
"loss": 0.1062,
"step": 1062
},
{
"epoch": 2.9004092769440657,
"grad_norm": 0.2724795937538147,
"learning_rate": 6.18648049654913e-08,
"loss": 0.1075,
"step": 1063
},
{
"epoch": 2.903137789904502,
"grad_norm": 0.26247331500053406,
"learning_rate": 5.838356997128869e-08,
"loss": 0.1055,
"step": 1064
},
{
"epoch": 2.9058663028649385,
"grad_norm": 0.2700982689857483,
"learning_rate": 5.500285266566319e-08,
"loss": 0.1043,
"step": 1065
},
{
"epoch": 2.908594815825375,
"grad_norm": 0.2781464755535126,
"learning_rate": 5.1722687230369995e-08,
"loss": 0.1101,
"step": 1066
},
{
"epoch": 2.911323328785812,
"grad_norm": 0.2614997327327728,
"learning_rate": 4.854310683050312e-08,
"loss": 0.106,
"step": 1067
},
{
"epoch": 2.9140518417462484,
"grad_norm": 0.27180665731430054,
"learning_rate": 4.5464143614162294e-08,
"loss": 0.108,
"step": 1068
},
{
"epoch": 2.9167803547066846,
"grad_norm": 0.2692631781101227,
"learning_rate": 4.2485828712126584e-08,
"loss": 0.1054,
"step": 1069
},
{
"epoch": 2.9195088676671213,
"grad_norm": 0.2767050862312317,
"learning_rate": 3.96081922375402e-08,
"loss": 0.1089,
"step": 1070
},
{
"epoch": 2.922237380627558,
"grad_norm": 0.2608284652233124,
"learning_rate": 3.683126328560826e-08,
"loss": 0.1072,
"step": 1071
},
{
"epoch": 2.9249658935879945,
"grad_norm": 0.2666699290275574,
"learning_rate": 3.4155069933301535e-08,
"loss": 0.1059,
"step": 1072
},
{
"epoch": 2.927694406548431,
"grad_norm": 0.27596232295036316,
"learning_rate": 3.1579639239074364e-08,
"loss": 0.1099,
"step": 1073
},
{
"epoch": 2.930422919508868,
"grad_norm": 0.266626238822937,
"learning_rate": 2.9104997242590528e-08,
"loss": 0.1078,
"step": 1074
},
{
"epoch": 2.9331514324693044,
"grad_norm": 0.26687273383140564,
"learning_rate": 2.673116896445671e-08,
"loss": 0.1059,
"step": 1075
},
{
"epoch": 2.9358799454297406,
"grad_norm": 0.26934775710105896,
"learning_rate": 2.4458178405974974e-08,
"loss": 0.104,
"step": 1076
},
{
"epoch": 2.9386084583901773,
"grad_norm": 0.2637055516242981,
"learning_rate": 2.2286048548897378e-08,
"loss": 0.1056,
"step": 1077
},
{
"epoch": 2.941336971350614,
"grad_norm": 0.2723630666732788,
"learning_rate": 2.0214801355192826e-08,
"loss": 0.1078,
"step": 1078
},
{
"epoch": 2.9440654843110505,
"grad_norm": 0.26885491609573364,
"learning_rate": 1.824445776682504e-08,
"loss": 0.1071,
"step": 1079
},
{
"epoch": 2.946793997271487,
"grad_norm": 0.2680526077747345,
"learning_rate": 1.6375037705543827e-08,
"loss": 0.1058,
"step": 1080
},
{
"epoch": 2.9495225102319234,
"grad_norm": 0.2671726942062378,
"learning_rate": 1.4606560072679687e-08,
"loss": 0.1048,
"step": 1081
},
{
"epoch": 2.9522510231923604,
"grad_norm": 0.2741990089416504,
"learning_rate": 1.2939042748955078e-08,
"loss": 0.1073,
"step": 1082
},
{
"epoch": 2.9549795361527966,
"grad_norm": 0.2567131221294403,
"learning_rate": 1.1372502594303448e-08,
"loss": 0.1035,
"step": 1083
},
{
"epoch": 2.9577080491132333,
"grad_norm": 0.2711924910545349,
"learning_rate": 9.906955447697153e-09,
"loss": 0.1074,
"step": 1084
},
{
"epoch": 2.96043656207367,
"grad_norm": 0.2609352767467499,
"learning_rate": 8.542416126989805e-09,
"loss": 0.1051,
"step": 1085
},
{
"epoch": 2.9631650750341065,
"grad_norm": 0.2701236307621002,
"learning_rate": 7.278898428764169e-09,
"loss": 0.1084,
"step": 1086
},
{
"epoch": 2.965893587994543,
"grad_norm": 0.2622404098510742,
"learning_rate": 6.1164151281944974e-09,
"loss": 0.1064,
"step": 1087
},
{
"epoch": 2.9686221009549794,
"grad_norm": 0.2681756317615509,
"learning_rate": 5.054977978916631e-09,
"loss": 0.1069,
"step": 1088
},
{
"epoch": 2.971350613915416,
"grad_norm": 0.26623058319091797,
"learning_rate": 4.094597712908099e-09,
"loss": 0.1067,
"step": 1089
},
{
"epoch": 2.9740791268758526,
"grad_norm": 0.2589554190635681,
"learning_rate": 3.2352840403804264e-09,
"loss": 0.1054,
"step": 1090
},
{
"epoch": 2.9768076398362893,
"grad_norm": 0.2684115469455719,
"learning_rate": 2.477045649681431e-09,
"loss": 0.1041,
"step": 1091
},
{
"epoch": 2.979536152796726,
"grad_norm": 0.2617541253566742,
"learning_rate": 1.8198902072097402e-09,
"loss": 0.1072,
"step": 1092
},
{
"epoch": 2.982264665757162,
"grad_norm": 0.3196612596511841,
"learning_rate": 1.2638243573293019e-09,
"loss": 0.1064,
"step": 1093
},
{
"epoch": 2.984993178717599,
"grad_norm": 0.2657419741153717,
"learning_rate": 8.088537223116533e-10,
"loss": 0.1052,
"step": 1094
},
{
"epoch": 2.9877216916780354,
"grad_norm": 0.270195871591568,
"learning_rate": 4.549829022748586e-10,
"loss": 0.1068,
"step": 1095
},
{
"epoch": 2.990450204638472,
"grad_norm": 0.2676283121109009,
"learning_rate": 2.02215475132439e-10,
"loss": 0.1069,
"step": 1096
},
{
"epoch": 2.9931787175989086,
"grad_norm": 0.2716352939605713,
"learning_rate": 5.0553996568947216e-11,
"loss": 0.1063,
"step": 1097
},
{
"epoch": 2.9959072305593453,
"grad_norm": 0.2776670753955841,
"learning_rate": 0.0,
"loss": 0.1066,
"step": 1098
},
{
"epoch": 2.9959072305593453,
"step": 1098,
"total_flos": 1.4375935422297539e+19,
"train_loss": 0.1783666251420649,
"train_runtime": 24222.5988,
"train_samples_per_second": 5.809,
"train_steps_per_second": 0.045
}
],
"logging_steps": 1,
"max_steps": 1098,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 999999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.4375935422297539e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}