diaenra's picture
Training in progress, step 478, checkpoint
8f853c6 verified
raw
history blame
82.1 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.966144517433047,
"eval_steps": 500,
"global_step": 478,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00202122283981809,
"grad_norm": 4.962096691131592,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.6441,
"step": 1
},
{
"epoch": 0.00404244567963618,
"grad_norm": 5.593231678009033,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.8613,
"step": 2
},
{
"epoch": 0.00606366851945427,
"grad_norm": 5.743273735046387,
"learning_rate": 3e-06,
"loss": 1.9149,
"step": 3
},
{
"epoch": 0.00808489135927236,
"grad_norm": 5.530357360839844,
"learning_rate": 4.000000000000001e-06,
"loss": 2.0057,
"step": 4
},
{
"epoch": 0.01010611419909045,
"grad_norm": 6.651333332061768,
"learning_rate": 5e-06,
"loss": 1.9692,
"step": 5
},
{
"epoch": 0.01212733703890854,
"grad_norm": 6.602941513061523,
"learning_rate": 6e-06,
"loss": 2.4343,
"step": 6
},
{
"epoch": 0.01414855987872663,
"grad_norm": 6.895396709442139,
"learning_rate": 7.000000000000001e-06,
"loss": 2.26,
"step": 7
},
{
"epoch": 0.01616978271854472,
"grad_norm": 7.525021553039551,
"learning_rate": 8.000000000000001e-06,
"loss": 2.2767,
"step": 8
},
{
"epoch": 0.01819100555836281,
"grad_norm": 7.5351762771606445,
"learning_rate": 9e-06,
"loss": 2.7438,
"step": 9
},
{
"epoch": 0.0202122283981809,
"grad_norm": 7.658970832824707,
"learning_rate": 1e-05,
"loss": 2.764,
"step": 10
},
{
"epoch": 0.02223345123799899,
"grad_norm": 8.046220779418945,
"learning_rate": 1.1000000000000001e-05,
"loss": 2.3894,
"step": 11
},
{
"epoch": 0.02425467407781708,
"grad_norm": 8.3847017288208,
"learning_rate": 1.2e-05,
"loss": 2.5517,
"step": 12
},
{
"epoch": 0.02627589691763517,
"grad_norm": 8.96577262878418,
"learning_rate": 1.3000000000000001e-05,
"loss": 2.2152,
"step": 13
},
{
"epoch": 0.02829711975745326,
"grad_norm": 8.063103675842285,
"learning_rate": 1.4000000000000001e-05,
"loss": 2.1623,
"step": 14
},
{
"epoch": 0.03031834259727135,
"grad_norm": 8.5758638381958,
"learning_rate": 1.5e-05,
"loss": 2.4497,
"step": 15
},
{
"epoch": 0.03233956543708944,
"grad_norm": 8.477540969848633,
"learning_rate": 1.6000000000000003e-05,
"loss": 2.3183,
"step": 16
},
{
"epoch": 0.03436078827690753,
"grad_norm": 8.865395545959473,
"learning_rate": 1.7000000000000003e-05,
"loss": 2.2435,
"step": 17
},
{
"epoch": 0.03638201111672562,
"grad_norm": 8.725611686706543,
"learning_rate": 1.8e-05,
"loss": 2.1894,
"step": 18
},
{
"epoch": 0.03840323395654371,
"grad_norm": 8.353998184204102,
"learning_rate": 1.9e-05,
"loss": 2.0811,
"step": 19
},
{
"epoch": 0.0404244567963618,
"grad_norm": 8.999526977539062,
"learning_rate": 2e-05,
"loss": 2.2778,
"step": 20
},
{
"epoch": 0.04244567963617989,
"grad_norm": 8.592598915100098,
"learning_rate": 2.1e-05,
"loss": 2.3943,
"step": 21
},
{
"epoch": 0.04446690247599798,
"grad_norm": 7.57433557510376,
"learning_rate": 2.2000000000000003e-05,
"loss": 2.1125,
"step": 22
},
{
"epoch": 0.046488125315816066,
"grad_norm": 8.0515775680542,
"learning_rate": 2.3000000000000003e-05,
"loss": 1.9887,
"step": 23
},
{
"epoch": 0.04850934815563416,
"grad_norm": 7.530181884765625,
"learning_rate": 2.4e-05,
"loss": 2.2077,
"step": 24
},
{
"epoch": 0.050530570995452245,
"grad_norm": 6.949326992034912,
"learning_rate": 2.5e-05,
"loss": 2.049,
"step": 25
},
{
"epoch": 0.05255179383527034,
"grad_norm": 7.002259254455566,
"learning_rate": 2.6000000000000002e-05,
"loss": 1.9994,
"step": 26
},
{
"epoch": 0.05457301667508843,
"grad_norm": 7.145877838134766,
"learning_rate": 2.7000000000000002e-05,
"loss": 1.7604,
"step": 27
},
{
"epoch": 0.05659423951490652,
"grad_norm": 7.082208156585693,
"learning_rate": 2.8000000000000003e-05,
"loss": 2.0095,
"step": 28
},
{
"epoch": 0.05861546235472461,
"grad_norm": 6.70477294921875,
"learning_rate": 2.9e-05,
"loss": 1.6048,
"step": 29
},
{
"epoch": 0.0606366851945427,
"grad_norm": 8.728182792663574,
"learning_rate": 3e-05,
"loss": 2.2502,
"step": 30
},
{
"epoch": 0.06265790803436079,
"grad_norm": 8.69613265991211,
"learning_rate": 3.1e-05,
"loss": 2.0332,
"step": 31
},
{
"epoch": 0.06467913087417888,
"grad_norm": 8.603922843933105,
"learning_rate": 3.2000000000000005e-05,
"loss": 2.3491,
"step": 32
},
{
"epoch": 0.06670035371399696,
"grad_norm": 7.335165977478027,
"learning_rate": 3.3e-05,
"loss": 1.7337,
"step": 33
},
{
"epoch": 0.06872157655381506,
"grad_norm": 8.186851501464844,
"learning_rate": 3.4000000000000007e-05,
"loss": 1.9627,
"step": 34
},
{
"epoch": 0.07074279939363315,
"grad_norm": 7.595352649688721,
"learning_rate": 3.5e-05,
"loss": 1.5682,
"step": 35
},
{
"epoch": 0.07276402223345124,
"grad_norm": 7.205020904541016,
"learning_rate": 3.6e-05,
"loss": 1.7703,
"step": 36
},
{
"epoch": 0.07478524507326932,
"grad_norm": 7.933116436004639,
"learning_rate": 3.7e-05,
"loss": 1.8315,
"step": 37
},
{
"epoch": 0.07680646791308741,
"grad_norm": 7.590288162231445,
"learning_rate": 3.8e-05,
"loss": 1.8305,
"step": 38
},
{
"epoch": 0.07882769075290551,
"grad_norm": 7.468386650085449,
"learning_rate": 3.9000000000000006e-05,
"loss": 1.6923,
"step": 39
},
{
"epoch": 0.0808489135927236,
"grad_norm": 8.244772911071777,
"learning_rate": 4e-05,
"loss": 2.0191,
"step": 40
},
{
"epoch": 0.0828701364325417,
"grad_norm": 8.714116096496582,
"learning_rate": 4.1e-05,
"loss": 1.9665,
"step": 41
},
{
"epoch": 0.08489135927235977,
"grad_norm": 8.570602416992188,
"learning_rate": 4.2e-05,
"loss": 1.9418,
"step": 42
},
{
"epoch": 0.08691258211217787,
"grad_norm": 7.338136196136475,
"learning_rate": 4.3e-05,
"loss": 1.1443,
"step": 43
},
{
"epoch": 0.08893380495199596,
"grad_norm": 8.277491569519043,
"learning_rate": 4.4000000000000006e-05,
"loss": 1.5882,
"step": 44
},
{
"epoch": 0.09095502779181405,
"grad_norm": 7.305893421173096,
"learning_rate": 4.5e-05,
"loss": 1.1692,
"step": 45
},
{
"epoch": 0.09297625063163213,
"grad_norm": 7.610684871673584,
"learning_rate": 4.600000000000001e-05,
"loss": 1.3468,
"step": 46
},
{
"epoch": 0.09499747347145023,
"grad_norm": 7.890575885772705,
"learning_rate": 4.7e-05,
"loss": 1.1566,
"step": 47
},
{
"epoch": 0.09701869631126832,
"grad_norm": 8.964077949523926,
"learning_rate": 4.8e-05,
"loss": 1.635,
"step": 48
},
{
"epoch": 0.09903991915108641,
"grad_norm": 9.524826049804688,
"learning_rate": 4.9e-05,
"loss": 1.4733,
"step": 49
},
{
"epoch": 0.10106114199090449,
"grad_norm": 9.499811172485352,
"learning_rate": 5e-05,
"loss": 1.5931,
"step": 50
},
{
"epoch": 0.10308236483072258,
"grad_norm": 5.5525898933410645,
"learning_rate": 5.1000000000000006e-05,
"loss": 1.5699,
"step": 51
},
{
"epoch": 0.10510358767054068,
"grad_norm": 4.883670330047607,
"learning_rate": 5.2000000000000004e-05,
"loss": 1.3653,
"step": 52
},
{
"epoch": 0.10712481051035877,
"grad_norm": 3.8409433364868164,
"learning_rate": 5.300000000000001e-05,
"loss": 1.2819,
"step": 53
},
{
"epoch": 0.10914603335017686,
"grad_norm": 4.300581932067871,
"learning_rate": 5.4000000000000005e-05,
"loss": 1.5477,
"step": 54
},
{
"epoch": 0.11116725618999494,
"grad_norm": 3.8485517501831055,
"learning_rate": 5.500000000000001e-05,
"loss": 1.4893,
"step": 55
},
{
"epoch": 0.11318847902981304,
"grad_norm": 4.364621639251709,
"learning_rate": 5.6000000000000006e-05,
"loss": 1.6671,
"step": 56
},
{
"epoch": 0.11520970186963113,
"grad_norm": 4.246096134185791,
"learning_rate": 5.6999999999999996e-05,
"loss": 1.6163,
"step": 57
},
{
"epoch": 0.11723092470944922,
"grad_norm": 4.382571697235107,
"learning_rate": 5.8e-05,
"loss": 1.6223,
"step": 58
},
{
"epoch": 0.1192521475492673,
"grad_norm": 4.406397819519043,
"learning_rate": 5.9e-05,
"loss": 1.5827,
"step": 59
},
{
"epoch": 0.1212733703890854,
"grad_norm": 5.563169002532959,
"learning_rate": 6e-05,
"loss": 2.1763,
"step": 60
},
{
"epoch": 0.12329459322890349,
"grad_norm": 5.388707160949707,
"learning_rate": 6.1e-05,
"loss": 1.9962,
"step": 61
},
{
"epoch": 0.12531581606872158,
"grad_norm": 4.910810947418213,
"learning_rate": 6.2e-05,
"loss": 1.6651,
"step": 62
},
{
"epoch": 0.12733703890853967,
"grad_norm": 5.668425559997559,
"learning_rate": 6.3e-05,
"loss": 1.8851,
"step": 63
},
{
"epoch": 0.12935826174835777,
"grad_norm": 5.245799541473389,
"learning_rate": 6.400000000000001e-05,
"loss": 1.7111,
"step": 64
},
{
"epoch": 0.13137948458817586,
"grad_norm": 5.701318264007568,
"learning_rate": 6.500000000000001e-05,
"loss": 2.0466,
"step": 65
},
{
"epoch": 0.13340070742799393,
"grad_norm": 6.002028942108154,
"learning_rate": 6.6e-05,
"loss": 1.9546,
"step": 66
},
{
"epoch": 0.13542193026781202,
"grad_norm": 5.405800819396973,
"learning_rate": 6.7e-05,
"loss": 1.6626,
"step": 67
},
{
"epoch": 0.1374431531076301,
"grad_norm": 5.076318740844727,
"learning_rate": 6.800000000000001e-05,
"loss": 1.726,
"step": 68
},
{
"epoch": 0.1394643759474482,
"grad_norm": 5.462904930114746,
"learning_rate": 6.9e-05,
"loss": 1.6945,
"step": 69
},
{
"epoch": 0.1414855987872663,
"grad_norm": 5.7171783447265625,
"learning_rate": 7e-05,
"loss": 1.9263,
"step": 70
},
{
"epoch": 0.1435068216270844,
"grad_norm": 5.716061592102051,
"learning_rate": 7.1e-05,
"loss": 1.4844,
"step": 71
},
{
"epoch": 0.14552804446690248,
"grad_norm": 5.982063293457031,
"learning_rate": 7.2e-05,
"loss": 1.8287,
"step": 72
},
{
"epoch": 0.14754926730672058,
"grad_norm": 5.261101722717285,
"learning_rate": 7.3e-05,
"loss": 1.5783,
"step": 73
},
{
"epoch": 0.14957049014653864,
"grad_norm": 5.717907428741455,
"learning_rate": 7.4e-05,
"loss": 1.4726,
"step": 74
},
{
"epoch": 0.15159171298635674,
"grad_norm": 5.534896373748779,
"learning_rate": 7.500000000000001e-05,
"loss": 1.5899,
"step": 75
},
{
"epoch": 0.15361293582617483,
"grad_norm": 6.794299125671387,
"learning_rate": 7.6e-05,
"loss": 1.7261,
"step": 76
},
{
"epoch": 0.15563415866599292,
"grad_norm": 6.486598014831543,
"learning_rate": 7.7e-05,
"loss": 1.7126,
"step": 77
},
{
"epoch": 0.15765538150581102,
"grad_norm": 6.078768730163574,
"learning_rate": 7.800000000000001e-05,
"loss": 1.5131,
"step": 78
},
{
"epoch": 0.1596766043456291,
"grad_norm": 7.305526256561279,
"learning_rate": 7.900000000000001e-05,
"loss": 2.1216,
"step": 79
},
{
"epoch": 0.1616978271854472,
"grad_norm": 6.43522310256958,
"learning_rate": 8e-05,
"loss": 1.5078,
"step": 80
},
{
"epoch": 0.1637190500252653,
"grad_norm": 6.868276119232178,
"learning_rate": 8.1e-05,
"loss": 1.6478,
"step": 81
},
{
"epoch": 0.1657402728650834,
"grad_norm": 6.5684051513671875,
"learning_rate": 8.2e-05,
"loss": 1.554,
"step": 82
},
{
"epoch": 0.16776149570490145,
"grad_norm": 7.237800121307373,
"learning_rate": 8.3e-05,
"loss": 1.776,
"step": 83
},
{
"epoch": 0.16978271854471955,
"grad_norm": 10.40848445892334,
"learning_rate": 8.4e-05,
"loss": 1.3637,
"step": 84
},
{
"epoch": 0.17180394138453764,
"grad_norm": 7.5290846824646,
"learning_rate": 8.5e-05,
"loss": 1.8149,
"step": 85
},
{
"epoch": 0.17382516422435573,
"grad_norm": 6.535577297210693,
"learning_rate": 8.6e-05,
"loss": 1.5878,
"step": 86
},
{
"epoch": 0.17584638706417383,
"grad_norm": 6.797990322113037,
"learning_rate": 8.7e-05,
"loss": 1.6962,
"step": 87
},
{
"epoch": 0.17786760990399192,
"grad_norm": 8.046355247497559,
"learning_rate": 8.800000000000001e-05,
"loss": 1.6756,
"step": 88
},
{
"epoch": 0.17988883274381,
"grad_norm": 6.245670318603516,
"learning_rate": 8.900000000000001e-05,
"loss": 1.4684,
"step": 89
},
{
"epoch": 0.1819100555836281,
"grad_norm": 6.456711769104004,
"learning_rate": 9e-05,
"loss": 1.4074,
"step": 90
},
{
"epoch": 0.1839312784234462,
"grad_norm": 6.714746475219727,
"learning_rate": 9.1e-05,
"loss": 1.4863,
"step": 91
},
{
"epoch": 0.18595250126326426,
"grad_norm": 8.266717910766602,
"learning_rate": 9.200000000000001e-05,
"loss": 1.8342,
"step": 92
},
{
"epoch": 0.18797372410308236,
"grad_norm": 7.780879497528076,
"learning_rate": 9.300000000000001e-05,
"loss": 1.9541,
"step": 93
},
{
"epoch": 0.18999494694290045,
"grad_norm": 6.307599067687988,
"learning_rate": 9.4e-05,
"loss": 1.2528,
"step": 94
},
{
"epoch": 0.19201616978271854,
"grad_norm": 7.502289295196533,
"learning_rate": 9.5e-05,
"loss": 1.5187,
"step": 95
},
{
"epoch": 0.19403739262253664,
"grad_norm": 6.638027667999268,
"learning_rate": 9.6e-05,
"loss": 1.2167,
"step": 96
},
{
"epoch": 0.19605861546235473,
"grad_norm": 7.040843963623047,
"learning_rate": 9.7e-05,
"loss": 1.3433,
"step": 97
},
{
"epoch": 0.19807983830217282,
"grad_norm": 6.591531753540039,
"learning_rate": 9.8e-05,
"loss": 1.1483,
"step": 98
},
{
"epoch": 0.20010106114199092,
"grad_norm": 8.779806137084961,
"learning_rate": 9.900000000000001e-05,
"loss": 1.8501,
"step": 99
},
{
"epoch": 0.20212228398180898,
"grad_norm": 8.384221076965332,
"learning_rate": 0.0001,
"loss": 1.5389,
"step": 100
},
{
"epoch": 0.20414350682162707,
"grad_norm": 4.096580982208252,
"learning_rate": 9.999841055681184e-05,
"loss": 1.4128,
"step": 101
},
{
"epoch": 0.20616472966144517,
"grad_norm": 4.011407375335693,
"learning_rate": 9.999364232830052e-05,
"loss": 1.2588,
"step": 102
},
{
"epoch": 0.20818595250126326,
"grad_norm": 3.9498050212860107,
"learning_rate": 9.99856956176192e-05,
"loss": 1.4214,
"step": 103
},
{
"epoch": 0.21020717534108135,
"grad_norm": 3.8423962593078613,
"learning_rate": 9.997457093000164e-05,
"loss": 1.7436,
"step": 104
},
{
"epoch": 0.21222839818089945,
"grad_norm": 3.859107255935669,
"learning_rate": 9.996026897273024e-05,
"loss": 1.5557,
"step": 105
},
{
"epoch": 0.21424962102071754,
"grad_norm": 4.101138591766357,
"learning_rate": 9.994279065509093e-05,
"loss": 1.6637,
"step": 106
},
{
"epoch": 0.21627084386053563,
"grad_norm": 4.0843963623046875,
"learning_rate": 9.992213708831543e-05,
"loss": 1.6069,
"step": 107
},
{
"epoch": 0.21829206670035373,
"grad_norm": 4.65712833404541,
"learning_rate": 9.989830958551057e-05,
"loss": 1.8503,
"step": 108
},
{
"epoch": 0.2203132895401718,
"grad_norm": 5.231738090515137,
"learning_rate": 9.987130966157486e-05,
"loss": 1.7857,
"step": 109
},
{
"epoch": 0.22233451237998988,
"grad_norm": 5.475664138793945,
"learning_rate": 9.984113903310206e-05,
"loss": 2.1317,
"step": 110
},
{
"epoch": 0.22435573521980798,
"grad_norm": 5.313016414642334,
"learning_rate": 9.98077996182722e-05,
"loss": 2.0104,
"step": 111
},
{
"epoch": 0.22637695805962607,
"grad_norm": 4.913154125213623,
"learning_rate": 9.97712935367295e-05,
"loss": 1.6801,
"step": 112
},
{
"epoch": 0.22839818089944416,
"grad_norm": 5.323668479919434,
"learning_rate": 9.973162310944768e-05,
"loss": 1.8818,
"step": 113
},
{
"epoch": 0.23041940373926226,
"grad_norm": 5.191682815551758,
"learning_rate": 9.968879085858234e-05,
"loss": 1.6902,
"step": 114
},
{
"epoch": 0.23244062657908035,
"grad_norm": 5.358806133270264,
"learning_rate": 9.964279950731066e-05,
"loss": 1.5777,
"step": 115
},
{
"epoch": 0.23446184941889844,
"grad_norm": 4.940889358520508,
"learning_rate": 9.959365197965824e-05,
"loss": 1.4829,
"step": 116
},
{
"epoch": 0.23648307225871654,
"grad_norm": 5.8027191162109375,
"learning_rate": 9.954135140031321e-05,
"loss": 1.743,
"step": 117
},
{
"epoch": 0.2385042950985346,
"grad_norm": 6.13554048538208,
"learning_rate": 9.948590109442754e-05,
"loss": 1.6588,
"step": 118
},
{
"epoch": 0.2405255179383527,
"grad_norm": 5.018206596374512,
"learning_rate": 9.942730458740568e-05,
"loss": 1.4198,
"step": 119
},
{
"epoch": 0.2425467407781708,
"grad_norm": 6.798793792724609,
"learning_rate": 9.936556560468037e-05,
"loss": 1.9842,
"step": 120
},
{
"epoch": 0.24456796361798888,
"grad_norm": 5.7152323722839355,
"learning_rate": 9.930068807147584e-05,
"loss": 1.739,
"step": 121
},
{
"epoch": 0.24658918645780697,
"grad_norm": 5.72061824798584,
"learning_rate": 9.923267611255825e-05,
"loss": 1.5452,
"step": 122
},
{
"epoch": 0.24861040929762507,
"grad_norm": 5.586998462677002,
"learning_rate": 9.916153405197332e-05,
"loss": 1.5731,
"step": 123
},
{
"epoch": 0.25063163213744316,
"grad_norm": 5.702105522155762,
"learning_rate": 9.908726641277167e-05,
"loss": 1.8093,
"step": 124
},
{
"epoch": 0.25265285497726125,
"grad_norm": 6.596114158630371,
"learning_rate": 9.9009877916721e-05,
"loss": 1.6054,
"step": 125
},
{
"epoch": 0.25467407781707935,
"grad_norm": 5.10860013961792,
"learning_rate": 9.892937348400601e-05,
"loss": 1.4648,
"step": 126
},
{
"epoch": 0.25669530065689744,
"grad_norm": 6.096838474273682,
"learning_rate": 9.88457582329156e-05,
"loss": 1.5438,
"step": 127
},
{
"epoch": 0.25871652349671553,
"grad_norm": 6.351302146911621,
"learning_rate": 9.875903747951742e-05,
"loss": 1.7681,
"step": 128
},
{
"epoch": 0.2607377463365336,
"grad_norm": 5.202528476715088,
"learning_rate": 9.866921673731992e-05,
"loss": 1.2863,
"step": 129
},
{
"epoch": 0.2627589691763517,
"grad_norm": 6.953995227813721,
"learning_rate": 9.857630171692174e-05,
"loss": 2.0508,
"step": 130
},
{
"epoch": 0.26478019201616976,
"grad_norm": 6.509955883026123,
"learning_rate": 9.848029832564875e-05,
"loss": 1.6179,
"step": 131
},
{
"epoch": 0.26680141485598785,
"grad_norm": 6.635447025299072,
"learning_rate": 9.838121266717839e-05,
"loss": 1.7545,
"step": 132
},
{
"epoch": 0.26882263769580594,
"grad_norm": 7.2970099449157715,
"learning_rate": 9.827905104115166e-05,
"loss": 1.7671,
"step": 133
},
{
"epoch": 0.27084386053562404,
"grad_norm": 5.802951335906982,
"learning_rate": 9.817381994277261e-05,
"loss": 1.4303,
"step": 134
},
{
"epoch": 0.27286508337544213,
"grad_norm": 5.940200328826904,
"learning_rate": 9.80655260623953e-05,
"loss": 1.6777,
"step": 135
},
{
"epoch": 0.2748863062152602,
"grad_norm": 6.702271938323975,
"learning_rate": 9.795417628509857e-05,
"loss": 1.7169,
"step": 136
},
{
"epoch": 0.2769075290550783,
"grad_norm": 5.698646068572998,
"learning_rate": 9.783977769024821e-05,
"loss": 1.4602,
"step": 137
},
{
"epoch": 0.2789287518948964,
"grad_norm": 7.149135589599609,
"learning_rate": 9.772233755104694e-05,
"loss": 1.6813,
"step": 138
},
{
"epoch": 0.2809499747347145,
"grad_norm": 6.527937412261963,
"learning_rate": 9.760186333407189e-05,
"loss": 1.5575,
"step": 139
},
{
"epoch": 0.2829711975745326,
"grad_norm": 9.34347152709961,
"learning_rate": 9.747836269880003e-05,
"loss": 1.6409,
"step": 140
},
{
"epoch": 0.2849924204143507,
"grad_norm": 5.21145486831665,
"learning_rate": 9.735184349712109e-05,
"loss": 1.0628,
"step": 141
},
{
"epoch": 0.2870136432541688,
"grad_norm": 5.601922988891602,
"learning_rate": 9.722231377283841e-05,
"loss": 1.2334,
"step": 142
},
{
"epoch": 0.2890348660939869,
"grad_norm": 6.080234050750732,
"learning_rate": 9.708978176115751e-05,
"loss": 1.1516,
"step": 143
},
{
"epoch": 0.29105608893380497,
"grad_norm": 6.611027717590332,
"learning_rate": 9.695425588816249e-05,
"loss": 1.3611,
"step": 144
},
{
"epoch": 0.29307731177362306,
"grad_norm": 5.696514129638672,
"learning_rate": 9.681574477028039e-05,
"loss": 1.0987,
"step": 145
},
{
"epoch": 0.29509853461344115,
"grad_norm": 8.035621643066406,
"learning_rate": 9.667425721373332e-05,
"loss": 1.7412,
"step": 146
},
{
"epoch": 0.29711975745325925,
"grad_norm": 8.804828643798828,
"learning_rate": 9.65298022139786e-05,
"loss": 1.7839,
"step": 147
},
{
"epoch": 0.2991409802930773,
"grad_norm": 6.8422064781188965,
"learning_rate": 9.638238895513687e-05,
"loss": 1.3999,
"step": 148
},
{
"epoch": 0.3011622031328954,
"grad_norm": 9.455986022949219,
"learning_rate": 9.623202680940811e-05,
"loss": 1.8304,
"step": 149
},
{
"epoch": 0.30318342597271347,
"grad_norm": 7.243097305297852,
"learning_rate": 9.607872533647584e-05,
"loss": 1.2319,
"step": 150
},
{
"epoch": 0.30520464881253156,
"grad_norm": 3.3011507987976074,
"learning_rate": 9.592249428289934e-05,
"loss": 1.2937,
"step": 151
},
{
"epoch": 0.30722587165234966,
"grad_norm": 3.4213240146636963,
"learning_rate": 9.5763343581494e-05,
"loss": 1.4224,
"step": 152
},
{
"epoch": 0.30924709449216775,
"grad_norm": 2.8826541900634766,
"learning_rate": 9.56012833506997e-05,
"loss": 1.0413,
"step": 153
},
{
"epoch": 0.31126831733198584,
"grad_norm": 3.137408494949341,
"learning_rate": 9.543632389393767e-05,
"loss": 1.3462,
"step": 154
},
{
"epoch": 0.31328954017180394,
"grad_norm": 3.6072700023651123,
"learning_rate": 9.52684756989553e-05,
"loss": 1.4831,
"step": 155
},
{
"epoch": 0.31531076301162203,
"grad_norm": 3.7660746574401855,
"learning_rate": 9.509774943715939e-05,
"loss": 1.6384,
"step": 156
},
{
"epoch": 0.3173319858514401,
"grad_norm": 4.279458522796631,
"learning_rate": 9.492415596293769e-05,
"loss": 1.8637,
"step": 157
},
{
"epoch": 0.3193532086912582,
"grad_norm": 3.9801113605499268,
"learning_rate": 9.474770631296881e-05,
"loss": 1.47,
"step": 158
},
{
"epoch": 0.3213744315310763,
"grad_norm": 4.086119174957275,
"learning_rate": 9.456841170552053e-05,
"loss": 1.7968,
"step": 159
},
{
"epoch": 0.3233956543708944,
"grad_norm": 3.968665838241577,
"learning_rate": 9.438628353973653e-05,
"loss": 1.4681,
"step": 160
},
{
"epoch": 0.3254168772107125,
"grad_norm": 4.774650573730469,
"learning_rate": 9.420133339491171e-05,
"loss": 1.9598,
"step": 161
},
{
"epoch": 0.3274381000505306,
"grad_norm": 4.956033706665039,
"learning_rate": 9.401357302975599e-05,
"loss": 1.7753,
"step": 162
},
{
"epoch": 0.3294593228903487,
"grad_norm": 4.571745872497559,
"learning_rate": 9.382301438164672e-05,
"loss": 1.5018,
"step": 163
},
{
"epoch": 0.3314805457301668,
"grad_norm": 4.879904747009277,
"learning_rate": 9.362966956586969e-05,
"loss": 1.795,
"step": 164
},
{
"epoch": 0.3335017685699848,
"grad_norm": 4.576776027679443,
"learning_rate": 9.343355087484894e-05,
"loss": 1.6005,
"step": 165
},
{
"epoch": 0.3355229914098029,
"grad_norm": 4.379726409912109,
"learning_rate": 9.323467077736511e-05,
"loss": 1.3807,
"step": 166
},
{
"epoch": 0.337544214249621,
"grad_norm": 5.608097553253174,
"learning_rate": 9.303304191776291e-05,
"loss": 1.7641,
"step": 167
},
{
"epoch": 0.3395654370894391,
"grad_norm": 4.803214073181152,
"learning_rate": 9.282867711514702e-05,
"loss": 1.5029,
"step": 168
},
{
"epoch": 0.3415866599292572,
"grad_norm": 5.842015266418457,
"learning_rate": 9.262158936256717e-05,
"loss": 1.9833,
"step": 169
},
{
"epoch": 0.3436078827690753,
"grad_norm": 5.395236968994141,
"learning_rate": 9.241179182619206e-05,
"loss": 1.6689,
"step": 170
},
{
"epoch": 0.34562910560889337,
"grad_norm": 6.057033061981201,
"learning_rate": 9.219929784447231e-05,
"loss": 1.9053,
"step": 171
},
{
"epoch": 0.34765032844871147,
"grad_norm": 5.41602897644043,
"learning_rate": 9.19841209272924e-05,
"loss": 1.56,
"step": 172
},
{
"epoch": 0.34967155128852956,
"grad_norm": 5.450588703155518,
"learning_rate": 9.17662747551117e-05,
"loss": 1.5914,
"step": 173
},
{
"epoch": 0.35169277412834765,
"grad_norm": 5.975580215454102,
"learning_rate": 9.154577317809482e-05,
"loss": 1.7152,
"step": 174
},
{
"epoch": 0.35371399696816574,
"grad_norm": 5.361056804656982,
"learning_rate": 9.132263021523096e-05,
"loss": 1.804,
"step": 175
},
{
"epoch": 0.35573521980798384,
"grad_norm": 5.825102806091309,
"learning_rate": 9.109686005344258e-05,
"loss": 1.5358,
"step": 176
},
{
"epoch": 0.35775644264780193,
"grad_norm": 5.946747779846191,
"learning_rate": 9.086847704668351e-05,
"loss": 1.964,
"step": 177
},
{
"epoch": 0.35977766548762,
"grad_norm": 5.494548320770264,
"learning_rate": 9.063749571502634e-05,
"loss": 1.4712,
"step": 178
},
{
"epoch": 0.3617988883274381,
"grad_norm": 5.114996433258057,
"learning_rate": 9.040393074373921e-05,
"loss": 1.5819,
"step": 179
},
{
"epoch": 0.3638201111672562,
"grad_norm": 5.512078285217285,
"learning_rate": 9.016779698235227e-05,
"loss": 1.6159,
"step": 180
},
{
"epoch": 0.3658413340070743,
"grad_norm": 5.07801628112793,
"learning_rate": 8.992910944371342e-05,
"loss": 1.4474,
"step": 181
},
{
"epoch": 0.3678625568468924,
"grad_norm": 6.879641056060791,
"learning_rate": 8.9687883303034e-05,
"loss": 1.7858,
"step": 182
},
{
"epoch": 0.36988377968671043,
"grad_norm": 5.092156887054443,
"learning_rate": 8.94441338969238e-05,
"loss": 1.4168,
"step": 183
},
{
"epoch": 0.3719050025265285,
"grad_norm": 5.373355865478516,
"learning_rate": 8.919787672241619e-05,
"loss": 1.5532,
"step": 184
},
{
"epoch": 0.3739262253663466,
"grad_norm": 5.264693737030029,
"learning_rate": 8.894912743598268e-05,
"loss": 1.3256,
"step": 185
},
{
"epoch": 0.3759474482061647,
"grad_norm": 5.930539608001709,
"learning_rate": 8.869790185253766e-05,
"loss": 1.6318,
"step": 186
},
{
"epoch": 0.3779686710459828,
"grad_norm": 6.245593547821045,
"learning_rate": 8.84442159444328e-05,
"loss": 1.6553,
"step": 187
},
{
"epoch": 0.3799898938858009,
"grad_norm": 6.605845928192139,
"learning_rate": 8.818808584044162e-05,
"loss": 1.1972,
"step": 188
},
{
"epoch": 0.382011116725619,
"grad_norm": 5.590306758880615,
"learning_rate": 8.792952782473413e-05,
"loss": 1.2016,
"step": 189
},
{
"epoch": 0.3840323395654371,
"grad_norm": 6.213796138763428,
"learning_rate": 8.76685583358414e-05,
"loss": 1.393,
"step": 190
},
{
"epoch": 0.3860535624052552,
"grad_norm": 6.904628753662109,
"learning_rate": 8.740519396561044e-05,
"loss": 1.5803,
"step": 191
},
{
"epoch": 0.3880747852450733,
"grad_norm": 6.069091796875,
"learning_rate": 8.713945145814946e-05,
"loss": 1.4332,
"step": 192
},
{
"epoch": 0.39009600808489137,
"grad_norm": 6.582734107971191,
"learning_rate": 8.687134770876319e-05,
"loss": 1.3049,
"step": 193
},
{
"epoch": 0.39211723092470946,
"grad_norm": 5.841890811920166,
"learning_rate": 8.660089976287875e-05,
"loss": 1.3647,
"step": 194
},
{
"epoch": 0.39413845376452755,
"grad_norm": 5.880384922027588,
"learning_rate": 8.632812481496195e-05,
"loss": 1.37,
"step": 195
},
{
"epoch": 0.39615967660434565,
"grad_norm": 5.35994291305542,
"learning_rate": 8.60530402074241e-05,
"loss": 1.103,
"step": 196
},
{
"epoch": 0.39818089944416374,
"grad_norm": 5.617462635040283,
"learning_rate": 8.577566342951943e-05,
"loss": 0.9969,
"step": 197
},
{
"epoch": 0.40020212228398183,
"grad_norm": 5.336188793182373,
"learning_rate": 8.549601211623316e-05,
"loss": 0.9578,
"step": 198
},
{
"epoch": 0.4022233451237999,
"grad_norm": 7.3241047859191895,
"learning_rate": 8.521410404716028e-05,
"loss": 1.4591,
"step": 199
},
{
"epoch": 0.40424456796361796,
"grad_norm": 5.509421348571777,
"learning_rate": 8.492995714537518e-05,
"loss": 1.113,
"step": 200
},
{
"epoch": 0.40626579080343606,
"grad_norm": 2.5618975162506104,
"learning_rate": 8.464358947629218e-05,
"loss": 1.4167,
"step": 201
},
{
"epoch": 0.40828701364325415,
"grad_norm": 3.311715602874756,
"learning_rate": 8.435501924651691e-05,
"loss": 1.3956,
"step": 202
},
{
"epoch": 0.41030823648307224,
"grad_norm": 2.9615299701690674,
"learning_rate": 8.406426480268881e-05,
"loss": 1.1263,
"step": 203
},
{
"epoch": 0.41232945932289033,
"grad_norm": 3.3075735569000244,
"learning_rate": 8.377134463031469e-05,
"loss": 1.3577,
"step": 204
},
{
"epoch": 0.41435068216270843,
"grad_norm": 3.4304211139678955,
"learning_rate": 8.347627735259343e-05,
"loss": 1.6109,
"step": 205
},
{
"epoch": 0.4163719050025265,
"grad_norm": 3.1346213817596436,
"learning_rate": 8.317908172923205e-05,
"loss": 1.1557,
"step": 206
},
{
"epoch": 0.4183931278423446,
"grad_norm": 3.6796648502349854,
"learning_rate": 8.287977665525292e-05,
"loss": 1.3844,
"step": 207
},
{
"epoch": 0.4204143506821627,
"grad_norm": 3.5582165718078613,
"learning_rate": 8.257838115979244e-05,
"loss": 1.4043,
"step": 208
},
{
"epoch": 0.4224355735219808,
"grad_norm": 4.690045356750488,
"learning_rate": 8.227491440489133e-05,
"loss": 1.9057,
"step": 209
},
{
"epoch": 0.4244567963617989,
"grad_norm": 4.603220462799072,
"learning_rate": 8.196939568427624e-05,
"loss": 2.0138,
"step": 210
},
{
"epoch": 0.426478019201617,
"grad_norm": 4.161036014556885,
"learning_rate": 8.166184442213313e-05,
"loss": 1.5959,
"step": 211
},
{
"epoch": 0.4284992420414351,
"grad_norm": 4.600170612335205,
"learning_rate": 8.135228017187237e-05,
"loss": 1.656,
"step": 212
},
{
"epoch": 0.4305204648812532,
"grad_norm": 4.487064838409424,
"learning_rate": 8.10407226148855e-05,
"loss": 1.8356,
"step": 213
},
{
"epoch": 0.43254168772107127,
"grad_norm": 4.709319591522217,
"learning_rate": 8.0727191559294e-05,
"loss": 1.6691,
"step": 214
},
{
"epoch": 0.43456291056088936,
"grad_norm": 4.594611644744873,
"learning_rate": 8.041170693868985e-05,
"loss": 1.5736,
"step": 215
},
{
"epoch": 0.43658413340070745,
"grad_norm": 4.373988628387451,
"learning_rate": 8.009428881086835e-05,
"loss": 1.5365,
"step": 216
},
{
"epoch": 0.4386053562405255,
"grad_norm": 4.379314422607422,
"learning_rate": 7.977495735655272e-05,
"loss": 1.3149,
"step": 217
},
{
"epoch": 0.4406265790803436,
"grad_norm": 5.018416881561279,
"learning_rate": 7.945373287811116e-05,
"loss": 1.6653,
"step": 218
},
{
"epoch": 0.4426478019201617,
"grad_norm": 5.0816969871521,
"learning_rate": 7.913063579826601e-05,
"loss": 1.6499,
"step": 219
},
{
"epoch": 0.44466902475997977,
"grad_norm": 4.877917289733887,
"learning_rate": 7.880568665879542e-05,
"loss": 1.6308,
"step": 220
},
{
"epoch": 0.44669024759979786,
"grad_norm": 4.950286388397217,
"learning_rate": 7.847890611922721e-05,
"loss": 1.4406,
"step": 221
},
{
"epoch": 0.44871147043961596,
"grad_norm": 5.2699480056762695,
"learning_rate": 7.815031495552549e-05,
"loss": 1.5366,
"step": 222
},
{
"epoch": 0.45073269327943405,
"grad_norm": 5.018312931060791,
"learning_rate": 7.781993405876972e-05,
"loss": 1.3646,
"step": 223
},
{
"epoch": 0.45275391611925214,
"grad_norm": 5.903739929199219,
"learning_rate": 7.748778443382658e-05,
"loss": 1.713,
"step": 224
},
{
"epoch": 0.45477513895907024,
"grad_norm": 5.679375648498535,
"learning_rate": 7.715388719801438e-05,
"loss": 1.6164,
"step": 225
},
{
"epoch": 0.45679636179888833,
"grad_norm": 5.779827117919922,
"learning_rate": 7.68182635797606e-05,
"loss": 1.8518,
"step": 226
},
{
"epoch": 0.4588175846387064,
"grad_norm": 5.680889129638672,
"learning_rate": 7.648093491725223e-05,
"loss": 1.6413,
"step": 227
},
{
"epoch": 0.4608388074785245,
"grad_norm": 6.258683681488037,
"learning_rate": 7.614192265707905e-05,
"loss": 1.4253,
"step": 228
},
{
"epoch": 0.4628600303183426,
"grad_norm": 6.006860733032227,
"learning_rate": 7.580124835287013e-05,
"loss": 1.5382,
"step": 229
},
{
"epoch": 0.4648812531581607,
"grad_norm": 5.656060695648193,
"learning_rate": 7.545893366392358e-05,
"loss": 1.491,
"step": 230
},
{
"epoch": 0.4669024759979788,
"grad_norm": 6.140045642852783,
"learning_rate": 7.511500035382942e-05,
"loss": 1.754,
"step": 231
},
{
"epoch": 0.4689236988377969,
"grad_norm": 6.063406467437744,
"learning_rate": 7.476947028908594e-05,
"loss": 1.5948,
"step": 232
},
{
"epoch": 0.470944921677615,
"grad_norm": 5.987018585205078,
"learning_rate": 7.442236543770944e-05,
"loss": 1.6062,
"step": 233
},
{
"epoch": 0.4729661445174331,
"grad_norm": 5.543860912322998,
"learning_rate": 7.407370786783757e-05,
"loss": 1.5026,
"step": 234
},
{
"epoch": 0.4749873673572511,
"grad_norm": 5.393790245056152,
"learning_rate": 7.372351974632634e-05,
"loss": 1.5174,
"step": 235
},
{
"epoch": 0.4770085901970692,
"grad_norm": 6.32440710067749,
"learning_rate": 7.33718233373407e-05,
"loss": 1.6588,
"step": 236
},
{
"epoch": 0.4790298130368873,
"grad_norm": 5.640255928039551,
"learning_rate": 7.301864100093912e-05,
"loss": 1.3006,
"step": 237
},
{
"epoch": 0.4810510358767054,
"grad_norm": 5.789175033569336,
"learning_rate": 7.266399519165192e-05,
"loss": 1.5062,
"step": 238
},
{
"epoch": 0.4830722587165235,
"grad_norm": 5.77946662902832,
"learning_rate": 7.230790845705379e-05,
"loss": 1.3896,
"step": 239
},
{
"epoch": 0.4850934815563416,
"grad_norm": 5.353186130523682,
"learning_rate": 7.195040343633007e-05,
"loss": 1.321,
"step": 240
},
{
"epoch": 0.48711470439615967,
"grad_norm": 7.649440765380859,
"learning_rate": 7.159150285883756e-05,
"loss": 2.033,
"step": 241
},
{
"epoch": 0.48913592723597776,
"grad_norm": 5.809901237487793,
"learning_rate": 7.123122954265941e-05,
"loss": 1.2447,
"step": 242
},
{
"epoch": 0.49115715007579586,
"grad_norm": 5.7666215896606445,
"learning_rate": 7.086960639315436e-05,
"loss": 1.3123,
"step": 243
},
{
"epoch": 0.49317837291561395,
"grad_norm": 6.335599422454834,
"learning_rate": 7.050665640150045e-05,
"loss": 1.6371,
"step": 244
},
{
"epoch": 0.49519959575543204,
"grad_norm": 5.487225532531738,
"learning_rate": 7.014240264323334e-05,
"loss": 1.2528,
"step": 245
},
{
"epoch": 0.49722081859525014,
"grad_norm": 5.477118968963623,
"learning_rate": 6.977686827677926e-05,
"loss": 1.1761,
"step": 246
},
{
"epoch": 0.49924204143506823,
"grad_norm": 4.509613037109375,
"learning_rate": 6.941007654198254e-05,
"loss": 1.0277,
"step": 247
},
{
"epoch": 0.5012632642748863,
"grad_norm": 7.54964542388916,
"learning_rate": 6.904205075862816e-05,
"loss": 1.6264,
"step": 248
},
{
"epoch": 0.5032844871147044,
"grad_norm": 7.848708152770996,
"learning_rate": 6.867281432495912e-05,
"loss": 1.4215,
"step": 249
},
{
"epoch": 0.5053057099545225,
"grad_norm": 7.514161109924316,
"learning_rate": 6.830239071618873e-05,
"loss": 1.4708,
"step": 250
},
{
"epoch": 0.5073269327943406,
"grad_norm": 2.8156673908233643,
"learning_rate": 6.793080348300833e-05,
"loss": 1.4503,
"step": 251
},
{
"epoch": 0.5093481556341587,
"grad_norm": 2.978626012802124,
"learning_rate": 6.755807625008974e-05,
"loss": 1.17,
"step": 252
},
{
"epoch": 0.5113693784739768,
"grad_norm": 3.320791482925415,
"learning_rate": 6.718423271458343e-05,
"loss": 1.4699,
"step": 253
},
{
"epoch": 0.5133906013137949,
"grad_norm": 3.1808319091796875,
"learning_rate": 6.680929664461185e-05,
"loss": 1.2698,
"step": 254
},
{
"epoch": 0.515411824153613,
"grad_norm": 3.3711514472961426,
"learning_rate": 6.643329187775827e-05,
"loss": 1.5507,
"step": 255
},
{
"epoch": 0.5174330469934311,
"grad_norm": 3.6619150638580322,
"learning_rate": 6.605624231955131e-05,
"loss": 1.6664,
"step": 256
},
{
"epoch": 0.5194542698332492,
"grad_norm": 4.01786994934082,
"learning_rate": 6.567817194194507e-05,
"loss": 1.7517,
"step": 257
},
{
"epoch": 0.5214754926730673,
"grad_norm": 4.117989540100098,
"learning_rate": 6.529910478179499e-05,
"loss": 1.7831,
"step": 258
},
{
"epoch": 0.5234967155128853,
"grad_norm": 3.7679736614227295,
"learning_rate": 6.491906493932968e-05,
"loss": 1.4514,
"step": 259
},
{
"epoch": 0.5255179383527034,
"grad_norm": 4.316635608673096,
"learning_rate": 6.45380765766187e-05,
"loss": 1.6405,
"step": 260
},
{
"epoch": 0.5275391611925214,
"grad_norm": 3.958988904953003,
"learning_rate": 6.415616391603638e-05,
"loss": 1.5774,
"step": 261
},
{
"epoch": 0.5295603840323395,
"grad_norm": 4.332874774932861,
"learning_rate": 6.377335123872177e-05,
"loss": 1.8736,
"step": 262
},
{
"epoch": 0.5315816068721576,
"grad_norm": 4.065393447875977,
"learning_rate": 6.338966288303499e-05,
"loss": 1.5071,
"step": 263
},
{
"epoch": 0.5336028297119757,
"grad_norm": 4.553988456726074,
"learning_rate": 6.300512324300975e-05,
"loss": 1.806,
"step": 264
},
{
"epoch": 0.5356240525517938,
"grad_norm": 4.563177108764648,
"learning_rate": 6.261975676680252e-05,
"loss": 1.567,
"step": 265
},
{
"epoch": 0.5376452753916119,
"grad_norm": 4.3816142082214355,
"learning_rate": 6.223358795513812e-05,
"loss": 1.6037,
"step": 266
},
{
"epoch": 0.53966649823143,
"grad_norm": 4.977108001708984,
"learning_rate": 6.184664135975203e-05,
"loss": 1.8076,
"step": 267
},
{
"epoch": 0.5416877210712481,
"grad_norm": 4.56311559677124,
"learning_rate": 6.145894158182944e-05,
"loss": 1.6309,
"step": 268
},
{
"epoch": 0.5437089439110662,
"grad_norm": 5.014670372009277,
"learning_rate": 6.107051327044124e-05,
"loss": 1.6022,
"step": 269
},
{
"epoch": 0.5457301667508843,
"grad_norm": 4.538066864013672,
"learning_rate": 6.068138112097674e-05,
"loss": 1.494,
"step": 270
},
{
"epoch": 0.5477513895907024,
"grad_norm": 5.387009143829346,
"learning_rate": 6.029156987357373e-05,
"loss": 1.7367,
"step": 271
},
{
"epoch": 0.5497726124305204,
"grad_norm": 4.673946857452393,
"learning_rate": 5.9901104311545487e-05,
"loss": 1.6585,
"step": 272
},
{
"epoch": 0.5517938352703385,
"grad_norm": 5.331272125244141,
"learning_rate": 5.9510009259805085e-05,
"loss": 1.7205,
"step": 273
},
{
"epoch": 0.5538150581101566,
"grad_norm": 5.224307537078857,
"learning_rate": 5.91183095832872e-05,
"loss": 1.8472,
"step": 274
},
{
"epoch": 0.5558362809499747,
"grad_norm": 4.787731170654297,
"learning_rate": 5.872603018536713e-05,
"loss": 1.5981,
"step": 275
},
{
"epoch": 0.5578575037897928,
"grad_norm": 5.033946990966797,
"learning_rate": 5.833319600627753e-05,
"loss": 1.5519,
"step": 276
},
{
"epoch": 0.5598787266296109,
"grad_norm": 5.653214931488037,
"learning_rate": 5.793983202152282e-05,
"loss": 1.9657,
"step": 277
},
{
"epoch": 0.561899949469429,
"grad_norm": 5.225715160369873,
"learning_rate": 5.7545963240291246e-05,
"loss": 1.4663,
"step": 278
},
{
"epoch": 0.5639211723092471,
"grad_norm": 5.952142715454102,
"learning_rate": 5.715161470386485e-05,
"loss": 1.8356,
"step": 279
},
{
"epoch": 0.5659423951490652,
"grad_norm": 5.057675361633301,
"learning_rate": 5.6756811484027425e-05,
"loss": 1.6058,
"step": 280
},
{
"epoch": 0.5679636179888833,
"grad_norm": 4.865301132202148,
"learning_rate": 5.636157868147054e-05,
"loss": 1.2382,
"step": 281
},
{
"epoch": 0.5699848408287014,
"grad_norm": 5.245824337005615,
"learning_rate": 5.596594142419759e-05,
"loss": 1.4634,
"step": 282
},
{
"epoch": 0.5720060636685195,
"grad_norm": 5.30856990814209,
"learning_rate": 5.556992486592634e-05,
"loss": 1.5013,
"step": 283
},
{
"epoch": 0.5740272865083376,
"grad_norm": 6.301365375518799,
"learning_rate": 5.517355418448961e-05,
"loss": 1.683,
"step": 284
},
{
"epoch": 0.5760485093481557,
"grad_norm": 5.439041614532471,
"learning_rate": 5.477685458023459e-05,
"loss": 1.4477,
"step": 285
},
{
"epoch": 0.5780697321879738,
"grad_norm": 5.788546085357666,
"learning_rate": 5.437985127442065e-05,
"loss": 1.5466,
"step": 286
},
{
"epoch": 0.5800909550277918,
"grad_norm": 4.990469932556152,
"learning_rate": 5.3982569507615775e-05,
"loss": 1.4082,
"step": 287
},
{
"epoch": 0.5821121778676099,
"grad_norm": 6.371885776519775,
"learning_rate": 5.3585034538091885e-05,
"loss": 1.5582,
"step": 288
},
{
"epoch": 0.584133400707428,
"grad_norm": 5.770207405090332,
"learning_rate": 5.318727164021896e-05,
"loss": 1.6081,
"step": 289
},
{
"epoch": 0.5861546235472461,
"grad_norm": 5.396596908569336,
"learning_rate": 5.278930610285813e-05,
"loss": 1.1804,
"step": 290
},
{
"epoch": 0.5881758463870642,
"grad_norm": 5.193579196929932,
"learning_rate": 5.239116322775391e-05,
"loss": 1.1155,
"step": 291
},
{
"epoch": 0.5901970692268823,
"grad_norm": 6.787877559661865,
"learning_rate": 5.1992868327925526e-05,
"loss": 1.7875,
"step": 292
},
{
"epoch": 0.5922182920667004,
"grad_norm": 5.3459696769714355,
"learning_rate": 5.159444672605759e-05,
"loss": 1.3469,
"step": 293
},
{
"epoch": 0.5942395149065185,
"grad_norm": 5.520552635192871,
"learning_rate": 5.119592375289015e-05,
"loss": 1.187,
"step": 294
},
{
"epoch": 0.5962607377463366,
"grad_norm": 5.6787519454956055,
"learning_rate": 5.079732474560821e-05,
"loss": 1.4493,
"step": 295
},
{
"epoch": 0.5982819605861546,
"grad_norm": 5.608784198760986,
"learning_rate": 5.0398675046230835e-05,
"loss": 1.2803,
"step": 296
},
{
"epoch": 0.6003031834259727,
"grad_norm": 4.6050872802734375,
"learning_rate": 5e-05,
"loss": 0.8699,
"step": 297
},
{
"epoch": 0.6023244062657908,
"grad_norm": 5.878370761871338,
"learning_rate": 4.960132495376918e-05,
"loss": 1.3753,
"step": 298
},
{
"epoch": 0.6043456291056089,
"grad_norm": 6.23378324508667,
"learning_rate": 4.92026752543918e-05,
"loss": 1.3375,
"step": 299
},
{
"epoch": 0.6063668519454269,
"grad_norm": 8.689998626708984,
"learning_rate": 4.8804076247109865e-05,
"loss": 1.3833,
"step": 300
},
{
"epoch": 0.608388074785245,
"grad_norm": 2.740164279937744,
"learning_rate": 4.840555327394241e-05,
"loss": 1.2242,
"step": 301
},
{
"epoch": 0.6104092976250631,
"grad_norm": 2.697038173675537,
"learning_rate": 4.800713167207449e-05,
"loss": 1.2152,
"step": 302
},
{
"epoch": 0.6124305204648812,
"grad_norm": 3.111619234085083,
"learning_rate": 4.760883677224609e-05,
"loss": 1.4117,
"step": 303
},
{
"epoch": 0.6144517433046993,
"grad_norm": 2.8779137134552,
"learning_rate": 4.721069389714188e-05,
"loss": 1.1105,
"step": 304
},
{
"epoch": 0.6164729661445174,
"grad_norm": 3.4639029502868652,
"learning_rate": 4.681272835978107e-05,
"loss": 1.4196,
"step": 305
},
{
"epoch": 0.6184941889843355,
"grad_norm": 4.024080753326416,
"learning_rate": 4.6414965461908126e-05,
"loss": 1.9051,
"step": 306
},
{
"epoch": 0.6205154118241536,
"grad_norm": 3.319389581680298,
"learning_rate": 4.601743049238424e-05,
"loss": 1.3579,
"step": 307
},
{
"epoch": 0.6225366346639717,
"grad_norm": 4.286203384399414,
"learning_rate": 4.562014872557935e-05,
"loss": 1.8763,
"step": 308
},
{
"epoch": 0.6245578575037898,
"grad_norm": 4.204199314117432,
"learning_rate": 4.522314541976541e-05,
"loss": 1.6859,
"step": 309
},
{
"epoch": 0.6265790803436079,
"grad_norm": 4.099395751953125,
"learning_rate": 4.482644581551039e-05,
"loss": 1.5438,
"step": 310
},
{
"epoch": 0.628600303183426,
"grad_norm": 4.58848237991333,
"learning_rate": 4.443007513407368e-05,
"loss": 1.9432,
"step": 311
},
{
"epoch": 0.6306215260232441,
"grad_norm": 4.800745010375977,
"learning_rate": 4.4034058575802424e-05,
"loss": 1.7121,
"step": 312
},
{
"epoch": 0.6326427488630622,
"grad_norm": 4.463362216949463,
"learning_rate": 4.3638421318529474e-05,
"loss": 1.6288,
"step": 313
},
{
"epoch": 0.6346639717028802,
"grad_norm": 4.3990864753723145,
"learning_rate": 4.324318851597258e-05,
"loss": 1.5733,
"step": 314
},
{
"epoch": 0.6366851945426983,
"grad_norm": 4.997748851776123,
"learning_rate": 4.284838529613516e-05,
"loss": 1.9203,
"step": 315
},
{
"epoch": 0.6387064173825164,
"grad_norm": 5.128242015838623,
"learning_rate": 4.2454036759708765e-05,
"loss": 1.8486,
"step": 316
},
{
"epoch": 0.6407276402223345,
"grad_norm": 4.875668525695801,
"learning_rate": 4.2060167978477184e-05,
"loss": 1.5951,
"step": 317
},
{
"epoch": 0.6427488630621526,
"grad_norm": 4.408964157104492,
"learning_rate": 4.166680399372248e-05,
"loss": 1.3977,
"step": 318
},
{
"epoch": 0.6447700859019707,
"grad_norm": 4.5807905197143555,
"learning_rate": 4.1273969814632894e-05,
"loss": 1.4649,
"step": 319
},
{
"epoch": 0.6467913087417888,
"grad_norm": 5.094422817230225,
"learning_rate": 4.0881690416712805e-05,
"loss": 1.6607,
"step": 320
},
{
"epoch": 0.6488125315816069,
"grad_norm": 4.936136722564697,
"learning_rate": 4.0489990740194926e-05,
"loss": 1.6117,
"step": 321
},
{
"epoch": 0.650833754421425,
"grad_norm": 5.264697074890137,
"learning_rate": 4.009889568845453e-05,
"loss": 1.6412,
"step": 322
},
{
"epoch": 0.6528549772612431,
"grad_norm": 4.067869663238525,
"learning_rate": 3.9708430126426284e-05,
"loss": 1.2319,
"step": 323
},
{
"epoch": 0.6548762001010612,
"grad_norm": 5.502519607543945,
"learning_rate": 3.9318618879023256e-05,
"loss": 1.6435,
"step": 324
},
{
"epoch": 0.6568974229408793,
"grad_norm": 5.370153903961182,
"learning_rate": 3.892948672955877e-05,
"loss": 1.6528,
"step": 325
},
{
"epoch": 0.6589186457806974,
"grad_norm": 4.501315593719482,
"learning_rate": 3.854105841817056e-05,
"loss": 1.4033,
"step": 326
},
{
"epoch": 0.6609398686205155,
"grad_norm": 5.458628177642822,
"learning_rate": 3.815335864024799e-05,
"loss": 1.7448,
"step": 327
},
{
"epoch": 0.6629610914603336,
"grad_norm": 5.266726016998291,
"learning_rate": 3.776641204486191e-05,
"loss": 1.6844,
"step": 328
},
{
"epoch": 0.6649823143001516,
"grad_norm": 4.972016334533691,
"learning_rate": 3.738024323319749e-05,
"loss": 1.66,
"step": 329
},
{
"epoch": 0.6670035371399696,
"grad_norm": 4.560551643371582,
"learning_rate": 3.699487675699026e-05,
"loss": 1.2507,
"step": 330
},
{
"epoch": 0.6690247599797877,
"grad_norm": 5.447690010070801,
"learning_rate": 3.661033711696501e-05,
"loss": 1.4381,
"step": 331
},
{
"epoch": 0.6710459828196058,
"grad_norm": 6.1798787117004395,
"learning_rate": 3.6226648761278235e-05,
"loss": 1.6519,
"step": 332
},
{
"epoch": 0.6730672056594239,
"grad_norm": 8.073100090026855,
"learning_rate": 3.584383608396362e-05,
"loss": 1.5615,
"step": 333
},
{
"epoch": 0.675088428499242,
"grad_norm": 6.568238735198975,
"learning_rate": 3.546192342338131e-05,
"loss": 1.5244,
"step": 334
},
{
"epoch": 0.6771096513390601,
"grad_norm": 5.141592979431152,
"learning_rate": 3.508093506067034e-05,
"loss": 1.3669,
"step": 335
},
{
"epoch": 0.6791308741788782,
"grad_norm": 5.7515950202941895,
"learning_rate": 3.470089521820502e-05,
"loss": 1.4939,
"step": 336
},
{
"epoch": 0.6811520970186963,
"grad_norm": 5.398025035858154,
"learning_rate": 3.432182805805495e-05,
"loss": 1.3243,
"step": 337
},
{
"epoch": 0.6831733198585144,
"grad_norm": 5.3287272453308105,
"learning_rate": 3.394375768044869e-05,
"loss": 1.3026,
"step": 338
},
{
"epoch": 0.6851945426983325,
"grad_norm": 5.701461315155029,
"learning_rate": 3.3566708122241756e-05,
"loss": 1.5187,
"step": 339
},
{
"epoch": 0.6872157655381506,
"grad_norm": 6.3350067138671875,
"learning_rate": 3.3190703355388166e-05,
"loss": 1.9201,
"step": 340
},
{
"epoch": 0.6892369883779687,
"grad_norm": 6.057320594787598,
"learning_rate": 3.2815767285416576e-05,
"loss": 1.3178,
"step": 341
},
{
"epoch": 0.6912582112177867,
"grad_norm": 4.327114105224609,
"learning_rate": 3.244192374991027e-05,
"loss": 1.0027,
"step": 342
},
{
"epoch": 0.6932794340576048,
"grad_norm": 5.175032615661621,
"learning_rate": 3.2069196516991686e-05,
"loss": 1.1705,
"step": 343
},
{
"epoch": 0.6953006568974229,
"grad_norm": 5.2318644523620605,
"learning_rate": 3.169760928381127e-05,
"loss": 1.1488,
"step": 344
},
{
"epoch": 0.697321879737241,
"grad_norm": 4.282010555267334,
"learning_rate": 3.13271856750409e-05,
"loss": 0.9302,
"step": 345
},
{
"epoch": 0.6993431025770591,
"grad_norm": 6.124838829040527,
"learning_rate": 3.095794924137184e-05,
"loss": 1.328,
"step": 346
},
{
"epoch": 0.7013643254168772,
"grad_norm": 4.892329216003418,
"learning_rate": 3.058992345801747e-05,
"loss": 1.0958,
"step": 347
},
{
"epoch": 0.7033855482566953,
"grad_norm": 5.745382308959961,
"learning_rate": 3.0223131723220756e-05,
"loss": 1.224,
"step": 348
},
{
"epoch": 0.7054067710965134,
"grad_norm": 7.191976547241211,
"learning_rate": 2.9857597356766674e-05,
"loss": 1.1652,
"step": 349
},
{
"epoch": 0.7074279939363315,
"grad_norm": 5.166662693023682,
"learning_rate": 2.9493343598499567e-05,
"loss": 1.0203,
"step": 350
},
{
"epoch": 0.7094492167761496,
"grad_norm": 2.4570610523223877,
"learning_rate": 2.913039360684565e-05,
"loss": 1.2315,
"step": 351
},
{
"epoch": 0.7114704396159677,
"grad_norm": 2.5633747577667236,
"learning_rate": 2.8768770457340575e-05,
"loss": 1.2238,
"step": 352
},
{
"epoch": 0.7134916624557858,
"grad_norm": 2.9392590522766113,
"learning_rate": 2.8408497141162438e-05,
"loss": 1.2994,
"step": 353
},
{
"epoch": 0.7155128852956039,
"grad_norm": 2.7536635398864746,
"learning_rate": 2.8049596563669932e-05,
"loss": 1.0344,
"step": 354
},
{
"epoch": 0.717534108135422,
"grad_norm": 3.7724685668945312,
"learning_rate": 2.769209154294623e-05,
"loss": 1.6256,
"step": 355
},
{
"epoch": 0.71955533097524,
"grad_norm": 3.661170721054077,
"learning_rate": 2.7336004808348093e-05,
"loss": 1.4207,
"step": 356
},
{
"epoch": 0.7215765538150581,
"grad_norm": 4.080316066741943,
"learning_rate": 2.69813589990609e-05,
"loss": 1.6689,
"step": 357
},
{
"epoch": 0.7235977766548762,
"grad_norm": 4.936508655548096,
"learning_rate": 2.662817666265932e-05,
"loss": 1.996,
"step": 358
},
{
"epoch": 0.7256189994946943,
"grad_norm": 4.0,
"learning_rate": 2.6276480253673662e-05,
"loss": 1.6679,
"step": 359
},
{
"epoch": 0.7276402223345124,
"grad_norm": 4.3267903327941895,
"learning_rate": 2.5926292132162433e-05,
"loss": 1.6934,
"step": 360
},
{
"epoch": 0.7296614451743305,
"grad_norm": 4.0602593421936035,
"learning_rate": 2.5577634562290564e-05,
"loss": 1.5712,
"step": 361
},
{
"epoch": 0.7316826680141486,
"grad_norm": 4.487820148468018,
"learning_rate": 2.5230529710914076e-05,
"loss": 1.5552,
"step": 362
},
{
"epoch": 0.7337038908539667,
"grad_norm": 4.205357074737549,
"learning_rate": 2.4884999646170597e-05,
"loss": 1.5065,
"step": 363
},
{
"epoch": 0.7357251136937848,
"grad_norm": 4.995065689086914,
"learning_rate": 2.4541066336076434e-05,
"loss": 1.7604,
"step": 364
},
{
"epoch": 0.7377463365336028,
"grad_norm": 4.657031059265137,
"learning_rate": 2.4198751647129897e-05,
"loss": 1.6113,
"step": 365
},
{
"epoch": 0.7397675593734209,
"grad_norm": 4.953658103942871,
"learning_rate": 2.3858077342920972e-05,
"loss": 1.4499,
"step": 366
},
{
"epoch": 0.741788782213239,
"grad_norm": 4.663423538208008,
"learning_rate": 2.3519065082747778e-05,
"loss": 1.6878,
"step": 367
},
{
"epoch": 0.743810005053057,
"grad_norm": 4.570845127105713,
"learning_rate": 2.3181736420239385e-05,
"loss": 1.5128,
"step": 368
},
{
"epoch": 0.7458312278928751,
"grad_norm": 4.937878608703613,
"learning_rate": 2.2846112801985632e-05,
"loss": 1.7156,
"step": 369
},
{
"epoch": 0.7478524507326932,
"grad_norm": 4.761300086975098,
"learning_rate": 2.251221556617344e-05,
"loss": 1.7288,
"step": 370
},
{
"epoch": 0.7498736735725113,
"grad_norm": 4.429453372955322,
"learning_rate": 2.2180065941230277e-05,
"loss": 1.4495,
"step": 371
},
{
"epoch": 0.7518948964123294,
"grad_norm": 3.9287309646606445,
"learning_rate": 2.1849685044474533e-05,
"loss": 1.2037,
"step": 372
},
{
"epoch": 0.7539161192521475,
"grad_norm": 5.036333084106445,
"learning_rate": 2.15210938807728e-05,
"loss": 1.4006,
"step": 373
},
{
"epoch": 0.7559373420919656,
"grad_norm": 4.479243755340576,
"learning_rate": 2.1194313341204597e-05,
"loss": 1.3916,
"step": 374
},
{
"epoch": 0.7579585649317837,
"grad_norm": 4.996969699859619,
"learning_rate": 2.0869364201733987e-05,
"loss": 1.2482,
"step": 375
},
{
"epoch": 0.7599797877716018,
"grad_norm": 5.1381449699401855,
"learning_rate": 2.054626712188886e-05,
"loss": 1.6205,
"step": 376
},
{
"epoch": 0.7620010106114199,
"grad_norm": 5.011663913726807,
"learning_rate": 2.0225042643447283e-05,
"loss": 1.6553,
"step": 377
},
{
"epoch": 0.764022233451238,
"grad_norm": 4.932290554046631,
"learning_rate": 1.990571118913166e-05,
"loss": 1.3811,
"step": 378
},
{
"epoch": 0.7660434562910561,
"grad_norm": 5.215028285980225,
"learning_rate": 1.9588293061310163e-05,
"loss": 1.4943,
"step": 379
},
{
"epoch": 0.7680646791308742,
"grad_norm": 4.604588985443115,
"learning_rate": 1.9272808440706026e-05,
"loss": 1.1947,
"step": 380
},
{
"epoch": 0.7700859019706923,
"grad_norm": 5.850764274597168,
"learning_rate": 1.8959277385114514e-05,
"loss": 1.4795,
"step": 381
},
{
"epoch": 0.7721071248105104,
"grad_norm": 5.08169412612915,
"learning_rate": 1.864771982812763e-05,
"loss": 1.4163,
"step": 382
},
{
"epoch": 0.7741283476503285,
"grad_norm": 5.118016719818115,
"learning_rate": 1.8338155577866873e-05,
"loss": 1.4816,
"step": 383
},
{
"epoch": 0.7761495704901465,
"grad_norm": 5.448619842529297,
"learning_rate": 1.8030604315723766e-05,
"loss": 1.3162,
"step": 384
},
{
"epoch": 0.7781707933299646,
"grad_norm": 6.636441707611084,
"learning_rate": 1.7725085595108682e-05,
"loss": 1.4221,
"step": 385
},
{
"epoch": 0.7801920161697827,
"grad_norm": 5.057902812957764,
"learning_rate": 1.7421618840207578e-05,
"loss": 1.4411,
"step": 386
},
{
"epoch": 0.7822132390096008,
"grad_norm": 4.244833946228027,
"learning_rate": 1.71202233447471e-05,
"loss": 1.0515,
"step": 387
},
{
"epoch": 0.7842344618494189,
"grad_norm": 4.543421745300293,
"learning_rate": 1.682091827076796e-05,
"loss": 1.3154,
"step": 388
},
{
"epoch": 0.786255684689237,
"grad_norm": 5.357529163360596,
"learning_rate": 1.6523722647406576e-05,
"loss": 1.3857,
"step": 389
},
{
"epoch": 0.7882769075290551,
"grad_norm": 5.082853317260742,
"learning_rate": 1.622865536968534e-05,
"loss": 1.2635,
"step": 390
},
{
"epoch": 0.7902981303688732,
"grad_norm": 5.57720947265625,
"learning_rate": 1.5935735197311202e-05,
"loss": 1.422,
"step": 391
},
{
"epoch": 0.7923193532086913,
"grad_norm": 5.431817054748535,
"learning_rate": 1.5644980753483107e-05,
"loss": 1.2788,
"step": 392
},
{
"epoch": 0.7943405760485094,
"grad_norm": 5.892661094665527,
"learning_rate": 1.5356410523707825e-05,
"loss": 1.5827,
"step": 393
},
{
"epoch": 0.7963617988883275,
"grad_norm": 5.396627902984619,
"learning_rate": 1.5070042854624834e-05,
"loss": 1.2314,
"step": 394
},
{
"epoch": 0.7983830217281456,
"grad_norm": 5.037806510925293,
"learning_rate": 1.4785895952839734e-05,
"loss": 1.2281,
"step": 395
},
{
"epoch": 0.8004042445679637,
"grad_norm": 6.198902130126953,
"learning_rate": 1.4503987883766857e-05,
"loss": 1.3708,
"step": 396
},
{
"epoch": 0.8024254674077818,
"grad_norm": 4.604989051818848,
"learning_rate": 1.4224336570480573e-05,
"loss": 0.9869,
"step": 397
},
{
"epoch": 0.8044466902475998,
"grad_norm": 6.081021785736084,
"learning_rate": 1.3946959792575915e-05,
"loss": 1.3921,
"step": 398
},
{
"epoch": 0.8064679130874179,
"grad_norm": 4.676353931427002,
"learning_rate": 1.3671875185038063e-05,
"loss": 0.9632,
"step": 399
},
{
"epoch": 0.8084891359272359,
"grad_norm": 6.441154956817627,
"learning_rate": 1.3399100237121265e-05,
"loss": 1.4048,
"step": 400
},
{
"epoch": 0.810510358767054,
"grad_norm": 2.4489996433258057,
"learning_rate": 1.312865229123681e-05,
"loss": 1.2112,
"step": 401
},
{
"epoch": 0.8125315816068721,
"grad_norm": 2.628634214401245,
"learning_rate": 1.2860548541850542e-05,
"loss": 1.1693,
"step": 402
},
{
"epoch": 0.8145528044466902,
"grad_norm": 3.1994330883026123,
"learning_rate": 1.2594806034389556e-05,
"loss": 1.3584,
"step": 403
},
{
"epoch": 0.8165740272865083,
"grad_norm": 2.7979207038879395,
"learning_rate": 1.2331441664158611e-05,
"loss": 1.1963,
"step": 404
},
{
"epoch": 0.8185952501263264,
"grad_norm": 3.172201156616211,
"learning_rate": 1.2070472175265856e-05,
"loss": 1.308,
"step": 405
},
{
"epoch": 0.8206164729661445,
"grad_norm": 3.279130458831787,
"learning_rate": 1.1811914159558374e-05,
"loss": 1.4214,
"step": 406
},
{
"epoch": 0.8226376958059626,
"grad_norm": 3.5428879261016846,
"learning_rate": 1.155578405556722e-05,
"loss": 1.5145,
"step": 407
},
{
"epoch": 0.8246589186457807,
"grad_norm": 4.0169830322265625,
"learning_rate": 1.1302098147462347e-05,
"loss": 1.6019,
"step": 408
},
{
"epoch": 0.8266801414855988,
"grad_norm": 4.363892078399658,
"learning_rate": 1.1050872564017328e-05,
"loss": 1.9656,
"step": 409
},
{
"epoch": 0.8287013643254169,
"grad_norm": 4.119546890258789,
"learning_rate": 1.0802123277583819e-05,
"loss": 1.6649,
"step": 410
},
{
"epoch": 0.830722587165235,
"grad_norm": 4.996504783630371,
"learning_rate": 1.0555866103076212e-05,
"loss": 1.8195,
"step": 411
},
{
"epoch": 0.832743810005053,
"grad_norm": 4.646332263946533,
"learning_rate": 1.0312116696966012e-05,
"loss": 1.9406,
"step": 412
},
{
"epoch": 0.8347650328448711,
"grad_norm": 4.677328109741211,
"learning_rate": 1.0070890556286577e-05,
"loss": 1.6832,
"step": 413
},
{
"epoch": 0.8367862556846892,
"grad_norm": 5.113035202026367,
"learning_rate": 9.832203017647745e-06,
"loss": 1.7782,
"step": 414
},
{
"epoch": 0.8388074785245073,
"grad_norm": 4.52169942855835,
"learning_rate": 9.596069256260792e-06,
"loss": 1.5927,
"step": 415
},
{
"epoch": 0.8408287013643254,
"grad_norm": 4.952767848968506,
"learning_rate": 9.362504284973683e-06,
"loss": 1.9372,
"step": 416
},
{
"epoch": 0.8428499242041435,
"grad_norm": 5.01066255569458,
"learning_rate": 9.131522953316501e-06,
"loss": 1.5843,
"step": 417
},
{
"epoch": 0.8448711470439616,
"grad_norm": 4.628442764282227,
"learning_rate": 8.903139946557438e-06,
"loss": 1.4746,
"step": 418
},
{
"epoch": 0.8468923698837797,
"grad_norm": 4.917454242706299,
"learning_rate": 8.67736978476904e-06,
"loss": 1.6411,
"step": 419
},
{
"epoch": 0.8489135927235978,
"grad_norm": 5.071280002593994,
"learning_rate": 8.45422682190517e-06,
"loss": 1.6055,
"step": 420
},
{
"epoch": 0.8509348155634159,
"grad_norm": 4.860715866088867,
"learning_rate": 8.233725244888291e-06,
"loss": 1.4997,
"step": 421
},
{
"epoch": 0.852956038403234,
"grad_norm": 4.802791595458984,
"learning_rate": 8.01587907270761e-06,
"loss": 1.3955,
"step": 422
},
{
"epoch": 0.8549772612430521,
"grad_norm": 4.843109607696533,
"learning_rate": 7.800702155527696e-06,
"loss": 1.4801,
"step": 423
},
{
"epoch": 0.8569984840828702,
"grad_norm": 5.050382137298584,
"learning_rate": 7.588208173807943e-06,
"loss": 1.5216,
"step": 424
},
{
"epoch": 0.8590197069226883,
"grad_norm": 4.331174373626709,
"learning_rate": 7.378410637432847e-06,
"loss": 1.4275,
"step": 425
},
{
"epoch": 0.8610409297625063,
"grad_norm": 5.057369709014893,
"learning_rate": 7.171322884852988e-06,
"loss": 1.6479,
"step": 426
},
{
"epoch": 0.8630621526023244,
"grad_norm": 5.058509826660156,
"learning_rate": 6.966958082237096e-06,
"loss": 1.483,
"step": 427
},
{
"epoch": 0.8650833754421425,
"grad_norm": 5.39819860458374,
"learning_rate": 6.765329222634892e-06,
"loss": 1.6475,
"step": 428
},
{
"epoch": 0.8671045982819606,
"grad_norm": 5.942454814910889,
"learning_rate": 6.566449125151086e-06,
"loss": 1.4512,
"step": 429
},
{
"epoch": 0.8691258211217787,
"grad_norm": 5.583020210266113,
"learning_rate": 6.370330434130317e-06,
"loss": 1.5802,
"step": 430
},
{
"epoch": 0.8711470439615968,
"grad_norm": 5.76262903213501,
"learning_rate": 6.176985618353282e-06,
"loss": 1.7253,
"step": 431
},
{
"epoch": 0.8731682668014149,
"grad_norm": 5.057263374328613,
"learning_rate": 5.9864269702440075e-06,
"loss": 1.424,
"step": 432
},
{
"epoch": 0.875189489641233,
"grad_norm": 7.444010257720947,
"learning_rate": 5.798666605088293e-06,
"loss": 1.4872,
"step": 433
},
{
"epoch": 0.877210712481051,
"grad_norm": 4.746302127838135,
"learning_rate": 5.613716460263485e-06,
"loss": 1.1225,
"step": 434
},
{
"epoch": 0.8792319353208691,
"grad_norm": 4.994941234588623,
"learning_rate": 5.431588294479478e-06,
"loss": 1.2954,
"step": 435
},
{
"epoch": 0.8812531581606872,
"grad_norm": 5.951868057250977,
"learning_rate": 5.2522936870311955e-06,
"loss": 1.4107,
"step": 436
},
{
"epoch": 0.8832743810005053,
"grad_norm": 5.421668529510498,
"learning_rate": 5.0758440370623214e-06,
"loss": 1.8054,
"step": 437
},
{
"epoch": 0.8852956038403234,
"grad_norm": 6.007269382476807,
"learning_rate": 4.902250562840621e-06,
"loss": 1.6518,
"step": 438
},
{
"epoch": 0.8873168266801414,
"grad_norm": 4.964114189147949,
"learning_rate": 4.731524301044715e-06,
"loss": 1.1705,
"step": 439
},
{
"epoch": 0.8893380495199595,
"grad_norm": 5.869405269622803,
"learning_rate": 4.563676106062331e-06,
"loss": 1.6333,
"step": 440
},
{
"epoch": 0.8913592723597776,
"grad_norm": 5.429330348968506,
"learning_rate": 4.398716649300311e-06,
"loss": 1.277,
"step": 441
},
{
"epoch": 0.8933804951995957,
"grad_norm": 6.709824562072754,
"learning_rate": 4.236656418506013e-06,
"loss": 1.4216,
"step": 442
},
{
"epoch": 0.8954017180394138,
"grad_norm": 5.117186546325684,
"learning_rate": 4.077505717100666e-06,
"loss": 1.2206,
"step": 443
},
{
"epoch": 0.8974229408792319,
"grad_norm": 4.522552967071533,
"learning_rate": 3.921274663524182e-06,
"loss": 0.9311,
"step": 444
},
{
"epoch": 0.89944416371905,
"grad_norm": 6.130990505218506,
"learning_rate": 3.767973190591906e-06,
"loss": 1.3967,
"step": 445
},
{
"epoch": 0.9014653865588681,
"grad_norm": 4.977992057800293,
"learning_rate": 3.6176110448631394e-06,
"loss": 1.062,
"step": 446
},
{
"epoch": 0.9034866093986862,
"grad_norm": 4.500341892242432,
"learning_rate": 3.4701977860213953e-06,
"loss": 0.9393,
"step": 447
},
{
"epoch": 0.9055078322385043,
"grad_norm": 5.150080680847168,
"learning_rate": 3.325742786266689e-06,
"loss": 1.1028,
"step": 448
},
{
"epoch": 0.9075290550783224,
"grad_norm": 5.322256088256836,
"learning_rate": 3.184255229719624e-06,
"loss": 1.0733,
"step": 449
},
{
"epoch": 0.9095502779181405,
"grad_norm": 7.01747989654541,
"learning_rate": 3.0457441118375283e-06,
"loss": 1.5135,
"step": 450
},
{
"epoch": 0.9115715007579586,
"grad_norm": 2.3920202255249023,
"learning_rate": 2.91021823884251e-06,
"loss": 1.0922,
"step": 451
},
{
"epoch": 0.9135927235977767,
"grad_norm": 2.5058305263519287,
"learning_rate": 2.7776862271615912e-06,
"loss": 1.1713,
"step": 452
},
{
"epoch": 0.9156139464375948,
"grad_norm": 2.848264217376709,
"learning_rate": 2.6481565028789067e-06,
"loss": 1.1941,
"step": 453
},
{
"epoch": 0.9176351692774128,
"grad_norm": 3.537458896636963,
"learning_rate": 2.5216373011999695e-06,
"loss": 1.5756,
"step": 454
},
{
"epoch": 0.9196563921172309,
"grad_norm": 3.5460147857666016,
"learning_rate": 2.3981366659281134e-06,
"loss": 1.3457,
"step": 455
},
{
"epoch": 0.921677614957049,
"grad_norm": 4.265406608581543,
"learning_rate": 2.277662448953066e-06,
"loss": 1.7724,
"step": 456
},
{
"epoch": 0.9236988377968671,
"grad_norm": 3.961560010910034,
"learning_rate": 2.1602223097517913e-06,
"loss": 1.7004,
"step": 457
},
{
"epoch": 0.9257200606366852,
"grad_norm": 4.308675289154053,
"learning_rate": 2.0458237149014347e-06,
"loss": 1.7709,
"step": 458
},
{
"epoch": 0.9277412834765033,
"grad_norm": 3.9854753017425537,
"learning_rate": 1.9344739376047083e-06,
"loss": 1.6308,
"step": 459
},
{
"epoch": 0.9297625063163214,
"grad_norm": 4.271021366119385,
"learning_rate": 1.8261800572274001e-06,
"loss": 1.65,
"step": 460
},
{
"epoch": 0.9317837291561395,
"grad_norm": 4.231322288513184,
"learning_rate": 1.7209489588483395e-06,
"loss": 1.529,
"step": 461
},
{
"epoch": 0.9338049519959576,
"grad_norm": 3.8486905097961426,
"learning_rate": 1.6187873328216142e-06,
"loss": 1.3554,
"step": 462
},
{
"epoch": 0.9358261748357757,
"grad_norm": 3.996654510498047,
"learning_rate": 1.519701674351265e-06,
"loss": 1.4264,
"step": 463
},
{
"epoch": 0.9378473976755938,
"grad_norm": 4.656818389892578,
"learning_rate": 1.4236982830782674e-06,
"loss": 1.5068,
"step": 464
},
{
"epoch": 0.9398686205154119,
"grad_norm": 4.660806179046631,
"learning_rate": 1.3307832626800964e-06,
"loss": 1.5453,
"step": 465
},
{
"epoch": 0.94188984335523,
"grad_norm": 4.372664928436279,
"learning_rate": 1.2409625204825803e-06,
"loss": 1.6273,
"step": 466
},
{
"epoch": 0.943911066195048,
"grad_norm": 4.731912612915039,
"learning_rate": 1.1542417670844074e-06,
"loss": 1.5745,
"step": 467
},
{
"epoch": 0.9459322890348661,
"grad_norm": 4.093106269836426,
"learning_rate": 1.0706265159939943e-06,
"loss": 1.2102,
"step": 468
},
{
"epoch": 0.9479535118746841,
"grad_norm": 4.671316623687744,
"learning_rate": 9.901220832790103e-07,
"loss": 1.3927,
"step": 469
},
{
"epoch": 0.9499747347145022,
"grad_norm": 5.522085666656494,
"learning_rate": 9.12733587228326e-07,
"loss": 1.7408,
"step": 470
},
{
"epoch": 0.9519959575543203,
"grad_norm": 5.045165061950684,
"learning_rate": 8.384659480266732e-07,
"loss": 1.4933,
"step": 471
},
{
"epoch": 0.9540171803941384,
"grad_norm": 5.388870716094971,
"learning_rate": 7.673238874417677e-07,
"loss": 1.6984,
"step": 472
},
{
"epoch": 0.9560384032339565,
"grad_norm": 4.820789813995361,
"learning_rate": 6.993119285241601e-07,
"loss": 1.5436,
"step": 473
},
{
"epoch": 0.9580596260737746,
"grad_norm": 5.232425212860107,
"learning_rate": 6.344343953196385e-07,
"loss": 1.5351,
"step": 474
},
{
"epoch": 0.9600808489135927,
"grad_norm": 4.9825639724731445,
"learning_rate": 5.726954125943318e-07,
"loss": 1.5842,
"step": 475
},
{
"epoch": 0.9621020717534108,
"grad_norm": 5.142207145690918,
"learning_rate": 5.140989055724687e-07,
"loss": 1.6371,
"step": 476
},
{
"epoch": 0.9641232945932289,
"grad_norm": 5.354437828063965,
"learning_rate": 4.5864859968679506e-07,
"loss": 1.4612,
"step": 477
},
{
"epoch": 0.966144517433047,
"grad_norm": 5.071149826049805,
"learning_rate": 4.0634802034176244e-07,
"loss": 1.3784,
"step": 478
}
],
"logging_steps": 1,
"max_steps": 494,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 239,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.1989875311968256e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}