alpaca_struq / trainer_state.json
jiachens
update
aac17f7
raw
history blame
194 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9972179289026277,
"eval_steps": 500,
"global_step": 1212,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 12.10108470916748,
"learning_rate": 5.405405405405406e-07,
"loss": 1.3381,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 7.443129539489746,
"learning_rate": 1.0810810810810812e-06,
"loss": 1.3317,
"step": 2
},
{
"epoch": 0.01,
"grad_norm": 7.6787285804748535,
"learning_rate": 1.6216216216216219e-06,
"loss": 1.3403,
"step": 3
},
{
"epoch": 0.01,
"grad_norm": 6.186384677886963,
"learning_rate": 2.1621621621621623e-06,
"loss": 1.3279,
"step": 4
},
{
"epoch": 0.01,
"grad_norm": 12.216065406799316,
"learning_rate": 2.702702702702703e-06,
"loss": 1.3095,
"step": 5
},
{
"epoch": 0.01,
"grad_norm": 9.98418140411377,
"learning_rate": 3.2432432432432437e-06,
"loss": 1.2631,
"step": 6
},
{
"epoch": 0.02,
"grad_norm": 5.183889389038086,
"learning_rate": 3.7837837837837844e-06,
"loss": 1.1647,
"step": 7
},
{
"epoch": 0.02,
"grad_norm": 4.002997398376465,
"learning_rate": 4.324324324324325e-06,
"loss": 1.1655,
"step": 8
},
{
"epoch": 0.02,
"grad_norm": 5.9775800704956055,
"learning_rate": 4.864864864864866e-06,
"loss": 1.165,
"step": 9
},
{
"epoch": 0.02,
"grad_norm": 2.6240146160125732,
"learning_rate": 5.405405405405406e-06,
"loss": 1.1835,
"step": 10
},
{
"epoch": 0.03,
"grad_norm": 2.16780686378479,
"learning_rate": 5.945945945945947e-06,
"loss": 1.1001,
"step": 11
},
{
"epoch": 0.03,
"grad_norm": 1.752638816833496,
"learning_rate": 6.486486486486487e-06,
"loss": 1.1154,
"step": 12
},
{
"epoch": 0.03,
"grad_norm": 1.459815502166748,
"learning_rate": 7.027027027027028e-06,
"loss": 1.0633,
"step": 13
},
{
"epoch": 0.03,
"grad_norm": 1.4126054048538208,
"learning_rate": 7.567567567567569e-06,
"loss": 1.0693,
"step": 14
},
{
"epoch": 0.04,
"grad_norm": 1.370827555656433,
"learning_rate": 8.108108108108109e-06,
"loss": 1.0763,
"step": 15
},
{
"epoch": 0.04,
"grad_norm": 1.2742247581481934,
"learning_rate": 8.64864864864865e-06,
"loss": 1.0182,
"step": 16
},
{
"epoch": 0.04,
"grad_norm": 1.2456259727478027,
"learning_rate": 9.189189189189191e-06,
"loss": 1.0418,
"step": 17
},
{
"epoch": 0.04,
"grad_norm": 1.1927070617675781,
"learning_rate": 9.729729729729732e-06,
"loss": 1.0756,
"step": 18
},
{
"epoch": 0.05,
"grad_norm": 1.1686992645263672,
"learning_rate": 1.027027027027027e-05,
"loss": 1.035,
"step": 19
},
{
"epoch": 0.05,
"grad_norm": 1.2025060653686523,
"learning_rate": 1.0810810810810812e-05,
"loss": 1.0085,
"step": 20
},
{
"epoch": 0.05,
"grad_norm": 1.0982664823532104,
"learning_rate": 1.1351351351351352e-05,
"loss": 1.0811,
"step": 21
},
{
"epoch": 0.05,
"grad_norm": 1.089470624923706,
"learning_rate": 1.1891891891891894e-05,
"loss": 1.0245,
"step": 22
},
{
"epoch": 0.06,
"grad_norm": 1.0580064058303833,
"learning_rate": 1.2432432432432433e-05,
"loss": 1.0332,
"step": 23
},
{
"epoch": 0.06,
"grad_norm": 1.1338077783584595,
"learning_rate": 1.2972972972972975e-05,
"loss": 1.0092,
"step": 24
},
{
"epoch": 0.06,
"grad_norm": 1.0969806909561157,
"learning_rate": 1.3513513513513515e-05,
"loss": 1.0386,
"step": 25
},
{
"epoch": 0.06,
"grad_norm": 1.1216936111450195,
"learning_rate": 1.4054054054054055e-05,
"loss": 0.96,
"step": 26
},
{
"epoch": 0.07,
"grad_norm": 1.131313681602478,
"learning_rate": 1.4594594594594596e-05,
"loss": 1.053,
"step": 27
},
{
"epoch": 0.07,
"grad_norm": 1.070252537727356,
"learning_rate": 1.5135135135135138e-05,
"loss": 1.0161,
"step": 28
},
{
"epoch": 0.07,
"grad_norm": 1.0395774841308594,
"learning_rate": 1.5675675675675676e-05,
"loss": 1.0493,
"step": 29
},
{
"epoch": 0.07,
"grad_norm": 1.0543193817138672,
"learning_rate": 1.6216216216216218e-05,
"loss": 1.0496,
"step": 30
},
{
"epoch": 0.08,
"grad_norm": 1.0976182222366333,
"learning_rate": 1.6756756756756757e-05,
"loss": 1.0433,
"step": 31
},
{
"epoch": 0.08,
"grad_norm": 1.112975001335144,
"learning_rate": 1.72972972972973e-05,
"loss": 0.9491,
"step": 32
},
{
"epoch": 0.08,
"grad_norm": 0.9319718480110168,
"learning_rate": 1.783783783783784e-05,
"loss": 1.0123,
"step": 33
},
{
"epoch": 0.08,
"grad_norm": 1.1316465139389038,
"learning_rate": 1.8378378378378383e-05,
"loss": 1.0137,
"step": 34
},
{
"epoch": 0.09,
"grad_norm": 1.0440936088562012,
"learning_rate": 1.891891891891892e-05,
"loss": 1.0521,
"step": 35
},
{
"epoch": 0.09,
"grad_norm": 1.0576587915420532,
"learning_rate": 1.9459459459459463e-05,
"loss": 1.0321,
"step": 36
},
{
"epoch": 0.09,
"grad_norm": 1.0005910396575928,
"learning_rate": 2e-05,
"loss": 1.0236,
"step": 37
},
{
"epoch": 0.09,
"grad_norm": 1.0735234022140503,
"learning_rate": 1.9999964256773125e-05,
"loss": 1.0013,
"step": 38
},
{
"epoch": 0.1,
"grad_norm": 1.1121292114257812,
"learning_rate": 1.9999857027348008e-05,
"loss": 0.9861,
"step": 39
},
{
"epoch": 0.1,
"grad_norm": 1.0172284841537476,
"learning_rate": 1.9999678312491194e-05,
"loss": 1.0562,
"step": 40
},
{
"epoch": 0.1,
"grad_norm": 0.9417063593864441,
"learning_rate": 1.999942811348026e-05,
"loss": 0.9934,
"step": 41
},
{
"epoch": 0.1,
"grad_norm": 0.9323516488075256,
"learning_rate": 1.9999106432103785e-05,
"loss": 0.9927,
"step": 42
},
{
"epoch": 0.11,
"grad_norm": 1.0684179067611694,
"learning_rate": 1.999871327066135e-05,
"loss": 0.9595,
"step": 43
},
{
"epoch": 0.11,
"grad_norm": 1.0439395904541016,
"learning_rate": 1.9998248631963532e-05,
"loss": 1.0175,
"step": 44
},
{
"epoch": 0.11,
"grad_norm": 0.9043775796890259,
"learning_rate": 1.999771251933187e-05,
"loss": 0.9598,
"step": 45
},
{
"epoch": 0.11,
"grad_norm": 0.9753316044807434,
"learning_rate": 1.999710493659884e-05,
"loss": 0.9802,
"step": 46
},
{
"epoch": 0.12,
"grad_norm": 1.0120280981063843,
"learning_rate": 1.999642588810784e-05,
"loss": 1.0248,
"step": 47
},
{
"epoch": 0.12,
"grad_norm": 1.0215895175933838,
"learning_rate": 1.999567537871314e-05,
"loss": 0.9361,
"step": 48
},
{
"epoch": 0.12,
"grad_norm": 1.006943702697754,
"learning_rate": 1.999485341377987e-05,
"loss": 0.9485,
"step": 49
},
{
"epoch": 0.12,
"grad_norm": 0.9099878668785095,
"learning_rate": 1.9993959999183964e-05,
"loss": 0.9008,
"step": 50
},
{
"epoch": 0.13,
"grad_norm": 0.9488409161567688,
"learning_rate": 1.9992995141312126e-05,
"loss": 0.9229,
"step": 51
},
{
"epoch": 0.13,
"grad_norm": 1.0199425220489502,
"learning_rate": 1.9991958847061786e-05,
"loss": 0.9754,
"step": 52
},
{
"epoch": 0.13,
"grad_norm": 1.0538727045059204,
"learning_rate": 1.999085112384104e-05,
"loss": 1.0028,
"step": 53
},
{
"epoch": 0.13,
"grad_norm": 1.098109483718872,
"learning_rate": 1.998967197956861e-05,
"loss": 0.9324,
"step": 54
},
{
"epoch": 0.14,
"grad_norm": 1.155998945236206,
"learning_rate": 1.998842142267378e-05,
"loss": 0.9955,
"step": 55
},
{
"epoch": 0.14,
"grad_norm": 0.972683310508728,
"learning_rate": 1.9987099462096342e-05,
"loss": 0.9839,
"step": 56
},
{
"epoch": 0.14,
"grad_norm": 0.8845910429954529,
"learning_rate": 1.9985706107286515e-05,
"loss": 0.9754,
"step": 57
},
{
"epoch": 0.14,
"grad_norm": 0.9837936162948608,
"learning_rate": 1.9984241368204907e-05,
"loss": 0.949,
"step": 58
},
{
"epoch": 0.15,
"grad_norm": 0.8097039461135864,
"learning_rate": 1.998270525532241e-05,
"loss": 0.939,
"step": 59
},
{
"epoch": 0.15,
"grad_norm": 0.9656291007995605,
"learning_rate": 1.9981097779620156e-05,
"loss": 0.9329,
"step": 60
},
{
"epoch": 0.15,
"grad_norm": 0.959920346736908,
"learning_rate": 1.9979418952589417e-05,
"loss": 1.0008,
"step": 61
},
{
"epoch": 0.15,
"grad_norm": 0.9785797595977783,
"learning_rate": 1.9977668786231536e-05,
"loss": 0.9345,
"step": 62
},
{
"epoch": 0.16,
"grad_norm": 0.8965155482292175,
"learning_rate": 1.9975847293057822e-05,
"loss": 0.9304,
"step": 63
},
{
"epoch": 0.16,
"grad_norm": 1.1089284420013428,
"learning_rate": 1.9973954486089494e-05,
"loss": 1.015,
"step": 64
},
{
"epoch": 0.16,
"grad_norm": 0.9888070225715637,
"learning_rate": 1.997199037885755e-05,
"loss": 1.0032,
"step": 65
},
{
"epoch": 0.16,
"grad_norm": 0.9456419348716736,
"learning_rate": 1.9969954985402702e-05,
"loss": 0.9988,
"step": 66
},
{
"epoch": 0.17,
"grad_norm": 0.9384573698043823,
"learning_rate": 1.9967848320275253e-05,
"loss": 1.013,
"step": 67
},
{
"epoch": 0.17,
"grad_norm": 1.035064935684204,
"learning_rate": 1.9965670398535004e-05,
"loss": 0.9888,
"step": 68
},
{
"epoch": 0.17,
"grad_norm": 0.9343249201774597,
"learning_rate": 1.996342123575115e-05,
"loss": 0.9788,
"step": 69
},
{
"epoch": 0.17,
"grad_norm": 1.063715934753418,
"learning_rate": 1.9961100848002154e-05,
"loss": 0.949,
"step": 70
},
{
"epoch": 0.18,
"grad_norm": 1.094469666481018,
"learning_rate": 1.9958709251875642e-05,
"loss": 0.9659,
"step": 71
},
{
"epoch": 0.18,
"grad_norm": 0.899391770362854,
"learning_rate": 1.9956246464468294e-05,
"loss": 0.9735,
"step": 72
},
{
"epoch": 0.18,
"grad_norm": 0.8771396279335022,
"learning_rate": 1.9953712503385702e-05,
"loss": 0.9622,
"step": 73
},
{
"epoch": 0.18,
"grad_norm": 0.9920835494995117,
"learning_rate": 1.995110738674225e-05,
"loss": 0.9235,
"step": 74
},
{
"epoch": 0.19,
"grad_norm": 1.021485447883606,
"learning_rate": 1.9948431133160998e-05,
"loss": 0.9669,
"step": 75
},
{
"epoch": 0.19,
"grad_norm": 0.9151023030281067,
"learning_rate": 1.9945683761773533e-05,
"loss": 0.9663,
"step": 76
},
{
"epoch": 0.19,
"grad_norm": 1.0832756757736206,
"learning_rate": 1.9942865292219837e-05,
"loss": 1.0037,
"step": 77
},
{
"epoch": 0.19,
"grad_norm": 0.9813768863677979,
"learning_rate": 1.9939975744648152e-05,
"loss": 0.9182,
"step": 78
},
{
"epoch": 0.2,
"grad_norm": 0.9876184463500977,
"learning_rate": 1.9937015139714825e-05,
"loss": 0.9985,
"step": 79
},
{
"epoch": 0.2,
"grad_norm": 1.0073497295379639,
"learning_rate": 1.9933983498584175e-05,
"loss": 0.9705,
"step": 80
},
{
"epoch": 0.2,
"grad_norm": 1.095999002456665,
"learning_rate": 1.9930880842928325e-05,
"loss": 0.9947,
"step": 81
},
{
"epoch": 0.2,
"grad_norm": 1.0470014810562134,
"learning_rate": 1.9927707194927067e-05,
"loss": 0.9406,
"step": 82
},
{
"epoch": 0.21,
"grad_norm": 1.0602818727493286,
"learning_rate": 1.9924462577267676e-05,
"loss": 0.96,
"step": 83
},
{
"epoch": 0.21,
"grad_norm": 0.9280155897140503,
"learning_rate": 1.9921147013144782e-05,
"loss": 0.9749,
"step": 84
},
{
"epoch": 0.21,
"grad_norm": 1.0354458093643188,
"learning_rate": 1.991776052626017e-05,
"loss": 0.9528,
"step": 85
},
{
"epoch": 0.21,
"grad_norm": 0.9226938486099243,
"learning_rate": 1.9914303140822634e-05,
"loss": 0.9673,
"step": 86
},
{
"epoch": 0.22,
"grad_norm": 1.0336785316467285,
"learning_rate": 1.9910774881547803e-05,
"loss": 0.9649,
"step": 87
},
{
"epoch": 0.22,
"grad_norm": 0.9763470888137817,
"learning_rate": 1.9907175773657945e-05,
"loss": 1.0167,
"step": 88
},
{
"epoch": 0.22,
"grad_norm": 1.0126240253448486,
"learning_rate": 1.990350584288181e-05,
"loss": 1.0057,
"step": 89
},
{
"epoch": 0.22,
"grad_norm": 0.8690929412841797,
"learning_rate": 1.989976511545443e-05,
"loss": 0.9853,
"step": 90
},
{
"epoch": 0.23,
"grad_norm": 0.8878690600395203,
"learning_rate": 1.9895953618116935e-05,
"loss": 0.9533,
"step": 91
},
{
"epoch": 0.23,
"grad_norm": 0.9222694039344788,
"learning_rate": 1.9892071378116378e-05,
"loss": 0.9911,
"step": 92
},
{
"epoch": 0.23,
"grad_norm": 0.9882602095603943,
"learning_rate": 1.9888118423205504e-05,
"loss": 0.9798,
"step": 93
},
{
"epoch": 0.23,
"grad_norm": 0.8841018080711365,
"learning_rate": 1.9884094781642592e-05,
"loss": 0.9744,
"step": 94
},
{
"epoch": 0.23,
"grad_norm": 0.9339718222618103,
"learning_rate": 1.988000048219123e-05,
"loss": 0.9685,
"step": 95
},
{
"epoch": 0.24,
"grad_norm": 0.9282500147819519,
"learning_rate": 1.9875835554120114e-05,
"loss": 1.0026,
"step": 96
},
{
"epoch": 0.24,
"grad_norm": 0.9409290552139282,
"learning_rate": 1.987160002720283e-05,
"loss": 0.9889,
"step": 97
},
{
"epoch": 0.24,
"grad_norm": 0.9274194240570068,
"learning_rate": 1.9867293931717664e-05,
"loss": 0.9946,
"step": 98
},
{
"epoch": 0.24,
"grad_norm": 0.8885007500648499,
"learning_rate": 1.9862917298447365e-05,
"loss": 0.9662,
"step": 99
},
{
"epoch": 0.25,
"grad_norm": 0.8919392824172974,
"learning_rate": 1.9858470158678932e-05,
"loss": 0.9725,
"step": 100
},
{
"epoch": 0.25,
"grad_norm": 0.9394062757492065,
"learning_rate": 1.9853952544203387e-05,
"loss": 0.9575,
"step": 101
},
{
"epoch": 0.25,
"grad_norm": 1.1470295190811157,
"learning_rate": 1.984936448731556e-05,
"loss": 0.9431,
"step": 102
},
{
"epoch": 0.25,
"grad_norm": 0.8493837714195251,
"learning_rate": 1.9844706020813835e-05,
"loss": 0.9904,
"step": 103
},
{
"epoch": 0.26,
"grad_norm": 1.0324151515960693,
"learning_rate": 1.9839977177999942e-05,
"loss": 0.9743,
"step": 104
},
{
"epoch": 0.26,
"grad_norm": 0.9960366487503052,
"learning_rate": 1.9835177992678704e-05,
"loss": 0.9572,
"step": 105
},
{
"epoch": 0.26,
"grad_norm": 0.9168769121170044,
"learning_rate": 1.9830308499157787e-05,
"loss": 0.9762,
"step": 106
},
{
"epoch": 0.26,
"grad_norm": 1.0085129737854004,
"learning_rate": 1.982536873224748e-05,
"loss": 0.9273,
"step": 107
},
{
"epoch": 0.27,
"grad_norm": 0.8885951638221741,
"learning_rate": 1.982035872726042e-05,
"loss": 0.9439,
"step": 108
},
{
"epoch": 0.27,
"grad_norm": 0.9167288541793823,
"learning_rate": 1.9815278520011364e-05,
"loss": 0.9737,
"step": 109
},
{
"epoch": 0.27,
"grad_norm": 0.9074441194534302,
"learning_rate": 1.98101281468169e-05,
"loss": 0.9909,
"step": 110
},
{
"epoch": 0.27,
"grad_norm": 1.047237753868103,
"learning_rate": 1.980490764449523e-05,
"loss": 0.9794,
"step": 111
},
{
"epoch": 0.28,
"grad_norm": 0.9899821281433105,
"learning_rate": 1.979961705036587e-05,
"loss": 0.9838,
"step": 112
},
{
"epoch": 0.28,
"grad_norm": 1.0370337963104248,
"learning_rate": 1.9794256402249398e-05,
"loss": 0.9588,
"step": 113
},
{
"epoch": 0.28,
"grad_norm": 1.0261287689208984,
"learning_rate": 1.9788825738467194e-05,
"loss": 0.9107,
"step": 114
},
{
"epoch": 0.28,
"grad_norm": 0.9958740472793579,
"learning_rate": 1.978332509784114e-05,
"loss": 1.0112,
"step": 115
},
{
"epoch": 0.29,
"grad_norm": 0.9102867841720581,
"learning_rate": 1.977775451969337e-05,
"loss": 0.9411,
"step": 116
},
{
"epoch": 0.29,
"grad_norm": 0.9279481768608093,
"learning_rate": 1.9772114043845968e-05,
"loss": 0.9159,
"step": 117
},
{
"epoch": 0.29,
"grad_norm": 0.9951130151748657,
"learning_rate": 1.97664037106207e-05,
"loss": 1.0187,
"step": 118
},
{
"epoch": 0.29,
"grad_norm": 0.897072970867157,
"learning_rate": 1.9760623560838707e-05,
"loss": 0.9252,
"step": 119
},
{
"epoch": 0.3,
"grad_norm": 1.045141339302063,
"learning_rate": 1.9754773635820236e-05,
"loss": 0.9259,
"step": 120
},
{
"epoch": 0.3,
"grad_norm": 0.9293466806411743,
"learning_rate": 1.9748853977384326e-05,
"loss": 0.9454,
"step": 121
},
{
"epoch": 0.3,
"grad_norm": 0.8189428448677063,
"learning_rate": 1.974286462784851e-05,
"loss": 0.9602,
"step": 122
},
{
"epoch": 0.3,
"grad_norm": 0.9392070770263672,
"learning_rate": 1.973680563002853e-05,
"loss": 0.9481,
"step": 123
},
{
"epoch": 0.31,
"grad_norm": 0.9261330366134644,
"learning_rate": 1.973067702723801e-05,
"loss": 0.9705,
"step": 124
},
{
"epoch": 0.31,
"grad_norm": 0.8699203133583069,
"learning_rate": 1.972447886328816e-05,
"loss": 0.9723,
"step": 125
},
{
"epoch": 0.31,
"grad_norm": 0.9999490976333618,
"learning_rate": 1.9718211182487455e-05,
"loss": 0.9577,
"step": 126
},
{
"epoch": 0.31,
"grad_norm": 0.9014701843261719,
"learning_rate": 1.971187402964132e-05,
"loss": 0.9847,
"step": 127
},
{
"epoch": 0.32,
"grad_norm": 0.9983301758766174,
"learning_rate": 1.970546745005182e-05,
"loss": 0.9717,
"step": 128
},
{
"epoch": 0.32,
"grad_norm": 0.9064127802848816,
"learning_rate": 1.969899148951731e-05,
"loss": 0.9598,
"step": 129
},
{
"epoch": 0.32,
"grad_norm": 0.9075201749801636,
"learning_rate": 1.9692446194332144e-05,
"loss": 0.9888,
"step": 130
},
{
"epoch": 0.32,
"grad_norm": 0.8838136196136475,
"learning_rate": 1.9685831611286312e-05,
"loss": 0.9472,
"step": 131
},
{
"epoch": 0.33,
"grad_norm": 0.9827654361724854,
"learning_rate": 1.9679147787665128e-05,
"loss": 0.9355,
"step": 132
},
{
"epoch": 0.33,
"grad_norm": 1.0343549251556396,
"learning_rate": 1.9672394771248867e-05,
"loss": 0.9681,
"step": 133
},
{
"epoch": 0.33,
"grad_norm": 1.118025302886963,
"learning_rate": 1.966557261031246e-05,
"loss": 0.9789,
"step": 134
},
{
"epoch": 0.33,
"grad_norm": 0.9481855034828186,
"learning_rate": 1.9658681353625105e-05,
"loss": 0.9587,
"step": 135
},
{
"epoch": 0.34,
"grad_norm": 1.0200761556625366,
"learning_rate": 1.9651721050449964e-05,
"loss": 1.0104,
"step": 136
},
{
"epoch": 0.34,
"grad_norm": 0.8867937922477722,
"learning_rate": 1.964469175054377e-05,
"loss": 0.9225,
"step": 137
},
{
"epoch": 0.34,
"grad_norm": 0.9916879534721375,
"learning_rate": 1.963759350415649e-05,
"loss": 0.9162,
"step": 138
},
{
"epoch": 0.34,
"grad_norm": 0.9483277797698975,
"learning_rate": 1.9630426362030978e-05,
"loss": 0.9088,
"step": 139
},
{
"epoch": 0.35,
"grad_norm": 0.8852947354316711,
"learning_rate": 1.962319037540259e-05,
"loss": 0.9577,
"step": 140
},
{
"epoch": 0.35,
"grad_norm": 1.0522266626358032,
"learning_rate": 1.9615885595998825e-05,
"loss": 0.9813,
"step": 141
},
{
"epoch": 0.35,
"grad_norm": 0.9052250385284424,
"learning_rate": 1.9608512076038964e-05,
"loss": 0.9103,
"step": 142
},
{
"epoch": 0.35,
"grad_norm": 0.9529223442077637,
"learning_rate": 1.9601069868233687e-05,
"loss": 0.9549,
"step": 143
},
{
"epoch": 0.36,
"grad_norm": 0.9327247738838196,
"learning_rate": 1.9593559025784692e-05,
"loss": 0.92,
"step": 144
},
{
"epoch": 0.36,
"grad_norm": 0.8364620804786682,
"learning_rate": 1.9585979602384334e-05,
"loss": 0.9342,
"step": 145
},
{
"epoch": 0.36,
"grad_norm": 0.9773866534233093,
"learning_rate": 1.9578331652215224e-05,
"loss": 0.9505,
"step": 146
},
{
"epoch": 0.36,
"grad_norm": 1.0143717527389526,
"learning_rate": 1.9570615229949844e-05,
"loss": 0.9836,
"step": 147
},
{
"epoch": 0.37,
"grad_norm": 0.9267006516456604,
"learning_rate": 1.9562830390750157e-05,
"loss": 0.9328,
"step": 148
},
{
"epoch": 0.37,
"grad_norm": 1.0241330862045288,
"learning_rate": 1.955497719026722e-05,
"loss": 0.9475,
"step": 149
},
{
"epoch": 0.37,
"grad_norm": 0.8536331653594971,
"learning_rate": 1.954705568464078e-05,
"loss": 0.9342,
"step": 150
},
{
"epoch": 0.37,
"grad_norm": 0.9117676019668579,
"learning_rate": 1.953906593049887e-05,
"loss": 0.9764,
"step": 151
},
{
"epoch": 0.38,
"grad_norm": 0.9041135311126709,
"learning_rate": 1.9531007984957408e-05,
"loss": 0.9525,
"step": 152
},
{
"epoch": 0.38,
"grad_norm": 0.7728226780891418,
"learning_rate": 1.9522881905619794e-05,
"loss": 0.88,
"step": 153
},
{
"epoch": 0.38,
"grad_norm": 0.9432837963104248,
"learning_rate": 1.9514687750576483e-05,
"loss": 0.9813,
"step": 154
},
{
"epoch": 0.38,
"grad_norm": 0.9156582355499268,
"learning_rate": 1.950642557840458e-05,
"loss": 0.9643,
"step": 155
},
{
"epoch": 0.39,
"grad_norm": 1.0463101863861084,
"learning_rate": 1.9498095448167435e-05,
"loss": 0.9231,
"step": 156
},
{
"epoch": 0.39,
"grad_norm": 0.9591227173805237,
"learning_rate": 1.948969741941418e-05,
"loss": 0.8738,
"step": 157
},
{
"epoch": 0.39,
"grad_norm": 0.9342914819717407,
"learning_rate": 1.948123155217936e-05,
"loss": 0.9654,
"step": 158
},
{
"epoch": 0.39,
"grad_norm": 1.0308187007904053,
"learning_rate": 1.947269790698245e-05,
"loss": 1.015,
"step": 159
},
{
"epoch": 0.4,
"grad_norm": 0.8782010674476624,
"learning_rate": 1.946409654482745e-05,
"loss": 0.9639,
"step": 160
},
{
"epoch": 0.4,
"grad_norm": 0.9808392524719238,
"learning_rate": 1.945542752720245e-05,
"loss": 0.92,
"step": 161
},
{
"epoch": 0.4,
"grad_norm": 0.9607554078102112,
"learning_rate": 1.944669091607919e-05,
"loss": 0.9605,
"step": 162
},
{
"epoch": 0.4,
"grad_norm": 0.9280065298080444,
"learning_rate": 1.9437886773912595e-05,
"loss": 0.9741,
"step": 163
},
{
"epoch": 0.41,
"grad_norm": 0.947117030620575,
"learning_rate": 1.9429015163640363e-05,
"loss": 0.9671,
"step": 164
},
{
"epoch": 0.41,
"grad_norm": 0.8978593349456787,
"learning_rate": 1.942007614868248e-05,
"loss": 0.9359,
"step": 165
},
{
"epoch": 0.41,
"grad_norm": 0.8768041133880615,
"learning_rate": 1.9411069792940803e-05,
"loss": 0.9273,
"step": 166
},
{
"epoch": 0.41,
"grad_norm": 0.9203951358795166,
"learning_rate": 1.9401996160798574e-05,
"loss": 0.9283,
"step": 167
},
{
"epoch": 0.42,
"grad_norm": 0.89815753698349,
"learning_rate": 1.9392855317119966e-05,
"loss": 0.9044,
"step": 168
},
{
"epoch": 0.42,
"grad_norm": 0.8957310914993286,
"learning_rate": 1.9383647327249635e-05,
"loss": 0.9367,
"step": 169
},
{
"epoch": 0.42,
"grad_norm": 0.9908803701400757,
"learning_rate": 1.937437225701223e-05,
"loss": 1.029,
"step": 170
},
{
"epoch": 0.42,
"grad_norm": 1.0380170345306396,
"learning_rate": 1.9365030172711946e-05,
"loss": 0.9446,
"step": 171
},
{
"epoch": 0.43,
"grad_norm": 0.9037762880325317,
"learning_rate": 1.9355621141132022e-05,
"loss": 0.985,
"step": 172
},
{
"epoch": 0.43,
"grad_norm": 0.9550085067749023,
"learning_rate": 1.9346145229534295e-05,
"loss": 1.0058,
"step": 173
},
{
"epoch": 0.43,
"grad_norm": 0.9788523316383362,
"learning_rate": 1.933660250565869e-05,
"loss": 0.9463,
"step": 174
},
{
"epoch": 0.43,
"grad_norm": 0.9898518323898315,
"learning_rate": 1.9326993037722762e-05,
"loss": 0.9352,
"step": 175
},
{
"epoch": 0.44,
"grad_norm": 0.8734515905380249,
"learning_rate": 1.931731689442119e-05,
"loss": 0.9938,
"step": 176
},
{
"epoch": 0.44,
"grad_norm": 1.1014068126678467,
"learning_rate": 1.9307574144925288e-05,
"loss": 0.9508,
"step": 177
},
{
"epoch": 0.44,
"grad_norm": 0.9668450355529785,
"learning_rate": 1.9297764858882516e-05,
"loss": 0.945,
"step": 178
},
{
"epoch": 0.44,
"grad_norm": 1.0787605047225952,
"learning_rate": 1.9287889106415983e-05,
"loss": 0.9366,
"step": 179
},
{
"epoch": 0.45,
"grad_norm": 0.9039891362190247,
"learning_rate": 1.927794695812394e-05,
"loss": 0.9548,
"step": 180
},
{
"epoch": 0.45,
"grad_norm": 1.1057233810424805,
"learning_rate": 1.9267938485079285e-05,
"loss": 1.0035,
"step": 181
},
{
"epoch": 0.45,
"grad_norm": 1.007955551147461,
"learning_rate": 1.9257863758829038e-05,
"loss": 1.0001,
"step": 182
},
{
"epoch": 0.45,
"grad_norm": 0.941587507724762,
"learning_rate": 1.9247722851393838e-05,
"loss": 0.946,
"step": 183
},
{
"epoch": 0.46,
"grad_norm": 0.9681749939918518,
"learning_rate": 1.9237515835267447e-05,
"loss": 1.0084,
"step": 184
},
{
"epoch": 0.46,
"grad_norm": 0.9005982875823975,
"learning_rate": 1.92272427834162e-05,
"loss": 0.9339,
"step": 185
},
{
"epoch": 0.46,
"grad_norm": 1.0263770818710327,
"learning_rate": 1.9216903769278498e-05,
"loss": 0.9207,
"step": 186
},
{
"epoch": 0.46,
"grad_norm": 0.8374605178833008,
"learning_rate": 1.920649886676429e-05,
"loss": 0.9709,
"step": 187
},
{
"epoch": 0.46,
"grad_norm": 1.0261112451553345,
"learning_rate": 1.9196028150254535e-05,
"loss": 0.9851,
"step": 188
},
{
"epoch": 0.47,
"grad_norm": 0.9642433524131775,
"learning_rate": 1.9185491694600668e-05,
"loss": 0.9663,
"step": 189
},
{
"epoch": 0.47,
"grad_norm": 0.9279437065124512,
"learning_rate": 1.9174889575124077e-05,
"loss": 0.942,
"step": 190
},
{
"epoch": 0.47,
"grad_norm": 0.9306063055992126,
"learning_rate": 1.9164221867615556e-05,
"loss": 0.962,
"step": 191
},
{
"epoch": 0.47,
"grad_norm": 0.9552830457687378,
"learning_rate": 1.915348864833476e-05,
"loss": 1.0035,
"step": 192
},
{
"epoch": 0.48,
"grad_norm": 0.9896847009658813,
"learning_rate": 1.9142689994009666e-05,
"loss": 0.933,
"step": 193
},
{
"epoch": 0.48,
"grad_norm": 0.9258723855018616,
"learning_rate": 1.913182598183603e-05,
"loss": 0.9384,
"step": 194
},
{
"epoch": 0.48,
"grad_norm": 0.82286137342453,
"learning_rate": 1.9120896689476817e-05,
"loss": 0.9674,
"step": 195
},
{
"epoch": 0.48,
"grad_norm": 0.8647574186325073,
"learning_rate": 1.9109902195061666e-05,
"loss": 0.9566,
"step": 196
},
{
"epoch": 0.49,
"grad_norm": 0.8555217981338501,
"learning_rate": 1.9098842577186315e-05,
"loss": 0.9296,
"step": 197
},
{
"epoch": 0.49,
"grad_norm": 1.0212464332580566,
"learning_rate": 1.9087717914912054e-05,
"loss": 0.973,
"step": 198
},
{
"epoch": 0.49,
"grad_norm": 1.1092183589935303,
"learning_rate": 1.9076528287765145e-05,
"loss": 0.9872,
"step": 199
},
{
"epoch": 0.49,
"grad_norm": 0.9690393805503845,
"learning_rate": 1.9065273775736264e-05,
"loss": 0.9043,
"step": 200
},
{
"epoch": 0.5,
"grad_norm": 1.1144682168960571,
"learning_rate": 1.9053954459279934e-05,
"loss": 0.9637,
"step": 201
},
{
"epoch": 0.5,
"grad_norm": 0.9438928961753845,
"learning_rate": 1.9042570419313927e-05,
"loss": 0.9321,
"step": 202
},
{
"epoch": 0.5,
"grad_norm": 0.8914815783500671,
"learning_rate": 1.9031121737218706e-05,
"loss": 0.9516,
"step": 203
},
{
"epoch": 0.5,
"grad_norm": 1.0489622354507446,
"learning_rate": 1.9019608494836843e-05,
"loss": 0.8967,
"step": 204
},
{
"epoch": 0.51,
"grad_norm": 0.9271294474601746,
"learning_rate": 1.900803077447243e-05,
"loss": 0.9241,
"step": 205
},
{
"epoch": 0.51,
"grad_norm": 1.1498748064041138,
"learning_rate": 1.899638865889047e-05,
"loss": 1.0105,
"step": 206
},
{
"epoch": 0.51,
"grad_norm": 0.9851434230804443,
"learning_rate": 1.8984682231316335e-05,
"loss": 0.9798,
"step": 207
},
{
"epoch": 0.51,
"grad_norm": 0.9529551267623901,
"learning_rate": 1.8972911575435112e-05,
"loss": 0.9895,
"step": 208
},
{
"epoch": 0.52,
"grad_norm": 0.8517554998397827,
"learning_rate": 1.896107677539105e-05,
"loss": 0.879,
"step": 209
},
{
"epoch": 0.52,
"grad_norm": 0.9570808410644531,
"learning_rate": 1.8949177915786942e-05,
"loss": 0.9677,
"step": 210
},
{
"epoch": 0.52,
"grad_norm": 0.8422967791557312,
"learning_rate": 1.893721508168351e-05,
"loss": 0.9558,
"step": 211
},
{
"epoch": 0.52,
"grad_norm": 0.8420692086219788,
"learning_rate": 1.8925188358598815e-05,
"loss": 0.9269,
"step": 212
},
{
"epoch": 0.53,
"grad_norm": 0.9592350721359253,
"learning_rate": 1.8913097832507632e-05,
"loss": 0.9153,
"step": 213
},
{
"epoch": 0.53,
"grad_norm": 0.8914132714271545,
"learning_rate": 1.890094358984085e-05,
"loss": 0.9337,
"step": 214
},
{
"epoch": 0.53,
"grad_norm": 0.991773247718811,
"learning_rate": 1.8888725717484834e-05,
"loss": 0.9508,
"step": 215
},
{
"epoch": 0.53,
"grad_norm": 0.8696689605712891,
"learning_rate": 1.8876444302780826e-05,
"loss": 0.9336,
"step": 216
},
{
"epoch": 0.54,
"grad_norm": 0.9667624235153198,
"learning_rate": 1.8864099433524302e-05,
"loss": 1.0045,
"step": 217
},
{
"epoch": 0.54,
"grad_norm": 0.9799702763557434,
"learning_rate": 1.8851691197964356e-05,
"loss": 0.9736,
"step": 218
},
{
"epoch": 0.54,
"grad_norm": 0.9233551025390625,
"learning_rate": 1.8839219684803057e-05,
"loss": 0.9236,
"step": 219
},
{
"epoch": 0.54,
"grad_norm": 0.9129133820533752,
"learning_rate": 1.882668498319484e-05,
"loss": 0.9493,
"step": 220
},
{
"epoch": 0.55,
"grad_norm": 0.9735204577445984,
"learning_rate": 1.8814087182745835e-05,
"loss": 0.9424,
"step": 221
},
{
"epoch": 0.55,
"grad_norm": 0.8610967397689819,
"learning_rate": 1.880142637351325e-05,
"loss": 0.9401,
"step": 222
},
{
"epoch": 0.55,
"grad_norm": 0.9463930130004883,
"learning_rate": 1.8788702646004725e-05,
"loss": 0.9718,
"step": 223
},
{
"epoch": 0.55,
"grad_norm": 0.9404011368751526,
"learning_rate": 1.8775916091177674e-05,
"loss": 0.9679,
"step": 224
},
{
"epoch": 0.56,
"grad_norm": 0.9345746636390686,
"learning_rate": 1.8763066800438638e-05,
"loss": 0.9298,
"step": 225
},
{
"epoch": 0.56,
"grad_norm": 0.8628617525100708,
"learning_rate": 1.8750154865642644e-05,
"loss": 0.9257,
"step": 226
},
{
"epoch": 0.56,
"grad_norm": 0.9928317070007324,
"learning_rate": 1.8737180379092536e-05,
"loss": 0.8931,
"step": 227
},
{
"epoch": 0.56,
"grad_norm": 0.9910153746604919,
"learning_rate": 1.8724143433538317e-05,
"loss": 0.8912,
"step": 228
},
{
"epoch": 0.57,
"grad_norm": 1.0308330059051514,
"learning_rate": 1.8711044122176484e-05,
"loss": 0.9302,
"step": 229
},
{
"epoch": 0.57,
"grad_norm": 0.9643605947494507,
"learning_rate": 1.8697882538649373e-05,
"loss": 0.9206,
"step": 230
},
{
"epoch": 0.57,
"grad_norm": 0.911734402179718,
"learning_rate": 1.8684658777044478e-05,
"loss": 0.8819,
"step": 231
},
{
"epoch": 0.57,
"grad_norm": 1.0010952949523926,
"learning_rate": 1.8671372931893775e-05,
"loss": 0.9707,
"step": 232
},
{
"epoch": 0.58,
"grad_norm": 0.9280171990394592,
"learning_rate": 1.865802509817306e-05,
"loss": 0.9474,
"step": 233
},
{
"epoch": 0.58,
"grad_norm": 1.011705994606018,
"learning_rate": 1.8644615371301275e-05,
"loss": 0.9165,
"step": 234
},
{
"epoch": 0.58,
"grad_norm": 0.8890619874000549,
"learning_rate": 1.8631143847139785e-05,
"loss": 0.881,
"step": 235
},
{
"epoch": 0.58,
"grad_norm": 1.0437440872192383,
"learning_rate": 1.8617610621991753e-05,
"loss": 0.928,
"step": 236
},
{
"epoch": 0.59,
"grad_norm": 0.8093174695968628,
"learning_rate": 1.8604015792601395e-05,
"loss": 0.9186,
"step": 237
},
{
"epoch": 0.59,
"grad_norm": 1.0239651203155518,
"learning_rate": 1.8590359456153333e-05,
"loss": 0.9397,
"step": 238
},
{
"epoch": 0.59,
"grad_norm": 0.8863648176193237,
"learning_rate": 1.857664171027187e-05,
"loss": 1.0017,
"step": 239
},
{
"epoch": 0.59,
"grad_norm": 0.872449517250061,
"learning_rate": 1.8562862653020306e-05,
"loss": 0.97,
"step": 240
},
{
"epoch": 0.6,
"grad_norm": 0.8939850330352783,
"learning_rate": 1.854902238290024e-05,
"loss": 0.9891,
"step": 241
},
{
"epoch": 0.6,
"grad_norm": 0.9652605056762695,
"learning_rate": 1.853512099885085e-05,
"loss": 0.9363,
"step": 242
},
{
"epoch": 0.6,
"grad_norm": 0.934234619140625,
"learning_rate": 1.85211586002482e-05,
"loss": 0.9908,
"step": 243
},
{
"epoch": 0.6,
"grad_norm": 0.8643277883529663,
"learning_rate": 1.8507135286904527e-05,
"loss": 1.0107,
"step": 244
},
{
"epoch": 0.61,
"grad_norm": 0.9351961612701416,
"learning_rate": 1.849305115906753e-05,
"loss": 0.9375,
"step": 245
},
{
"epoch": 0.61,
"grad_norm": 0.9504395127296448,
"learning_rate": 1.8478906317419644e-05,
"loss": 0.9011,
"step": 246
},
{
"epoch": 0.61,
"grad_norm": 0.8627222180366516,
"learning_rate": 1.8464700863077313e-05,
"loss": 0.8714,
"step": 247
},
{
"epoch": 0.61,
"grad_norm": 0.9281494617462158,
"learning_rate": 1.845043489759031e-05,
"loss": 0.9504,
"step": 248
},
{
"epoch": 0.62,
"grad_norm": 0.9497500061988831,
"learning_rate": 1.8436108522940953e-05,
"loss": 0.9485,
"step": 249
},
{
"epoch": 0.62,
"grad_norm": 0.8968126177787781,
"learning_rate": 1.8421721841543412e-05,
"loss": 0.969,
"step": 250
},
{
"epoch": 0.62,
"grad_norm": 0.8765113949775696,
"learning_rate": 1.8407274956242983e-05,
"loss": 0.9459,
"step": 251
},
{
"epoch": 0.62,
"grad_norm": 0.9863468408584595,
"learning_rate": 1.8392767970315314e-05,
"loss": 0.9509,
"step": 252
},
{
"epoch": 0.63,
"grad_norm": 0.9541934728622437,
"learning_rate": 1.8378200987465704e-05,
"loss": 0.9497,
"step": 253
},
{
"epoch": 0.63,
"grad_norm": 0.9275906682014465,
"learning_rate": 1.836357411182835e-05,
"loss": 0.907,
"step": 254
},
{
"epoch": 0.63,
"grad_norm": 0.9415097236633301,
"learning_rate": 1.83488874479656e-05,
"loss": 0.9655,
"step": 255
},
{
"epoch": 0.63,
"grad_norm": 0.9965115189552307,
"learning_rate": 1.8334141100867208e-05,
"loss": 0.9046,
"step": 256
},
{
"epoch": 0.64,
"grad_norm": 1.0025869607925415,
"learning_rate": 1.831933517594957e-05,
"loss": 0.9464,
"step": 257
},
{
"epoch": 0.64,
"grad_norm": 1.0173474550247192,
"learning_rate": 1.8304469779055e-05,
"loss": 0.9255,
"step": 258
},
{
"epoch": 0.64,
"grad_norm": 0.8364382386207581,
"learning_rate": 1.8289545016450953e-05,
"loss": 0.9389,
"step": 259
},
{
"epoch": 0.64,
"grad_norm": 0.880821168422699,
"learning_rate": 1.8274560994829256e-05,
"loss": 0.8535,
"step": 260
},
{
"epoch": 0.65,
"grad_norm": 0.8849993348121643,
"learning_rate": 1.825951782130537e-05,
"loss": 0.9088,
"step": 261
},
{
"epoch": 0.65,
"grad_norm": 0.8260980248451233,
"learning_rate": 1.8244415603417603e-05,
"loss": 0.9189,
"step": 262
},
{
"epoch": 0.65,
"grad_norm": 0.9681034684181213,
"learning_rate": 1.8229254449126365e-05,
"loss": 0.9535,
"step": 263
},
{
"epoch": 0.65,
"grad_norm": 0.8485463857650757,
"learning_rate": 1.821403446681336e-05,
"loss": 0.9525,
"step": 264
},
{
"epoch": 0.66,
"grad_norm": 0.9151450395584106,
"learning_rate": 1.819875576528085e-05,
"loss": 0.9254,
"step": 265
},
{
"epoch": 0.66,
"grad_norm": 1.044825553894043,
"learning_rate": 1.818341845375086e-05,
"loss": 0.9023,
"step": 266
},
{
"epoch": 0.66,
"grad_norm": 0.9123022556304932,
"learning_rate": 1.816802264186438e-05,
"loss": 0.9918,
"step": 267
},
{
"epoch": 0.66,
"grad_norm": 0.8421608805656433,
"learning_rate": 1.8152568439680612e-05,
"loss": 0.9638,
"step": 268
},
{
"epoch": 0.67,
"grad_norm": 0.8400166630744934,
"learning_rate": 1.8137055957676172e-05,
"loss": 0.9442,
"step": 269
},
{
"epoch": 0.67,
"grad_norm": 0.8485128879547119,
"learning_rate": 1.8121485306744286e-05,
"loss": 0.9667,
"step": 270
},
{
"epoch": 0.67,
"grad_norm": 0.9254871606826782,
"learning_rate": 1.8105856598194026e-05,
"loss": 0.9809,
"step": 271
},
{
"epoch": 0.67,
"grad_norm": 0.8990374207496643,
"learning_rate": 1.8090169943749477e-05,
"loss": 0.8996,
"step": 272
},
{
"epoch": 0.68,
"grad_norm": 0.9757938981056213,
"learning_rate": 1.8074425455548972e-05,
"loss": 0.9734,
"step": 273
},
{
"epoch": 0.68,
"grad_norm": 1.0201201438903809,
"learning_rate": 1.8058623246144274e-05,
"loss": 0.9188,
"step": 274
},
{
"epoch": 0.68,
"grad_norm": 0.8527243733406067,
"learning_rate": 1.8042763428499777e-05,
"loss": 0.9401,
"step": 275
},
{
"epoch": 0.68,
"grad_norm": 0.9102773070335388,
"learning_rate": 1.802684611599169e-05,
"loss": 0.9415,
"step": 276
},
{
"epoch": 0.69,
"grad_norm": 0.8753496408462524,
"learning_rate": 1.8010871422407238e-05,
"loss": 0.9326,
"step": 277
},
{
"epoch": 0.69,
"grad_norm": 0.9664900302886963,
"learning_rate": 1.7994839461943834e-05,
"loss": 0.92,
"step": 278
},
{
"epoch": 0.69,
"grad_norm": 0.8483555912971497,
"learning_rate": 1.7978750349208284e-05,
"loss": 0.9526,
"step": 279
},
{
"epoch": 0.69,
"grad_norm": 0.9533878564834595,
"learning_rate": 1.7962604199215946e-05,
"loss": 0.9409,
"step": 280
},
{
"epoch": 0.69,
"grad_norm": 0.9228258728981018,
"learning_rate": 1.7946401127389928e-05,
"loss": 0.972,
"step": 281
},
{
"epoch": 0.7,
"grad_norm": 0.9358240962028503,
"learning_rate": 1.7930141249560235e-05,
"loss": 0.9624,
"step": 282
},
{
"epoch": 0.7,
"grad_norm": 0.9039192199707031,
"learning_rate": 1.791382468196297e-05,
"loss": 0.9152,
"step": 283
},
{
"epoch": 0.7,
"grad_norm": 0.9258562922477722,
"learning_rate": 1.789745154123949e-05,
"loss": 0.9392,
"step": 284
},
{
"epoch": 0.7,
"grad_norm": 0.8800731897354126,
"learning_rate": 1.788102194443557e-05,
"loss": 0.9225,
"step": 285
},
{
"epoch": 0.71,
"grad_norm": 0.9394617676734924,
"learning_rate": 1.7864536009000575e-05,
"loss": 0.9482,
"step": 286
},
{
"epoch": 0.71,
"grad_norm": 0.9548134803771973,
"learning_rate": 1.7847993852786612e-05,
"loss": 0.8954,
"step": 287
},
{
"epoch": 0.71,
"grad_norm": 0.8929195404052734,
"learning_rate": 1.7831395594047682e-05,
"loss": 0.938,
"step": 288
},
{
"epoch": 0.71,
"grad_norm": 0.9912259578704834,
"learning_rate": 1.7814741351438855e-05,
"loss": 0.9557,
"step": 289
},
{
"epoch": 0.72,
"grad_norm": 0.8686590194702148,
"learning_rate": 1.7798031244015406e-05,
"loss": 0.9637,
"step": 290
},
{
"epoch": 0.72,
"grad_norm": 1.0021827220916748,
"learning_rate": 1.7781265391231968e-05,
"loss": 0.9704,
"step": 291
},
{
"epoch": 0.72,
"grad_norm": 0.9183592796325684,
"learning_rate": 1.7764443912941675e-05,
"loss": 0.9164,
"step": 292
},
{
"epoch": 0.72,
"grad_norm": 0.8944042325019836,
"learning_rate": 1.7747566929395307e-05,
"loss": 0.9674,
"step": 293
},
{
"epoch": 0.73,
"grad_norm": 0.924879789352417,
"learning_rate": 1.7730634561240442e-05,
"loss": 0.9854,
"step": 294
},
{
"epoch": 0.73,
"grad_norm": 0.9109692573547363,
"learning_rate": 1.7713646929520568e-05,
"loss": 0.911,
"step": 295
},
{
"epoch": 0.73,
"grad_norm": 0.8695762157440186,
"learning_rate": 1.769660415567424e-05,
"loss": 0.884,
"step": 296
},
{
"epoch": 0.73,
"grad_norm": 0.9606033563613892,
"learning_rate": 1.7679506361534216e-05,
"loss": 1.0099,
"step": 297
},
{
"epoch": 0.74,
"grad_norm": 0.8884177207946777,
"learning_rate": 1.766235366932655e-05,
"loss": 0.9538,
"step": 298
},
{
"epoch": 0.74,
"grad_norm": 0.9658393263816833,
"learning_rate": 1.764514620166976e-05,
"loss": 0.9319,
"step": 299
},
{
"epoch": 0.74,
"grad_norm": 0.9385345578193665,
"learning_rate": 1.762788408157393e-05,
"loss": 0.9537,
"step": 300
},
{
"epoch": 0.74,
"grad_norm": 0.9953750371932983,
"learning_rate": 1.7610567432439834e-05,
"loss": 0.9259,
"step": 301
},
{
"epoch": 0.75,
"grad_norm": 1.038669228553772,
"learning_rate": 1.759319637805806e-05,
"loss": 0.9937,
"step": 302
},
{
"epoch": 0.75,
"grad_norm": 0.9077236652374268,
"learning_rate": 1.757577104260811e-05,
"loss": 0.9005,
"step": 303
},
{
"epoch": 0.75,
"grad_norm": 1.1027412414550781,
"learning_rate": 1.755829155065753e-05,
"loss": 0.9809,
"step": 304
},
{
"epoch": 0.75,
"grad_norm": 0.8355712294578552,
"learning_rate": 1.7540758027161014e-05,
"loss": 0.8971,
"step": 305
},
{
"epoch": 0.76,
"grad_norm": 0.8847547769546509,
"learning_rate": 1.7523170597459497e-05,
"loss": 0.9613,
"step": 306
},
{
"epoch": 0.76,
"grad_norm": 0.8579404950141907,
"learning_rate": 1.750552938727928e-05,
"loss": 0.936,
"step": 307
},
{
"epoch": 0.76,
"grad_norm": 0.8035609722137451,
"learning_rate": 1.7487834522731115e-05,
"loss": 0.9303,
"step": 308
},
{
"epoch": 0.76,
"grad_norm": 0.8296695947647095,
"learning_rate": 1.747008613030932e-05,
"loss": 0.9458,
"step": 309
},
{
"epoch": 0.77,
"grad_norm": 0.8970544934272766,
"learning_rate": 1.7452284336890853e-05,
"loss": 0.9999,
"step": 310
},
{
"epoch": 0.77,
"grad_norm": 0.9501840472221375,
"learning_rate": 1.7434429269734426e-05,
"loss": 0.9492,
"step": 311
},
{
"epoch": 0.77,
"grad_norm": 0.9118046760559082,
"learning_rate": 1.7416521056479577e-05,
"loss": 0.926,
"step": 312
},
{
"epoch": 0.77,
"grad_norm": 0.9785935282707214,
"learning_rate": 1.7398559825145776e-05,
"loss": 0.9719,
"step": 313
},
{
"epoch": 0.78,
"grad_norm": 1.0440707206726074,
"learning_rate": 1.7380545704131496e-05,
"loss": 0.926,
"step": 314
},
{
"epoch": 0.78,
"grad_norm": 0.9193445444107056,
"learning_rate": 1.73624788222133e-05,
"loss": 0.9227,
"step": 315
},
{
"epoch": 0.78,
"grad_norm": 0.8930751085281372,
"learning_rate": 1.734435930854492e-05,
"loss": 0.9569,
"step": 316
},
{
"epoch": 0.78,
"grad_norm": 0.8954054117202759,
"learning_rate": 1.7326187292656332e-05,
"loss": 0.9531,
"step": 317
},
{
"epoch": 0.79,
"grad_norm": 0.8552334308624268,
"learning_rate": 1.7307962904452837e-05,
"loss": 0.8935,
"step": 318
},
{
"epoch": 0.79,
"grad_norm": 0.9402141571044922,
"learning_rate": 1.7289686274214116e-05,
"loss": 0.9386,
"step": 319
},
{
"epoch": 0.79,
"grad_norm": 0.9156544208526611,
"learning_rate": 1.7271357532593325e-05,
"loss": 0.9029,
"step": 320
},
{
"epoch": 0.79,
"grad_norm": 0.9618916511535645,
"learning_rate": 1.7252976810616134e-05,
"loss": 0.9303,
"step": 321
},
{
"epoch": 0.8,
"grad_norm": 0.9479423761367798,
"learning_rate": 1.7234544239679807e-05,
"loss": 0.9224,
"step": 322
},
{
"epoch": 0.8,
"grad_norm": 0.8893961906433105,
"learning_rate": 1.7216059951552256e-05,
"loss": 0.9229,
"step": 323
},
{
"epoch": 0.8,
"grad_norm": 0.8631827235221863,
"learning_rate": 1.7197524078371105e-05,
"loss": 1.0178,
"step": 324
},
{
"epoch": 0.8,
"grad_norm": 1.0318481922149658,
"learning_rate": 1.7178936752642737e-05,
"loss": 0.9222,
"step": 325
},
{
"epoch": 0.81,
"grad_norm": 0.9426582455635071,
"learning_rate": 1.7160298107241347e-05,
"loss": 0.9286,
"step": 326
},
{
"epoch": 0.81,
"grad_norm": 0.8652179837226868,
"learning_rate": 1.714160827540801e-05,
"loss": 0.9112,
"step": 327
},
{
"epoch": 0.81,
"grad_norm": 1.006952166557312,
"learning_rate": 1.7122867390749697e-05,
"loss": 0.9774,
"step": 328
},
{
"epoch": 0.81,
"grad_norm": 0.8762472867965698,
"learning_rate": 1.7104075587238353e-05,
"loss": 0.8923,
"step": 329
},
{
"epoch": 0.82,
"grad_norm": 0.9208068251609802,
"learning_rate": 1.7085232999209915e-05,
"loss": 0.946,
"step": 330
},
{
"epoch": 0.82,
"grad_norm": 1.0112638473510742,
"learning_rate": 1.7066339761363364e-05,
"loss": 0.8921,
"step": 331
},
{
"epoch": 0.82,
"grad_norm": 0.8371056318283081,
"learning_rate": 1.7047396008759755e-05,
"loss": 0.9129,
"step": 332
},
{
"epoch": 0.82,
"grad_norm": 0.8354660272598267,
"learning_rate": 1.7028401876821257e-05,
"loss": 0.894,
"step": 333
},
{
"epoch": 0.83,
"grad_norm": 0.8596228361129761,
"learning_rate": 1.7009357501330188e-05,
"loss": 0.9125,
"step": 334
},
{
"epoch": 0.83,
"grad_norm": 0.8694031238555908,
"learning_rate": 1.699026301842803e-05,
"loss": 0.9293,
"step": 335
},
{
"epoch": 0.83,
"grad_norm": 0.9017357230186462,
"learning_rate": 1.6971118564614473e-05,
"loss": 0.9319,
"step": 336
},
{
"epoch": 0.83,
"grad_norm": 0.9348689317703247,
"learning_rate": 1.6951924276746425e-05,
"loss": 0.9846,
"step": 337
},
{
"epoch": 0.84,
"grad_norm": 1.02806556224823,
"learning_rate": 1.6932680292037045e-05,
"loss": 0.8774,
"step": 338
},
{
"epoch": 0.84,
"grad_norm": 0.8359406590461731,
"learning_rate": 1.6913386748054757e-05,
"loss": 0.9137,
"step": 339
},
{
"epoch": 0.84,
"grad_norm": 0.8839374780654907,
"learning_rate": 1.689404378272226e-05,
"loss": 0.9285,
"step": 340
},
{
"epoch": 0.84,
"grad_norm": 0.9413475394248962,
"learning_rate": 1.687465153431556e-05,
"loss": 0.9835,
"step": 341
},
{
"epoch": 0.85,
"grad_norm": 0.849159836769104,
"learning_rate": 1.6855210141462964e-05,
"loss": 0.8963,
"step": 342
},
{
"epoch": 0.85,
"grad_norm": 0.9252450466156006,
"learning_rate": 1.683571974314409e-05,
"loss": 0.9006,
"step": 343
},
{
"epoch": 0.85,
"grad_norm": 0.800440788269043,
"learning_rate": 1.6816180478688885e-05,
"loss": 0.8678,
"step": 344
},
{
"epoch": 0.85,
"grad_norm": 0.9242329597473145,
"learning_rate": 1.679659248777662e-05,
"loss": 0.9017,
"step": 345
},
{
"epoch": 0.86,
"grad_norm": 0.8285741209983826,
"learning_rate": 1.67769559104349e-05,
"loss": 0.9184,
"step": 346
},
{
"epoch": 0.86,
"grad_norm": 0.9821171760559082,
"learning_rate": 1.6757270887038653e-05,
"loss": 0.8811,
"step": 347
},
{
"epoch": 0.86,
"grad_norm": 0.8693691492080688,
"learning_rate": 1.6737537558309128e-05,
"loss": 0.8838,
"step": 348
},
{
"epoch": 0.86,
"grad_norm": 0.8885882496833801,
"learning_rate": 1.6717756065312892e-05,
"loss": 0.9457,
"step": 349
},
{
"epoch": 0.87,
"grad_norm": 0.9387298226356506,
"learning_rate": 1.6697926549460826e-05,
"loss": 0.916,
"step": 350
},
{
"epoch": 0.87,
"grad_norm": 0.9151391983032227,
"learning_rate": 1.667804915250711e-05,
"loss": 0.9675,
"step": 351
},
{
"epoch": 0.87,
"grad_norm": 0.8945315480232239,
"learning_rate": 1.66581240165482e-05,
"loss": 0.8943,
"step": 352
},
{
"epoch": 0.87,
"grad_norm": 0.828498125076294,
"learning_rate": 1.6638151284021828e-05,
"loss": 0.878,
"step": 353
},
{
"epoch": 0.88,
"grad_norm": 0.9389723539352417,
"learning_rate": 1.661813109770598e-05,
"loss": 0.9186,
"step": 354
},
{
"epoch": 0.88,
"grad_norm": 0.8923815488815308,
"learning_rate": 1.6598063600717865e-05,
"loss": 0.915,
"step": 355
},
{
"epoch": 0.88,
"grad_norm": 1.029050588607788,
"learning_rate": 1.6577948936512905e-05,
"loss": 0.9704,
"step": 356
},
{
"epoch": 0.88,
"grad_norm": 0.8715476393699646,
"learning_rate": 1.6557787248883698e-05,
"loss": 0.8839,
"step": 357
},
{
"epoch": 0.89,
"grad_norm": 0.9403415322303772,
"learning_rate": 1.6537578681958998e-05,
"loss": 0.9316,
"step": 358
},
{
"epoch": 0.89,
"grad_norm": 1.0738519430160522,
"learning_rate": 1.6517323380202693e-05,
"loss": 0.9496,
"step": 359
},
{
"epoch": 0.89,
"grad_norm": 0.9317395687103271,
"learning_rate": 1.649702148841274e-05,
"loss": 0.9179,
"step": 360
},
{
"epoch": 0.89,
"grad_norm": 1.0131531953811646,
"learning_rate": 1.647667315172017e-05,
"loss": 0.9191,
"step": 361
},
{
"epoch": 0.9,
"grad_norm": 0.8759732246398926,
"learning_rate": 1.6456278515588023e-05,
"loss": 0.9744,
"step": 362
},
{
"epoch": 0.9,
"grad_norm": 0.8557033538818359,
"learning_rate": 1.6435837725810326e-05,
"loss": 0.9046,
"step": 363
},
{
"epoch": 0.9,
"grad_norm": 0.8780690431594849,
"learning_rate": 1.6415350928511037e-05,
"loss": 0.9018,
"step": 364
},
{
"epoch": 0.9,
"grad_norm": 1.0143564939498901,
"learning_rate": 1.6394818270142995e-05,
"loss": 0.8701,
"step": 365
},
{
"epoch": 0.91,
"grad_norm": 0.882732629776001,
"learning_rate": 1.63742398974869e-05,
"loss": 0.9321,
"step": 366
},
{
"epoch": 0.91,
"grad_norm": 0.835397481918335,
"learning_rate": 1.635361595765024e-05,
"loss": 0.8976,
"step": 367
},
{
"epoch": 0.91,
"grad_norm": 0.9068905115127563,
"learning_rate": 1.6332946598066244e-05,
"loss": 0.9203,
"step": 368
},
{
"epoch": 0.91,
"grad_norm": 0.8993863463401794,
"learning_rate": 1.631223196649284e-05,
"loss": 0.8459,
"step": 369
},
{
"epoch": 0.91,
"grad_norm": 0.9969024658203125,
"learning_rate": 1.6291472211011575e-05,
"loss": 0.9748,
"step": 370
},
{
"epoch": 0.92,
"grad_norm": 0.9903284907341003,
"learning_rate": 1.6270667480026588e-05,
"loss": 0.9503,
"step": 371
},
{
"epoch": 0.92,
"grad_norm": 0.8794125318527222,
"learning_rate": 1.6249817922263518e-05,
"loss": 0.9277,
"step": 372
},
{
"epoch": 0.92,
"grad_norm": 0.9173590540885925,
"learning_rate": 1.6228923686768458e-05,
"loss": 0.9212,
"step": 373
},
{
"epoch": 0.92,
"grad_norm": 0.9650622606277466,
"learning_rate": 1.6207984922906893e-05,
"loss": 0.9431,
"step": 374
},
{
"epoch": 0.93,
"grad_norm": 0.945070743560791,
"learning_rate": 1.6187001780362613e-05,
"loss": 0.9434,
"step": 375
},
{
"epoch": 0.93,
"grad_norm": 0.8877692222595215,
"learning_rate": 1.6165974409136673e-05,
"loss": 0.9123,
"step": 376
},
{
"epoch": 0.93,
"grad_norm": 0.9542880058288574,
"learning_rate": 1.6144902959546286e-05,
"loss": 0.8207,
"step": 377
},
{
"epoch": 0.93,
"grad_norm": 0.858532726764679,
"learning_rate": 1.6123787582223774e-05,
"loss": 0.9526,
"step": 378
},
{
"epoch": 0.94,
"grad_norm": 0.9449251890182495,
"learning_rate": 1.610262842811548e-05,
"loss": 0.9136,
"step": 379
},
{
"epoch": 0.94,
"grad_norm": 0.9061833620071411,
"learning_rate": 1.6081425648480696e-05,
"loss": 0.9652,
"step": 380
},
{
"epoch": 0.94,
"grad_norm": 0.8926973938941956,
"learning_rate": 1.6060179394890573e-05,
"loss": 0.898,
"step": 381
},
{
"epoch": 0.94,
"grad_norm": 0.8653205633163452,
"learning_rate": 1.6038889819227047e-05,
"loss": 0.9273,
"step": 382
},
{
"epoch": 0.95,
"grad_norm": 0.9271880388259888,
"learning_rate": 1.601755707368174e-05,
"loss": 0.8709,
"step": 383
},
{
"epoch": 0.95,
"grad_norm": 0.838083803653717,
"learning_rate": 1.5996181310754883e-05,
"loss": 0.8773,
"step": 384
},
{
"epoch": 0.95,
"grad_norm": 0.8962357640266418,
"learning_rate": 1.5974762683254232e-05,
"loss": 0.9538,
"step": 385
},
{
"epoch": 0.95,
"grad_norm": 0.9841724634170532,
"learning_rate": 1.5953301344293954e-05,
"loss": 0.9115,
"step": 386
},
{
"epoch": 0.96,
"grad_norm": 0.8522834181785583,
"learning_rate": 1.5931797447293553e-05,
"loss": 0.9283,
"step": 387
},
{
"epoch": 0.96,
"grad_norm": 0.8607474565505981,
"learning_rate": 1.5910251145976762e-05,
"loss": 0.886,
"step": 388
},
{
"epoch": 0.96,
"grad_norm": 1.185672402381897,
"learning_rate": 1.5888662594370448e-05,
"loss": 0.9027,
"step": 389
},
{
"epoch": 0.96,
"grad_norm": 0.8950862288475037,
"learning_rate": 1.5867031946803512e-05,
"loss": 0.951,
"step": 390
},
{
"epoch": 0.97,
"grad_norm": 1.0167317390441895,
"learning_rate": 1.584535935790578e-05,
"loss": 0.9591,
"step": 391
},
{
"epoch": 0.97,
"grad_norm": 0.8440210223197937,
"learning_rate": 1.5823644982606905e-05,
"loss": 0.9769,
"step": 392
},
{
"epoch": 0.97,
"grad_norm": 0.9818696975708008,
"learning_rate": 1.580188897613526e-05,
"loss": 0.9215,
"step": 393
},
{
"epoch": 0.97,
"grad_norm": 0.8964303135871887,
"learning_rate": 1.578009149401681e-05,
"loss": 0.8964,
"step": 394
},
{
"epoch": 0.98,
"grad_norm": 0.8808884024620056,
"learning_rate": 1.5758252692074036e-05,
"loss": 0.9841,
"step": 395
},
{
"epoch": 0.98,
"grad_norm": 0.9643732309341431,
"learning_rate": 1.5736372726424784e-05,
"loss": 0.924,
"step": 396
},
{
"epoch": 0.98,
"grad_norm": 1.0200897455215454,
"learning_rate": 1.571445175348117e-05,
"loss": 0.892,
"step": 397
},
{
"epoch": 0.98,
"grad_norm": 0.9758608937263489,
"learning_rate": 1.5692489929948453e-05,
"loss": 0.9619,
"step": 398
},
{
"epoch": 0.99,
"grad_norm": 0.9189080595970154,
"learning_rate": 1.5670487412823922e-05,
"loss": 0.8645,
"step": 399
},
{
"epoch": 0.99,
"grad_norm": 0.8707435727119446,
"learning_rate": 1.564844435939577e-05,
"loss": 0.8698,
"step": 400
},
{
"epoch": 0.99,
"grad_norm": 0.9341516494750977,
"learning_rate": 1.5626360927241974e-05,
"loss": 0.9557,
"step": 401
},
{
"epoch": 0.99,
"grad_norm": 0.7838379144668579,
"learning_rate": 1.560423727422915e-05,
"loss": 0.9594,
"step": 402
},
{
"epoch": 1.0,
"grad_norm": 0.9376362562179565,
"learning_rate": 1.5582073558511452e-05,
"loss": 0.9702,
"step": 403
},
{
"epoch": 1.0,
"grad_norm": 0.9549680948257446,
"learning_rate": 1.5559869938529428e-05,
"loss": 0.9455,
"step": 404
},
{
"epoch": 1.0,
"grad_norm": 0.9809777140617371,
"learning_rate": 1.5537626573008878e-05,
"loss": 0.7453,
"step": 405
},
{
"epoch": 1.0,
"grad_norm": 1.0656132698059082,
"learning_rate": 1.551534362095973e-05,
"loss": 0.6621,
"step": 406
},
{
"epoch": 1.01,
"grad_norm": 1.0354893207550049,
"learning_rate": 1.549302124167492e-05,
"loss": 0.6656,
"step": 407
},
{
"epoch": 1.01,
"grad_norm": 0.8907589912414551,
"learning_rate": 1.547065959472921e-05,
"loss": 0.6761,
"step": 408
},
{
"epoch": 1.01,
"grad_norm": 0.868365466594696,
"learning_rate": 1.544825883997809e-05,
"loss": 0.6581,
"step": 409
},
{
"epoch": 1.01,
"grad_norm": 0.9552692174911499,
"learning_rate": 1.5425819137556605e-05,
"loss": 0.6658,
"step": 410
},
{
"epoch": 1.02,
"grad_norm": 1.0614516735076904,
"learning_rate": 1.5403340647878234e-05,
"loss": 0.6521,
"step": 411
},
{
"epoch": 1.02,
"grad_norm": 1.1384412050247192,
"learning_rate": 1.5380823531633727e-05,
"loss": 0.6916,
"step": 412
},
{
"epoch": 1.02,
"grad_norm": 0.9832707047462463,
"learning_rate": 1.5358267949789968e-05,
"loss": 0.6792,
"step": 413
},
{
"epoch": 1.02,
"grad_norm": 1.025050163269043,
"learning_rate": 1.5335674063588808e-05,
"loss": 0.6658,
"step": 414
},
{
"epoch": 1.03,
"grad_norm": 1.0268765687942505,
"learning_rate": 1.531304203454593e-05,
"loss": 0.6148,
"step": 415
},
{
"epoch": 1.03,
"grad_norm": 0.9756706953048706,
"learning_rate": 1.529037202444968e-05,
"loss": 0.6175,
"step": 416
},
{
"epoch": 1.03,
"grad_norm": 1.0162984132766724,
"learning_rate": 1.5267664195359917e-05,
"loss": 0.6118,
"step": 417
},
{
"epoch": 1.03,
"grad_norm": 1.031238317489624,
"learning_rate": 1.524491870960687e-05,
"loss": 0.6502,
"step": 418
},
{
"epoch": 1.04,
"grad_norm": 1.0331997871398926,
"learning_rate": 1.5222135729789944e-05,
"loss": 0.6462,
"step": 419
},
{
"epoch": 1.04,
"grad_norm": 1.0492758750915527,
"learning_rate": 1.5199315418776584e-05,
"loss": 0.6314,
"step": 420
},
{
"epoch": 1.04,
"grad_norm": 1.0537834167480469,
"learning_rate": 1.51764579397011e-05,
"loss": 0.6211,
"step": 421
},
{
"epoch": 1.04,
"grad_norm": 0.9573667049407959,
"learning_rate": 1.5153563455963501e-05,
"loss": 0.6126,
"step": 422
},
{
"epoch": 1.05,
"grad_norm": 1.0148391723632812,
"learning_rate": 1.5130632131228336e-05,
"loss": 0.6096,
"step": 423
},
{
"epoch": 1.05,
"grad_norm": 1.1061570644378662,
"learning_rate": 1.5107664129423513e-05,
"loss": 0.5909,
"step": 424
},
{
"epoch": 1.05,
"grad_norm": 1.0046223402023315,
"learning_rate": 1.5084659614739133e-05,
"loss": 0.6172,
"step": 425
},
{
"epoch": 1.05,
"grad_norm": 1.0545165538787842,
"learning_rate": 1.506161875162631e-05,
"loss": 0.6433,
"step": 426
},
{
"epoch": 1.06,
"grad_norm": 0.8336237072944641,
"learning_rate": 1.5038541704796004e-05,
"loss": 0.6062,
"step": 427
},
{
"epoch": 1.06,
"grad_norm": 0.9275774955749512,
"learning_rate": 1.5015428639217844e-05,
"loss": 0.5803,
"step": 428
},
{
"epoch": 1.06,
"grad_norm": 1.0565983057022095,
"learning_rate": 1.4992279720118936e-05,
"loss": 0.617,
"step": 429
},
{
"epoch": 1.06,
"grad_norm": 1.0599391460418701,
"learning_rate": 1.4969095112982692e-05,
"loss": 0.6538,
"step": 430
},
{
"epoch": 1.07,
"grad_norm": 1.0006245374679565,
"learning_rate": 1.4945874983547647e-05,
"loss": 0.6214,
"step": 431
},
{
"epoch": 1.07,
"grad_norm": 0.8739469647407532,
"learning_rate": 1.4922619497806276e-05,
"loss": 0.63,
"step": 432
},
{
"epoch": 1.07,
"grad_norm": 0.9901002645492554,
"learning_rate": 1.4899328822003796e-05,
"loss": 0.6258,
"step": 433
},
{
"epoch": 1.07,
"grad_norm": 0.9801365733146667,
"learning_rate": 1.4876003122636989e-05,
"loss": 0.5663,
"step": 434
},
{
"epoch": 1.08,
"grad_norm": 1.0043010711669922,
"learning_rate": 1.4852642566453008e-05,
"loss": 0.6277,
"step": 435
},
{
"epoch": 1.08,
"grad_norm": 0.863335132598877,
"learning_rate": 1.4829247320448187e-05,
"loss": 0.6042,
"step": 436
},
{
"epoch": 1.08,
"grad_norm": 0.9847108721733093,
"learning_rate": 1.4805817551866839e-05,
"loss": 0.6376,
"step": 437
},
{
"epoch": 1.08,
"grad_norm": 1.0984731912612915,
"learning_rate": 1.4782353428200075e-05,
"loss": 0.6212,
"step": 438
},
{
"epoch": 1.09,
"grad_norm": 0.890733540058136,
"learning_rate": 1.4758855117184591e-05,
"loss": 0.6377,
"step": 439
},
{
"epoch": 1.09,
"grad_norm": 0.9059834480285645,
"learning_rate": 1.473532278680148e-05,
"loss": 0.6355,
"step": 440
},
{
"epoch": 1.09,
"grad_norm": 0.9394241571426392,
"learning_rate": 1.4711756605275031e-05,
"loss": 0.6597,
"step": 441
},
{
"epoch": 1.09,
"grad_norm": 0.9297809600830078,
"learning_rate": 1.4688156741071513e-05,
"loss": 0.5901,
"step": 442
},
{
"epoch": 1.1,
"grad_norm": 0.9530428647994995,
"learning_rate": 1.4664523362897991e-05,
"loss": 0.6371,
"step": 443
},
{
"epoch": 1.1,
"grad_norm": 0.9405369758605957,
"learning_rate": 1.46408566397011e-05,
"loss": 0.6296,
"step": 444
},
{
"epoch": 1.1,
"grad_norm": 0.9997767806053162,
"learning_rate": 1.4617156740665852e-05,
"loss": 0.6117,
"step": 445
},
{
"epoch": 1.1,
"grad_norm": 0.9971033930778503,
"learning_rate": 1.4593423835214421e-05,
"loss": 0.6048,
"step": 446
},
{
"epoch": 1.11,
"grad_norm": 0.9242191314697266,
"learning_rate": 1.4569658093004935e-05,
"loss": 0.5894,
"step": 447
},
{
"epoch": 1.11,
"grad_norm": 1.0100167989730835,
"learning_rate": 1.4545859683930252e-05,
"loss": 0.6318,
"step": 448
},
{
"epoch": 1.11,
"grad_norm": 1.0533432960510254,
"learning_rate": 1.4522028778116765e-05,
"loss": 0.6712,
"step": 449
},
{
"epoch": 1.11,
"grad_norm": 0.9874811172485352,
"learning_rate": 1.4498165545923167e-05,
"loss": 0.646,
"step": 450
},
{
"epoch": 1.12,
"grad_norm": 1.0431230068206787,
"learning_rate": 1.4474270157939236e-05,
"loss": 0.5988,
"step": 451
},
{
"epoch": 1.12,
"grad_norm": 1.019249677658081,
"learning_rate": 1.4450342784984632e-05,
"loss": 0.634,
"step": 452
},
{
"epoch": 1.12,
"grad_norm": 0.9447318315505981,
"learning_rate": 1.4426383598107663e-05,
"loss": 0.5728,
"step": 453
},
{
"epoch": 1.12,
"grad_norm": 0.940975546836853,
"learning_rate": 1.4402392768584053e-05,
"loss": 0.6225,
"step": 454
},
{
"epoch": 1.13,
"grad_norm": 0.9953579306602478,
"learning_rate": 1.4378370467915736e-05,
"loss": 0.6517,
"step": 455
},
{
"epoch": 1.13,
"grad_norm": 1.0336792469024658,
"learning_rate": 1.4354316867829622e-05,
"loss": 0.6121,
"step": 456
},
{
"epoch": 1.13,
"grad_norm": 0.9227294325828552,
"learning_rate": 1.4330232140276365e-05,
"loss": 0.6425,
"step": 457
},
{
"epoch": 1.13,
"grad_norm": 0.9596818089485168,
"learning_rate": 1.4306116457429146e-05,
"loss": 0.6478,
"step": 458
},
{
"epoch": 1.14,
"grad_norm": 1.0404927730560303,
"learning_rate": 1.4281969991682427e-05,
"loss": 0.6455,
"step": 459
},
{
"epoch": 1.14,
"grad_norm": 1.024898648262024,
"learning_rate": 1.4257792915650728e-05,
"loss": 0.6066,
"step": 460
},
{
"epoch": 1.14,
"grad_norm": 1.0144062042236328,
"learning_rate": 1.4233585402167394e-05,
"loss": 0.6159,
"step": 461
},
{
"epoch": 1.14,
"grad_norm": 1.03654944896698,
"learning_rate": 1.4209347624283352e-05,
"loss": 0.6734,
"step": 462
},
{
"epoch": 1.14,
"grad_norm": 0.9717058539390564,
"learning_rate": 1.418507975526588e-05,
"loss": 0.6392,
"step": 463
},
{
"epoch": 1.15,
"grad_norm": 0.9585022330284119,
"learning_rate": 1.4160781968597372e-05,
"loss": 0.6114,
"step": 464
},
{
"epoch": 1.15,
"grad_norm": 0.9625274538993835,
"learning_rate": 1.4136454437974086e-05,
"loss": 0.6292,
"step": 465
},
{
"epoch": 1.15,
"grad_norm": 0.9512671828269958,
"learning_rate": 1.4112097337304908e-05,
"loss": 0.6408,
"step": 466
},
{
"epoch": 1.15,
"grad_norm": 0.9498721361160278,
"learning_rate": 1.408771084071012e-05,
"loss": 0.6458,
"step": 467
},
{
"epoch": 1.16,
"grad_norm": 0.9445478320121765,
"learning_rate": 1.406329512252013e-05,
"loss": 0.6353,
"step": 468
},
{
"epoch": 1.16,
"grad_norm": 1.0758302211761475,
"learning_rate": 1.4038850357274254e-05,
"loss": 0.6305,
"step": 469
},
{
"epoch": 1.16,
"grad_norm": 0.9867809414863586,
"learning_rate": 1.4014376719719454e-05,
"loss": 0.5953,
"step": 470
},
{
"epoch": 1.16,
"grad_norm": 0.9557908177375793,
"learning_rate": 1.3989874384809077e-05,
"loss": 0.5915,
"step": 471
},
{
"epoch": 1.17,
"grad_norm": 0.8721527457237244,
"learning_rate": 1.3965343527701629e-05,
"loss": 0.612,
"step": 472
},
{
"epoch": 1.17,
"grad_norm": 0.9285739660263062,
"learning_rate": 1.3940784323759511e-05,
"loss": 0.6454,
"step": 473
},
{
"epoch": 1.17,
"grad_norm": 1.063903570175171,
"learning_rate": 1.391619694854776e-05,
"loss": 0.6087,
"step": 474
},
{
"epoch": 1.17,
"grad_norm": 0.8913307189941406,
"learning_rate": 1.3891581577832804e-05,
"loss": 0.6262,
"step": 475
},
{
"epoch": 1.18,
"grad_norm": 1.043076753616333,
"learning_rate": 1.3866938387581199e-05,
"loss": 0.6128,
"step": 476
},
{
"epoch": 1.18,
"grad_norm": 1.027060866355896,
"learning_rate": 1.3842267553958373e-05,
"loss": 0.6639,
"step": 477
},
{
"epoch": 1.18,
"grad_norm": 1.0449680089950562,
"learning_rate": 1.3817569253327363e-05,
"loss": 0.618,
"step": 478
},
{
"epoch": 1.18,
"grad_norm": 0.9534300565719604,
"learning_rate": 1.3792843662247565e-05,
"loss": 0.6048,
"step": 479
},
{
"epoch": 1.19,
"grad_norm": 1.0519123077392578,
"learning_rate": 1.3768090957473464e-05,
"loss": 0.5747,
"step": 480
},
{
"epoch": 1.19,
"grad_norm": 0.9293439388275146,
"learning_rate": 1.3743311315953363e-05,
"loss": 0.5929,
"step": 481
},
{
"epoch": 1.19,
"grad_norm": 0.9092494249343872,
"learning_rate": 1.3718504914828135e-05,
"loss": 0.6259,
"step": 482
},
{
"epoch": 1.19,
"grad_norm": 1.0047491788864136,
"learning_rate": 1.3693671931429941e-05,
"loss": 0.6207,
"step": 483
},
{
"epoch": 1.2,
"grad_norm": 1.0500539541244507,
"learning_rate": 1.3668812543280976e-05,
"loss": 0.6219,
"step": 484
},
{
"epoch": 1.2,
"grad_norm": 0.9104728698730469,
"learning_rate": 1.3643926928092192e-05,
"loss": 0.5948,
"step": 485
},
{
"epoch": 1.2,
"grad_norm": 1.1024607419967651,
"learning_rate": 1.3619015263762028e-05,
"loss": 0.6159,
"step": 486
},
{
"epoch": 1.2,
"grad_norm": 1.0737922191619873,
"learning_rate": 1.3594077728375129e-05,
"loss": 0.6409,
"step": 487
},
{
"epoch": 1.21,
"grad_norm": 0.9735491871833801,
"learning_rate": 1.35691145002011e-05,
"loss": 0.6434,
"step": 488
},
{
"epoch": 1.21,
"grad_norm": 0.9941731691360474,
"learning_rate": 1.3544125757693207e-05,
"loss": 0.6179,
"step": 489
},
{
"epoch": 1.21,
"grad_norm": 0.9234891533851624,
"learning_rate": 1.35191116794871e-05,
"loss": 0.615,
"step": 490
},
{
"epoch": 1.21,
"grad_norm": 1.0279288291931152,
"learning_rate": 1.3494072444399566e-05,
"loss": 0.6516,
"step": 491
},
{
"epoch": 1.22,
"grad_norm": 1.0623533725738525,
"learning_rate": 1.3469008231427207e-05,
"loss": 0.6274,
"step": 492
},
{
"epoch": 1.22,
"grad_norm": 0.9189989566802979,
"learning_rate": 1.3443919219745199e-05,
"loss": 0.5938,
"step": 493
},
{
"epoch": 1.22,
"grad_norm": 1.0581763982772827,
"learning_rate": 1.3418805588705986e-05,
"loss": 0.6587,
"step": 494
},
{
"epoch": 1.22,
"grad_norm": 1.0061285495758057,
"learning_rate": 1.3393667517838012e-05,
"loss": 0.5864,
"step": 495
},
{
"epoch": 1.23,
"grad_norm": 0.9764752388000488,
"learning_rate": 1.3368505186844427e-05,
"loss": 0.5979,
"step": 496
},
{
"epoch": 1.23,
"grad_norm": 1.2984907627105713,
"learning_rate": 1.334331877560182e-05,
"loss": 0.637,
"step": 497
},
{
"epoch": 1.23,
"grad_norm": 1.0544503927230835,
"learning_rate": 1.3318108464158907e-05,
"loss": 0.5968,
"step": 498
},
{
"epoch": 1.23,
"grad_norm": 1.0052608251571655,
"learning_rate": 1.3292874432735268e-05,
"loss": 0.5985,
"step": 499
},
{
"epoch": 1.24,
"grad_norm": 0.8912485837936401,
"learning_rate": 1.3267616861720041e-05,
"loss": 0.6147,
"step": 500
},
{
"epoch": 1.24,
"grad_norm": 1.0198827981948853,
"learning_rate": 1.3242335931670647e-05,
"loss": 0.5905,
"step": 501
},
{
"epoch": 1.24,
"grad_norm": 1.0287050008773804,
"learning_rate": 1.3217031823311488e-05,
"loss": 0.676,
"step": 502
},
{
"epoch": 1.24,
"grad_norm": 0.9980819821357727,
"learning_rate": 1.3191704717532667e-05,
"loss": 0.6149,
"step": 503
},
{
"epoch": 1.25,
"grad_norm": 0.9271026849746704,
"learning_rate": 1.3166354795388677e-05,
"loss": 0.6545,
"step": 504
},
{
"epoch": 1.25,
"grad_norm": 0.8525359630584717,
"learning_rate": 1.3140982238097117e-05,
"loss": 0.603,
"step": 505
},
{
"epoch": 1.25,
"grad_norm": 0.9558356404304504,
"learning_rate": 1.3115587227037408e-05,
"loss": 0.5766,
"step": 506
},
{
"epoch": 1.25,
"grad_norm": 0.9394976496696472,
"learning_rate": 1.3090169943749475e-05,
"loss": 0.6305,
"step": 507
},
{
"epoch": 1.26,
"grad_norm": 1.0280481576919556,
"learning_rate": 1.3064730569932467e-05,
"loss": 0.5879,
"step": 508
},
{
"epoch": 1.26,
"grad_norm": 0.9981361627578735,
"learning_rate": 1.3039269287443442e-05,
"loss": 0.6551,
"step": 509
},
{
"epoch": 1.26,
"grad_norm": 1.0905154943466187,
"learning_rate": 1.301378627829608e-05,
"loss": 0.6621,
"step": 510
},
{
"epoch": 1.26,
"grad_norm": 0.9698947668075562,
"learning_rate": 1.2988281724659375e-05,
"loss": 0.6153,
"step": 511
},
{
"epoch": 1.27,
"grad_norm": 0.9609881043434143,
"learning_rate": 1.2962755808856341e-05,
"loss": 0.5923,
"step": 512
},
{
"epoch": 1.27,
"grad_norm": 1.167883038520813,
"learning_rate": 1.2937208713362694e-05,
"loss": 0.6681,
"step": 513
},
{
"epoch": 1.27,
"grad_norm": 0.9012638330459595,
"learning_rate": 1.2911640620805561e-05,
"loss": 0.6109,
"step": 514
},
{
"epoch": 1.27,
"grad_norm": 0.8938215970993042,
"learning_rate": 1.2886051713962172e-05,
"loss": 0.6522,
"step": 515
},
{
"epoch": 1.28,
"grad_norm": 1.029787302017212,
"learning_rate": 1.2860442175758543e-05,
"loss": 0.6127,
"step": 516
},
{
"epoch": 1.28,
"grad_norm": 1.1626105308532715,
"learning_rate": 1.283481218926818e-05,
"loss": 0.6325,
"step": 517
},
{
"epoch": 1.28,
"grad_norm": 0.9775775671005249,
"learning_rate": 1.280916193771077e-05,
"loss": 0.6329,
"step": 518
},
{
"epoch": 1.28,
"grad_norm": 0.9691373109817505,
"learning_rate": 1.2783491604450869e-05,
"loss": 0.6153,
"step": 519
},
{
"epoch": 1.29,
"grad_norm": 0.9563249349594116,
"learning_rate": 1.2757801372996577e-05,
"loss": 0.644,
"step": 520
},
{
"epoch": 1.29,
"grad_norm": 0.9542986750602722,
"learning_rate": 1.2732091426998258e-05,
"loss": 0.5939,
"step": 521
},
{
"epoch": 1.29,
"grad_norm": 0.9499353170394897,
"learning_rate": 1.270636195024719e-05,
"loss": 0.6172,
"step": 522
},
{
"epoch": 1.29,
"grad_norm": 1.0935438871383667,
"learning_rate": 1.2680613126674285e-05,
"loss": 0.6006,
"step": 523
},
{
"epoch": 1.3,
"grad_norm": 1.1205679178237915,
"learning_rate": 1.2654845140348746e-05,
"loss": 0.6472,
"step": 524
},
{
"epoch": 1.3,
"grad_norm": 0.8894829750061035,
"learning_rate": 1.2629058175476774e-05,
"loss": 0.587,
"step": 525
},
{
"epoch": 1.3,
"grad_norm": 0.9492902755737305,
"learning_rate": 1.2603252416400232e-05,
"loss": 0.6175,
"step": 526
},
{
"epoch": 1.3,
"grad_norm": 1.077660083770752,
"learning_rate": 1.2577428047595343e-05,
"loss": 0.6474,
"step": 527
},
{
"epoch": 1.31,
"grad_norm": 0.9132212996482849,
"learning_rate": 1.255158525367136e-05,
"loss": 0.6248,
"step": 528
},
{
"epoch": 1.31,
"grad_norm": 1.0104243755340576,
"learning_rate": 1.2525724219369253e-05,
"loss": 0.634,
"step": 529
},
{
"epoch": 1.31,
"grad_norm": 0.998405933380127,
"learning_rate": 1.2499845129560386e-05,
"loss": 0.6332,
"step": 530
},
{
"epoch": 1.31,
"grad_norm": 1.0179756879806519,
"learning_rate": 1.2473948169245196e-05,
"loss": 0.6283,
"step": 531
},
{
"epoch": 1.32,
"grad_norm": 0.9786280989646912,
"learning_rate": 1.2448033523551866e-05,
"loss": 0.66,
"step": 532
},
{
"epoch": 1.32,
"grad_norm": 0.9850865006446838,
"learning_rate": 1.2422101377735007e-05,
"loss": 0.6504,
"step": 533
},
{
"epoch": 1.32,
"grad_norm": 0.8938550353050232,
"learning_rate": 1.2396151917174335e-05,
"loss": 0.5528,
"step": 534
},
{
"epoch": 1.32,
"grad_norm": 1.077877402305603,
"learning_rate": 1.2370185327373341e-05,
"loss": 0.5879,
"step": 535
},
{
"epoch": 1.33,
"grad_norm": 1.0089164972305298,
"learning_rate": 1.234420179395797e-05,
"loss": 0.6132,
"step": 536
},
{
"epoch": 1.33,
"grad_norm": 0.9469050765037537,
"learning_rate": 1.2318201502675285e-05,
"loss": 0.571,
"step": 537
},
{
"epoch": 1.33,
"grad_norm": 0.9590569138526917,
"learning_rate": 1.2292184639392146e-05,
"loss": 0.6046,
"step": 538
},
{
"epoch": 1.33,
"grad_norm": 1.0073339939117432,
"learning_rate": 1.2266151390093887e-05,
"loss": 0.6111,
"step": 539
},
{
"epoch": 1.34,
"grad_norm": 0.997769832611084,
"learning_rate": 1.224010194088297e-05,
"loss": 0.6198,
"step": 540
},
{
"epoch": 1.34,
"grad_norm": 1.0480035543441772,
"learning_rate": 1.2214036477977675e-05,
"loss": 0.654,
"step": 541
},
{
"epoch": 1.34,
"grad_norm": 1.042383074760437,
"learning_rate": 1.2187955187710752e-05,
"loss": 0.6222,
"step": 542
},
{
"epoch": 1.34,
"grad_norm": 1.0312013626098633,
"learning_rate": 1.2161858256528092e-05,
"loss": 0.5356,
"step": 543
},
{
"epoch": 1.35,
"grad_norm": 1.0612717866897583,
"learning_rate": 1.2135745870987406e-05,
"loss": 0.6243,
"step": 544
},
{
"epoch": 1.35,
"grad_norm": 0.9805787205696106,
"learning_rate": 1.2109618217756876e-05,
"loss": 0.6197,
"step": 545
},
{
"epoch": 1.35,
"grad_norm": 1.0467524528503418,
"learning_rate": 1.2083475483613828e-05,
"loss": 0.6377,
"step": 546
},
{
"epoch": 1.35,
"grad_norm": 0.9999006390571594,
"learning_rate": 1.2057317855443395e-05,
"loss": 0.6516,
"step": 547
},
{
"epoch": 1.36,
"grad_norm": 0.9280255436897278,
"learning_rate": 1.2031145520237194e-05,
"loss": 0.6058,
"step": 548
},
{
"epoch": 1.36,
"grad_norm": 0.9208453893661499,
"learning_rate": 1.2004958665091964e-05,
"loss": 0.6071,
"step": 549
},
{
"epoch": 1.36,
"grad_norm": 1.0235896110534668,
"learning_rate": 1.1978757477208242e-05,
"loss": 0.5772,
"step": 550
},
{
"epoch": 1.36,
"grad_norm": 0.9581038951873779,
"learning_rate": 1.1952542143889034e-05,
"loss": 0.6438,
"step": 551
},
{
"epoch": 1.37,
"grad_norm": 0.9629083871841431,
"learning_rate": 1.1926312852538456e-05,
"loss": 0.5897,
"step": 552
},
{
"epoch": 1.37,
"grad_norm": 1.1396433115005493,
"learning_rate": 1.1900069790660411e-05,
"loss": 0.6267,
"step": 553
},
{
"epoch": 1.37,
"grad_norm": 0.9454533457756042,
"learning_rate": 1.187381314585725e-05,
"loss": 0.6167,
"step": 554
},
{
"epoch": 1.37,
"grad_norm": 0.9393343329429626,
"learning_rate": 1.1847543105828404e-05,
"loss": 0.5861,
"step": 555
},
{
"epoch": 1.37,
"grad_norm": 0.9355834722518921,
"learning_rate": 1.1821259858369082e-05,
"loss": 0.6694,
"step": 556
},
{
"epoch": 1.38,
"grad_norm": 0.9925780892372131,
"learning_rate": 1.1794963591368893e-05,
"loss": 0.5937,
"step": 557
},
{
"epoch": 1.38,
"grad_norm": 0.9895220398902893,
"learning_rate": 1.1768654492810525e-05,
"loss": 0.6416,
"step": 558
},
{
"epoch": 1.38,
"grad_norm": 1.0594924688339233,
"learning_rate": 1.1742332750768402e-05,
"loss": 0.6139,
"step": 559
},
{
"epoch": 1.38,
"grad_norm": 1.1669124364852905,
"learning_rate": 1.1715998553407315e-05,
"loss": 0.6561,
"step": 560
},
{
"epoch": 1.39,
"grad_norm": 0.9161636829376221,
"learning_rate": 1.1689652088981102e-05,
"loss": 0.5935,
"step": 561
},
{
"epoch": 1.39,
"grad_norm": 0.8627362251281738,
"learning_rate": 1.1663293545831302e-05,
"loss": 0.6457,
"step": 562
},
{
"epoch": 1.39,
"grad_norm": 0.962069571018219,
"learning_rate": 1.1636923112385785e-05,
"loss": 0.6398,
"step": 563
},
{
"epoch": 1.39,
"grad_norm": 0.9659212827682495,
"learning_rate": 1.161054097715743e-05,
"loss": 0.5907,
"step": 564
},
{
"epoch": 1.4,
"grad_norm": 0.9934397339820862,
"learning_rate": 1.1584147328742767e-05,
"loss": 0.6029,
"step": 565
},
{
"epoch": 1.4,
"grad_norm": 0.9827930331230164,
"learning_rate": 1.155774235582063e-05,
"loss": 0.5824,
"step": 566
},
{
"epoch": 1.4,
"grad_norm": 0.9656209945678711,
"learning_rate": 1.1531326247150802e-05,
"loss": 0.5986,
"step": 567
},
{
"epoch": 1.4,
"grad_norm": 1.017004132270813,
"learning_rate": 1.1504899191572682e-05,
"loss": 0.5915,
"step": 568
},
{
"epoch": 1.41,
"grad_norm": 0.9455320835113525,
"learning_rate": 1.1478461378003913e-05,
"loss": 0.6283,
"step": 569
},
{
"epoch": 1.41,
"grad_norm": 0.9357017278671265,
"learning_rate": 1.145201299543905e-05,
"loss": 0.6459,
"step": 570
},
{
"epoch": 1.41,
"grad_norm": 0.9253040552139282,
"learning_rate": 1.1425554232948206e-05,
"loss": 0.6166,
"step": 571
},
{
"epoch": 1.41,
"grad_norm": 1.0472208261489868,
"learning_rate": 1.1399085279675688e-05,
"loss": 0.6671,
"step": 572
},
{
"epoch": 1.42,
"grad_norm": 0.9446719288825989,
"learning_rate": 1.1372606324838651e-05,
"loss": 0.5965,
"step": 573
},
{
"epoch": 1.42,
"grad_norm": 0.9347842931747437,
"learning_rate": 1.1346117557725757e-05,
"loss": 0.637,
"step": 574
},
{
"epoch": 1.42,
"grad_norm": 1.023077368736267,
"learning_rate": 1.1319619167695814e-05,
"loss": 0.6457,
"step": 575
},
{
"epoch": 1.42,
"grad_norm": 0.949116051197052,
"learning_rate": 1.1293111344176406e-05,
"loss": 0.6483,
"step": 576
},
{
"epoch": 1.43,
"grad_norm": 1.096680760383606,
"learning_rate": 1.126659427666257e-05,
"loss": 0.6242,
"step": 577
},
{
"epoch": 1.43,
"grad_norm": 0.8694916367530823,
"learning_rate": 1.1240068154715416e-05,
"loss": 0.6179,
"step": 578
},
{
"epoch": 1.43,
"grad_norm": 1.0984961986541748,
"learning_rate": 1.121353316796078e-05,
"loss": 0.6552,
"step": 579
},
{
"epoch": 1.43,
"grad_norm": 0.9651594161987305,
"learning_rate": 1.1186989506087876e-05,
"loss": 0.5931,
"step": 580
},
{
"epoch": 1.44,
"grad_norm": 0.9294801354408264,
"learning_rate": 1.116043735884793e-05,
"loss": 0.5662,
"step": 581
},
{
"epoch": 1.44,
"grad_norm": 1.0226821899414062,
"learning_rate": 1.1133876916052822e-05,
"loss": 0.6017,
"step": 582
},
{
"epoch": 1.44,
"grad_norm": 1.0314971208572388,
"learning_rate": 1.1107308367573744e-05,
"loss": 0.6144,
"step": 583
},
{
"epoch": 1.44,
"grad_norm": 0.9721775650978088,
"learning_rate": 1.1080731903339825e-05,
"loss": 0.6361,
"step": 584
},
{
"epoch": 1.45,
"grad_norm": 0.9814489483833313,
"learning_rate": 1.1054147713336782e-05,
"loss": 0.6227,
"step": 585
},
{
"epoch": 1.45,
"grad_norm": 0.9761776924133301,
"learning_rate": 1.1027555987605562e-05,
"loss": 0.6154,
"step": 586
},
{
"epoch": 1.45,
"grad_norm": 1.0100598335266113,
"learning_rate": 1.1000956916240985e-05,
"loss": 0.6178,
"step": 587
},
{
"epoch": 1.45,
"grad_norm": 1.193156123161316,
"learning_rate": 1.0974350689390376e-05,
"loss": 0.6675,
"step": 588
},
{
"epoch": 1.46,
"grad_norm": 1.0012152194976807,
"learning_rate": 1.094773749725222e-05,
"loss": 0.6202,
"step": 589
},
{
"epoch": 1.46,
"grad_norm": 0.9017734527587891,
"learning_rate": 1.0921117530074785e-05,
"loss": 0.619,
"step": 590
},
{
"epoch": 1.46,
"grad_norm": 0.932767391204834,
"learning_rate": 1.0894490978154777e-05,
"loss": 0.5801,
"step": 591
},
{
"epoch": 1.46,
"grad_norm": 0.9889426827430725,
"learning_rate": 1.0867858031835975e-05,
"loss": 0.6712,
"step": 592
},
{
"epoch": 1.47,
"grad_norm": 1.012459635734558,
"learning_rate": 1.084121888150787e-05,
"loss": 0.6553,
"step": 593
},
{
"epoch": 1.47,
"grad_norm": 1.0420746803283691,
"learning_rate": 1.0814573717604295e-05,
"loss": 0.5916,
"step": 594
},
{
"epoch": 1.47,
"grad_norm": 1.005211353302002,
"learning_rate": 1.0787922730602083e-05,
"loss": 0.6043,
"step": 595
},
{
"epoch": 1.47,
"grad_norm": 1.069198489189148,
"learning_rate": 1.0761266111019685e-05,
"loss": 0.598,
"step": 596
},
{
"epoch": 1.48,
"grad_norm": 0.9799802899360657,
"learning_rate": 1.0734604049415822e-05,
"loss": 0.5933,
"step": 597
},
{
"epoch": 1.48,
"grad_norm": 1.0022966861724854,
"learning_rate": 1.070793673638812e-05,
"loss": 0.6514,
"step": 598
},
{
"epoch": 1.48,
"grad_norm": 1.001762866973877,
"learning_rate": 1.0681264362571744e-05,
"loss": 0.6686,
"step": 599
},
{
"epoch": 1.48,
"grad_norm": 0.985598087310791,
"learning_rate": 1.0654587118638027e-05,
"loss": 0.601,
"step": 600
},
{
"epoch": 1.49,
"grad_norm": 0.9582387804985046,
"learning_rate": 1.0627905195293135e-05,
"loss": 0.6139,
"step": 601
},
{
"epoch": 1.49,
"grad_norm": 1.035148024559021,
"learning_rate": 1.0601218783276673e-05,
"loss": 0.5909,
"step": 602
},
{
"epoch": 1.49,
"grad_norm": 1.0642892122268677,
"learning_rate": 1.0574528073360333e-05,
"loss": 0.6194,
"step": 603
},
{
"epoch": 1.49,
"grad_norm": 1.1434149742126465,
"learning_rate": 1.054783325634654e-05,
"loss": 0.5967,
"step": 604
},
{
"epoch": 1.5,
"grad_norm": 1.0699044466018677,
"learning_rate": 1.0521134523067076e-05,
"loss": 0.652,
"step": 605
},
{
"epoch": 1.5,
"grad_norm": 0.926213800907135,
"learning_rate": 1.0494432064381707e-05,
"loss": 0.6236,
"step": 606
},
{
"epoch": 1.5,
"grad_norm": 1.1603809595108032,
"learning_rate": 1.0467726071176854e-05,
"loss": 0.5861,
"step": 607
},
{
"epoch": 1.5,
"grad_norm": 1.1133846044540405,
"learning_rate": 1.044101673436418e-05,
"loss": 0.6999,
"step": 608
},
{
"epoch": 1.51,
"grad_norm": 1.0773183107376099,
"learning_rate": 1.041430424487927e-05,
"loss": 0.6397,
"step": 609
},
{
"epoch": 1.51,
"grad_norm": 0.9208213686943054,
"learning_rate": 1.0387588793680235e-05,
"loss": 0.6361,
"step": 610
},
{
"epoch": 1.51,
"grad_norm": 0.9352226853370667,
"learning_rate": 1.0360870571746364e-05,
"loss": 0.609,
"step": 611
},
{
"epoch": 1.51,
"grad_norm": 1.0909321308135986,
"learning_rate": 1.0334149770076747e-05,
"loss": 0.6224,
"step": 612
},
{
"epoch": 1.52,
"grad_norm": 0.9950162172317505,
"learning_rate": 1.0307426579688924e-05,
"loss": 0.6293,
"step": 613
},
{
"epoch": 1.52,
"grad_norm": 1.1023386716842651,
"learning_rate": 1.0280701191617502e-05,
"loss": 0.5652,
"step": 614
},
{
"epoch": 1.52,
"grad_norm": 1.1158206462860107,
"learning_rate": 1.0253973796912801e-05,
"loss": 0.6174,
"step": 615
},
{
"epoch": 1.52,
"grad_norm": 1.006576657295227,
"learning_rate": 1.0227244586639498e-05,
"loss": 0.6335,
"step": 616
},
{
"epoch": 1.53,
"grad_norm": 1.0058975219726562,
"learning_rate": 1.0200513751875227e-05,
"loss": 0.6564,
"step": 617
},
{
"epoch": 1.53,
"grad_norm": 0.9998272657394409,
"learning_rate": 1.0173781483709253e-05,
"loss": 0.6517,
"step": 618
},
{
"epoch": 1.53,
"grad_norm": 0.9474151730537415,
"learning_rate": 1.0147047973241078e-05,
"loss": 0.5813,
"step": 619
},
{
"epoch": 1.53,
"grad_norm": 0.9911189079284668,
"learning_rate": 1.012031341157909e-05,
"loss": 0.6308,
"step": 620
},
{
"epoch": 1.54,
"grad_norm": 1.004300832748413,
"learning_rate": 1.009357798983919e-05,
"loss": 0.5864,
"step": 621
},
{
"epoch": 1.54,
"grad_norm": 0.9834850430488586,
"learning_rate": 1.0066841899143424e-05,
"loss": 0.6631,
"step": 622
},
{
"epoch": 1.54,
"grad_norm": 1.0142207145690918,
"learning_rate": 1.0040105330618624e-05,
"loss": 0.6337,
"step": 623
},
{
"epoch": 1.54,
"grad_norm": 1.0587005615234375,
"learning_rate": 1.001336847539504e-05,
"loss": 0.5773,
"step": 624
},
{
"epoch": 1.55,
"grad_norm": 1.0602400302886963,
"learning_rate": 9.986631524604967e-06,
"loss": 0.627,
"step": 625
},
{
"epoch": 1.55,
"grad_norm": 1.0758881568908691,
"learning_rate": 9.95989466938138e-06,
"loss": 0.6312,
"step": 626
},
{
"epoch": 1.55,
"grad_norm": 0.9769020080566406,
"learning_rate": 9.93315810085658e-06,
"loss": 0.6003,
"step": 627
},
{
"epoch": 1.55,
"grad_norm": 0.8988069891929626,
"learning_rate": 9.906422010160815e-06,
"loss": 0.638,
"step": 628
},
{
"epoch": 1.56,
"grad_norm": 1.04930579662323,
"learning_rate": 9.879686588420912e-06,
"loss": 0.6,
"step": 629
},
{
"epoch": 1.56,
"grad_norm": 0.9747940897941589,
"learning_rate": 9.852952026758923e-06,
"loss": 0.6223,
"step": 630
},
{
"epoch": 1.56,
"grad_norm": 0.9301921725273132,
"learning_rate": 9.826218516290749e-06,
"loss": 0.5485,
"step": 631
},
{
"epoch": 1.56,
"grad_norm": 1.037238597869873,
"learning_rate": 9.799486248124775e-06,
"loss": 0.573,
"step": 632
},
{
"epoch": 1.57,
"grad_norm": 0.9284355044364929,
"learning_rate": 9.772755413360505e-06,
"loss": 0.5852,
"step": 633
},
{
"epoch": 1.57,
"grad_norm": 0.9785302877426147,
"learning_rate": 9.746026203087198e-06,
"loss": 0.666,
"step": 634
},
{
"epoch": 1.57,
"grad_norm": 0.9351165294647217,
"learning_rate": 9.719298808382502e-06,
"loss": 0.6178,
"step": 635
},
{
"epoch": 1.57,
"grad_norm": 1.0524693727493286,
"learning_rate": 9.69257342031108e-06,
"loss": 0.6928,
"step": 636
},
{
"epoch": 1.58,
"grad_norm": 1.005279541015625,
"learning_rate": 9.665850229923258e-06,
"loss": 0.5992,
"step": 637
},
{
"epoch": 1.58,
"grad_norm": 1.0178667306900024,
"learning_rate": 9.639129428253639e-06,
"loss": 0.6411,
"step": 638
},
{
"epoch": 1.58,
"grad_norm": 0.9722139239311218,
"learning_rate": 9.612411206319765e-06,
"loss": 0.6265,
"step": 639
},
{
"epoch": 1.58,
"grad_norm": 0.9661009907722473,
"learning_rate": 9.585695755120735e-06,
"loss": 0.649,
"step": 640
},
{
"epoch": 1.59,
"grad_norm": 1.2120968103408813,
"learning_rate": 9.558983265635822e-06,
"loss": 0.6839,
"step": 641
},
{
"epoch": 1.59,
"grad_norm": 1.0278242826461792,
"learning_rate": 9.532273928823151e-06,
"loss": 0.5958,
"step": 642
},
{
"epoch": 1.59,
"grad_norm": 0.9723142385482788,
"learning_rate": 9.505567935618295e-06,
"loss": 0.6621,
"step": 643
},
{
"epoch": 1.59,
"grad_norm": 1.026511549949646,
"learning_rate": 9.47886547693293e-06,
"loss": 0.6396,
"step": 644
},
{
"epoch": 1.6,
"grad_norm": 0.996562659740448,
"learning_rate": 9.452166743653461e-06,
"loss": 0.6786,
"step": 645
},
{
"epoch": 1.6,
"grad_norm": 0.9691541194915771,
"learning_rate": 9.425471926639667e-06,
"loss": 0.6305,
"step": 646
},
{
"epoch": 1.6,
"grad_norm": 1.1188807487487793,
"learning_rate": 9.39878121672333e-06,
"loss": 0.6518,
"step": 647
},
{
"epoch": 1.6,
"grad_norm": 0.8913319110870361,
"learning_rate": 9.372094804706867e-06,
"loss": 0.6458,
"step": 648
},
{
"epoch": 1.6,
"grad_norm": 1.0041614770889282,
"learning_rate": 9.345412881361978e-06,
"loss": 0.6229,
"step": 649
},
{
"epoch": 1.61,
"grad_norm": 0.9724077582359314,
"learning_rate": 9.31873563742826e-06,
"loss": 0.5926,
"step": 650
},
{
"epoch": 1.61,
"grad_norm": 1.0334805250167847,
"learning_rate": 9.29206326361188e-06,
"loss": 0.6482,
"step": 651
},
{
"epoch": 1.61,
"grad_norm": 0.955544650554657,
"learning_rate": 9.265395950584181e-06,
"loss": 0.6301,
"step": 652
},
{
"epoch": 1.61,
"grad_norm": 0.9824047088623047,
"learning_rate": 9.238733888980316e-06,
"loss": 0.5432,
"step": 653
},
{
"epoch": 1.62,
"grad_norm": 1.0226794481277466,
"learning_rate": 9.21207726939792e-06,
"loss": 0.644,
"step": 654
},
{
"epoch": 1.62,
"grad_norm": 1.1623178720474243,
"learning_rate": 9.185426282395707e-06,
"loss": 0.6399,
"step": 655
},
{
"epoch": 1.62,
"grad_norm": 1.005444049835205,
"learning_rate": 9.158781118492133e-06,
"loss": 0.5909,
"step": 656
},
{
"epoch": 1.62,
"grad_norm": 1.0435283184051514,
"learning_rate": 9.132141968164026e-06,
"loss": 0.6455,
"step": 657
},
{
"epoch": 1.63,
"grad_norm": 0.9722042083740234,
"learning_rate": 9.105509021845224e-06,
"loss": 0.5917,
"step": 658
},
{
"epoch": 1.63,
"grad_norm": 1.1557525396347046,
"learning_rate": 9.078882469925219e-06,
"loss": 0.6092,
"step": 659
},
{
"epoch": 1.63,
"grad_norm": 0.9559000730514526,
"learning_rate": 9.052262502747784e-06,
"loss": 0.6808,
"step": 660
},
{
"epoch": 1.63,
"grad_norm": 0.9796317219734192,
"learning_rate": 9.025649310609627e-06,
"loss": 0.592,
"step": 661
},
{
"epoch": 1.64,
"grad_norm": 1.1970458030700684,
"learning_rate": 8.999043083759016e-06,
"loss": 0.6143,
"step": 662
},
{
"epoch": 1.64,
"grad_norm": 1.0335098505020142,
"learning_rate": 8.97244401239444e-06,
"loss": 0.6122,
"step": 663
},
{
"epoch": 1.64,
"grad_norm": 1.0141277313232422,
"learning_rate": 8.945852286663224e-06,
"loss": 0.6471,
"step": 664
},
{
"epoch": 1.64,
"grad_norm": 1.0685157775878906,
"learning_rate": 8.919268096660178e-06,
"loss": 0.648,
"step": 665
},
{
"epoch": 1.65,
"grad_norm": 1.0332282781600952,
"learning_rate": 8.89269163242626e-06,
"loss": 0.583,
"step": 666
},
{
"epoch": 1.65,
"grad_norm": 1.0065079927444458,
"learning_rate": 8.866123083947182e-06,
"loss": 0.6102,
"step": 667
},
{
"epoch": 1.65,
"grad_norm": 1.0752369165420532,
"learning_rate": 8.839562641152074e-06,
"loss": 0.6244,
"step": 668
},
{
"epoch": 1.65,
"grad_norm": 1.0231661796569824,
"learning_rate": 8.813010493912127e-06,
"loss": 0.6492,
"step": 669
},
{
"epoch": 1.66,
"grad_norm": 1.0296612977981567,
"learning_rate": 8.786466832039222e-06,
"loss": 0.6345,
"step": 670
},
{
"epoch": 1.66,
"grad_norm": 0.9212373495101929,
"learning_rate": 8.759931845284589e-06,
"loss": 0.61,
"step": 671
},
{
"epoch": 1.66,
"grad_norm": 0.9772341251373291,
"learning_rate": 8.733405723337433e-06,
"loss": 0.6099,
"step": 672
},
{
"epoch": 1.66,
"grad_norm": 1.0963752269744873,
"learning_rate": 8.706888655823594e-06,
"loss": 0.6536,
"step": 673
},
{
"epoch": 1.67,
"grad_norm": 1.0175827741622925,
"learning_rate": 8.680380832304189e-06,
"loss": 0.5874,
"step": 674
},
{
"epoch": 1.67,
"grad_norm": 0.9240247011184692,
"learning_rate": 8.653882442274243e-06,
"loss": 0.6009,
"step": 675
},
{
"epoch": 1.67,
"grad_norm": 1.154167652130127,
"learning_rate": 8.627393675161354e-06,
"loss": 0.608,
"step": 676
},
{
"epoch": 1.67,
"grad_norm": 1.7089484930038452,
"learning_rate": 8.600914720324315e-06,
"loss": 0.6657,
"step": 677
},
{
"epoch": 1.68,
"grad_norm": 0.9948485493659973,
"learning_rate": 8.574445767051794e-06,
"loss": 0.6419,
"step": 678
},
{
"epoch": 1.68,
"grad_norm": 0.9488326907157898,
"learning_rate": 8.547987004560952e-06,
"loss": 0.6241,
"step": 679
},
{
"epoch": 1.68,
"grad_norm": 1.0953195095062256,
"learning_rate": 8.521538621996087e-06,
"loss": 0.5945,
"step": 680
},
{
"epoch": 1.68,
"grad_norm": 0.9409649968147278,
"learning_rate": 8.495100808427323e-06,
"loss": 0.5869,
"step": 681
},
{
"epoch": 1.69,
"grad_norm": 1.0395634174346924,
"learning_rate": 8.468673752849201e-06,
"loss": 0.5999,
"step": 682
},
{
"epoch": 1.69,
"grad_norm": 0.996700644493103,
"learning_rate": 8.442257644179374e-06,
"loss": 0.601,
"step": 683
},
{
"epoch": 1.69,
"grad_norm": 1.024767518043518,
"learning_rate": 8.415852671257235e-06,
"loss": 0.6195,
"step": 684
},
{
"epoch": 1.69,
"grad_norm": 0.9146267175674438,
"learning_rate": 8.38945902284257e-06,
"loss": 0.595,
"step": 685
},
{
"epoch": 1.7,
"grad_norm": 1.0181987285614014,
"learning_rate": 8.363076887614218e-06,
"loss": 0.6269,
"step": 686
},
{
"epoch": 1.7,
"grad_norm": 0.9889507293701172,
"learning_rate": 8.336706454168701e-06,
"loss": 0.6069,
"step": 687
},
{
"epoch": 1.7,
"grad_norm": 0.9235756397247314,
"learning_rate": 8.3103479110189e-06,
"loss": 0.586,
"step": 688
},
{
"epoch": 1.7,
"grad_norm": 1.0416637659072876,
"learning_rate": 8.284001446592687e-06,
"loss": 0.6085,
"step": 689
},
{
"epoch": 1.71,
"grad_norm": 1.0352030992507935,
"learning_rate": 8.2576672492316e-06,
"loss": 0.6387,
"step": 690
},
{
"epoch": 1.71,
"grad_norm": 0.9668939113616943,
"learning_rate": 8.231345507189478e-06,
"loss": 0.6113,
"step": 691
},
{
"epoch": 1.71,
"grad_norm": 1.0079621076583862,
"learning_rate": 8.20503640863111e-06,
"loss": 0.6257,
"step": 692
},
{
"epoch": 1.71,
"grad_norm": 0.9561861753463745,
"learning_rate": 8.178740141630925e-06,
"loss": 0.6228,
"step": 693
},
{
"epoch": 1.72,
"grad_norm": 0.8857633471488953,
"learning_rate": 8.1524568941716e-06,
"loss": 0.6184,
"step": 694
},
{
"epoch": 1.72,
"grad_norm": 0.9727585315704346,
"learning_rate": 8.126186854142752e-06,
"loss": 0.6176,
"step": 695
},
{
"epoch": 1.72,
"grad_norm": 0.9954246282577515,
"learning_rate": 8.09993020933959e-06,
"loss": 0.6427,
"step": 696
},
{
"epoch": 1.72,
"grad_norm": 0.9415956735610962,
"learning_rate": 8.073687147461548e-06,
"loss": 0.6224,
"step": 697
},
{
"epoch": 1.73,
"grad_norm": 1.0226082801818848,
"learning_rate": 8.047457856110972e-06,
"loss": 0.5697,
"step": 698
},
{
"epoch": 1.73,
"grad_norm": 0.932956337928772,
"learning_rate": 8.021242522791761e-06,
"loss": 0.5983,
"step": 699
},
{
"epoch": 1.73,
"grad_norm": 0.9568089246749878,
"learning_rate": 7.99504133490804e-06,
"loss": 0.6338,
"step": 700
},
{
"epoch": 1.73,
"grad_norm": 0.891200840473175,
"learning_rate": 7.968854479762807e-06,
"loss": 0.6253,
"step": 701
},
{
"epoch": 1.74,
"grad_norm": 0.9749317765235901,
"learning_rate": 7.942682144556605e-06,
"loss": 0.5868,
"step": 702
},
{
"epoch": 1.74,
"grad_norm": 1.0237942934036255,
"learning_rate": 7.916524516386177e-06,
"loss": 0.6026,
"step": 703
},
{
"epoch": 1.74,
"grad_norm": 0.8944117426872253,
"learning_rate": 7.890381782243129e-06,
"loss": 0.5886,
"step": 704
},
{
"epoch": 1.74,
"grad_norm": 0.8560101389884949,
"learning_rate": 7.864254129012599e-06,
"loss": 0.5571,
"step": 705
},
{
"epoch": 1.75,
"grad_norm": 0.9961780905723572,
"learning_rate": 7.838141743471912e-06,
"loss": 0.6196,
"step": 706
},
{
"epoch": 1.75,
"grad_norm": 0.9380569458007812,
"learning_rate": 7.81204481228925e-06,
"loss": 0.6182,
"step": 707
},
{
"epoch": 1.75,
"grad_norm": 0.881510853767395,
"learning_rate": 7.785963522022328e-06,
"loss": 0.6177,
"step": 708
},
{
"epoch": 1.75,
"grad_norm": 1.0555378198623657,
"learning_rate": 7.759898059117031e-06,
"loss": 0.5773,
"step": 709
},
{
"epoch": 1.76,
"grad_norm": 1.115706205368042,
"learning_rate": 7.733848609906118e-06,
"loss": 0.6126,
"step": 710
},
{
"epoch": 1.76,
"grad_norm": 1.0175631046295166,
"learning_rate": 7.707815360607857e-06,
"loss": 0.6435,
"step": 711
},
{
"epoch": 1.76,
"grad_norm": 0.9289364218711853,
"learning_rate": 7.681798497324717e-06,
"loss": 0.6466,
"step": 712
},
{
"epoch": 1.76,
"grad_norm": 1.0058919191360474,
"learning_rate": 7.655798206042033e-06,
"loss": 0.6462,
"step": 713
},
{
"epoch": 1.77,
"grad_norm": 1.0242526531219482,
"learning_rate": 7.629814672626659e-06,
"loss": 0.6278,
"step": 714
},
{
"epoch": 1.77,
"grad_norm": 1.0230293273925781,
"learning_rate": 7.603848082825667e-06,
"loss": 0.6173,
"step": 715
},
{
"epoch": 1.77,
"grad_norm": 0.9613228440284729,
"learning_rate": 7.577898622264995e-06,
"loss": 0.6383,
"step": 716
},
{
"epoch": 1.77,
"grad_norm": 1.0192224979400635,
"learning_rate": 7.55196647644814e-06,
"loss": 0.6351,
"step": 717
},
{
"epoch": 1.78,
"grad_norm": 0.9832161664962769,
"learning_rate": 7.526051830754806e-06,
"loss": 0.5862,
"step": 718
},
{
"epoch": 1.78,
"grad_norm": 0.9572023749351501,
"learning_rate": 7.500154870439613e-06,
"loss": 0.624,
"step": 719
},
{
"epoch": 1.78,
"grad_norm": 0.9719714522361755,
"learning_rate": 7.474275780630749e-06,
"loss": 0.5904,
"step": 720
},
{
"epoch": 1.78,
"grad_norm": 1.0092636346817017,
"learning_rate": 7.4484147463286425e-06,
"loss": 0.5794,
"step": 721
},
{
"epoch": 1.79,
"grad_norm": 0.9607662558555603,
"learning_rate": 7.422571952404662e-06,
"loss": 0.5801,
"step": 722
},
{
"epoch": 1.79,
"grad_norm": 0.9463273286819458,
"learning_rate": 7.3967475835997715e-06,
"loss": 0.5916,
"step": 723
},
{
"epoch": 1.79,
"grad_norm": 0.9618302583694458,
"learning_rate": 7.37094182452323e-06,
"loss": 0.6052,
"step": 724
},
{
"epoch": 1.79,
"grad_norm": 1.005319356918335,
"learning_rate": 7.345154859651258e-06,
"loss": 0.6004,
"step": 725
},
{
"epoch": 1.8,
"grad_norm": 1.0791444778442383,
"learning_rate": 7.319386873325718e-06,
"loss": 0.6356,
"step": 726
},
{
"epoch": 1.8,
"grad_norm": 1.1857832670211792,
"learning_rate": 7.293638049752813e-06,
"loss": 0.6772,
"step": 727
},
{
"epoch": 1.8,
"grad_norm": 0.9920181632041931,
"learning_rate": 7.267908573001745e-06,
"loss": 0.5627,
"step": 728
},
{
"epoch": 1.8,
"grad_norm": 1.02803635597229,
"learning_rate": 7.242198627003423e-06,
"loss": 0.622,
"step": 729
},
{
"epoch": 1.81,
"grad_norm": 1.0053447484970093,
"learning_rate": 7.216508395549134e-06,
"loss": 0.6066,
"step": 730
},
{
"epoch": 1.81,
"grad_norm": 1.0622904300689697,
"learning_rate": 7.19083806228923e-06,
"loss": 0.6346,
"step": 731
},
{
"epoch": 1.81,
"grad_norm": 0.9941219091415405,
"learning_rate": 7.165187810731824e-06,
"loss": 0.6347,
"step": 732
},
{
"epoch": 1.81,
"grad_norm": 0.975704550743103,
"learning_rate": 7.13955782424146e-06,
"loss": 0.6394,
"step": 733
},
{
"epoch": 1.82,
"grad_norm": 1.0743510723114014,
"learning_rate": 7.1139482860378325e-06,
"loss": 0.651,
"step": 734
},
{
"epoch": 1.82,
"grad_norm": 0.9468650221824646,
"learning_rate": 7.0883593791944405e-06,
"loss": 0.6473,
"step": 735
},
{
"epoch": 1.82,
"grad_norm": 1.0278937816619873,
"learning_rate": 7.062791286637307e-06,
"loss": 0.677,
"step": 736
},
{
"epoch": 1.82,
"grad_norm": 1.0085780620574951,
"learning_rate": 7.037244191143662e-06,
"loss": 0.5808,
"step": 737
},
{
"epoch": 1.83,
"grad_norm": 0.9366616606712341,
"learning_rate": 7.011718275340626e-06,
"loss": 0.6272,
"step": 738
},
{
"epoch": 1.83,
"grad_norm": 1.1304408311843872,
"learning_rate": 6.986213721703925e-06,
"loss": 0.6539,
"step": 739
},
{
"epoch": 1.83,
"grad_norm": 0.9665270447731018,
"learning_rate": 6.960730712556561e-06,
"loss": 0.6165,
"step": 740
},
{
"epoch": 1.83,
"grad_norm": 1.007178783416748,
"learning_rate": 6.9352694300675345e-06,
"loss": 0.617,
"step": 741
},
{
"epoch": 1.83,
"grad_norm": 0.9570013284683228,
"learning_rate": 6.909830056250527e-06,
"loss": 0.6026,
"step": 742
},
{
"epoch": 1.84,
"grad_norm": 1.0597633123397827,
"learning_rate": 6.884412772962594e-06,
"loss": 0.561,
"step": 743
},
{
"epoch": 1.84,
"grad_norm": 0.9947226047515869,
"learning_rate": 6.859017761902888e-06,
"loss": 0.6183,
"step": 744
},
{
"epoch": 1.84,
"grad_norm": 1.0463498830795288,
"learning_rate": 6.8336452046113276e-06,
"loss": 0.5819,
"step": 745
},
{
"epoch": 1.84,
"grad_norm": 1.047746181488037,
"learning_rate": 6.8082952824673345e-06,
"loss": 0.6718,
"step": 746
},
{
"epoch": 1.85,
"grad_norm": 0.9246806502342224,
"learning_rate": 6.782968176688514e-06,
"loss": 0.5836,
"step": 747
},
{
"epoch": 1.85,
"grad_norm": 0.9811944365501404,
"learning_rate": 6.757664068329353e-06,
"loss": 0.6438,
"step": 748
},
{
"epoch": 1.85,
"grad_norm": 0.9842390418052673,
"learning_rate": 6.732383138279963e-06,
"loss": 0.618,
"step": 749
},
{
"epoch": 1.85,
"grad_norm": 1.0551859140396118,
"learning_rate": 6.7071255672647366e-06,
"loss": 0.6041,
"step": 750
},
{
"epoch": 1.86,
"grad_norm": 1.0227152109146118,
"learning_rate": 6.681891535841094e-06,
"loss": 0.6214,
"step": 751
},
{
"epoch": 1.86,
"grad_norm": 1.0405597686767578,
"learning_rate": 6.656681224398182e-06,
"loss": 0.6428,
"step": 752
},
{
"epoch": 1.86,
"grad_norm": 1.0254696607589722,
"learning_rate": 6.631494813155574e-06,
"loss": 0.626,
"step": 753
},
{
"epoch": 1.86,
"grad_norm": 1.108877182006836,
"learning_rate": 6.606332482161992e-06,
"loss": 0.6285,
"step": 754
},
{
"epoch": 1.87,
"grad_norm": 1.0474169254302979,
"learning_rate": 6.581194411294018e-06,
"loss": 0.6101,
"step": 755
},
{
"epoch": 1.87,
"grad_norm": 1.1109755039215088,
"learning_rate": 6.556080780254805e-06,
"loss": 0.6379,
"step": 756
},
{
"epoch": 1.87,
"grad_norm": 1.0906550884246826,
"learning_rate": 6.530991768572794e-06,
"loss": 0.6692,
"step": 757
},
{
"epoch": 1.87,
"grad_norm": 0.937291145324707,
"learning_rate": 6.505927555600435e-06,
"loss": 0.5968,
"step": 758
},
{
"epoch": 1.88,
"grad_norm": 0.9941329956054688,
"learning_rate": 6.480888320512901e-06,
"loss": 0.6337,
"step": 759
},
{
"epoch": 1.88,
"grad_norm": 0.9972838759422302,
"learning_rate": 6.455874242306795e-06,
"loss": 0.5937,
"step": 760
},
{
"epoch": 1.88,
"grad_norm": 1.045379638671875,
"learning_rate": 6.430885499798903e-06,
"loss": 0.6405,
"step": 761
},
{
"epoch": 1.88,
"grad_norm": 0.9919976592063904,
"learning_rate": 6.405922271624874e-06,
"loss": 0.5793,
"step": 762
},
{
"epoch": 1.89,
"grad_norm": 0.9332768321037292,
"learning_rate": 6.3809847362379765e-06,
"loss": 0.6133,
"step": 763
},
{
"epoch": 1.89,
"grad_norm": 1.1040133237838745,
"learning_rate": 6.356073071907809e-06,
"loss": 0.5769,
"step": 764
},
{
"epoch": 1.89,
"grad_norm": 1.022218108177185,
"learning_rate": 6.331187456719023e-06,
"loss": 0.6322,
"step": 765
},
{
"epoch": 1.89,
"grad_norm": 1.0963839292526245,
"learning_rate": 6.306328068570062e-06,
"loss": 0.6277,
"step": 766
},
{
"epoch": 1.9,
"grad_norm": 1.0245170593261719,
"learning_rate": 6.2814950851718695e-06,
"loss": 0.6039,
"step": 767
},
{
"epoch": 1.9,
"grad_norm": 0.9054310917854309,
"learning_rate": 6.256688684046639e-06,
"loss": 0.5974,
"step": 768
},
{
"epoch": 1.9,
"grad_norm": 1.024070382118225,
"learning_rate": 6.231909042526539e-06,
"loss": 0.6311,
"step": 769
},
{
"epoch": 1.9,
"grad_norm": 1.0201619863510132,
"learning_rate": 6.207156337752435e-06,
"loss": 0.6453,
"step": 770
},
{
"epoch": 1.91,
"grad_norm": 1.1757748126983643,
"learning_rate": 6.1824307466726405e-06,
"loss": 0.5946,
"step": 771
},
{
"epoch": 1.91,
"grad_norm": 0.91058748960495,
"learning_rate": 6.15773244604163e-06,
"loss": 0.652,
"step": 772
},
{
"epoch": 1.91,
"grad_norm": 1.0924842357635498,
"learning_rate": 6.133061612418804e-06,
"loss": 0.6322,
"step": 773
},
{
"epoch": 1.91,
"grad_norm": 0.9574403166770935,
"learning_rate": 6.108418422167199e-06,
"loss": 0.6716,
"step": 774
},
{
"epoch": 1.92,
"grad_norm": 1.0027289390563965,
"learning_rate": 6.08380305145224e-06,
"loss": 0.6145,
"step": 775
},
{
"epoch": 1.92,
"grad_norm": 1.1438909769058228,
"learning_rate": 6.059215676240493e-06,
"loss": 0.6237,
"step": 776
},
{
"epoch": 1.92,
"grad_norm": 1.1727540493011475,
"learning_rate": 6.034656472298374e-06,
"loss": 0.5684,
"step": 777
},
{
"epoch": 1.92,
"grad_norm": 1.1187207698822021,
"learning_rate": 6.0101256151909286e-06,
"loss": 0.594,
"step": 778
},
{
"epoch": 1.93,
"grad_norm": 1.0779290199279785,
"learning_rate": 5.9856232802805505e-06,
"loss": 0.5759,
"step": 779
},
{
"epoch": 1.93,
"grad_norm": 1.0938560962677002,
"learning_rate": 5.961149642725745e-06,
"loss": 0.6111,
"step": 780
},
{
"epoch": 1.93,
"grad_norm": 1.029662847518921,
"learning_rate": 5.936704877479872e-06,
"loss": 0.6049,
"step": 781
},
{
"epoch": 1.93,
"grad_norm": 1.0182905197143555,
"learning_rate": 5.912289159289884e-06,
"loss": 0.6123,
"step": 782
},
{
"epoch": 1.94,
"grad_norm": 0.9786838293075562,
"learning_rate": 5.887902662695093e-06,
"loss": 0.6258,
"step": 783
},
{
"epoch": 1.94,
"grad_norm": 0.9776577353477478,
"learning_rate": 5.863545562025916e-06,
"loss": 0.6418,
"step": 784
},
{
"epoch": 1.94,
"grad_norm": 0.9853367805480957,
"learning_rate": 5.839218031402629e-06,
"loss": 0.6124,
"step": 785
},
{
"epoch": 1.94,
"grad_norm": 0.9788981080055237,
"learning_rate": 5.814920244734124e-06,
"loss": 0.6229,
"step": 786
},
{
"epoch": 1.95,
"grad_norm": 1.10306715965271,
"learning_rate": 5.790652375716653e-06,
"loss": 0.6275,
"step": 787
},
{
"epoch": 1.95,
"grad_norm": 0.9367828369140625,
"learning_rate": 5.7664145978326095e-06,
"loss": 0.5829,
"step": 788
},
{
"epoch": 1.95,
"grad_norm": 1.1416945457458496,
"learning_rate": 5.742207084349274e-06,
"loss": 0.6467,
"step": 789
},
{
"epoch": 1.95,
"grad_norm": 1.0426101684570312,
"learning_rate": 5.718030008317578e-06,
"loss": 0.6057,
"step": 790
},
{
"epoch": 1.96,
"grad_norm": 0.9459037780761719,
"learning_rate": 5.6938835425708575e-06,
"loss": 0.6362,
"step": 791
},
{
"epoch": 1.96,
"grad_norm": 1.133233666419983,
"learning_rate": 5.669767859723636e-06,
"loss": 0.6561,
"step": 792
},
{
"epoch": 1.96,
"grad_norm": 1.196579098701477,
"learning_rate": 5.645683132170384e-06,
"loss": 0.6231,
"step": 793
},
{
"epoch": 1.96,
"grad_norm": 1.10919988155365,
"learning_rate": 5.621629532084265e-06,
"loss": 0.6073,
"step": 794
},
{
"epoch": 1.97,
"grad_norm": 0.9457207918167114,
"learning_rate": 5.597607231415952e-06,
"loss": 0.5828,
"step": 795
},
{
"epoch": 1.97,
"grad_norm": 1.0295524597167969,
"learning_rate": 5.57361640189234e-06,
"loss": 0.6061,
"step": 796
},
{
"epoch": 1.97,
"grad_norm": 0.9679163694381714,
"learning_rate": 5.549657215015367e-06,
"loss": 0.6393,
"step": 797
},
{
"epoch": 1.97,
"grad_norm": 1.0385793447494507,
"learning_rate": 5.525729842060768e-06,
"loss": 0.6045,
"step": 798
},
{
"epoch": 1.98,
"grad_norm": 1.0008641481399536,
"learning_rate": 5.501834454076838e-06,
"loss": 0.6174,
"step": 799
},
{
"epoch": 1.98,
"grad_norm": 0.9990904331207275,
"learning_rate": 5.4779712218832356e-06,
"loss": 0.6405,
"step": 800
},
{
"epoch": 1.98,
"grad_norm": 1.0095916986465454,
"learning_rate": 5.454140316069747e-06,
"loss": 0.5964,
"step": 801
},
{
"epoch": 1.98,
"grad_norm": 1.0264003276824951,
"learning_rate": 5.430341906995064e-06,
"loss": 0.5937,
"step": 802
},
{
"epoch": 1.99,
"grad_norm": 0.9655827879905701,
"learning_rate": 5.406576164785582e-06,
"loss": 0.6226,
"step": 803
},
{
"epoch": 1.99,
"grad_norm": 0.9920186996459961,
"learning_rate": 5.382843259334152e-06,
"loss": 0.6189,
"step": 804
},
{
"epoch": 1.99,
"grad_norm": 0.9311228394508362,
"learning_rate": 5.3591433602989076e-06,
"loss": 0.5913,
"step": 805
},
{
"epoch": 1.99,
"grad_norm": 0.9763147234916687,
"learning_rate": 5.3354766371020106e-06,
"loss": 0.5837,
"step": 806
},
{
"epoch": 2.0,
"grad_norm": 1.0070594549179077,
"learning_rate": 5.311843258928489e-06,
"loss": 0.579,
"step": 807
},
{
"epoch": 2.0,
"grad_norm": 1.0727875232696533,
"learning_rate": 5.288243394724971e-06,
"loss": 0.6467,
"step": 808
},
{
"epoch": 2.0,
"grad_norm": 0.971811056137085,
"learning_rate": 5.264677213198519e-06,
"loss": 0.5764,
"step": 809
},
{
"epoch": 2.0,
"grad_norm": 1.2393842935562134,
"learning_rate": 5.241144882815413e-06,
"loss": 0.3858,
"step": 810
},
{
"epoch": 2.01,
"grad_norm": 1.2670291662216187,
"learning_rate": 5.217646571799929e-06,
"loss": 0.3888,
"step": 811
},
{
"epoch": 2.01,
"grad_norm": 1.1530663967132568,
"learning_rate": 5.194182448133163e-06,
"loss": 0.4032,
"step": 812
},
{
"epoch": 2.01,
"grad_norm": 1.1262778043746948,
"learning_rate": 5.170752679551816e-06,
"loss": 0.3776,
"step": 813
},
{
"epoch": 2.01,
"grad_norm": 1.1338114738464355,
"learning_rate": 5.147357433546992e-06,
"loss": 0.3838,
"step": 814
},
{
"epoch": 2.02,
"grad_norm": 1.0503406524658203,
"learning_rate": 5.123996877363015e-06,
"loss": 0.4049,
"step": 815
},
{
"epoch": 2.02,
"grad_norm": 1.0421979427337646,
"learning_rate": 5.100671177996206e-06,
"loss": 0.378,
"step": 816
},
{
"epoch": 2.02,
"grad_norm": 0.9778011441230774,
"learning_rate": 5.077380502193725e-06,
"loss": 0.3765,
"step": 817
},
{
"epoch": 2.02,
"grad_norm": 1.1578431129455566,
"learning_rate": 5.054125016452352e-06,
"loss": 0.4303,
"step": 818
},
{
"epoch": 2.03,
"grad_norm": 1.2118648290634155,
"learning_rate": 5.0309048870173074e-06,
"loss": 0.3904,
"step": 819
},
{
"epoch": 2.03,
"grad_norm": 1.1190721988677979,
"learning_rate": 5.0077202798810675e-06,
"loss": 0.3864,
"step": 820
},
{
"epoch": 2.03,
"grad_norm": 1.2787443399429321,
"learning_rate": 4.984571360782158e-06,
"loss": 0.384,
"step": 821
},
{
"epoch": 2.03,
"grad_norm": 1.375259518623352,
"learning_rate": 4.961458295203999e-06,
"loss": 0.4013,
"step": 822
},
{
"epoch": 2.04,
"grad_norm": 1.1658669710159302,
"learning_rate": 4.938381248373695e-06,
"loss": 0.4077,
"step": 823
},
{
"epoch": 2.04,
"grad_norm": 1.029345989227295,
"learning_rate": 4.915340385260871e-06,
"loss": 0.3275,
"step": 824
},
{
"epoch": 2.04,
"grad_norm": 1.1690287590026855,
"learning_rate": 4.8923358705764885e-06,
"loss": 0.3395,
"step": 825
},
{
"epoch": 2.04,
"grad_norm": 1.077650547027588,
"learning_rate": 4.869367868771666e-06,
"loss": 0.4032,
"step": 826
},
{
"epoch": 2.05,
"grad_norm": 1.276775598526001,
"learning_rate": 4.846436544036505e-06,
"loss": 0.4053,
"step": 827
},
{
"epoch": 2.05,
"grad_norm": 1.1765282154083252,
"learning_rate": 4.823542060298905e-06,
"loss": 0.3554,
"step": 828
},
{
"epoch": 2.05,
"grad_norm": 1.1307880878448486,
"learning_rate": 4.80068458122342e-06,
"loss": 0.3487,
"step": 829
},
{
"epoch": 2.05,
"grad_norm": 1.0298750400543213,
"learning_rate": 4.777864270210057e-06,
"loss": 0.3857,
"step": 830
},
{
"epoch": 2.06,
"grad_norm": 0.9388719797134399,
"learning_rate": 4.75508129039313e-06,
"loss": 0.3694,
"step": 831
},
{
"epoch": 2.06,
"grad_norm": 0.9721542000770569,
"learning_rate": 4.7323358046400844e-06,
"loss": 0.3944,
"step": 832
},
{
"epoch": 2.06,
"grad_norm": 0.9660404920578003,
"learning_rate": 4.709627975550326e-06,
"loss": 0.3604,
"step": 833
},
{
"epoch": 2.06,
"grad_norm": 1.0129330158233643,
"learning_rate": 4.686957965454078e-06,
"loss": 0.3131,
"step": 834
},
{
"epoch": 2.06,
"grad_norm": 1.3064758777618408,
"learning_rate": 4.664325936411197e-06,
"loss": 0.4035,
"step": 835
},
{
"epoch": 2.07,
"grad_norm": 1.203345775604248,
"learning_rate": 4.641732050210032e-06,
"loss": 0.3636,
"step": 836
},
{
"epoch": 2.07,
"grad_norm": 1.0145697593688965,
"learning_rate": 4.619176468366274e-06,
"loss": 0.3997,
"step": 837
},
{
"epoch": 2.07,
"grad_norm": 1.1014325618743896,
"learning_rate": 4.596659352121768e-06,
"loss": 0.4159,
"step": 838
},
{
"epoch": 2.07,
"grad_norm": 1.0002981424331665,
"learning_rate": 4.574180862443402e-06,
"loss": 0.3767,
"step": 839
},
{
"epoch": 2.08,
"grad_norm": 1.0357248783111572,
"learning_rate": 4.551741160021916e-06,
"loss": 0.3172,
"step": 840
},
{
"epoch": 2.08,
"grad_norm": 1.2248551845550537,
"learning_rate": 4.529340405270792e-06,
"loss": 0.3774,
"step": 841
},
{
"epoch": 2.08,
"grad_norm": 1.0676765441894531,
"learning_rate": 4.5069787583250815e-06,
"loss": 0.3796,
"step": 842
},
{
"epoch": 2.08,
"grad_norm": 1.013757586479187,
"learning_rate": 4.484656379040268e-06,
"loss": 0.3868,
"step": 843
},
{
"epoch": 2.09,
"grad_norm": 0.9967157244682312,
"learning_rate": 4.4623734269911274e-06,
"loss": 0.3727,
"step": 844
},
{
"epoch": 2.09,
"grad_norm": 0.8882321715354919,
"learning_rate": 4.4401300614705765e-06,
"loss": 0.3648,
"step": 845
},
{
"epoch": 2.09,
"grad_norm": 1.1337214708328247,
"learning_rate": 4.417926441488553e-06,
"loss": 0.364,
"step": 846
},
{
"epoch": 2.09,
"grad_norm": 1.1071412563323975,
"learning_rate": 4.395762725770852e-06,
"loss": 0.3898,
"step": 847
},
{
"epoch": 2.1,
"grad_norm": 1.106566071510315,
"learning_rate": 4.3736390727580295e-06,
"loss": 0.3404,
"step": 848
},
{
"epoch": 2.1,
"grad_norm": 1.0419895648956299,
"learning_rate": 4.351555640604233e-06,
"loss": 0.399,
"step": 849
},
{
"epoch": 2.1,
"grad_norm": 1.0877364873886108,
"learning_rate": 4.329512587176081e-06,
"loss": 0.3499,
"step": 850
},
{
"epoch": 2.1,
"grad_norm": 1.0401561260223389,
"learning_rate": 4.307510070051554e-06,
"loss": 0.3831,
"step": 851
},
{
"epoch": 2.11,
"grad_norm": 1.0114710330963135,
"learning_rate": 4.285548246518837e-06,
"loss": 0.3743,
"step": 852
},
{
"epoch": 2.11,
"grad_norm": 0.92411208152771,
"learning_rate": 4.2636272735752195e-06,
"loss": 0.3369,
"step": 853
},
{
"epoch": 2.11,
"grad_norm": 1.210862159729004,
"learning_rate": 4.241747307925966e-06,
"loss": 0.3815,
"step": 854
},
{
"epoch": 2.11,
"grad_norm": 1.109056830406189,
"learning_rate": 4.21990850598319e-06,
"loss": 0.3597,
"step": 855
},
{
"epoch": 2.12,
"grad_norm": 1.0660123825073242,
"learning_rate": 4.198111023864747e-06,
"loss": 0.377,
"step": 856
},
{
"epoch": 2.12,
"grad_norm": 1.0329532623291016,
"learning_rate": 4.176355017393099e-06,
"loss": 0.3282,
"step": 857
},
{
"epoch": 2.12,
"grad_norm": 0.9710352420806885,
"learning_rate": 4.154640642094223e-06,
"loss": 0.3676,
"step": 858
},
{
"epoch": 2.12,
"grad_norm": 1.0102200508117676,
"learning_rate": 4.1329680531964914e-06,
"loss": 0.4018,
"step": 859
},
{
"epoch": 2.13,
"grad_norm": 0.9945006370544434,
"learning_rate": 4.111337405629553e-06,
"loss": 0.3984,
"step": 860
},
{
"epoch": 2.13,
"grad_norm": 0.9301691055297852,
"learning_rate": 4.089748854023241e-06,
"loss": 0.361,
"step": 861
},
{
"epoch": 2.13,
"grad_norm": 1.0488731861114502,
"learning_rate": 4.0682025527064486e-06,
"loss": 0.3614,
"step": 862
},
{
"epoch": 2.13,
"grad_norm": 0.9186500906944275,
"learning_rate": 4.04669865570605e-06,
"loss": 0.3857,
"step": 863
},
{
"epoch": 2.14,
"grad_norm": 1.0314873456954956,
"learning_rate": 4.025237316745771e-06,
"loss": 0.3722,
"step": 864
},
{
"epoch": 2.14,
"grad_norm": 1.003535509109497,
"learning_rate": 4.003818689245118e-06,
"loss": 0.3675,
"step": 865
},
{
"epoch": 2.14,
"grad_norm": 1.1849119663238525,
"learning_rate": 3.982442926318263e-06,
"loss": 0.3972,
"step": 866
},
{
"epoch": 2.14,
"grad_norm": 1.0405722856521606,
"learning_rate": 3.961110180772955e-06,
"loss": 0.3863,
"step": 867
},
{
"epoch": 2.15,
"grad_norm": 0.9948720335960388,
"learning_rate": 3.939820605109429e-06,
"loss": 0.385,
"step": 868
},
{
"epoch": 2.15,
"grad_norm": 1.059756875038147,
"learning_rate": 3.9185743515193065e-06,
"loss": 0.3785,
"step": 869
},
{
"epoch": 2.15,
"grad_norm": 1.1081238985061646,
"learning_rate": 3.897371571884521e-06,
"loss": 0.3502,
"step": 870
},
{
"epoch": 2.15,
"grad_norm": 0.9353616833686829,
"learning_rate": 3.8762124177762285e-06,
"loss": 0.3723,
"step": 871
},
{
"epoch": 2.16,
"grad_norm": 0.9215275049209595,
"learning_rate": 3.855097040453715e-06,
"loss": 0.4104,
"step": 872
},
{
"epoch": 2.16,
"grad_norm": 1.007318139076233,
"learning_rate": 3.83402559086333e-06,
"loss": 0.3417,
"step": 873
},
{
"epoch": 2.16,
"grad_norm": 0.9856567978858948,
"learning_rate": 3.812998219637387e-06,
"loss": 0.3887,
"step": 874
},
{
"epoch": 2.16,
"grad_norm": 0.9427929520606995,
"learning_rate": 3.7920150770931095e-06,
"loss": 0.3523,
"step": 875
},
{
"epoch": 2.17,
"grad_norm": 1.0284315347671509,
"learning_rate": 3.7710763132315455e-06,
"loss": 0.3622,
"step": 876
},
{
"epoch": 2.17,
"grad_norm": 0.9978756308555603,
"learning_rate": 3.750182077736486e-06,
"loss": 0.4096,
"step": 877
},
{
"epoch": 2.17,
"grad_norm": 1.1080024242401123,
"learning_rate": 3.7293325199734144e-06,
"loss": 0.3986,
"step": 878
},
{
"epoch": 2.17,
"grad_norm": 1.1263668537139893,
"learning_rate": 3.7085277889884253e-06,
"loss": 0.4085,
"step": 879
},
{
"epoch": 2.18,
"grad_norm": 0.8740857243537903,
"learning_rate": 3.6877680335071653e-06,
"loss": 0.3564,
"step": 880
},
{
"epoch": 2.18,
"grad_norm": 0.9901694059371948,
"learning_rate": 3.667053401933759e-06,
"loss": 0.3689,
"step": 881
},
{
"epoch": 2.18,
"grad_norm": 1.0043367147445679,
"learning_rate": 3.6463840423497643e-06,
"loss": 0.3585,
"step": 882
},
{
"epoch": 2.18,
"grad_norm": 1.034173846244812,
"learning_rate": 3.625760102513103e-06,
"loss": 0.3353,
"step": 883
},
{
"epoch": 2.19,
"grad_norm": 1.0721750259399414,
"learning_rate": 3.6051817298570067e-06,
"loss": 0.3798,
"step": 884
},
{
"epoch": 2.19,
"grad_norm": 0.9780704379081726,
"learning_rate": 3.5846490714889694e-06,
"loss": 0.3393,
"step": 885
},
{
"epoch": 2.19,
"grad_norm": 1.0679874420166016,
"learning_rate": 3.5641622741896742e-06,
"loss": 0.3506,
"step": 886
},
{
"epoch": 2.19,
"grad_norm": 1.1510306596755981,
"learning_rate": 3.543721484411976e-06,
"loss": 0.3939,
"step": 887
},
{
"epoch": 2.2,
"grad_norm": 1.0783063173294067,
"learning_rate": 3.5233268482798353e-06,
"loss": 0.3808,
"step": 888
},
{
"epoch": 2.2,
"grad_norm": 0.9867876768112183,
"learning_rate": 3.5029785115872617e-06,
"loss": 0.3586,
"step": 889
},
{
"epoch": 2.2,
"grad_norm": 1.2214995622634888,
"learning_rate": 3.4826766197973127e-06,
"loss": 0.3611,
"step": 890
},
{
"epoch": 2.2,
"grad_norm": 1.017261028289795,
"learning_rate": 3.462421318041003e-06,
"loss": 0.41,
"step": 891
},
{
"epoch": 2.21,
"grad_norm": 0.9691404700279236,
"learning_rate": 3.442212751116305e-06,
"loss": 0.3767,
"step": 892
},
{
"epoch": 2.21,
"grad_norm": 0.9148778915405273,
"learning_rate": 3.4220510634871005e-06,
"loss": 0.3441,
"step": 893
},
{
"epoch": 2.21,
"grad_norm": 1.102925419807434,
"learning_rate": 3.4019363992821386e-06,
"loss": 0.3347,
"step": 894
},
{
"epoch": 2.21,
"grad_norm": 1.0539733171463013,
"learning_rate": 3.381868902294023e-06,
"loss": 0.3705,
"step": 895
},
{
"epoch": 2.22,
"grad_norm": 1.1010009050369263,
"learning_rate": 3.361848715978173e-06,
"loss": 0.3813,
"step": 896
},
{
"epoch": 2.22,
"grad_norm": 0.9682654738426208,
"learning_rate": 3.3418759834518056e-06,
"loss": 0.3709,
"step": 897
},
{
"epoch": 2.22,
"grad_norm": 1.3582557439804077,
"learning_rate": 3.321950847492895e-06,
"loss": 0.344,
"step": 898
},
{
"epoch": 2.22,
"grad_norm": 0.981054425239563,
"learning_rate": 3.302073450539176e-06,
"loss": 0.3971,
"step": 899
},
{
"epoch": 2.23,
"grad_norm": 0.9968892931938171,
"learning_rate": 3.2822439346871127e-06,
"loss": 0.3853,
"step": 900
},
{
"epoch": 2.23,
"grad_norm": 1.0093852281570435,
"learning_rate": 3.2624624416908745e-06,
"loss": 0.3476,
"step": 901
},
{
"epoch": 2.23,
"grad_norm": 0.8982293605804443,
"learning_rate": 3.2427291129613502e-06,
"loss": 0.3239,
"step": 902
},
{
"epoch": 2.23,
"grad_norm": 1.1306248903274536,
"learning_rate": 3.2230440895651006e-06,
"loss": 0.3988,
"step": 903
},
{
"epoch": 2.24,
"grad_norm": 1.0701810121536255,
"learning_rate": 3.2034075122233798e-06,
"loss": 0.3628,
"step": 904
},
{
"epoch": 2.24,
"grad_norm": 1.1889069080352783,
"learning_rate": 3.18381952131112e-06,
"loss": 0.3936,
"step": 905
},
{
"epoch": 2.24,
"grad_norm": 0.8822778463363647,
"learning_rate": 3.164280256855914e-06,
"loss": 0.3456,
"step": 906
},
{
"epoch": 2.24,
"grad_norm": 0.9765056371688843,
"learning_rate": 3.1447898585370386e-06,
"loss": 0.3213,
"step": 907
},
{
"epoch": 2.25,
"grad_norm": 1.1653448343276978,
"learning_rate": 3.125348465684439e-06,
"loss": 0.3927,
"step": 908
},
{
"epoch": 2.25,
"grad_norm": 1.027599811553955,
"learning_rate": 3.105956217277738e-06,
"loss": 0.346,
"step": 909
},
{
"epoch": 2.25,
"grad_norm": 1.0844277143478394,
"learning_rate": 3.086613251945246e-06,
"loss": 0.4082,
"step": 910
},
{
"epoch": 2.25,
"grad_norm": 1.0551422834396362,
"learning_rate": 3.067319707962957e-06,
"loss": 0.3862,
"step": 911
},
{
"epoch": 2.26,
"grad_norm": 1.1803510189056396,
"learning_rate": 3.0480757232535773e-06,
"loss": 0.3932,
"step": 912
},
{
"epoch": 2.26,
"grad_norm": 0.9642703533172607,
"learning_rate": 3.02888143538553e-06,
"loss": 0.3842,
"step": 913
},
{
"epoch": 2.26,
"grad_norm": 0.9816460609436035,
"learning_rate": 3.0097369815719746e-06,
"loss": 0.3978,
"step": 914
},
{
"epoch": 2.26,
"grad_norm": 1.063370943069458,
"learning_rate": 2.990642498669816e-06,
"loss": 0.3512,
"step": 915
},
{
"epoch": 2.27,
"grad_norm": 1.0633240938186646,
"learning_rate": 2.971598123178744e-06,
"loss": 0.3679,
"step": 916
},
{
"epoch": 2.27,
"grad_norm": 0.9631143808364868,
"learning_rate": 2.9526039912402504e-06,
"loss": 0.4003,
"step": 917
},
{
"epoch": 2.27,
"grad_norm": 1.0357810258865356,
"learning_rate": 2.9336602386366396e-06,
"loss": 0.3878,
"step": 918
},
{
"epoch": 2.27,
"grad_norm": 0.965857982635498,
"learning_rate": 2.9147670007900875e-06,
"loss": 0.366,
"step": 919
},
{
"epoch": 2.28,
"grad_norm": 0.9633121490478516,
"learning_rate": 2.8959244127616483e-06,
"loss": 0.3191,
"step": 920
},
{
"epoch": 2.28,
"grad_norm": 1.093406319618225,
"learning_rate": 2.877132609250303e-06,
"loss": 0.3572,
"step": 921
},
{
"epoch": 2.28,
"grad_norm": 1.123749017715454,
"learning_rate": 2.8583917245919944e-06,
"loss": 0.3693,
"step": 922
},
{
"epoch": 2.28,
"grad_norm": 1.0981041193008423,
"learning_rate": 2.839701892758655e-06,
"loss": 0.3731,
"step": 923
},
{
"epoch": 2.29,
"grad_norm": 0.9987170100212097,
"learning_rate": 2.8210632473572664e-06,
"loss": 0.3364,
"step": 924
},
{
"epoch": 2.29,
"grad_norm": 0.9356394410133362,
"learning_rate": 2.8024759216288953e-06,
"loss": 0.3768,
"step": 925
},
{
"epoch": 2.29,
"grad_norm": 1.053085207939148,
"learning_rate": 2.783940048447743e-06,
"loss": 0.3797,
"step": 926
},
{
"epoch": 2.29,
"grad_norm": 1.0330283641815186,
"learning_rate": 2.765455760320196e-06,
"loss": 0.3577,
"step": 927
},
{
"epoch": 2.29,
"grad_norm": 1.184908390045166,
"learning_rate": 2.7470231893838684e-06,
"loss": 0.4114,
"step": 928
},
{
"epoch": 2.3,
"grad_norm": 0.9896612167358398,
"learning_rate": 2.728642467406679e-06,
"loss": 0.3806,
"step": 929
},
{
"epoch": 2.3,
"grad_norm": 0.9521954655647278,
"learning_rate": 2.7103137257858867e-06,
"loss": 0.3781,
"step": 930
},
{
"epoch": 2.3,
"grad_norm": 1.0808932781219482,
"learning_rate": 2.692037095547164e-06,
"loss": 0.375,
"step": 931
},
{
"epoch": 2.3,
"grad_norm": 1.0219560861587524,
"learning_rate": 2.6738127073436694e-06,
"loss": 0.3789,
"step": 932
},
{
"epoch": 2.31,
"grad_norm": 1.1612478494644165,
"learning_rate": 2.6556406914550803e-06,
"loss": 0.3625,
"step": 933
},
{
"epoch": 2.31,
"grad_norm": 0.9443746209144592,
"learning_rate": 2.6375211777867015e-06,
"loss": 0.3291,
"step": 934
},
{
"epoch": 2.31,
"grad_norm": 0.932725727558136,
"learning_rate": 2.6194542958685052e-06,
"loss": 0.3633,
"step": 935
},
{
"epoch": 2.31,
"grad_norm": 1.0153712034225464,
"learning_rate": 2.601440174854225e-06,
"loss": 0.3542,
"step": 936
},
{
"epoch": 2.32,
"grad_norm": 0.9318667650222778,
"learning_rate": 2.5834789435204245e-06,
"loss": 0.3358,
"step": 937
},
{
"epoch": 2.32,
"grad_norm": 1.0376856327056885,
"learning_rate": 2.5655707302655766e-06,
"loss": 0.3631,
"step": 938
},
{
"epoch": 2.32,
"grad_norm": 1.0927082300186157,
"learning_rate": 2.5477156631091503e-06,
"loss": 0.3779,
"step": 939
},
{
"epoch": 2.32,
"grad_norm": 1.1744165420532227,
"learning_rate": 2.5299138696906833e-06,
"loss": 0.3473,
"step": 940
},
{
"epoch": 2.33,
"grad_norm": 0.9817997813224792,
"learning_rate": 2.512165477268889e-06,
"loss": 0.3636,
"step": 941
},
{
"epoch": 2.33,
"grad_norm": 1.0878486633300781,
"learning_rate": 2.4944706127207252e-06,
"loss": 0.4015,
"step": 942
},
{
"epoch": 2.33,
"grad_norm": 1.0829596519470215,
"learning_rate": 2.476829402540504e-06,
"loss": 0.3734,
"step": 943
},
{
"epoch": 2.33,
"grad_norm": 1.122971773147583,
"learning_rate": 2.459241972838988e-06,
"loss": 0.3629,
"step": 944
},
{
"epoch": 2.34,
"grad_norm": 1.0560338497161865,
"learning_rate": 2.4417084493424693e-06,
"loss": 0.3897,
"step": 945
},
{
"epoch": 2.34,
"grad_norm": 1.1771371364593506,
"learning_rate": 2.4242289573918933e-06,
"loss": 0.3897,
"step": 946
},
{
"epoch": 2.34,
"grad_norm": 0.9809534549713135,
"learning_rate": 2.4068036219419433e-06,
"loss": 0.3768,
"step": 947
},
{
"epoch": 2.34,
"grad_norm": 0.9981406331062317,
"learning_rate": 2.3894325675601683e-06,
"loss": 0.3449,
"step": 948
},
{
"epoch": 2.35,
"grad_norm": 0.9740926623344421,
"learning_rate": 2.3721159184260733e-06,
"loss": 0.3633,
"step": 949
},
{
"epoch": 2.35,
"grad_norm": 0.9102757573127747,
"learning_rate": 2.354853798330242e-06,
"loss": 0.3533,
"step": 950
},
{
"epoch": 2.35,
"grad_norm": 1.2017672061920166,
"learning_rate": 2.3376463306734543e-06,
"loss": 0.4059,
"step": 951
},
{
"epoch": 2.35,
"grad_norm": 1.1060692071914673,
"learning_rate": 2.3204936384657873e-06,
"loss": 0.3661,
"step": 952
},
{
"epoch": 2.36,
"grad_norm": 1.068207025527954,
"learning_rate": 2.303395844325761e-06,
"loss": 0.3814,
"step": 953
},
{
"epoch": 2.36,
"grad_norm": 0.9582358598709106,
"learning_rate": 2.2863530704794334e-06,
"loss": 0.3912,
"step": 954
},
{
"epoch": 2.36,
"grad_norm": 0.9677960872650146,
"learning_rate": 2.26936543875956e-06,
"loss": 0.3706,
"step": 955
},
{
"epoch": 2.36,
"grad_norm": 0.9783523678779602,
"learning_rate": 2.252433070604695e-06,
"loss": 0.366,
"step": 956
},
{
"epoch": 2.37,
"grad_norm": 1.0802927017211914,
"learning_rate": 2.2355560870583283e-06,
"loss": 0.3453,
"step": 957
},
{
"epoch": 2.37,
"grad_norm": 1.0811406373977661,
"learning_rate": 2.2187346087680363e-06,
"loss": 0.3426,
"step": 958
},
{
"epoch": 2.37,
"grad_norm": 0.9628420472145081,
"learning_rate": 2.201968755984596e-06,
"loss": 0.3757,
"step": 959
},
{
"epoch": 2.37,
"grad_norm": 1.030654788017273,
"learning_rate": 2.185258648561147e-06,
"loss": 0.3455,
"step": 960
},
{
"epoch": 2.38,
"grad_norm": 1.0548131465911865,
"learning_rate": 2.1686044059523192e-06,
"loss": 0.3432,
"step": 961
},
{
"epoch": 2.38,
"grad_norm": 1.0581459999084473,
"learning_rate": 2.1520061472133903e-06,
"loss": 0.4014,
"step": 962
},
{
"epoch": 2.38,
"grad_norm": 1.1126346588134766,
"learning_rate": 2.1354639909994258e-06,
"loss": 0.3412,
"step": 963
},
{
"epoch": 2.38,
"grad_norm": 1.108038067817688,
"learning_rate": 2.1189780555644302e-06,
"loss": 0.3699,
"step": 964
},
{
"epoch": 2.39,
"grad_norm": 1.047153353691101,
"learning_rate": 2.1025484587605115e-06,
"loss": 0.3794,
"step": 965
},
{
"epoch": 2.39,
"grad_norm": 1.0419840812683105,
"learning_rate": 2.0861753180370324e-06,
"loss": 0.4154,
"step": 966
},
{
"epoch": 2.39,
"grad_norm": 1.0939159393310547,
"learning_rate": 2.0698587504397684e-06,
"loss": 0.3856,
"step": 967
},
{
"epoch": 2.39,
"grad_norm": 1.1994597911834717,
"learning_rate": 2.0535988726100774e-06,
"loss": 0.4137,
"step": 968
},
{
"epoch": 2.4,
"grad_norm": 1.089043140411377,
"learning_rate": 2.0373958007840545e-06,
"loss": 0.4091,
"step": 969
},
{
"epoch": 2.4,
"grad_norm": 1.0016350746154785,
"learning_rate": 2.0212496507917214e-06,
"loss": 0.3792,
"step": 970
},
{
"epoch": 2.4,
"grad_norm": 1.0850452184677124,
"learning_rate": 2.0051605380561702e-06,
"loss": 0.3811,
"step": 971
},
{
"epoch": 2.4,
"grad_norm": 1.1471667289733887,
"learning_rate": 1.9891285775927684e-06,
"loss": 0.3476,
"step": 972
},
{
"epoch": 2.41,
"grad_norm": 0.8458806872367859,
"learning_rate": 1.973153884008312e-06,
"loss": 0.3635,
"step": 973
},
{
"epoch": 2.41,
"grad_norm": 1.008909821510315,
"learning_rate": 1.957236571500224e-06,
"loss": 0.3514,
"step": 974
},
{
"epoch": 2.41,
"grad_norm": 1.0104005336761475,
"learning_rate": 1.941376753855728e-06,
"loss": 0.3863,
"step": 975
},
{
"epoch": 2.41,
"grad_norm": 1.1356624364852905,
"learning_rate": 1.925574544451031e-06,
"loss": 0.4084,
"step": 976
},
{
"epoch": 2.42,
"grad_norm": 0.9510722756385803,
"learning_rate": 1.9098300562505266e-06,
"loss": 0.3683,
"step": 977
},
{
"epoch": 2.42,
"grad_norm": 1.0902206897735596,
"learning_rate": 1.8941434018059779e-06,
"loss": 0.3351,
"step": 978
},
{
"epoch": 2.42,
"grad_norm": 1.1588971614837646,
"learning_rate": 1.878514693255714e-06,
"loss": 0.3436,
"step": 979
},
{
"epoch": 2.42,
"grad_norm": 1.1468517780303955,
"learning_rate": 1.8629440423238333e-06,
"loss": 0.3305,
"step": 980
},
{
"epoch": 2.43,
"grad_norm": 1.119321346282959,
"learning_rate": 1.8474315603193916e-06,
"loss": 0.4021,
"step": 981
},
{
"epoch": 2.43,
"grad_norm": 1.0432164669036865,
"learning_rate": 1.8319773581356248e-06,
"loss": 0.3419,
"step": 982
},
{
"epoch": 2.43,
"grad_norm": 1.1679702997207642,
"learning_rate": 1.8165815462491466e-06,
"loss": 0.3892,
"step": 983
},
{
"epoch": 2.43,
"grad_norm": 0.934533953666687,
"learning_rate": 1.8012442347191483e-06,
"loss": 0.3555,
"step": 984
},
{
"epoch": 2.44,
"grad_norm": 1.0839966535568237,
"learning_rate": 1.7859655331866422e-06,
"loss": 0.3948,
"step": 985
},
{
"epoch": 2.44,
"grad_norm": 1.1098887920379639,
"learning_rate": 1.7707455508736381e-06,
"loss": 0.3793,
"step": 986
},
{
"epoch": 2.44,
"grad_norm": 0.9264780879020691,
"learning_rate": 1.7555843965823992e-06,
"loss": 0.3552,
"step": 987
},
{
"epoch": 2.44,
"grad_norm": 1.1234818696975708,
"learning_rate": 1.7404821786946346e-06,
"loss": 0.4372,
"step": 988
},
{
"epoch": 2.45,
"grad_norm": 1.0034370422363281,
"learning_rate": 1.725439005170747e-06,
"loss": 0.3827,
"step": 989
},
{
"epoch": 2.45,
"grad_norm": 1.1202595233917236,
"learning_rate": 1.7104549835490491e-06,
"loss": 0.3891,
"step": 990
},
{
"epoch": 2.45,
"grad_norm": 1.0200791358947754,
"learning_rate": 1.6955302209449987e-06,
"loss": 0.4046,
"step": 991
},
{
"epoch": 2.45,
"grad_norm": 1.0529330968856812,
"learning_rate": 1.680664824050432e-06,
"loss": 0.4219,
"step": 992
},
{
"epoch": 2.46,
"grad_norm": 1.1495938301086426,
"learning_rate": 1.6658588991327962e-06,
"loss": 0.3416,
"step": 993
},
{
"epoch": 2.46,
"grad_norm": 1.0006402730941772,
"learning_rate": 1.6511125520344007e-06,
"loss": 0.3688,
"step": 994
},
{
"epoch": 2.46,
"grad_norm": 0.9660508632659912,
"learning_rate": 1.636425888171652e-06,
"loss": 0.386,
"step": 995
},
{
"epoch": 2.46,
"grad_norm": 1.0930101871490479,
"learning_rate": 1.6217990125342964e-06,
"loss": 0.3759,
"step": 996
},
{
"epoch": 2.47,
"grad_norm": 0.9058865308761597,
"learning_rate": 1.6072320296846898e-06,
"loss": 0.404,
"step": 997
},
{
"epoch": 2.47,
"grad_norm": 1.0616929531097412,
"learning_rate": 1.5927250437570197e-06,
"loss": 0.3596,
"step": 998
},
{
"epoch": 2.47,
"grad_norm": 1.0566691160202026,
"learning_rate": 1.5782781584565854e-06,
"loss": 0.3778,
"step": 999
},
{
"epoch": 2.47,
"grad_norm": 1.012123703956604,
"learning_rate": 1.5638914770590508e-06,
"loss": 0.3415,
"step": 1000
},
{
"epoch": 2.48,
"grad_norm": 1.0266258716583252,
"learning_rate": 1.5495651024096925e-06,
"loss": 0.4095,
"step": 1001
},
{
"epoch": 2.48,
"grad_norm": 0.9796007871627808,
"learning_rate": 1.5352991369226865e-06,
"loss": 0.4002,
"step": 1002
},
{
"epoch": 2.48,
"grad_norm": 1.0556604862213135,
"learning_rate": 1.5210936825803602e-06,
"loss": 0.3394,
"step": 1003
},
{
"epoch": 2.48,
"grad_norm": 1.0829135179519653,
"learning_rate": 1.5069488409324696e-06,
"loss": 0.3859,
"step": 1004
},
{
"epoch": 2.49,
"grad_norm": 0.9986345171928406,
"learning_rate": 1.4928647130954743e-06,
"loss": 0.3893,
"step": 1005
},
{
"epoch": 2.49,
"grad_norm": 0.9846001863479614,
"learning_rate": 1.4788413997518026e-06,
"loss": 0.3415,
"step": 1006
},
{
"epoch": 2.49,
"grad_norm": 0.998688280582428,
"learning_rate": 1.4648790011491544e-06,
"loss": 0.3699,
"step": 1007
},
{
"epoch": 2.49,
"grad_norm": 1.0853443145751953,
"learning_rate": 1.4509776170997625e-06,
"loss": 0.3483,
"step": 1008
},
{
"epoch": 2.5,
"grad_norm": 1.1299796104431152,
"learning_rate": 1.4371373469796956e-06,
"loss": 0.383,
"step": 1009
},
{
"epoch": 2.5,
"grad_norm": 1.091850757598877,
"learning_rate": 1.4233582897281328e-06,
"loss": 0.3405,
"step": 1010
},
{
"epoch": 2.5,
"grad_norm": 1.159667730331421,
"learning_rate": 1.4096405438466687e-06,
"loss": 0.3682,
"step": 1011
},
{
"epoch": 2.5,
"grad_norm": 1.0162327289581299,
"learning_rate": 1.3959842073986085e-06,
"loss": 0.3824,
"step": 1012
},
{
"epoch": 2.51,
"grad_norm": 1.0610222816467285,
"learning_rate": 1.3823893780082508e-06,
"loss": 0.3945,
"step": 1013
},
{
"epoch": 2.51,
"grad_norm": 0.9760584235191345,
"learning_rate": 1.368856152860215e-06,
"loss": 0.3673,
"step": 1014
},
{
"epoch": 2.51,
"grad_norm": 0.964137077331543,
"learning_rate": 1.3553846286987271e-06,
"loss": 0.3615,
"step": 1015
},
{
"epoch": 2.51,
"grad_norm": 1.068863868713379,
"learning_rate": 1.3419749018269368e-06,
"loss": 0.3698,
"step": 1016
},
{
"epoch": 2.51,
"grad_norm": 1.0485306978225708,
"learning_rate": 1.3286270681062275e-06,
"loss": 0.3907,
"step": 1017
},
{
"epoch": 2.52,
"grad_norm": 1.0279109477996826,
"learning_rate": 1.3153412229555251e-06,
"loss": 0.3407,
"step": 1018
},
{
"epoch": 2.52,
"grad_norm": 1.1816673278808594,
"learning_rate": 1.302117461350627e-06,
"loss": 0.3924,
"step": 1019
},
{
"epoch": 2.52,
"grad_norm": 1.0322753190994263,
"learning_rate": 1.2889558778235157e-06,
"loss": 0.3109,
"step": 1020
},
{
"epoch": 2.52,
"grad_norm": 1.0824332237243652,
"learning_rate": 1.2758565664616829e-06,
"loss": 0.3831,
"step": 1021
},
{
"epoch": 2.53,
"grad_norm": 1.1403456926345825,
"learning_rate": 1.262819620907465e-06,
"loss": 0.3975,
"step": 1022
},
{
"epoch": 2.53,
"grad_norm": 0.9186389446258545,
"learning_rate": 1.249845134357357e-06,
"loss": 0.4084,
"step": 1023
},
{
"epoch": 2.53,
"grad_norm": 1.1256766319274902,
"learning_rate": 1.2369331995613664e-06,
"loss": 0.3524,
"step": 1024
},
{
"epoch": 2.53,
"grad_norm": 1.12443208694458,
"learning_rate": 1.224083908822331e-06,
"loss": 0.4011,
"step": 1025
},
{
"epoch": 2.54,
"grad_norm": 1.045150876045227,
"learning_rate": 1.2112973539952777e-06,
"loss": 0.3609,
"step": 1026
},
{
"epoch": 2.54,
"grad_norm": 1.305837869644165,
"learning_rate": 1.198573626486751e-06,
"loss": 0.4315,
"step": 1027
},
{
"epoch": 2.54,
"grad_norm": 0.9822626113891602,
"learning_rate": 1.1859128172541668e-06,
"loss": 0.357,
"step": 1028
},
{
"epoch": 2.54,
"grad_norm": 0.9743748903274536,
"learning_rate": 1.1733150168051632e-06,
"loss": 0.364,
"step": 1029
},
{
"epoch": 2.55,
"grad_norm": 0.9916324019432068,
"learning_rate": 1.1607803151969443e-06,
"loss": 0.3729,
"step": 1030
},
{
"epoch": 2.55,
"grad_norm": 1.18027663230896,
"learning_rate": 1.148308802035648e-06,
"loss": 0.3667,
"step": 1031
},
{
"epoch": 2.55,
"grad_norm": 1.1447166204452515,
"learning_rate": 1.1359005664756994e-06,
"loss": 0.3707,
"step": 1032
},
{
"epoch": 2.55,
"grad_norm": 1.0819268226623535,
"learning_rate": 1.123555697219174e-06,
"loss": 0.3907,
"step": 1033
},
{
"epoch": 2.56,
"grad_norm": 1.018946886062622,
"learning_rate": 1.1112742825151669e-06,
"loss": 0.3577,
"step": 1034
},
{
"epoch": 2.56,
"grad_norm": 1.0342707633972168,
"learning_rate": 1.0990564101591527e-06,
"loss": 0.335,
"step": 1035
},
{
"epoch": 2.56,
"grad_norm": 1.2041442394256592,
"learning_rate": 1.0869021674923708e-06,
"loss": 0.3969,
"step": 1036
},
{
"epoch": 2.56,
"grad_norm": 1.0217515230178833,
"learning_rate": 1.074811641401189e-06,
"loss": 0.3416,
"step": 1037
},
{
"epoch": 2.57,
"grad_norm": 0.9454570412635803,
"learning_rate": 1.0627849183164906e-06,
"loss": 0.3731,
"step": 1038
},
{
"epoch": 2.57,
"grad_norm": 1.1685900688171387,
"learning_rate": 1.0508220842130602e-06,
"loss": 0.368,
"step": 1039
},
{
"epoch": 2.57,
"grad_norm": 1.0981504917144775,
"learning_rate": 1.0389232246089499e-06,
"loss": 0.3884,
"step": 1040
},
{
"epoch": 2.57,
"grad_norm": 1.085720419883728,
"learning_rate": 1.0270884245648905e-06,
"loss": 0.3683,
"step": 1041
},
{
"epoch": 2.58,
"grad_norm": 0.9990667700767517,
"learning_rate": 1.015317768683669e-06,
"loss": 0.354,
"step": 1042
},
{
"epoch": 2.58,
"grad_norm": 1.0519665479660034,
"learning_rate": 1.0036113411095304e-06,
"loss": 0.3681,
"step": 1043
},
{
"epoch": 2.58,
"grad_norm": 1.1080559492111206,
"learning_rate": 9.919692255275747e-07,
"loss": 0.3691,
"step": 1044
},
{
"epoch": 2.58,
"grad_norm": 1.012698769569397,
"learning_rate": 9.803915051631574e-07,
"loss": 0.356,
"step": 1045
},
{
"epoch": 2.59,
"grad_norm": 1.0640681982040405,
"learning_rate": 9.688782627812965e-07,
"loss": 0.3452,
"step": 1046
},
{
"epoch": 2.59,
"grad_norm": 0.9197502732276917,
"learning_rate": 9.574295806860767e-07,
"loss": 0.3919,
"step": 1047
},
{
"epoch": 2.59,
"grad_norm": 1.1475162506103516,
"learning_rate": 9.460455407200708e-07,
"loss": 0.3938,
"step": 1048
},
{
"epoch": 2.59,
"grad_norm": 1.1591166257858276,
"learning_rate": 9.347262242637345e-07,
"loss": 0.3901,
"step": 1049
},
{
"epoch": 2.6,
"grad_norm": 1.102064847946167,
"learning_rate": 9.234717122348558e-07,
"loss": 0.4048,
"step": 1050
},
{
"epoch": 2.6,
"grad_norm": 1.1424314975738525,
"learning_rate": 9.122820850879488e-07,
"loss": 0.3597,
"step": 1051
},
{
"epoch": 2.6,
"grad_norm": 1.2532248497009277,
"learning_rate": 9.011574228136866e-07,
"loss": 0.3674,
"step": 1052
},
{
"epoch": 2.6,
"grad_norm": 1.1703746318817139,
"learning_rate": 8.90097804938338e-07,
"loss": 0.4226,
"step": 1053
},
{
"epoch": 2.61,
"grad_norm": 1.0363982915878296,
"learning_rate": 8.791033105231861e-07,
"loss": 0.3609,
"step": 1054
},
{
"epoch": 2.61,
"grad_norm": 1.1072263717651367,
"learning_rate": 8.681740181639731e-07,
"loss": 0.3995,
"step": 1055
},
{
"epoch": 2.61,
"grad_norm": 1.1055591106414795,
"learning_rate": 8.573100059903349e-07,
"loss": 0.3452,
"step": 1056
},
{
"epoch": 2.61,
"grad_norm": 1.0444055795669556,
"learning_rate": 8.465113516652424e-07,
"loss": 0.369,
"step": 1057
},
{
"epoch": 2.62,
"grad_norm": 1.04262375831604,
"learning_rate": 8.357781323844482e-07,
"loss": 0.3242,
"step": 1058
},
{
"epoch": 2.62,
"grad_norm": 0.9722965955734253,
"learning_rate": 8.251104248759256e-07,
"loss": 0.3625,
"step": 1059
},
{
"epoch": 2.62,
"grad_norm": 1.1053335666656494,
"learning_rate": 8.145083053993364e-07,
"loss": 0.3739,
"step": 1060
},
{
"epoch": 2.62,
"grad_norm": 1.0828070640563965,
"learning_rate": 8.039718497454685e-07,
"loss": 0.3723,
"step": 1061
},
{
"epoch": 2.63,
"grad_norm": 1.1141027212142944,
"learning_rate": 7.935011332357113e-07,
"loss": 0.3588,
"step": 1062
},
{
"epoch": 2.63,
"grad_norm": 1.0269464254379272,
"learning_rate": 7.83096230721505e-07,
"loss": 0.3514,
"step": 1063
},
{
"epoch": 2.63,
"grad_norm": 0.9290771484375,
"learning_rate": 7.727572165838038e-07,
"loss": 0.3697,
"step": 1064
},
{
"epoch": 2.63,
"grad_norm": 1.0621395111083984,
"learning_rate": 7.624841647325565e-07,
"loss": 0.3692,
"step": 1065
},
{
"epoch": 2.64,
"grad_norm": 1.0444269180297852,
"learning_rate": 7.522771486061642e-07,
"loss": 0.3676,
"step": 1066
},
{
"epoch": 2.64,
"grad_norm": 0.9791834950447083,
"learning_rate": 7.421362411709676e-07,
"loss": 0.3695,
"step": 1067
},
{
"epoch": 2.64,
"grad_norm": 1.0859102010726929,
"learning_rate": 7.320615149207177e-07,
"loss": 0.3987,
"step": 1068
},
{
"epoch": 2.64,
"grad_norm": 0.981833279132843,
"learning_rate": 7.220530418760597e-07,
"loss": 0.3744,
"step": 1069
},
{
"epoch": 2.65,
"grad_norm": 1.0273011922836304,
"learning_rate": 7.121108935840193e-07,
"loss": 0.36,
"step": 1070
},
{
"epoch": 2.65,
"grad_norm": 0.9487125873565674,
"learning_rate": 7.022351411174866e-07,
"loss": 0.3683,
"step": 1071
},
{
"epoch": 2.65,
"grad_norm": 1.0154640674591064,
"learning_rate": 6.924258550747154e-07,
"loss": 0.384,
"step": 1072
},
{
"epoch": 2.65,
"grad_norm": 1.018172264099121,
"learning_rate": 6.826831055788119e-07,
"loss": 0.4203,
"step": 1073
},
{
"epoch": 2.66,
"grad_norm": 0.9754199385643005,
"learning_rate": 6.730069622772373e-07,
"loss": 0.405,
"step": 1074
},
{
"epoch": 2.66,
"grad_norm": 1.1086993217468262,
"learning_rate": 6.633974943413113e-07,
"loss": 0.3739,
"step": 1075
},
{
"epoch": 2.66,
"grad_norm": 1.0852959156036377,
"learning_rate": 6.538547704657094e-07,
"loss": 0.4029,
"step": 1076
},
{
"epoch": 2.66,
"grad_norm": 1.1574333906173706,
"learning_rate": 6.443788588679823e-07,
"loss": 0.3849,
"step": 1077
},
{
"epoch": 2.67,
"grad_norm": 1.0261621475219727,
"learning_rate": 6.349698272880588e-07,
"loss": 0.3691,
"step": 1078
},
{
"epoch": 2.67,
"grad_norm": 1.0623788833618164,
"learning_rate": 6.256277429877711e-07,
"loss": 0.342,
"step": 1079
},
{
"epoch": 2.67,
"grad_norm": 0.9742769002914429,
"learning_rate": 6.163526727503688e-07,
"loss": 0.3574,
"step": 1080
},
{
"epoch": 2.67,
"grad_norm": 1.0071436166763306,
"learning_rate": 6.071446828800353e-07,
"loss": 0.3781,
"step": 1081
},
{
"epoch": 2.68,
"grad_norm": 0.9264128804206848,
"learning_rate": 5.980038392014309e-07,
"loss": 0.3502,
"step": 1082
},
{
"epoch": 2.68,
"grad_norm": 1.1676849126815796,
"learning_rate": 5.889302070591985e-07,
"loss": 0.3902,
"step": 1083
},
{
"epoch": 2.68,
"grad_norm": 0.9769526720046997,
"learning_rate": 5.79923851317521e-07,
"loss": 0.3435,
"step": 1084
},
{
"epoch": 2.68,
"grad_norm": 1.0769532918930054,
"learning_rate": 5.709848363596404e-07,
"loss": 0.3773,
"step": 1085
},
{
"epoch": 2.69,
"grad_norm": 1.0605096817016602,
"learning_rate": 5.621132260874051e-07,
"loss": 0.3717,
"step": 1086
},
{
"epoch": 2.69,
"grad_norm": 1.0418657064437866,
"learning_rate": 5.533090839208133e-07,
"loss": 0.344,
"step": 1087
},
{
"epoch": 2.69,
"grad_norm": 0.8912917971611023,
"learning_rate": 5.445724727975498e-07,
"loss": 0.3305,
"step": 1088
},
{
"epoch": 2.69,
"grad_norm": 1.045654296875,
"learning_rate": 5.359034551725517e-07,
"loss": 0.4082,
"step": 1089
},
{
"epoch": 2.7,
"grad_norm": 1.1446118354797363,
"learning_rate": 5.273020930175543e-07,
"loss": 0.3402,
"step": 1090
},
{
"epoch": 2.7,
"grad_norm": 0.9397068619728088,
"learning_rate": 5.187684478206412e-07,
"loss": 0.3751,
"step": 1091
},
{
"epoch": 2.7,
"grad_norm": 1.055282711982727,
"learning_rate": 5.103025805858197e-07,
"loss": 0.3857,
"step": 1092
},
{
"epoch": 2.7,
"grad_norm": 1.0485235452651978,
"learning_rate": 5.019045518325693e-07,
"loss": 0.3787,
"step": 1093
},
{
"epoch": 2.71,
"grad_norm": 1.0545977354049683,
"learning_rate": 4.935744215954197e-07,
"loss": 0.3758,
"step": 1094
},
{
"epoch": 2.71,
"grad_norm": 1.0440723896026611,
"learning_rate": 4.853122494235207e-07,
"loss": 0.3521,
"step": 1095
},
{
"epoch": 2.71,
"grad_norm": 1.0499279499053955,
"learning_rate": 4.77118094380209e-07,
"loss": 0.3803,
"step": 1096
},
{
"epoch": 2.71,
"grad_norm": 1.0537831783294678,
"learning_rate": 4.6899201504259196e-07,
"loss": 0.3497,
"step": 1097
},
{
"epoch": 2.72,
"grad_norm": 1.1298884153366089,
"learning_rate": 4.609340695011311e-07,
"loss": 0.3606,
"step": 1098
},
{
"epoch": 2.72,
"grad_norm": 1.1490843296051025,
"learning_rate": 4.5294431535922166e-07,
"loss": 0.378,
"step": 1099
},
{
"epoch": 2.72,
"grad_norm": 1.051404356956482,
"learning_rate": 4.4502280973278135e-07,
"loss": 0.3925,
"step": 1100
},
{
"epoch": 2.72,
"grad_norm": 1.0292401313781738,
"learning_rate": 4.3716960924984566e-07,
"loss": 0.3421,
"step": 1101
},
{
"epoch": 2.73,
"grad_norm": 1.0107548236846924,
"learning_rate": 4.2938477005015853e-07,
"loss": 0.3451,
"step": 1102
},
{
"epoch": 2.73,
"grad_norm": 1.0043538808822632,
"learning_rate": 4.2166834778477717e-07,
"loss": 0.3601,
"step": 1103
},
{
"epoch": 2.73,
"grad_norm": 1.0446064472198486,
"learning_rate": 4.140203976156665e-07,
"loss": 0.3921,
"step": 1104
},
{
"epoch": 2.73,
"grad_norm": 1.2105978727340698,
"learning_rate": 4.064409742153097e-07,
"loss": 0.3689,
"step": 1105
},
{
"epoch": 2.74,
"grad_norm": 1.1314482688903809,
"learning_rate": 3.9893013176631636e-07,
"loss": 0.3859,
"step": 1106
},
{
"epoch": 2.74,
"grad_norm": 1.1071857213974,
"learning_rate": 3.914879239610392e-07,
"loss": 0.3551,
"step": 1107
},
{
"epoch": 2.74,
"grad_norm": 1.0189298391342163,
"learning_rate": 3.8411440400117685e-07,
"loss": 0.3886,
"step": 1108
},
{
"epoch": 2.74,
"grad_norm": 1.129767656326294,
"learning_rate": 3.768096245974129e-07,
"loss": 0.3446,
"step": 1109
},
{
"epoch": 2.74,
"grad_norm": 1.0028159618377686,
"learning_rate": 3.69573637969024e-07,
"loss": 0.3489,
"step": 1110
},
{
"epoch": 2.75,
"grad_norm": 0.9922955632209778,
"learning_rate": 3.6240649584351137e-07,
"loss": 0.3524,
"step": 1111
},
{
"epoch": 2.75,
"grad_norm": 0.9887750744819641,
"learning_rate": 3.553082494562354e-07,
"loss": 0.3677,
"step": 1112
},
{
"epoch": 2.75,
"grad_norm": 1.0813145637512207,
"learning_rate": 3.4827894955003825e-07,
"loss": 0.3819,
"step": 1113
},
{
"epoch": 2.75,
"grad_norm": 1.1129223108291626,
"learning_rate": 3.413186463748941e-07,
"loss": 0.315,
"step": 1114
},
{
"epoch": 2.76,
"grad_norm": 1.0134004354476929,
"learning_rate": 3.3442738968754164e-07,
"loss": 0.3586,
"step": 1115
},
{
"epoch": 2.76,
"grad_norm": 0.9864435791969299,
"learning_rate": 3.276052287511333e-07,
"loss": 0.3953,
"step": 1116
},
{
"epoch": 2.76,
"grad_norm": 0.9584245085716248,
"learning_rate": 3.2085221233487564e-07,
"loss": 0.3791,
"step": 1117
},
{
"epoch": 2.76,
"grad_norm": 0.9986026287078857,
"learning_rate": 3.1416838871368925e-07,
"loss": 0.3478,
"step": 1118
},
{
"epoch": 2.77,
"grad_norm": 1.9416661262512207,
"learning_rate": 3.0755380566785955e-07,
"loss": 0.4271,
"step": 1119
},
{
"epoch": 2.77,
"grad_norm": 1.0283199548721313,
"learning_rate": 3.010085104826932e-07,
"loss": 0.3685,
"step": 1120
},
{
"epoch": 2.77,
"grad_norm": 1.0230189561843872,
"learning_rate": 2.945325499481855e-07,
"loss": 0.3593,
"step": 1121
},
{
"epoch": 2.77,
"grad_norm": 1.1526145935058594,
"learning_rate": 2.881259703586814e-07,
"loss": 0.3692,
"step": 1122
},
{
"epoch": 2.78,
"grad_norm": 1.2891979217529297,
"learning_rate": 2.817888175125472e-07,
"loss": 0.407,
"step": 1123
},
{
"epoch": 2.78,
"grad_norm": 1.2529091835021973,
"learning_rate": 2.7552113671184264e-07,
"loss": 0.3489,
"step": 1124
},
{
"epoch": 2.78,
"grad_norm": 1.004999041557312,
"learning_rate": 2.693229727619906e-07,
"loss": 0.322,
"step": 1125
},
{
"epoch": 2.78,
"grad_norm": 1.0034558773040771,
"learning_rate": 2.631943699714712e-07,
"loss": 0.3658,
"step": 1126
},
{
"epoch": 2.79,
"grad_norm": 1.0214273929595947,
"learning_rate": 2.571353721514913e-07,
"loss": 0.3813,
"step": 1127
},
{
"epoch": 2.79,
"grad_norm": 1.0095939636230469,
"learning_rate": 2.51146022615677e-07,
"loss": 0.3516,
"step": 1128
},
{
"epoch": 2.79,
"grad_norm": 1.0672303438186646,
"learning_rate": 2.452263641797659e-07,
"loss": 0.3813,
"step": 1129
},
{
"epoch": 2.79,
"grad_norm": 0.959056556224823,
"learning_rate": 2.3937643916129404e-07,
"loss": 0.3743,
"step": 1130
},
{
"epoch": 2.8,
"grad_norm": 1.0662682056427002,
"learning_rate": 2.3359628937930422e-07,
"loss": 0.381,
"step": 1131
},
{
"epoch": 2.8,
"grad_norm": 1.0703846216201782,
"learning_rate": 2.2788595615403475e-07,
"loss": 0.4138,
"step": 1132
},
{
"epoch": 2.8,
"grad_norm": 1.0766559839248657,
"learning_rate": 2.222454803066332e-07,
"loss": 0.3711,
"step": 1133
},
{
"epoch": 2.8,
"grad_norm": 0.9815179109573364,
"learning_rate": 2.16674902158861e-07,
"loss": 0.3602,
"step": 1134
},
{
"epoch": 2.81,
"grad_norm": 1.1038676500320435,
"learning_rate": 2.111742615328083e-07,
"loss": 0.3954,
"step": 1135
},
{
"epoch": 2.81,
"grad_norm": 1.0331889390945435,
"learning_rate": 2.057435977506028e-07,
"loss": 0.3847,
"step": 1136
},
{
"epoch": 2.81,
"grad_norm": 1.0882453918457031,
"learning_rate": 2.0038294963413251e-07,
"loss": 0.4082,
"step": 1137
},
{
"epoch": 2.81,
"grad_norm": 1.0436729192733765,
"learning_rate": 1.9509235550477123e-07,
"loss": 0.3623,
"step": 1138
},
{
"epoch": 2.82,
"grad_norm": 1.161468744277954,
"learning_rate": 1.8987185318310009e-07,
"loss": 0.3537,
"step": 1139
},
{
"epoch": 2.82,
"grad_norm": 1.13321852684021,
"learning_rate": 1.8472147998863877e-07,
"loss": 0.3521,
"step": 1140
},
{
"epoch": 2.82,
"grad_norm": 1.0659288167953491,
"learning_rate": 1.796412727395802e-07,
"loss": 0.3931,
"step": 1141
},
{
"epoch": 2.82,
"grad_norm": 1.0503995418548584,
"learning_rate": 1.7463126775252192e-07,
"loss": 0.3681,
"step": 1142
},
{
"epoch": 2.83,
"grad_norm": 1.183427095413208,
"learning_rate": 1.6969150084221399e-07,
"loss": 0.3357,
"step": 1143
},
{
"epoch": 2.83,
"grad_norm": 1.158530592918396,
"learning_rate": 1.6482200732129804e-07,
"loss": 0.3683,
"step": 1144
},
{
"epoch": 2.83,
"grad_norm": 0.953620433807373,
"learning_rate": 1.600228220000577e-07,
"loss": 0.3479,
"step": 1145
},
{
"epoch": 2.83,
"grad_norm": 1.0016752481460571,
"learning_rate": 1.552939791861663e-07,
"loss": 0.3587,
"step": 1146
},
{
"epoch": 2.84,
"grad_norm": 1.0718989372253418,
"learning_rate": 1.5063551268444275e-07,
"loss": 0.4021,
"step": 1147
},
{
"epoch": 2.84,
"grad_norm": 1.0193805694580078,
"learning_rate": 1.4604745579661405e-07,
"loss": 0.3593,
"step": 1148
},
{
"epoch": 2.84,
"grad_norm": 1.106675148010254,
"learning_rate": 1.4152984132106972e-07,
"loss": 0.3293,
"step": 1149
},
{
"epoch": 2.84,
"grad_norm": 0.987933874130249,
"learning_rate": 1.370827015526355e-07,
"loss": 0.4167,
"step": 1150
},
{
"epoch": 2.85,
"grad_norm": 1.0538557767868042,
"learning_rate": 1.3270606828233668e-07,
"loss": 0.3581,
"step": 1151
},
{
"epoch": 2.85,
"grad_norm": 1.105463981628418,
"learning_rate": 1.2839997279717075e-07,
"loss": 0.3861,
"step": 1152
},
{
"epoch": 2.85,
"grad_norm": 0.9823542237281799,
"learning_rate": 1.241644458798885e-07,
"loss": 0.3428,
"step": 1153
},
{
"epoch": 2.85,
"grad_norm": 1.0500560998916626,
"learning_rate": 1.1999951780876872e-07,
"loss": 0.4134,
"step": 1154
},
{
"epoch": 2.86,
"grad_norm": 1.0864903926849365,
"learning_rate": 1.159052183574072e-07,
"loss": 0.3789,
"step": 1155
},
{
"epoch": 2.86,
"grad_norm": 0.9418767690658569,
"learning_rate": 1.1188157679449585e-07,
"loss": 0.3673,
"step": 1156
},
{
"epoch": 2.86,
"grad_norm": 0.9977820515632629,
"learning_rate": 1.0792862188362396e-07,
"loss": 0.384,
"step": 1157
},
{
"epoch": 2.86,
"grad_norm": 0.9629470109939575,
"learning_rate": 1.0404638188306504e-07,
"loss": 0.373,
"step": 1158
},
{
"epoch": 2.87,
"grad_norm": 0.983284592628479,
"learning_rate": 1.002348845455725e-07,
"loss": 0.3716,
"step": 1159
},
{
"epoch": 2.87,
"grad_norm": 1.0521374940872192,
"learning_rate": 9.64941571181921e-08,
"loss": 0.3407,
"step": 1160
},
{
"epoch": 2.87,
"grad_norm": 0.9713572263717651,
"learning_rate": 9.282422634205645e-08,
"loss": 0.3835,
"step": 1161
},
{
"epoch": 2.87,
"grad_norm": 1.1258341073989868,
"learning_rate": 8.922511845219972e-08,
"loss": 0.3906,
"step": 1162
},
{
"epoch": 2.88,
"grad_norm": 1.0587928295135498,
"learning_rate": 8.569685917736659e-08,
"loss": 0.3664,
"step": 1163
},
{
"epoch": 2.88,
"grad_norm": 1.0492050647735596,
"learning_rate": 8.223947373983354e-08,
"loss": 0.3806,
"step": 1164
},
{
"epoch": 2.88,
"grad_norm": 1.0404672622680664,
"learning_rate": 7.885298685522235e-08,
"loss": 0.3596,
"step": 1165
},
{
"epoch": 2.88,
"grad_norm": 0.98375403881073,
"learning_rate": 7.553742273232578e-08,
"loss": 0.3733,
"step": 1166
},
{
"epoch": 2.89,
"grad_norm": 1.1184265613555908,
"learning_rate": 7.229280507293657e-08,
"loss": 0.402,
"step": 1167
},
{
"epoch": 2.89,
"grad_norm": 1.1216493844985962,
"learning_rate": 6.911915707167538e-08,
"loss": 0.3718,
"step": 1168
},
{
"epoch": 2.89,
"grad_norm": 1.0226802825927734,
"learning_rate": 6.601650141582649e-08,
"loss": 0.3855,
"step": 1169
},
{
"epoch": 2.89,
"grad_norm": 1.0417250394821167,
"learning_rate": 6.29848602851768e-08,
"loss": 0.3915,
"step": 1170
},
{
"epoch": 2.9,
"grad_norm": 1.1957062482833862,
"learning_rate": 6.002425535185041e-08,
"loss": 0.3532,
"step": 1171
},
{
"epoch": 2.9,
"grad_norm": 1.027915596961975,
"learning_rate": 5.713470778016539e-08,
"loss": 0.3529,
"step": 1172
},
{
"epoch": 2.9,
"grad_norm": 0.9913766384124756,
"learning_rate": 5.4316238226469476e-08,
"loss": 0.3272,
"step": 1173
},
{
"epoch": 2.9,
"grad_norm": 1.1681790351867676,
"learning_rate": 5.1568866839003525e-08,
"loss": 0.375,
"step": 1174
},
{
"epoch": 2.91,
"grad_norm": 1.0217177867889404,
"learning_rate": 4.889261325775163e-08,
"loss": 0.4097,
"step": 1175
},
{
"epoch": 2.91,
"grad_norm": 1.0957375764846802,
"learning_rate": 4.628749661430121e-08,
"loss": 0.3522,
"step": 1176
},
{
"epoch": 2.91,
"grad_norm": 1.0801223516464233,
"learning_rate": 4.375353553170647e-08,
"loss": 0.357,
"step": 1177
},
{
"epoch": 2.91,
"grad_norm": 1.0419552326202393,
"learning_rate": 4.1290748124358513e-08,
"loss": 0.3433,
"step": 1178
},
{
"epoch": 2.92,
"grad_norm": 1.117267370223999,
"learning_rate": 3.889915199784877e-08,
"loss": 0.3631,
"step": 1179
},
{
"epoch": 2.92,
"grad_norm": 1.0114556550979614,
"learning_rate": 3.657876424885243e-08,
"loss": 0.3939,
"step": 1180
},
{
"epoch": 2.92,
"grad_norm": 1.162495732307434,
"learning_rate": 3.432960146499631e-08,
"loss": 0.3512,
"step": 1181
},
{
"epoch": 2.92,
"grad_norm": 1.0782825946807861,
"learning_rate": 3.2151679724748974e-08,
"loss": 0.3726,
"step": 1182
},
{
"epoch": 2.93,
"grad_norm": 1.1750308275222778,
"learning_rate": 3.0045014597299695e-08,
"loss": 0.3905,
"step": 1183
},
{
"epoch": 2.93,
"grad_norm": 1.0197083950042725,
"learning_rate": 2.800962114245076e-08,
"loss": 0.3661,
"step": 1184
},
{
"epoch": 2.93,
"grad_norm": 1.0369820594787598,
"learning_rate": 2.6045513910509802e-08,
"loss": 0.3995,
"step": 1185
},
{
"epoch": 2.93,
"grad_norm": 1.0298311710357666,
"learning_rate": 2.415270694217986e-08,
"loss": 0.368,
"step": 1186
},
{
"epoch": 2.94,
"grad_norm": 0.9938005805015564,
"learning_rate": 2.2331213768468363e-08,
"loss": 0.3792,
"step": 1187
},
{
"epoch": 2.94,
"grad_norm": 0.9757488369941711,
"learning_rate": 2.0581047410583865e-08,
"loss": 0.3454,
"step": 1188
},
{
"epoch": 2.94,
"grad_norm": 0.9685208201408386,
"learning_rate": 1.8902220379846125e-08,
"loss": 0.3504,
"step": 1189
},
{
"epoch": 2.94,
"grad_norm": 1.1232110261917114,
"learning_rate": 1.7294744677591733e-08,
"loss": 0.361,
"step": 1190
},
{
"epoch": 2.95,
"grad_norm": 0.9797727465629578,
"learning_rate": 1.57586317950964e-08,
"loss": 0.3333,
"step": 1191
},
{
"epoch": 2.95,
"grad_norm": 1.0392558574676514,
"learning_rate": 1.4293892713486135e-08,
"loss": 0.3813,
"step": 1192
},
{
"epoch": 2.95,
"grad_norm": 1.0265522003173828,
"learning_rate": 1.2900537903660637e-08,
"loss": 0.3655,
"step": 1193
},
{
"epoch": 2.95,
"grad_norm": 1.1100423336029053,
"learning_rate": 1.157857732622003e-08,
"loss": 0.4057,
"step": 1194
},
{
"epoch": 2.96,
"grad_norm": 0.9980720281600952,
"learning_rate": 1.0328020431391583e-08,
"loss": 0.3694,
"step": 1195
},
{
"epoch": 2.96,
"grad_norm": 1.0084683895111084,
"learning_rate": 9.148876158961983e-09,
"loss": 0.3646,
"step": 1196
},
{
"epoch": 2.96,
"grad_norm": 0.9384503960609436,
"learning_rate": 8.041152938216278e-09,
"loss": 0.3825,
"step": 1197
},
{
"epoch": 2.96,
"grad_norm": 1.0136579275131226,
"learning_rate": 7.004858687874594e-09,
"loss": 0.3425,
"step": 1198
},
{
"epoch": 2.97,
"grad_norm": 0.8865569829940796,
"learning_rate": 6.040000816037728e-09,
"loss": 0.3204,
"step": 1199
},
{
"epoch": 2.97,
"grad_norm": 1.067499041557312,
"learning_rate": 5.146586220131644e-09,
"loss": 0.3571,
"step": 1200
},
{
"epoch": 2.97,
"grad_norm": 1.09268057346344,
"learning_rate": 4.324621286861952e-09,
"loss": 0.3801,
"step": 1201
},
{
"epoch": 2.97,
"grad_norm": 1.0590449571609497,
"learning_rate": 3.5741118921628346e-09,
"loss": 0.3827,
"step": 1202
},
{
"epoch": 2.97,
"grad_norm": 0.9990357160568237,
"learning_rate": 2.895063401160414e-09,
"loss": 0.3529,
"step": 1203
},
{
"epoch": 2.98,
"grad_norm": 0.9654290080070496,
"learning_rate": 2.2874806681305593e-09,
"loss": 0.3551,
"step": 1204
},
{
"epoch": 2.98,
"grad_norm": 1.145081639289856,
"learning_rate": 1.7513680364689145e-09,
"loss": 0.3953,
"step": 1205
},
{
"epoch": 2.98,
"grad_norm": 1.1687543392181396,
"learning_rate": 1.2867293386531476e-09,
"loss": 0.3482,
"step": 1206
},
{
"epoch": 2.98,
"grad_norm": 1.138846516609192,
"learning_rate": 8.935678962196381e-10,
"loss": 0.3827,
"step": 1207
},
{
"epoch": 2.99,
"grad_norm": 1.009304404258728,
"learning_rate": 5.718865197423817e-10,
"loss": 0.3386,
"step": 1208
},
{
"epoch": 2.99,
"grad_norm": 0.9322947859764099,
"learning_rate": 3.2168750880634537e-10,
"loss": 0.3711,
"step": 1209
},
{
"epoch": 2.99,
"grad_norm": 0.9357113838195801,
"learning_rate": 1.4297265199414434e-10,
"loss": 0.3978,
"step": 1210
},
{
"epoch": 2.99,
"grad_norm": 1.1130071878433228,
"learning_rate": 3.57432268771607e-11,
"loss": 0.3536,
"step": 1211
},
{
"epoch": 3.0,
"grad_norm": 0.9601169228553772,
"learning_rate": 0.0,
"loss": 0.3487,
"step": 1212
},
{
"epoch": 3.0,
"step": 1212,
"total_flos": 6.037597788831744e+17,
"train_loss": 0.6512430794168227,
"train_runtime": 17166.6584,
"train_samples_per_second": 9.045,
"train_steps_per_second": 0.071
}
],
"logging_steps": 1.0,
"max_steps": 1212,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 2000,
"total_flos": 6.037597788831744e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}