GuilhermeFreire's picture
checkpoint-1130
fdceccb
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7850629613547546,
"eval_steps": 500,
"global_step": 1130,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006947459834997829,
"grad_norm": 2.5,
"learning_rate": 5.780346820809248e-07,
"loss": 4.1915,
"step": 1
},
{
"epoch": 0.0013894919669995658,
"grad_norm": 4.21875,
"learning_rate": 1.1560693641618497e-06,
"loss": 3.7445,
"step": 2
},
{
"epoch": 0.0020842379504993486,
"grad_norm": 1.4140625,
"learning_rate": 1.7341040462427746e-06,
"loss": 3.9999,
"step": 3
},
{
"epoch": 0.0027789839339991316,
"grad_norm": 1.1484375,
"learning_rate": 2.3121387283236993e-06,
"loss": 4.0444,
"step": 4
},
{
"epoch": 0.0034737299174989146,
"grad_norm": 1.3359375,
"learning_rate": 2.8901734104046244e-06,
"loss": 4.1148,
"step": 5
},
{
"epoch": 0.004168475900998697,
"grad_norm": 1.7578125,
"learning_rate": 3.468208092485549e-06,
"loss": 4.5023,
"step": 6
},
{
"epoch": 0.004863221884498481,
"grad_norm": 6.90625,
"learning_rate": 4.046242774566474e-06,
"loss": 4.0132,
"step": 7
},
{
"epoch": 0.005557967867998263,
"grad_norm": 1.3046875,
"learning_rate": 4.624277456647399e-06,
"loss": 3.7643,
"step": 8
},
{
"epoch": 0.006252713851498046,
"grad_norm": 1.5234375,
"learning_rate": 5.202312138728324e-06,
"loss": 3.3901,
"step": 9
},
{
"epoch": 0.006947459834997829,
"grad_norm": 0.8984375,
"learning_rate": 5.780346820809249e-06,
"loss": 3.249,
"step": 10
},
{
"epoch": 0.007642205818497612,
"grad_norm": 2.28125,
"learning_rate": 6.358381502890173e-06,
"loss": 3.8669,
"step": 11
},
{
"epoch": 0.008336951801997394,
"grad_norm": 1.40625,
"learning_rate": 6.936416184971098e-06,
"loss": 3.9794,
"step": 12
},
{
"epoch": 0.009031697785497178,
"grad_norm": 0.8515625,
"learning_rate": 7.514450867052024e-06,
"loss": 3.3814,
"step": 13
},
{
"epoch": 0.009726443768996961,
"grad_norm": 1.1953125,
"learning_rate": 8.092485549132949e-06,
"loss": 3.7746,
"step": 14
},
{
"epoch": 0.010421189752496743,
"grad_norm": 1.7265625,
"learning_rate": 8.670520231213873e-06,
"loss": 3.0555,
"step": 15
},
{
"epoch": 0.011115935735996526,
"grad_norm": 1.1796875,
"learning_rate": 9.248554913294797e-06,
"loss": 3.6958,
"step": 16
},
{
"epoch": 0.01181068171949631,
"grad_norm": 2.25,
"learning_rate": 9.826589595375723e-06,
"loss": 5.0544,
"step": 17
},
{
"epoch": 0.012505427702996091,
"grad_norm": 4.875,
"learning_rate": 1.0404624277456647e-05,
"loss": 5.395,
"step": 18
},
{
"epoch": 0.013200173686495875,
"grad_norm": 0.9375,
"learning_rate": 1.0982658959537573e-05,
"loss": 3.3306,
"step": 19
},
{
"epoch": 0.013894919669995658,
"grad_norm": 1.625,
"learning_rate": 1.1560693641618498e-05,
"loss": 4.0474,
"step": 20
},
{
"epoch": 0.01458966565349544,
"grad_norm": 1.65625,
"learning_rate": 1.2138728323699422e-05,
"loss": 3.6259,
"step": 21
},
{
"epoch": 0.015284411636995223,
"grad_norm": 3.0,
"learning_rate": 1.2716763005780346e-05,
"loss": 4.0321,
"step": 22
},
{
"epoch": 0.015979157620495007,
"grad_norm": 1.9140625,
"learning_rate": 1.329479768786127e-05,
"loss": 4.1047,
"step": 23
},
{
"epoch": 0.01667390360399479,
"grad_norm": 2.375,
"learning_rate": 1.3872832369942197e-05,
"loss": 3.4635,
"step": 24
},
{
"epoch": 0.017368649587494574,
"grad_norm": 3.03125,
"learning_rate": 1.4450867052023123e-05,
"loss": 5.1757,
"step": 25
},
{
"epoch": 0.018063395570994355,
"grad_norm": 7.0,
"learning_rate": 1.5028901734104049e-05,
"loss": 4.4566,
"step": 26
},
{
"epoch": 0.018758141554494137,
"grad_norm": 2.015625,
"learning_rate": 1.5606936416184973e-05,
"loss": 3.9157,
"step": 27
},
{
"epoch": 0.019452887537993922,
"grad_norm": 2.40625,
"learning_rate": 1.6184971098265897e-05,
"loss": 3.5971,
"step": 28
},
{
"epoch": 0.020147633521493704,
"grad_norm": 2.84375,
"learning_rate": 1.676300578034682e-05,
"loss": 3.1815,
"step": 29
},
{
"epoch": 0.020842379504993486,
"grad_norm": 2.515625,
"learning_rate": 1.7341040462427746e-05,
"loss": 3.5147,
"step": 30
},
{
"epoch": 0.02153712548849327,
"grad_norm": 2.625,
"learning_rate": 1.791907514450867e-05,
"loss": 3.4658,
"step": 31
},
{
"epoch": 0.022231871471993053,
"grad_norm": 4.34375,
"learning_rate": 1.8497109826589594e-05,
"loss": 3.2493,
"step": 32
},
{
"epoch": 0.022926617455492834,
"grad_norm": 1.75,
"learning_rate": 1.907514450867052e-05,
"loss": 2.5355,
"step": 33
},
{
"epoch": 0.02362136343899262,
"grad_norm": 2.234375,
"learning_rate": 1.9653179190751446e-05,
"loss": 3.1388,
"step": 34
},
{
"epoch": 0.0243161094224924,
"grad_norm": 5.40625,
"learning_rate": 2.023121387283237e-05,
"loss": 2.7128,
"step": 35
},
{
"epoch": 0.025010855405992183,
"grad_norm": 2.78125,
"learning_rate": 2.0809248554913295e-05,
"loss": 2.9951,
"step": 36
},
{
"epoch": 0.025705601389491968,
"grad_norm": 1.9453125,
"learning_rate": 2.1387283236994223e-05,
"loss": 2.938,
"step": 37
},
{
"epoch": 0.02640034737299175,
"grad_norm": 3.21875,
"learning_rate": 2.1965317919075147e-05,
"loss": 2.8222,
"step": 38
},
{
"epoch": 0.02709509335649153,
"grad_norm": 0.8359375,
"learning_rate": 2.254335260115607e-05,
"loss": 2.4152,
"step": 39
},
{
"epoch": 0.027789839339991317,
"grad_norm": 1.7890625,
"learning_rate": 2.3121387283236996e-05,
"loss": 2.5405,
"step": 40
},
{
"epoch": 0.028484585323491098,
"grad_norm": 1.2421875,
"learning_rate": 2.369942196531792e-05,
"loss": 2.6256,
"step": 41
},
{
"epoch": 0.02917933130699088,
"grad_norm": 1.0078125,
"learning_rate": 2.4277456647398844e-05,
"loss": 2.646,
"step": 42
},
{
"epoch": 0.029874077290490665,
"grad_norm": 0.84375,
"learning_rate": 2.485549132947977e-05,
"loss": 2.3207,
"step": 43
},
{
"epoch": 0.030568823273990447,
"grad_norm": 0.91015625,
"learning_rate": 2.5433526011560693e-05,
"loss": 2.3893,
"step": 44
},
{
"epoch": 0.03126356925749023,
"grad_norm": 1.546875,
"learning_rate": 2.6011560693641617e-05,
"loss": 1.8784,
"step": 45
},
{
"epoch": 0.031958315240990014,
"grad_norm": 1.3515625,
"learning_rate": 2.658959537572254e-05,
"loss": 2.0503,
"step": 46
},
{
"epoch": 0.0326530612244898,
"grad_norm": 0.8515625,
"learning_rate": 2.7167630057803466e-05,
"loss": 2.1115,
"step": 47
},
{
"epoch": 0.03334780720798958,
"grad_norm": 1.1171875,
"learning_rate": 2.7745664739884393e-05,
"loss": 1.92,
"step": 48
},
{
"epoch": 0.03404255319148936,
"grad_norm": 0.83984375,
"learning_rate": 2.832369942196532e-05,
"loss": 2.1723,
"step": 49
},
{
"epoch": 0.03473729917498915,
"grad_norm": 0.78515625,
"learning_rate": 2.8901734104046245e-05,
"loss": 2.1771,
"step": 50
},
{
"epoch": 0.035432045158488926,
"grad_norm": 1.96875,
"learning_rate": 2.947976878612717e-05,
"loss": 2.5537,
"step": 51
},
{
"epoch": 0.03612679114198871,
"grad_norm": 0.9140625,
"learning_rate": 3.0057803468208097e-05,
"loss": 2.0183,
"step": 52
},
{
"epoch": 0.036821537125488496,
"grad_norm": 0.6171875,
"learning_rate": 3.063583815028902e-05,
"loss": 2.2641,
"step": 53
},
{
"epoch": 0.037516283108988274,
"grad_norm": 1.2890625,
"learning_rate": 3.1213872832369946e-05,
"loss": 2.2391,
"step": 54
},
{
"epoch": 0.03821102909248806,
"grad_norm": 0.92578125,
"learning_rate": 3.179190751445087e-05,
"loss": 2.0064,
"step": 55
},
{
"epoch": 0.038905775075987845,
"grad_norm": 1.171875,
"learning_rate": 3.2369942196531794e-05,
"loss": 1.6102,
"step": 56
},
{
"epoch": 0.03960052105948762,
"grad_norm": 0.4765625,
"learning_rate": 3.294797687861272e-05,
"loss": 2.1865,
"step": 57
},
{
"epoch": 0.04029526704298741,
"grad_norm": 1.1484375,
"learning_rate": 3.352601156069364e-05,
"loss": 1.9878,
"step": 58
},
{
"epoch": 0.04099001302648719,
"grad_norm": 1.0234375,
"learning_rate": 3.410404624277457e-05,
"loss": 2.1405,
"step": 59
},
{
"epoch": 0.04168475900998697,
"grad_norm": 1.625,
"learning_rate": 3.468208092485549e-05,
"loss": 2.3983,
"step": 60
},
{
"epoch": 0.04237950499348676,
"grad_norm": 0.96875,
"learning_rate": 3.5260115606936416e-05,
"loss": 2.3009,
"step": 61
},
{
"epoch": 0.04307425097698654,
"grad_norm": 1.0,
"learning_rate": 3.583815028901734e-05,
"loss": 1.8959,
"step": 62
},
{
"epoch": 0.04376899696048632,
"grad_norm": 1.0390625,
"learning_rate": 3.6416184971098265e-05,
"loss": 1.9368,
"step": 63
},
{
"epoch": 0.044463742943986105,
"grad_norm": 1.046875,
"learning_rate": 3.699421965317919e-05,
"loss": 2.1604,
"step": 64
},
{
"epoch": 0.04515848892748589,
"grad_norm": 0.703125,
"learning_rate": 3.757225433526011e-05,
"loss": 2.034,
"step": 65
},
{
"epoch": 0.04585323491098567,
"grad_norm": 0.8515625,
"learning_rate": 3.815028901734104e-05,
"loss": 2.3586,
"step": 66
},
{
"epoch": 0.046547980894485454,
"grad_norm": 0.62109375,
"learning_rate": 3.872832369942196e-05,
"loss": 1.8835,
"step": 67
},
{
"epoch": 0.04724272687798524,
"grad_norm": 0.6328125,
"learning_rate": 3.930635838150289e-05,
"loss": 2.1474,
"step": 68
},
{
"epoch": 0.04793747286148502,
"grad_norm": 0.7578125,
"learning_rate": 3.988439306358382e-05,
"loss": 1.988,
"step": 69
},
{
"epoch": 0.0486322188449848,
"grad_norm": 0.6953125,
"learning_rate": 4.046242774566474e-05,
"loss": 2.2501,
"step": 70
},
{
"epoch": 0.04932696482848459,
"grad_norm": 1.125,
"learning_rate": 4.1040462427745666e-05,
"loss": 1.6597,
"step": 71
},
{
"epoch": 0.050021710811984366,
"grad_norm": 0.90234375,
"learning_rate": 4.161849710982659e-05,
"loss": 2.2616,
"step": 72
},
{
"epoch": 0.05071645679548415,
"grad_norm": 1.0390625,
"learning_rate": 4.2196531791907514e-05,
"loss": 1.8914,
"step": 73
},
{
"epoch": 0.051411202778983936,
"grad_norm": 1.7421875,
"learning_rate": 4.2774566473988445e-05,
"loss": 2.0235,
"step": 74
},
{
"epoch": 0.052105948762483714,
"grad_norm": 0.66015625,
"learning_rate": 4.335260115606937e-05,
"loss": 2.1633,
"step": 75
},
{
"epoch": 0.0528006947459835,
"grad_norm": 0.68359375,
"learning_rate": 4.3930635838150294e-05,
"loss": 2.1997,
"step": 76
},
{
"epoch": 0.053495440729483285,
"grad_norm": 0.98828125,
"learning_rate": 4.450867052023122e-05,
"loss": 2.2325,
"step": 77
},
{
"epoch": 0.05419018671298306,
"grad_norm": 0.95703125,
"learning_rate": 4.508670520231214e-05,
"loss": 1.6797,
"step": 78
},
{
"epoch": 0.05488493269648285,
"grad_norm": 0.68359375,
"learning_rate": 4.566473988439307e-05,
"loss": 2.0388,
"step": 79
},
{
"epoch": 0.05557967867998263,
"grad_norm": 1.34375,
"learning_rate": 4.624277456647399e-05,
"loss": 1.8112,
"step": 80
},
{
"epoch": 0.05627442466348241,
"grad_norm": 1.2578125,
"learning_rate": 4.6820809248554915e-05,
"loss": 1.925,
"step": 81
},
{
"epoch": 0.056969170646982197,
"grad_norm": 0.80859375,
"learning_rate": 4.739884393063584e-05,
"loss": 1.8969,
"step": 82
},
{
"epoch": 0.05766391663048198,
"grad_norm": 1.1171875,
"learning_rate": 4.7976878612716764e-05,
"loss": 2.1033,
"step": 83
},
{
"epoch": 0.05835866261398176,
"grad_norm": 0.90234375,
"learning_rate": 4.855491329479769e-05,
"loss": 2.0978,
"step": 84
},
{
"epoch": 0.059053408597481545,
"grad_norm": 0.51953125,
"learning_rate": 4.913294797687861e-05,
"loss": 2.0516,
"step": 85
},
{
"epoch": 0.05974815458098133,
"grad_norm": 0.474609375,
"learning_rate": 4.971098265895954e-05,
"loss": 2.0648,
"step": 86
},
{
"epoch": 0.06044290056448111,
"grad_norm": 5.1875,
"learning_rate": 5.028901734104047e-05,
"loss": 2.098,
"step": 87
},
{
"epoch": 0.061137646547980894,
"grad_norm": 1.25,
"learning_rate": 5.0867052023121385e-05,
"loss": 2.1498,
"step": 88
},
{
"epoch": 0.06183239253148068,
"grad_norm": 1.375,
"learning_rate": 5.1445086705202317e-05,
"loss": 1.8586,
"step": 89
},
{
"epoch": 0.06252713851498046,
"grad_norm": 0.83984375,
"learning_rate": 5.2023121387283234e-05,
"loss": 1.6702,
"step": 90
},
{
"epoch": 0.06322188449848025,
"grad_norm": 0.734375,
"learning_rate": 5.2601156069364165e-05,
"loss": 2.1599,
"step": 91
},
{
"epoch": 0.06391663048198003,
"grad_norm": 0.85546875,
"learning_rate": 5.317919075144508e-05,
"loss": 2.0213,
"step": 92
},
{
"epoch": 0.0646113764654798,
"grad_norm": 0.71875,
"learning_rate": 5.3757225433526014e-05,
"loss": 2.3254,
"step": 93
},
{
"epoch": 0.0653061224489796,
"grad_norm": 0.89453125,
"learning_rate": 5.433526011560693e-05,
"loss": 2.0617,
"step": 94
},
{
"epoch": 0.06600086843247938,
"grad_norm": 0.62890625,
"learning_rate": 5.491329479768786e-05,
"loss": 1.8925,
"step": 95
},
{
"epoch": 0.06669561441597915,
"grad_norm": 0.58203125,
"learning_rate": 5.5491329479768787e-05,
"loss": 1.886,
"step": 96
},
{
"epoch": 0.06739036039947895,
"grad_norm": 0.984375,
"learning_rate": 5.606936416184971e-05,
"loss": 2.2635,
"step": 97
},
{
"epoch": 0.06808510638297872,
"grad_norm": 1.1796875,
"learning_rate": 5.664739884393064e-05,
"loss": 1.8094,
"step": 98
},
{
"epoch": 0.0687798523664785,
"grad_norm": 0.8203125,
"learning_rate": 5.722543352601156e-05,
"loss": 1.7222,
"step": 99
},
{
"epoch": 0.0694745983499783,
"grad_norm": 0.5390625,
"learning_rate": 5.780346820809249e-05,
"loss": 2.1751,
"step": 100
},
{
"epoch": 0.07016934433347807,
"grad_norm": 0.64453125,
"learning_rate": 5.8381502890173415e-05,
"loss": 2.0186,
"step": 101
},
{
"epoch": 0.07086409031697785,
"grad_norm": 0.92578125,
"learning_rate": 5.895953757225434e-05,
"loss": 1.7453,
"step": 102
},
{
"epoch": 0.07155883630047764,
"grad_norm": 0.55078125,
"learning_rate": 5.9537572254335263e-05,
"loss": 2.1655,
"step": 103
},
{
"epoch": 0.07225358228397742,
"grad_norm": 0.578125,
"learning_rate": 6.0115606936416195e-05,
"loss": 2.0565,
"step": 104
},
{
"epoch": 0.0729483282674772,
"grad_norm": 0.84765625,
"learning_rate": 6.069364161849711e-05,
"loss": 1.9825,
"step": 105
},
{
"epoch": 0.07364307425097699,
"grad_norm": 1.515625,
"learning_rate": 6.127167630057804e-05,
"loss": 1.4917,
"step": 106
},
{
"epoch": 0.07433782023447677,
"grad_norm": 1.546875,
"learning_rate": 6.184971098265896e-05,
"loss": 1.7809,
"step": 107
},
{
"epoch": 0.07503256621797655,
"grad_norm": 1.640625,
"learning_rate": 6.242774566473989e-05,
"loss": 2.2905,
"step": 108
},
{
"epoch": 0.07572731220147634,
"grad_norm": 0.71875,
"learning_rate": 6.300578034682081e-05,
"loss": 1.8123,
"step": 109
},
{
"epoch": 0.07642205818497612,
"grad_norm": 0.81640625,
"learning_rate": 6.358381502890174e-05,
"loss": 2.1268,
"step": 110
},
{
"epoch": 0.0771168041684759,
"grad_norm": 0.9921875,
"learning_rate": 6.416184971098266e-05,
"loss": 1.9522,
"step": 111
},
{
"epoch": 0.07781155015197569,
"grad_norm": 0.66796875,
"learning_rate": 6.473988439306359e-05,
"loss": 2.0203,
"step": 112
},
{
"epoch": 0.07850629613547547,
"grad_norm": 1.7734375,
"learning_rate": 6.53179190751445e-05,
"loss": 2.4639,
"step": 113
},
{
"epoch": 0.07920104211897525,
"grad_norm": 0.828125,
"learning_rate": 6.589595375722544e-05,
"loss": 2.1491,
"step": 114
},
{
"epoch": 0.07989578810247504,
"grad_norm": 0.96484375,
"learning_rate": 6.647398843930635e-05,
"loss": 2.0459,
"step": 115
},
{
"epoch": 0.08059053408597482,
"grad_norm": 1.6640625,
"learning_rate": 6.705202312138729e-05,
"loss": 2.0957,
"step": 116
},
{
"epoch": 0.0812852800694746,
"grad_norm": 0.69921875,
"learning_rate": 6.763005780346822e-05,
"loss": 2.1087,
"step": 117
},
{
"epoch": 0.08198002605297439,
"grad_norm": 0.96875,
"learning_rate": 6.820809248554913e-05,
"loss": 1.6713,
"step": 118
},
{
"epoch": 0.08267477203647416,
"grad_norm": 2.21875,
"learning_rate": 6.878612716763007e-05,
"loss": 2.0883,
"step": 119
},
{
"epoch": 0.08336951801997394,
"grad_norm": 1.015625,
"learning_rate": 6.936416184971098e-05,
"loss": 1.8738,
"step": 120
},
{
"epoch": 0.08406426400347373,
"grad_norm": 0.73046875,
"learning_rate": 6.994219653179191e-05,
"loss": 2.0907,
"step": 121
},
{
"epoch": 0.08475900998697351,
"grad_norm": 0.80078125,
"learning_rate": 7.052023121387283e-05,
"loss": 1.7412,
"step": 122
},
{
"epoch": 0.08545375597047329,
"grad_norm": 0.87890625,
"learning_rate": 7.109826589595376e-05,
"loss": 1.9133,
"step": 123
},
{
"epoch": 0.08614850195397308,
"grad_norm": 0.60546875,
"learning_rate": 7.167630057803468e-05,
"loss": 1.9658,
"step": 124
},
{
"epoch": 0.08684324793747286,
"grad_norm": 1.3203125,
"learning_rate": 7.225433526011561e-05,
"loss": 1.947,
"step": 125
},
{
"epoch": 0.08753799392097264,
"grad_norm": 1.1484375,
"learning_rate": 7.283236994219653e-05,
"loss": 1.912,
"step": 126
},
{
"epoch": 0.08823273990447243,
"grad_norm": 0.431640625,
"learning_rate": 7.341040462427746e-05,
"loss": 2.0971,
"step": 127
},
{
"epoch": 0.08892748588797221,
"grad_norm": 0.6953125,
"learning_rate": 7.398843930635838e-05,
"loss": 2.044,
"step": 128
},
{
"epoch": 0.08962223187147199,
"grad_norm": 0.8046875,
"learning_rate": 7.456647398843931e-05,
"loss": 2.0081,
"step": 129
},
{
"epoch": 0.09031697785497178,
"grad_norm": 1.109375,
"learning_rate": 7.514450867052023e-05,
"loss": 1.8501,
"step": 130
},
{
"epoch": 0.09101172383847156,
"grad_norm": 0.74609375,
"learning_rate": 7.572254335260116e-05,
"loss": 1.7543,
"step": 131
},
{
"epoch": 0.09170646982197134,
"grad_norm": 0.5703125,
"learning_rate": 7.630057803468207e-05,
"loss": 1.9667,
"step": 132
},
{
"epoch": 0.09240121580547113,
"grad_norm": 0.82421875,
"learning_rate": 7.6878612716763e-05,
"loss": 2.1222,
"step": 133
},
{
"epoch": 0.09309596178897091,
"grad_norm": 0.8828125,
"learning_rate": 7.745664739884392e-05,
"loss": 1.5583,
"step": 134
},
{
"epoch": 0.09379070777247069,
"grad_norm": 1.625,
"learning_rate": 7.803468208092485e-05,
"loss": 2.2327,
"step": 135
},
{
"epoch": 0.09448545375597048,
"grad_norm": 1.0,
"learning_rate": 7.861271676300579e-05,
"loss": 2.3996,
"step": 136
},
{
"epoch": 0.09518019973947026,
"grad_norm": 1.0859375,
"learning_rate": 7.91907514450867e-05,
"loss": 2.1297,
"step": 137
},
{
"epoch": 0.09587494572297003,
"grad_norm": 1.5390625,
"learning_rate": 7.976878612716763e-05,
"loss": 1.5923,
"step": 138
},
{
"epoch": 0.09656969170646983,
"grad_norm": 1.171875,
"learning_rate": 8.034682080924855e-05,
"loss": 1.671,
"step": 139
},
{
"epoch": 0.0972644376899696,
"grad_norm": 0.82421875,
"learning_rate": 8.092485549132948e-05,
"loss": 1.9664,
"step": 140
},
{
"epoch": 0.09795918367346938,
"grad_norm": 1.484375,
"learning_rate": 8.15028901734104e-05,
"loss": 1.8026,
"step": 141
},
{
"epoch": 0.09865392965696917,
"grad_norm": 2.171875,
"learning_rate": 8.208092485549133e-05,
"loss": 2.1695,
"step": 142
},
{
"epoch": 0.09934867564046895,
"grad_norm": 0.75390625,
"learning_rate": 8.265895953757226e-05,
"loss": 1.9029,
"step": 143
},
{
"epoch": 0.10004342162396873,
"grad_norm": 1.0078125,
"learning_rate": 8.323699421965318e-05,
"loss": 1.6349,
"step": 144
},
{
"epoch": 0.10073816760746852,
"grad_norm": 0.73828125,
"learning_rate": 8.381502890173411e-05,
"loss": 2.1295,
"step": 145
},
{
"epoch": 0.1014329135909683,
"grad_norm": 2.703125,
"learning_rate": 8.439306358381503e-05,
"loss": 1.9088,
"step": 146
},
{
"epoch": 0.10212765957446808,
"grad_norm": 1.34375,
"learning_rate": 8.497109826589596e-05,
"loss": 2.0262,
"step": 147
},
{
"epoch": 0.10282240555796787,
"grad_norm": 4.15625,
"learning_rate": 8.554913294797689e-05,
"loss": 1.7243,
"step": 148
},
{
"epoch": 0.10351715154146765,
"grad_norm": 0.9921875,
"learning_rate": 8.612716763005781e-05,
"loss": 2.2122,
"step": 149
},
{
"epoch": 0.10421189752496743,
"grad_norm": 0.7109375,
"learning_rate": 8.670520231213874e-05,
"loss": 2.3888,
"step": 150
},
{
"epoch": 0.10490664350846722,
"grad_norm": 0.734375,
"learning_rate": 8.728323699421966e-05,
"loss": 1.8788,
"step": 151
},
{
"epoch": 0.105601389491967,
"grad_norm": 1.3203125,
"learning_rate": 8.786127167630059e-05,
"loss": 1.9623,
"step": 152
},
{
"epoch": 0.10629613547546678,
"grad_norm": 1.25,
"learning_rate": 8.84393063583815e-05,
"loss": 1.7922,
"step": 153
},
{
"epoch": 0.10699088145896657,
"grad_norm": 1.3359375,
"learning_rate": 8.901734104046244e-05,
"loss": 2.0573,
"step": 154
},
{
"epoch": 0.10768562744246635,
"grad_norm": 1.4609375,
"learning_rate": 8.959537572254337e-05,
"loss": 1.978,
"step": 155
},
{
"epoch": 0.10838037342596613,
"grad_norm": 0.81640625,
"learning_rate": 9.017341040462428e-05,
"loss": 2.4477,
"step": 156
},
{
"epoch": 0.10907511940946592,
"grad_norm": 0.8125,
"learning_rate": 9.075144508670522e-05,
"loss": 2.1113,
"step": 157
},
{
"epoch": 0.1097698653929657,
"grad_norm": 1.0078125,
"learning_rate": 9.132947976878613e-05,
"loss": 1.7494,
"step": 158
},
{
"epoch": 0.11046461137646547,
"grad_norm": 0.81640625,
"learning_rate": 9.190751445086706e-05,
"loss": 2.1112,
"step": 159
},
{
"epoch": 0.11115935735996527,
"grad_norm": 0.9765625,
"learning_rate": 9.248554913294798e-05,
"loss": 1.9607,
"step": 160
},
{
"epoch": 0.11185410334346504,
"grad_norm": 0.84765625,
"learning_rate": 9.306358381502891e-05,
"loss": 1.8264,
"step": 161
},
{
"epoch": 0.11254884932696482,
"grad_norm": 0.8984375,
"learning_rate": 9.364161849710983e-05,
"loss": 1.9532,
"step": 162
},
{
"epoch": 0.11324359531046461,
"grad_norm": 1.1796875,
"learning_rate": 9.421965317919076e-05,
"loss": 1.9819,
"step": 163
},
{
"epoch": 0.11393834129396439,
"grad_norm": 0.70703125,
"learning_rate": 9.479768786127168e-05,
"loss": 2.0391,
"step": 164
},
{
"epoch": 0.11463308727746417,
"grad_norm": 0.92578125,
"learning_rate": 9.537572254335261e-05,
"loss": 2.027,
"step": 165
},
{
"epoch": 0.11532783326096396,
"grad_norm": 0.76953125,
"learning_rate": 9.595375722543353e-05,
"loss": 1.5242,
"step": 166
},
{
"epoch": 0.11602257924446374,
"grad_norm": 0.77734375,
"learning_rate": 9.653179190751446e-05,
"loss": 1.8081,
"step": 167
},
{
"epoch": 0.11671732522796352,
"grad_norm": 1.2734375,
"learning_rate": 9.710982658959538e-05,
"loss": 1.8001,
"step": 168
},
{
"epoch": 0.11741207121146331,
"grad_norm": 0.82421875,
"learning_rate": 9.768786127167631e-05,
"loss": 1.8917,
"step": 169
},
{
"epoch": 0.11810681719496309,
"grad_norm": 0.57421875,
"learning_rate": 9.826589595375723e-05,
"loss": 2.0557,
"step": 170
},
{
"epoch": 0.11880156317846287,
"grad_norm": 0.8046875,
"learning_rate": 9.884393063583816e-05,
"loss": 2.0773,
"step": 171
},
{
"epoch": 0.11949630916196266,
"grad_norm": 1.015625,
"learning_rate": 9.942196531791907e-05,
"loss": 1.566,
"step": 172
},
{
"epoch": 0.12019105514546244,
"grad_norm": 0.78125,
"learning_rate": 0.0001,
"loss": 2.0485,
"step": 173
},
{
"epoch": 0.12088580112896222,
"grad_norm": 0.87109375,
"learning_rate": 0.00010057803468208094,
"loss": 1.7415,
"step": 174
},
{
"epoch": 0.12158054711246201,
"grad_norm": 0.75390625,
"learning_rate": 0.00010115606936416187,
"loss": 2.2481,
"step": 175
},
{
"epoch": 0.12227529309596179,
"grad_norm": 1.0,
"learning_rate": 0.00010173410404624277,
"loss": 1.878,
"step": 176
},
{
"epoch": 0.12297003907946157,
"grad_norm": 1.1328125,
"learning_rate": 0.0001023121387283237,
"loss": 1.5244,
"step": 177
},
{
"epoch": 0.12366478506296136,
"grad_norm": 0.57421875,
"learning_rate": 0.00010289017341040463,
"loss": 2.1072,
"step": 178
},
{
"epoch": 0.12435953104646114,
"grad_norm": 0.81640625,
"learning_rate": 0.00010346820809248556,
"loss": 1.9914,
"step": 179
},
{
"epoch": 0.12505427702996091,
"grad_norm": 0.828125,
"learning_rate": 0.00010404624277456647,
"loss": 1.3949,
"step": 180
},
{
"epoch": 0.1257490230134607,
"grad_norm": 0.71484375,
"learning_rate": 0.0001046242774566474,
"loss": 1.7482,
"step": 181
},
{
"epoch": 0.1264437689969605,
"grad_norm": 1.1015625,
"learning_rate": 0.00010520231213872833,
"loss": 1.6729,
"step": 182
},
{
"epoch": 0.12713851498046028,
"grad_norm": 0.58984375,
"learning_rate": 0.00010578034682080926,
"loss": 1.7445,
"step": 183
},
{
"epoch": 0.12783326096396005,
"grad_norm": 0.462890625,
"learning_rate": 0.00010635838150289017,
"loss": 2.1268,
"step": 184
},
{
"epoch": 0.12852800694745983,
"grad_norm": 0.9453125,
"learning_rate": 0.0001069364161849711,
"loss": 1.6518,
"step": 185
},
{
"epoch": 0.1292227529309596,
"grad_norm": 0.8515625,
"learning_rate": 0.00010751445086705203,
"loss": 1.9006,
"step": 186
},
{
"epoch": 0.1299174989144594,
"grad_norm": 17.25,
"learning_rate": 0.00010809248554913296,
"loss": 2.0701,
"step": 187
},
{
"epoch": 0.1306122448979592,
"grad_norm": 0.828125,
"learning_rate": 0.00010867052023121386,
"loss": 2.0978,
"step": 188
},
{
"epoch": 0.13130699088145897,
"grad_norm": 1.1015625,
"learning_rate": 0.0001092485549132948,
"loss": 1.8128,
"step": 189
},
{
"epoch": 0.13200173686495875,
"grad_norm": 0.90625,
"learning_rate": 0.00010982658959537572,
"loss": 1.9241,
"step": 190
},
{
"epoch": 0.13269648284845853,
"grad_norm": 0.5859375,
"learning_rate": 0.00011040462427745666,
"loss": 1.7011,
"step": 191
},
{
"epoch": 0.1333912288319583,
"grad_norm": 0.6796875,
"learning_rate": 0.00011098265895953757,
"loss": 2.2784,
"step": 192
},
{
"epoch": 0.1340859748154581,
"grad_norm": 3.078125,
"learning_rate": 0.00011156069364161849,
"loss": 2.16,
"step": 193
},
{
"epoch": 0.1347807207989579,
"grad_norm": 0.7421875,
"learning_rate": 0.00011213872832369942,
"loss": 1.734,
"step": 194
},
{
"epoch": 0.13547546678245767,
"grad_norm": 0.734375,
"learning_rate": 0.00011271676300578035,
"loss": 1.6558,
"step": 195
},
{
"epoch": 0.13617021276595745,
"grad_norm": 0.70703125,
"learning_rate": 0.00011329479768786128,
"loss": 1.7766,
"step": 196
},
{
"epoch": 0.13686495874945723,
"grad_norm": 0.72265625,
"learning_rate": 0.0001138728323699422,
"loss": 1.6735,
"step": 197
},
{
"epoch": 0.137559704732957,
"grad_norm": 1.1640625,
"learning_rate": 0.00011445086705202312,
"loss": 2.2215,
"step": 198
},
{
"epoch": 0.13825445071645678,
"grad_norm": 0.6484375,
"learning_rate": 0.00011502890173410405,
"loss": 1.8671,
"step": 199
},
{
"epoch": 0.1389491966999566,
"grad_norm": 1.40625,
"learning_rate": 0.00011560693641618498,
"loss": 1.9517,
"step": 200
},
{
"epoch": 0.13964394268345637,
"grad_norm": 0.67578125,
"learning_rate": 0.0001161849710982659,
"loss": 2.0064,
"step": 201
},
{
"epoch": 0.14033868866695615,
"grad_norm": 1.0234375,
"learning_rate": 0.00011676300578034683,
"loss": 1.7618,
"step": 202
},
{
"epoch": 0.14103343465045592,
"grad_norm": 1.3828125,
"learning_rate": 0.00011734104046242775,
"loss": 1.746,
"step": 203
},
{
"epoch": 0.1417281806339557,
"grad_norm": 1.453125,
"learning_rate": 0.00011791907514450868,
"loss": 2.4571,
"step": 204
},
{
"epoch": 0.14242292661745548,
"grad_norm": 1.171875,
"learning_rate": 0.0001184971098265896,
"loss": 2.0719,
"step": 205
},
{
"epoch": 0.1431176726009553,
"grad_norm": 0.93359375,
"learning_rate": 0.00011907514450867053,
"loss": 1.892,
"step": 206
},
{
"epoch": 0.14381241858445507,
"grad_norm": 0.9375,
"learning_rate": 0.00011965317919075146,
"loss": 1.9472,
"step": 207
},
{
"epoch": 0.14450716456795484,
"grad_norm": 0.78515625,
"learning_rate": 0.00012023121387283239,
"loss": 1.9143,
"step": 208
},
{
"epoch": 0.14520191055145462,
"grad_norm": 0.4296875,
"learning_rate": 0.00012080924855491329,
"loss": 1.9386,
"step": 209
},
{
"epoch": 0.1458966565349544,
"grad_norm": 4.40625,
"learning_rate": 0.00012138728323699422,
"loss": 1.6159,
"step": 210
},
{
"epoch": 0.14659140251845418,
"grad_norm": 0.59375,
"learning_rate": 0.00012196531791907516,
"loss": 1.9722,
"step": 211
},
{
"epoch": 0.14728614850195398,
"grad_norm": 1.0703125,
"learning_rate": 0.00012254335260115609,
"loss": 2.2088,
"step": 212
},
{
"epoch": 0.14798089448545376,
"grad_norm": 0.416015625,
"learning_rate": 0.00012312138728323702,
"loss": 2.3268,
"step": 213
},
{
"epoch": 0.14867564046895354,
"grad_norm": 0.87109375,
"learning_rate": 0.00012369942196531792,
"loss": 1.9869,
"step": 214
},
{
"epoch": 0.14937038645245332,
"grad_norm": 0.82421875,
"learning_rate": 0.00012427745664739885,
"loss": 1.6529,
"step": 215
},
{
"epoch": 0.1500651324359531,
"grad_norm": 1.1796875,
"learning_rate": 0.00012485549132947978,
"loss": 1.788,
"step": 216
},
{
"epoch": 0.15075987841945288,
"grad_norm": 0.80078125,
"learning_rate": 0.00012543352601156071,
"loss": 2.1698,
"step": 217
},
{
"epoch": 0.15145462440295268,
"grad_norm": 0.9296875,
"learning_rate": 0.00012601156069364162,
"loss": 1.8094,
"step": 218
},
{
"epoch": 0.15214937038645246,
"grad_norm": 0.859375,
"learning_rate": 0.00012658959537572255,
"loss": 1.5737,
"step": 219
},
{
"epoch": 0.15284411636995224,
"grad_norm": 0.75,
"learning_rate": 0.00012716763005780348,
"loss": 2.0224,
"step": 220
},
{
"epoch": 0.15353886235345202,
"grad_norm": 0.9140625,
"learning_rate": 0.0001277456647398844,
"loss": 1.9444,
"step": 221
},
{
"epoch": 0.1542336083369518,
"grad_norm": 1.5078125,
"learning_rate": 0.00012832369942196532,
"loss": 1.8394,
"step": 222
},
{
"epoch": 0.15492835432045157,
"grad_norm": 0.9609375,
"learning_rate": 0.00012890173410404625,
"loss": 1.9949,
"step": 223
},
{
"epoch": 0.15562310030395138,
"grad_norm": 0.515625,
"learning_rate": 0.00012947976878612718,
"loss": 1.8063,
"step": 224
},
{
"epoch": 0.15631784628745116,
"grad_norm": 0.76171875,
"learning_rate": 0.0001300578034682081,
"loss": 2.108,
"step": 225
},
{
"epoch": 0.15701259227095093,
"grad_norm": 0.8203125,
"learning_rate": 0.000130635838150289,
"loss": 1.865,
"step": 226
},
{
"epoch": 0.1577073382544507,
"grad_norm": 1.171875,
"learning_rate": 0.00013121387283236994,
"loss": 1.9677,
"step": 227
},
{
"epoch": 0.1584020842379505,
"grad_norm": 0.875,
"learning_rate": 0.00013179190751445087,
"loss": 1.5685,
"step": 228
},
{
"epoch": 0.15909683022145027,
"grad_norm": 0.609375,
"learning_rate": 0.0001323699421965318,
"loss": 1.7492,
"step": 229
},
{
"epoch": 0.15979157620495008,
"grad_norm": 1.609375,
"learning_rate": 0.0001329479768786127,
"loss": 2.3405,
"step": 230
},
{
"epoch": 0.16048632218844985,
"grad_norm": 1.3125,
"learning_rate": 0.00013352601156069364,
"loss": 1.8523,
"step": 231
},
{
"epoch": 0.16118106817194963,
"grad_norm": 0.7890625,
"learning_rate": 0.00013410404624277457,
"loss": 1.6507,
"step": 232
},
{
"epoch": 0.1618758141554494,
"grad_norm": 0.70703125,
"learning_rate": 0.0001346820809248555,
"loss": 1.9661,
"step": 233
},
{
"epoch": 0.1625705601389492,
"grad_norm": 0.65234375,
"learning_rate": 0.00013526011560693643,
"loss": 1.8777,
"step": 234
},
{
"epoch": 0.16326530612244897,
"grad_norm": 0.78125,
"learning_rate": 0.00013583815028901734,
"loss": 1.7511,
"step": 235
},
{
"epoch": 0.16396005210594877,
"grad_norm": 0.69140625,
"learning_rate": 0.00013641618497109827,
"loss": 2.1641,
"step": 236
},
{
"epoch": 0.16465479808944855,
"grad_norm": 0.77734375,
"learning_rate": 0.0001369942196531792,
"loss": 2.0807,
"step": 237
},
{
"epoch": 0.16534954407294833,
"grad_norm": 0.93359375,
"learning_rate": 0.00013757225433526013,
"loss": 1.5565,
"step": 238
},
{
"epoch": 0.1660442900564481,
"grad_norm": 0.58984375,
"learning_rate": 0.00013815028901734104,
"loss": 2.0807,
"step": 239
},
{
"epoch": 0.16673903603994789,
"grad_norm": 0.5078125,
"learning_rate": 0.00013872832369942197,
"loss": 1.7981,
"step": 240
},
{
"epoch": 0.16743378202344766,
"grad_norm": 0.578125,
"learning_rate": 0.0001393063583815029,
"loss": 1.6482,
"step": 241
},
{
"epoch": 0.16812852800694747,
"grad_norm": 0.8046875,
"learning_rate": 0.00013988439306358383,
"loss": 1.8768,
"step": 242
},
{
"epoch": 0.16882327399044725,
"grad_norm": 1.328125,
"learning_rate": 0.00014046242774566473,
"loss": 1.6573,
"step": 243
},
{
"epoch": 0.16951801997394703,
"grad_norm": 1.0546875,
"learning_rate": 0.00014104046242774566,
"loss": 1.6465,
"step": 244
},
{
"epoch": 0.1702127659574468,
"grad_norm": 0.859375,
"learning_rate": 0.0001416184971098266,
"loss": 2.1823,
"step": 245
},
{
"epoch": 0.17090751194094658,
"grad_norm": 1.078125,
"learning_rate": 0.00014219653179190753,
"loss": 2.1286,
"step": 246
},
{
"epoch": 0.17160225792444636,
"grad_norm": 0.64453125,
"learning_rate": 0.00014277456647398843,
"loss": 2.0725,
"step": 247
},
{
"epoch": 0.17229700390794617,
"grad_norm": 1.0,
"learning_rate": 0.00014335260115606936,
"loss": 1.9063,
"step": 248
},
{
"epoch": 0.17299174989144595,
"grad_norm": 0.71484375,
"learning_rate": 0.0001439306358381503,
"loss": 2.1774,
"step": 249
},
{
"epoch": 0.17368649587494572,
"grad_norm": 0.640625,
"learning_rate": 0.00014450867052023122,
"loss": 1.8539,
"step": 250
},
{
"epoch": 0.1743812418584455,
"grad_norm": 0.89453125,
"learning_rate": 0.00014508670520231215,
"loss": 1.8404,
"step": 251
},
{
"epoch": 0.17507598784194528,
"grad_norm": 0.84375,
"learning_rate": 0.00014566473988439306,
"loss": 1.8873,
"step": 252
},
{
"epoch": 0.17577073382544506,
"grad_norm": 0.6953125,
"learning_rate": 0.000146242774566474,
"loss": 1.9706,
"step": 253
},
{
"epoch": 0.17646547980894486,
"grad_norm": 0.94921875,
"learning_rate": 0.00014682080924855492,
"loss": 1.9709,
"step": 254
},
{
"epoch": 0.17716022579244464,
"grad_norm": 0.765625,
"learning_rate": 0.00014739884393063585,
"loss": 1.8306,
"step": 255
},
{
"epoch": 0.17785497177594442,
"grad_norm": 1.0,
"learning_rate": 0.00014797687861271676,
"loss": 2.1021,
"step": 256
},
{
"epoch": 0.1785497177594442,
"grad_norm": 0.62890625,
"learning_rate": 0.00014855491329479769,
"loss": 2.1095,
"step": 257
},
{
"epoch": 0.17924446374294398,
"grad_norm": 0.6328125,
"learning_rate": 0.00014913294797687862,
"loss": 2.0039,
"step": 258
},
{
"epoch": 0.17993920972644378,
"grad_norm": 1.4765625,
"learning_rate": 0.00014971098265895955,
"loss": 1.7203,
"step": 259
},
{
"epoch": 0.18063395570994356,
"grad_norm": 0.72265625,
"learning_rate": 0.00015028901734104045,
"loss": 1.6399,
"step": 260
},
{
"epoch": 0.18132870169344334,
"grad_norm": 0.90625,
"learning_rate": 0.00015086705202312138,
"loss": 1.6332,
"step": 261
},
{
"epoch": 0.18202344767694312,
"grad_norm": 1.3359375,
"learning_rate": 0.00015144508670520231,
"loss": 1.9613,
"step": 262
},
{
"epoch": 0.1827181936604429,
"grad_norm": 1.2265625,
"learning_rate": 0.00015202312138728325,
"loss": 1.7818,
"step": 263
},
{
"epoch": 0.18341293964394267,
"grad_norm": 1.5625,
"learning_rate": 0.00015260115606936415,
"loss": 2.2765,
"step": 264
},
{
"epoch": 0.18410768562744248,
"grad_norm": 1.625,
"learning_rate": 0.00015317919075144508,
"loss": 1.7086,
"step": 265
},
{
"epoch": 0.18480243161094226,
"grad_norm": 1.203125,
"learning_rate": 0.000153757225433526,
"loss": 1.7275,
"step": 266
},
{
"epoch": 0.18549717759444204,
"grad_norm": 0.7890625,
"learning_rate": 0.00015433526011560694,
"loss": 2.0655,
"step": 267
},
{
"epoch": 0.18619192357794181,
"grad_norm": 1.1796875,
"learning_rate": 0.00015491329479768785,
"loss": 1.5989,
"step": 268
},
{
"epoch": 0.1868866695614416,
"grad_norm": 1.546875,
"learning_rate": 0.00015549132947976878,
"loss": 1.1796,
"step": 269
},
{
"epoch": 0.18758141554494137,
"grad_norm": 0.859375,
"learning_rate": 0.0001560693641618497,
"loss": 1.8115,
"step": 270
},
{
"epoch": 0.18827616152844118,
"grad_norm": 1.359375,
"learning_rate": 0.00015664739884393064,
"loss": 1.9066,
"step": 271
},
{
"epoch": 0.18897090751194096,
"grad_norm": 1.1875,
"learning_rate": 0.00015722543352601157,
"loss": 2.0165,
"step": 272
},
{
"epoch": 0.18966565349544073,
"grad_norm": 1.4765625,
"learning_rate": 0.00015780346820809248,
"loss": 2.3133,
"step": 273
},
{
"epoch": 0.1903603994789405,
"grad_norm": 1.1953125,
"learning_rate": 0.0001583815028901734,
"loss": 2.0332,
"step": 274
},
{
"epoch": 0.1910551454624403,
"grad_norm": 3.734375,
"learning_rate": 0.00015895953757225434,
"loss": 1.9067,
"step": 275
},
{
"epoch": 0.19174989144594007,
"grad_norm": 0.83203125,
"learning_rate": 0.00015953757225433527,
"loss": 1.9406,
"step": 276
},
{
"epoch": 0.19244463742943987,
"grad_norm": 0.66796875,
"learning_rate": 0.00016011560693641617,
"loss": 1.8024,
"step": 277
},
{
"epoch": 0.19313938341293965,
"grad_norm": 0.76953125,
"learning_rate": 0.0001606936416184971,
"loss": 1.803,
"step": 278
},
{
"epoch": 0.19383412939643943,
"grad_norm": 0.84375,
"learning_rate": 0.00016127167630057803,
"loss": 2.1169,
"step": 279
},
{
"epoch": 0.1945288753799392,
"grad_norm": 0.75390625,
"learning_rate": 0.00016184971098265897,
"loss": 1.8095,
"step": 280
},
{
"epoch": 0.195223621363439,
"grad_norm": 0.671875,
"learning_rate": 0.0001624277456647399,
"loss": 2.1282,
"step": 281
},
{
"epoch": 0.19591836734693877,
"grad_norm": 0.92578125,
"learning_rate": 0.0001630057803468208,
"loss": 1.7845,
"step": 282
},
{
"epoch": 0.19661311333043857,
"grad_norm": 1.15625,
"learning_rate": 0.00016358381502890173,
"loss": 1.8735,
"step": 283
},
{
"epoch": 0.19730785931393835,
"grad_norm": 1.2265625,
"learning_rate": 0.00016416184971098266,
"loss": 1.9958,
"step": 284
},
{
"epoch": 0.19800260529743813,
"grad_norm": 0.796875,
"learning_rate": 0.0001647398843930636,
"loss": 2.1224,
"step": 285
},
{
"epoch": 0.1986973512809379,
"grad_norm": 1.0390625,
"learning_rate": 0.00016531791907514452,
"loss": 2.1101,
"step": 286
},
{
"epoch": 0.19939209726443768,
"grad_norm": 0.69140625,
"learning_rate": 0.00016589595375722543,
"loss": 2.1356,
"step": 287
},
{
"epoch": 0.20008684324793746,
"grad_norm": 1.2109375,
"learning_rate": 0.00016647398843930636,
"loss": 1.6759,
"step": 288
},
{
"epoch": 0.20078158923143727,
"grad_norm": 0.94140625,
"learning_rate": 0.0001670520231213873,
"loss": 2.1158,
"step": 289
},
{
"epoch": 0.20147633521493705,
"grad_norm": 0.66796875,
"learning_rate": 0.00016763005780346822,
"loss": 2.1826,
"step": 290
},
{
"epoch": 0.20217108119843683,
"grad_norm": 0.91015625,
"learning_rate": 0.00016820809248554915,
"loss": 1.8656,
"step": 291
},
{
"epoch": 0.2028658271819366,
"grad_norm": 1.03125,
"learning_rate": 0.00016878612716763006,
"loss": 2.2818,
"step": 292
},
{
"epoch": 0.20356057316543638,
"grad_norm": 0.7109375,
"learning_rate": 0.000169364161849711,
"loss": 1.6402,
"step": 293
},
{
"epoch": 0.20425531914893616,
"grad_norm": 5.40625,
"learning_rate": 0.00016994219653179192,
"loss": 1.8538,
"step": 294
},
{
"epoch": 0.20495006513243597,
"grad_norm": 2.859375,
"learning_rate": 0.00017052023121387285,
"loss": 1.6352,
"step": 295
},
{
"epoch": 0.20564481111593574,
"grad_norm": 0.9453125,
"learning_rate": 0.00017109826589595378,
"loss": 1.7506,
"step": 296
},
{
"epoch": 0.20633955709943552,
"grad_norm": 0.78125,
"learning_rate": 0.0001716763005780347,
"loss": 1.889,
"step": 297
},
{
"epoch": 0.2070343030829353,
"grad_norm": 0.703125,
"learning_rate": 0.00017225433526011562,
"loss": 2.0453,
"step": 298
},
{
"epoch": 0.20772904906643508,
"grad_norm": 1.125,
"learning_rate": 0.00017283236994219655,
"loss": 2.182,
"step": 299
},
{
"epoch": 0.20842379504993486,
"grad_norm": 0.73828125,
"learning_rate": 0.00017341040462427748,
"loss": 1.943,
"step": 300
},
{
"epoch": 0.20911854103343466,
"grad_norm": 0.93359375,
"learning_rate": 0.0001739884393063584,
"loss": 1.9609,
"step": 301
},
{
"epoch": 0.20981328701693444,
"grad_norm": 0.93359375,
"learning_rate": 0.0001745664739884393,
"loss": 1.8991,
"step": 302
},
{
"epoch": 0.21050803300043422,
"grad_norm": 1.328125,
"learning_rate": 0.00017514450867052024,
"loss": 1.8573,
"step": 303
},
{
"epoch": 0.211202778983934,
"grad_norm": 0.84375,
"learning_rate": 0.00017572254335260118,
"loss": 2.1771,
"step": 304
},
{
"epoch": 0.21189752496743378,
"grad_norm": 0.9375,
"learning_rate": 0.0001763005780346821,
"loss": 1.9985,
"step": 305
},
{
"epoch": 0.21259227095093355,
"grad_norm": 0.62890625,
"learning_rate": 0.000176878612716763,
"loss": 1.5911,
"step": 306
},
{
"epoch": 0.21328701693443336,
"grad_norm": 0.91796875,
"learning_rate": 0.00017745664739884394,
"loss": 2.0792,
"step": 307
},
{
"epoch": 0.21398176291793314,
"grad_norm": 0.8046875,
"learning_rate": 0.00017803468208092487,
"loss": 1.5303,
"step": 308
},
{
"epoch": 0.21467650890143292,
"grad_norm": 0.55078125,
"learning_rate": 0.0001786127167630058,
"loss": 2.1808,
"step": 309
},
{
"epoch": 0.2153712548849327,
"grad_norm": 0.84765625,
"learning_rate": 0.00017919075144508673,
"loss": 1.5205,
"step": 310
},
{
"epoch": 0.21606600086843247,
"grad_norm": 0.96875,
"learning_rate": 0.00017976878612716764,
"loss": 1.8961,
"step": 311
},
{
"epoch": 0.21676074685193225,
"grad_norm": 0.9921875,
"learning_rate": 0.00018034682080924857,
"loss": 1.634,
"step": 312
},
{
"epoch": 0.21745549283543206,
"grad_norm": 1.1953125,
"learning_rate": 0.0001809248554913295,
"loss": 1.9757,
"step": 313
},
{
"epoch": 0.21815023881893184,
"grad_norm": 1.2109375,
"learning_rate": 0.00018150289017341043,
"loss": 1.6177,
"step": 314
},
{
"epoch": 0.2188449848024316,
"grad_norm": 0.76953125,
"learning_rate": 0.00018208092485549134,
"loss": 1.5741,
"step": 315
},
{
"epoch": 0.2195397307859314,
"grad_norm": 0.61328125,
"learning_rate": 0.00018265895953757227,
"loss": 1.8792,
"step": 316
},
{
"epoch": 0.22023447676943117,
"grad_norm": 0.984375,
"learning_rate": 0.0001832369942196532,
"loss": 1.973,
"step": 317
},
{
"epoch": 0.22092922275293095,
"grad_norm": 1.0703125,
"learning_rate": 0.00018381502890173413,
"loss": 2.0379,
"step": 318
},
{
"epoch": 0.22162396873643075,
"grad_norm": 0.84375,
"learning_rate": 0.00018439306358381503,
"loss": 1.9269,
"step": 319
},
{
"epoch": 0.22231871471993053,
"grad_norm": 0.8359375,
"learning_rate": 0.00018497109826589596,
"loss": 2.2882,
"step": 320
},
{
"epoch": 0.2230134607034303,
"grad_norm": 1.09375,
"learning_rate": 0.0001855491329479769,
"loss": 1.7531,
"step": 321
},
{
"epoch": 0.2237082066869301,
"grad_norm": 0.84765625,
"learning_rate": 0.00018612716763005783,
"loss": 2.011,
"step": 322
},
{
"epoch": 0.22440295267042987,
"grad_norm": 0.61328125,
"learning_rate": 0.00018670520231213873,
"loss": 1.7293,
"step": 323
},
{
"epoch": 0.22509769865392965,
"grad_norm": 1.328125,
"learning_rate": 0.00018728323699421966,
"loss": 1.8176,
"step": 324
},
{
"epoch": 0.22579244463742945,
"grad_norm": 0.75,
"learning_rate": 0.0001878612716763006,
"loss": 2.1176,
"step": 325
},
{
"epoch": 0.22648719062092923,
"grad_norm": 0.859375,
"learning_rate": 0.00018843930635838152,
"loss": 1.9007,
"step": 326
},
{
"epoch": 0.227181936604429,
"grad_norm": 0.7421875,
"learning_rate": 0.00018901734104046245,
"loss": 1.9553,
"step": 327
},
{
"epoch": 0.22787668258792879,
"grad_norm": 2.78125,
"learning_rate": 0.00018959537572254336,
"loss": 1.7696,
"step": 328
},
{
"epoch": 0.22857142857142856,
"grad_norm": 1.6484375,
"learning_rate": 0.0001901734104046243,
"loss": 2.1033,
"step": 329
},
{
"epoch": 0.22926617455492834,
"grad_norm": 0.703125,
"learning_rate": 0.00019075144508670522,
"loss": 2.0961,
"step": 330
},
{
"epoch": 0.22996092053842815,
"grad_norm": 1.0078125,
"learning_rate": 0.00019132947976878615,
"loss": 1.8456,
"step": 331
},
{
"epoch": 0.23065566652192793,
"grad_norm": 0.81640625,
"learning_rate": 0.00019190751445086706,
"loss": 1.6038,
"step": 332
},
{
"epoch": 0.2313504125054277,
"grad_norm": 0.625,
"learning_rate": 0.000192485549132948,
"loss": 1.9724,
"step": 333
},
{
"epoch": 0.23204515848892748,
"grad_norm": 0.71484375,
"learning_rate": 0.00019306358381502892,
"loss": 1.9463,
"step": 334
},
{
"epoch": 0.23273990447242726,
"grad_norm": 0.65234375,
"learning_rate": 0.00019364161849710985,
"loss": 1.8556,
"step": 335
},
{
"epoch": 0.23343465045592704,
"grad_norm": 1.375,
"learning_rate": 0.00019421965317919075,
"loss": 2.1624,
"step": 336
},
{
"epoch": 0.23412939643942685,
"grad_norm": 0.88671875,
"learning_rate": 0.00019479768786127168,
"loss": 1.8647,
"step": 337
},
{
"epoch": 0.23482414242292662,
"grad_norm": 0.91796875,
"learning_rate": 0.00019537572254335262,
"loss": 1.5004,
"step": 338
},
{
"epoch": 0.2355188884064264,
"grad_norm": 0.79296875,
"learning_rate": 0.00019595375722543355,
"loss": 2.1977,
"step": 339
},
{
"epoch": 0.23621363438992618,
"grad_norm": 0.68359375,
"learning_rate": 0.00019653179190751445,
"loss": 1.8312,
"step": 340
},
{
"epoch": 0.23690838037342596,
"grad_norm": 0.609375,
"learning_rate": 0.00019710982658959538,
"loss": 1.9086,
"step": 341
},
{
"epoch": 0.23760312635692574,
"grad_norm": 1.453125,
"learning_rate": 0.0001976878612716763,
"loss": 1.7357,
"step": 342
},
{
"epoch": 0.23829787234042554,
"grad_norm": 0.828125,
"learning_rate": 0.00019826589595375724,
"loss": 1.7354,
"step": 343
},
{
"epoch": 0.23899261832392532,
"grad_norm": 0.8203125,
"learning_rate": 0.00019884393063583815,
"loss": 1.867,
"step": 344
},
{
"epoch": 0.2396873643074251,
"grad_norm": 0.94921875,
"learning_rate": 0.00019942196531791908,
"loss": 1.9156,
"step": 345
},
{
"epoch": 0.24038211029092488,
"grad_norm": 0.953125,
"learning_rate": 0.0002,
"loss": 1.9306,
"step": 346
},
{
"epoch": 0.24107685627442466,
"grad_norm": 0.69140625,
"learning_rate": 0.00019994963485268195,
"loss": 2.1757,
"step": 347
},
{
"epoch": 0.24177160225792443,
"grad_norm": 4.09375,
"learning_rate": 0.0001998992697053639,
"loss": 1.7667,
"step": 348
},
{
"epoch": 0.24246634824142424,
"grad_norm": 0.875,
"learning_rate": 0.00019984890455804585,
"loss": 1.9543,
"step": 349
},
{
"epoch": 0.24316109422492402,
"grad_norm": 1.234375,
"learning_rate": 0.00019979853941072778,
"loss": 1.5744,
"step": 350
},
{
"epoch": 0.2438558402084238,
"grad_norm": 0.83203125,
"learning_rate": 0.00019974817426340972,
"loss": 1.9601,
"step": 351
},
{
"epoch": 0.24455058619192357,
"grad_norm": 1.1015625,
"learning_rate": 0.00019969780911609168,
"loss": 2.0014,
"step": 352
},
{
"epoch": 0.24524533217542335,
"grad_norm": 0.6484375,
"learning_rate": 0.00019964744396877362,
"loss": 1.8605,
"step": 353
},
{
"epoch": 0.24594007815892313,
"grad_norm": 1.0859375,
"learning_rate": 0.00019959707882145555,
"loss": 1.9851,
"step": 354
},
{
"epoch": 0.24663482414242294,
"grad_norm": 0.69921875,
"learning_rate": 0.00019954671367413752,
"loss": 2.1791,
"step": 355
},
{
"epoch": 0.24732957012592272,
"grad_norm": 0.640625,
"learning_rate": 0.00019949634852681945,
"loss": 1.8084,
"step": 356
},
{
"epoch": 0.2480243161094225,
"grad_norm": 0.78125,
"learning_rate": 0.0001994459833795014,
"loss": 1.9735,
"step": 357
},
{
"epoch": 0.24871906209292227,
"grad_norm": 0.9609375,
"learning_rate": 0.00019939561823218333,
"loss": 2.0874,
"step": 358
},
{
"epoch": 0.24941380807642205,
"grad_norm": 0.91015625,
"learning_rate": 0.0001993452530848653,
"loss": 2.0039,
"step": 359
},
{
"epoch": 0.25010855405992183,
"grad_norm": 0.734375,
"learning_rate": 0.00019929488793754723,
"loss": 1.8483,
"step": 360
},
{
"epoch": 0.2508033000434216,
"grad_norm": 0.97265625,
"learning_rate": 0.00019924452279022916,
"loss": 1.7729,
"step": 361
},
{
"epoch": 0.2514980460269214,
"grad_norm": 0.90234375,
"learning_rate": 0.00019919415764291113,
"loss": 1.8132,
"step": 362
},
{
"epoch": 0.25219279201042116,
"grad_norm": 1.015625,
"learning_rate": 0.00019914379249559306,
"loss": 1.5844,
"step": 363
},
{
"epoch": 0.252887537993921,
"grad_norm": 1.4453125,
"learning_rate": 0.000199093427348275,
"loss": 2.1404,
"step": 364
},
{
"epoch": 0.2535822839774208,
"grad_norm": 2.796875,
"learning_rate": 0.00019904306220095693,
"loss": 2.0058,
"step": 365
},
{
"epoch": 0.25427702996092055,
"grad_norm": 0.8359375,
"learning_rate": 0.0001989926970536389,
"loss": 1.795,
"step": 366
},
{
"epoch": 0.25497177594442033,
"grad_norm": 0.83203125,
"learning_rate": 0.00019894233190632083,
"loss": 2.0315,
"step": 367
},
{
"epoch": 0.2556665219279201,
"grad_norm": 0.66796875,
"learning_rate": 0.0001988919667590028,
"loss": 1.8603,
"step": 368
},
{
"epoch": 0.2563612679114199,
"grad_norm": 0.6171875,
"learning_rate": 0.00019884160161168473,
"loss": 2.0518,
"step": 369
},
{
"epoch": 0.25705601389491967,
"grad_norm": 0.62890625,
"learning_rate": 0.00019879123646436667,
"loss": 1.4851,
"step": 370
},
{
"epoch": 0.25775075987841944,
"grad_norm": 1.2265625,
"learning_rate": 0.0001987408713170486,
"loss": 1.5712,
"step": 371
},
{
"epoch": 0.2584455058619192,
"grad_norm": 1.0390625,
"learning_rate": 0.00019869050616973054,
"loss": 2.2505,
"step": 372
},
{
"epoch": 0.259140251845419,
"grad_norm": 0.93359375,
"learning_rate": 0.0001986401410224125,
"loss": 2.3709,
"step": 373
},
{
"epoch": 0.2598349978289188,
"grad_norm": 0.671875,
"learning_rate": 0.00019858977587509444,
"loss": 2.1194,
"step": 374
},
{
"epoch": 0.26052974381241856,
"grad_norm": 1.0546875,
"learning_rate": 0.0001985394107277764,
"loss": 1.5963,
"step": 375
},
{
"epoch": 0.2612244897959184,
"grad_norm": 0.56640625,
"learning_rate": 0.00019848904558045834,
"loss": 2.1682,
"step": 376
},
{
"epoch": 0.26191923577941817,
"grad_norm": 0.66015625,
"learning_rate": 0.00019843868043314028,
"loss": 1.848,
"step": 377
},
{
"epoch": 0.26261398176291795,
"grad_norm": 0.55078125,
"learning_rate": 0.00019838831528582222,
"loss": 1.5301,
"step": 378
},
{
"epoch": 0.2633087277464177,
"grad_norm": 0.83984375,
"learning_rate": 0.00019833795013850415,
"loss": 1.6342,
"step": 379
},
{
"epoch": 0.2640034737299175,
"grad_norm": 0.8203125,
"learning_rate": 0.00019828758499118611,
"loss": 1.7481,
"step": 380
},
{
"epoch": 0.2646982197134173,
"grad_norm": 0.875,
"learning_rate": 0.00019823721984386805,
"loss": 2.0733,
"step": 381
},
{
"epoch": 0.26539296569691706,
"grad_norm": 0.85546875,
"learning_rate": 0.00019818685469655001,
"loss": 2.2002,
"step": 382
},
{
"epoch": 0.26608771168041684,
"grad_norm": 4.0,
"learning_rate": 0.00019813648954923195,
"loss": 1.8585,
"step": 383
},
{
"epoch": 0.2667824576639166,
"grad_norm": 0.83203125,
"learning_rate": 0.0001980861244019139,
"loss": 1.8138,
"step": 384
},
{
"epoch": 0.2674772036474164,
"grad_norm": 1.234375,
"learning_rate": 0.00019803575925459582,
"loss": 2.0099,
"step": 385
},
{
"epoch": 0.2681719496309162,
"grad_norm": 0.91015625,
"learning_rate": 0.00019798539410727776,
"loss": 1.9675,
"step": 386
},
{
"epoch": 0.26886669561441595,
"grad_norm": 0.87109375,
"learning_rate": 0.00019793502895995972,
"loss": 1.6869,
"step": 387
},
{
"epoch": 0.2695614415979158,
"grad_norm": 0.64453125,
"learning_rate": 0.00019788466381264166,
"loss": 1.9112,
"step": 388
},
{
"epoch": 0.27025618758141556,
"grad_norm": 1.0859375,
"learning_rate": 0.00019783429866532362,
"loss": 1.7991,
"step": 389
},
{
"epoch": 0.27095093356491534,
"grad_norm": 0.8828125,
"learning_rate": 0.00019778393351800556,
"loss": 1.7384,
"step": 390
},
{
"epoch": 0.2716456795484151,
"grad_norm": 0.66015625,
"learning_rate": 0.0001977335683706875,
"loss": 1.763,
"step": 391
},
{
"epoch": 0.2723404255319149,
"grad_norm": 1.3671875,
"learning_rate": 0.00019768320322336943,
"loss": 1.837,
"step": 392
},
{
"epoch": 0.2730351715154147,
"grad_norm": 1.4140625,
"learning_rate": 0.00019763283807605137,
"loss": 1.9485,
"step": 393
},
{
"epoch": 0.27372991749891445,
"grad_norm": 0.8203125,
"learning_rate": 0.00019758247292873333,
"loss": 2.2162,
"step": 394
},
{
"epoch": 0.27442466348241423,
"grad_norm": 1.171875,
"learning_rate": 0.00019753210778141527,
"loss": 2.1763,
"step": 395
},
{
"epoch": 0.275119409465914,
"grad_norm": 0.85546875,
"learning_rate": 0.00019748174263409723,
"loss": 1.8636,
"step": 396
},
{
"epoch": 0.2758141554494138,
"grad_norm": 1.09375,
"learning_rate": 0.00019743137748677917,
"loss": 2.0849,
"step": 397
},
{
"epoch": 0.27650890143291357,
"grad_norm": 1.0078125,
"learning_rate": 0.0001973810123394611,
"loss": 2.1198,
"step": 398
},
{
"epoch": 0.2772036474164134,
"grad_norm": 0.5546875,
"learning_rate": 0.00019733064719214304,
"loss": 1.8171,
"step": 399
},
{
"epoch": 0.2778983933999132,
"grad_norm": 0.5625,
"learning_rate": 0.00019728028204482498,
"loss": 2.1271,
"step": 400
},
{
"epoch": 0.27859313938341296,
"grad_norm": 0.80859375,
"learning_rate": 0.00019722991689750694,
"loss": 1.8004,
"step": 401
},
{
"epoch": 0.27928788536691274,
"grad_norm": 1.1328125,
"learning_rate": 0.00019717955175018888,
"loss": 2.0326,
"step": 402
},
{
"epoch": 0.2799826313504125,
"grad_norm": 0.8359375,
"learning_rate": 0.00019712918660287084,
"loss": 1.6586,
"step": 403
},
{
"epoch": 0.2806773773339123,
"grad_norm": 0.60546875,
"learning_rate": 0.00019707882145555278,
"loss": 1.7802,
"step": 404
},
{
"epoch": 0.28137212331741207,
"grad_norm": 0.59375,
"learning_rate": 0.0001970284563082347,
"loss": 1.9849,
"step": 405
},
{
"epoch": 0.28206686930091185,
"grad_norm": 0.73828125,
"learning_rate": 0.00019697809116091665,
"loss": 2.3161,
"step": 406
},
{
"epoch": 0.2827616152844116,
"grad_norm": 0.97265625,
"learning_rate": 0.00019692772601359858,
"loss": 1.7711,
"step": 407
},
{
"epoch": 0.2834563612679114,
"grad_norm": 0.66015625,
"learning_rate": 0.00019687736086628055,
"loss": 1.8916,
"step": 408
},
{
"epoch": 0.2841511072514112,
"grad_norm": 0.62890625,
"learning_rate": 0.00019682699571896248,
"loss": 2.0297,
"step": 409
},
{
"epoch": 0.28484585323491096,
"grad_norm": 0.8359375,
"learning_rate": 0.00019677663057164445,
"loss": 1.8274,
"step": 410
},
{
"epoch": 0.2855405992184108,
"grad_norm": 0.69140625,
"learning_rate": 0.00019672626542432638,
"loss": 1.5261,
"step": 411
},
{
"epoch": 0.2862353452019106,
"grad_norm": 0.4921875,
"learning_rate": 0.00019667590027700832,
"loss": 1.9996,
"step": 412
},
{
"epoch": 0.28693009118541035,
"grad_norm": 1.3671875,
"learning_rate": 0.00019662553512969026,
"loss": 2.1384,
"step": 413
},
{
"epoch": 0.28762483716891013,
"grad_norm": 0.90234375,
"learning_rate": 0.0001965751699823722,
"loss": 1.6071,
"step": 414
},
{
"epoch": 0.2883195831524099,
"grad_norm": 0.890625,
"learning_rate": 0.00019652480483505416,
"loss": 2.268,
"step": 415
},
{
"epoch": 0.2890143291359097,
"grad_norm": 0.515625,
"learning_rate": 0.0001964744396877361,
"loss": 1.6901,
"step": 416
},
{
"epoch": 0.28970907511940946,
"grad_norm": 0.953125,
"learning_rate": 0.00019642407454041806,
"loss": 2.0621,
"step": 417
},
{
"epoch": 0.29040382110290924,
"grad_norm": 1.234375,
"learning_rate": 0.0001963737093931,
"loss": 1.7457,
"step": 418
},
{
"epoch": 0.291098567086409,
"grad_norm": 0.84765625,
"learning_rate": 0.00019632334424578193,
"loss": 1.8165,
"step": 419
},
{
"epoch": 0.2917933130699088,
"grad_norm": 0.75390625,
"learning_rate": 0.00019627297909846387,
"loss": 1.8413,
"step": 420
},
{
"epoch": 0.2924880590534086,
"grad_norm": 0.8046875,
"learning_rate": 0.0001962226139511458,
"loss": 2.0921,
"step": 421
},
{
"epoch": 0.29318280503690836,
"grad_norm": 0.72265625,
"learning_rate": 0.00019617224880382777,
"loss": 1.9443,
"step": 422
},
{
"epoch": 0.2938775510204082,
"grad_norm": 2.390625,
"learning_rate": 0.0001961218836565097,
"loss": 1.8594,
"step": 423
},
{
"epoch": 0.29457229700390797,
"grad_norm": 1.6328125,
"learning_rate": 0.00019607151850919166,
"loss": 1.8851,
"step": 424
},
{
"epoch": 0.29526704298740775,
"grad_norm": 0.73828125,
"learning_rate": 0.0001960211533618736,
"loss": 2.0707,
"step": 425
},
{
"epoch": 0.2959617889709075,
"grad_norm": 0.9296875,
"learning_rate": 0.00019597078821455554,
"loss": 2.2316,
"step": 426
},
{
"epoch": 0.2966565349544073,
"grad_norm": 0.84765625,
"learning_rate": 0.00019592042306723747,
"loss": 1.8408,
"step": 427
},
{
"epoch": 0.2973512809379071,
"grad_norm": 0.8515625,
"learning_rate": 0.0001958700579199194,
"loss": 1.893,
"step": 428
},
{
"epoch": 0.29804602692140686,
"grad_norm": 0.7109375,
"learning_rate": 0.00019581969277260137,
"loss": 1.929,
"step": 429
},
{
"epoch": 0.29874077290490664,
"grad_norm": 0.68359375,
"learning_rate": 0.0001957693276252833,
"loss": 1.9385,
"step": 430
},
{
"epoch": 0.2994355188884064,
"grad_norm": 0.875,
"learning_rate": 0.00019571896247796527,
"loss": 1.9914,
"step": 431
},
{
"epoch": 0.3001302648719062,
"grad_norm": 1.1640625,
"learning_rate": 0.0001956685973306472,
"loss": 1.7315,
"step": 432
},
{
"epoch": 0.30082501085540597,
"grad_norm": 0.8046875,
"learning_rate": 0.00019561823218332915,
"loss": 1.5899,
"step": 433
},
{
"epoch": 0.30151975683890575,
"grad_norm": 0.53515625,
"learning_rate": 0.00019556786703601108,
"loss": 2.0665,
"step": 434
},
{
"epoch": 0.3022145028224056,
"grad_norm": 1.25,
"learning_rate": 0.00019551750188869305,
"loss": 1.7969,
"step": 435
},
{
"epoch": 0.30290924880590536,
"grad_norm": 0.6953125,
"learning_rate": 0.00019546713674137498,
"loss": 1.8387,
"step": 436
},
{
"epoch": 0.30360399478940514,
"grad_norm": 1.1015625,
"learning_rate": 0.00019541677159405692,
"loss": 1.8154,
"step": 437
},
{
"epoch": 0.3042987407729049,
"grad_norm": 0.890625,
"learning_rate": 0.00019536640644673888,
"loss": 1.9395,
"step": 438
},
{
"epoch": 0.3049934867564047,
"grad_norm": 0.60546875,
"learning_rate": 0.00019531604129942082,
"loss": 1.8653,
"step": 439
},
{
"epoch": 0.3056882327399045,
"grad_norm": 0.7109375,
"learning_rate": 0.00019526567615210275,
"loss": 1.9524,
"step": 440
},
{
"epoch": 0.30638297872340425,
"grad_norm": 0.75390625,
"learning_rate": 0.0001952153110047847,
"loss": 1.5947,
"step": 441
},
{
"epoch": 0.30707772470690403,
"grad_norm": 1.4765625,
"learning_rate": 0.00019516494585746665,
"loss": 2.1353,
"step": 442
},
{
"epoch": 0.3077724706904038,
"grad_norm": 1.453125,
"learning_rate": 0.0001951145807101486,
"loss": 1.6529,
"step": 443
},
{
"epoch": 0.3084672166739036,
"grad_norm": 1.0234375,
"learning_rate": 0.00019506421556283053,
"loss": 1.9062,
"step": 444
},
{
"epoch": 0.30916196265740337,
"grad_norm": 1.1484375,
"learning_rate": 0.0001950138504155125,
"loss": 2.128,
"step": 445
},
{
"epoch": 0.30985670864090314,
"grad_norm": 0.8203125,
"learning_rate": 0.0001949634852681944,
"loss": 1.9101,
"step": 446
},
{
"epoch": 0.310551454624403,
"grad_norm": 0.73046875,
"learning_rate": 0.00019491312012087636,
"loss": 1.6201,
"step": 447
},
{
"epoch": 0.31124620060790276,
"grad_norm": 1.296875,
"learning_rate": 0.0001948627549735583,
"loss": 2.1286,
"step": 448
},
{
"epoch": 0.31194094659140253,
"grad_norm": 0.65625,
"learning_rate": 0.00019481238982624026,
"loss": 1.7674,
"step": 449
},
{
"epoch": 0.3126356925749023,
"grad_norm": 0.9609375,
"learning_rate": 0.0001947620246789222,
"loss": 2.2924,
"step": 450
},
{
"epoch": 0.3133304385584021,
"grad_norm": 0.50390625,
"learning_rate": 0.00019471165953160413,
"loss": 2.1334,
"step": 451
},
{
"epoch": 0.31402518454190187,
"grad_norm": 1.4140625,
"learning_rate": 0.0001946612943842861,
"loss": 2.1055,
"step": 452
},
{
"epoch": 0.31471993052540165,
"grad_norm": 0.98828125,
"learning_rate": 0.000194610929236968,
"loss": 2.0685,
"step": 453
},
{
"epoch": 0.3154146765089014,
"grad_norm": 0.91796875,
"learning_rate": 0.00019456056408964997,
"loss": 2.2465,
"step": 454
},
{
"epoch": 0.3161094224924012,
"grad_norm": 0.9921875,
"learning_rate": 0.0001945101989423319,
"loss": 1.4865,
"step": 455
},
{
"epoch": 0.316804168475901,
"grad_norm": 0.9453125,
"learning_rate": 0.00019445983379501387,
"loss": 1.9726,
"step": 456
},
{
"epoch": 0.31749891445940076,
"grad_norm": 2.984375,
"learning_rate": 0.0001944094686476958,
"loss": 1.8845,
"step": 457
},
{
"epoch": 0.31819366044290054,
"grad_norm": 0.90625,
"learning_rate": 0.00019435910350037774,
"loss": 1.9736,
"step": 458
},
{
"epoch": 0.3188884064264004,
"grad_norm": 0.53515625,
"learning_rate": 0.0001943087383530597,
"loss": 2.0074,
"step": 459
},
{
"epoch": 0.31958315240990015,
"grad_norm": 0.56640625,
"learning_rate": 0.00019425837320574162,
"loss": 2.0931,
"step": 460
},
{
"epoch": 0.32027789839339993,
"grad_norm": 1.0,
"learning_rate": 0.00019420800805842358,
"loss": 1.9506,
"step": 461
},
{
"epoch": 0.3209726443768997,
"grad_norm": 0.90234375,
"learning_rate": 0.00019415764291110552,
"loss": 2.0672,
"step": 462
},
{
"epoch": 0.3216673903603995,
"grad_norm": 0.94140625,
"learning_rate": 0.00019410727776378748,
"loss": 2.1114,
"step": 463
},
{
"epoch": 0.32236213634389926,
"grad_norm": 1.4296875,
"learning_rate": 0.00019405691261646942,
"loss": 1.8596,
"step": 464
},
{
"epoch": 0.32305688232739904,
"grad_norm": 0.7890625,
"learning_rate": 0.00019400654746915138,
"loss": 2.2623,
"step": 465
},
{
"epoch": 0.3237516283108988,
"grad_norm": 0.515625,
"learning_rate": 0.00019395618232183331,
"loss": 1.9356,
"step": 466
},
{
"epoch": 0.3244463742943986,
"grad_norm": 0.72265625,
"learning_rate": 0.00019390581717451522,
"loss": 2.1368,
"step": 467
},
{
"epoch": 0.3251411202778984,
"grad_norm": 1.046875,
"learning_rate": 0.0001938554520271972,
"loss": 1.9551,
"step": 468
},
{
"epoch": 0.32583586626139815,
"grad_norm": 0.9375,
"learning_rate": 0.00019380508687987912,
"loss": 1.7151,
"step": 469
},
{
"epoch": 0.32653061224489793,
"grad_norm": 0.58984375,
"learning_rate": 0.0001937547217325611,
"loss": 1.6639,
"step": 470
},
{
"epoch": 0.32722535822839777,
"grad_norm": 1.265625,
"learning_rate": 0.00019370435658524302,
"loss": 1.9154,
"step": 471
},
{
"epoch": 0.32792010421189755,
"grad_norm": 1.3984375,
"learning_rate": 0.000193653991437925,
"loss": 1.6919,
"step": 472
},
{
"epoch": 0.3286148501953973,
"grad_norm": 1.1328125,
"learning_rate": 0.00019360362629060692,
"loss": 2.0988,
"step": 473
},
{
"epoch": 0.3293095961788971,
"grad_norm": 1.171875,
"learning_rate": 0.00019355326114328883,
"loss": 2.12,
"step": 474
},
{
"epoch": 0.3300043421623969,
"grad_norm": 0.94140625,
"learning_rate": 0.0001935028959959708,
"loss": 2.1701,
"step": 475
},
{
"epoch": 0.33069908814589666,
"grad_norm": 0.9296875,
"learning_rate": 0.00019345253084865273,
"loss": 2.0642,
"step": 476
},
{
"epoch": 0.33139383412939644,
"grad_norm": 0.85546875,
"learning_rate": 0.0001934021657013347,
"loss": 1.9536,
"step": 477
},
{
"epoch": 0.3320885801128962,
"grad_norm": 1.4453125,
"learning_rate": 0.00019335180055401663,
"loss": 2.023,
"step": 478
},
{
"epoch": 0.332783326096396,
"grad_norm": 0.734375,
"learning_rate": 0.0001933014354066986,
"loss": 1.9748,
"step": 479
},
{
"epoch": 0.33347807207989577,
"grad_norm": 4.15625,
"learning_rate": 0.0001932510702593805,
"loss": 1.7722,
"step": 480
},
{
"epoch": 0.33417281806339555,
"grad_norm": 0.703125,
"learning_rate": 0.00019320070511206244,
"loss": 2.1057,
"step": 481
},
{
"epoch": 0.3348675640468953,
"grad_norm": 0.66796875,
"learning_rate": 0.0001931503399647444,
"loss": 1.6782,
"step": 482
},
{
"epoch": 0.33556231003039516,
"grad_norm": 1.1640625,
"learning_rate": 0.00019309997481742634,
"loss": 2.1179,
"step": 483
},
{
"epoch": 0.33625705601389494,
"grad_norm": 0.9375,
"learning_rate": 0.0001930496096701083,
"loss": 2.192,
"step": 484
},
{
"epoch": 0.3369518019973947,
"grad_norm": 0.77734375,
"learning_rate": 0.00019299924452279024,
"loss": 1.8594,
"step": 485
},
{
"epoch": 0.3376465479808945,
"grad_norm": 0.6484375,
"learning_rate": 0.0001929488793754722,
"loss": 1.9312,
"step": 486
},
{
"epoch": 0.3383412939643943,
"grad_norm": 1.4375,
"learning_rate": 0.0001928985142281541,
"loss": 1.9013,
"step": 487
},
{
"epoch": 0.33903603994789405,
"grad_norm": 0.859375,
"learning_rate": 0.00019284814908083605,
"loss": 1.9257,
"step": 488
},
{
"epoch": 0.33973078593139383,
"grad_norm": 0.7265625,
"learning_rate": 0.000192797783933518,
"loss": 2.0159,
"step": 489
},
{
"epoch": 0.3404255319148936,
"grad_norm": 1.203125,
"learning_rate": 0.00019274741878619995,
"loss": 1.5344,
"step": 490
},
{
"epoch": 0.3411202778983934,
"grad_norm": 0.6953125,
"learning_rate": 0.0001926970536388819,
"loss": 1.5615,
"step": 491
},
{
"epoch": 0.34181502388189317,
"grad_norm": 0.81640625,
"learning_rate": 0.00019264668849156385,
"loss": 1.8554,
"step": 492
},
{
"epoch": 0.34250976986539294,
"grad_norm": 0.79296875,
"learning_rate": 0.0001925963233442458,
"loss": 1.8949,
"step": 493
},
{
"epoch": 0.3432045158488927,
"grad_norm": 1.078125,
"learning_rate": 0.00019254595819692772,
"loss": 1.8137,
"step": 494
},
{
"epoch": 0.34389926183239256,
"grad_norm": 1.2734375,
"learning_rate": 0.00019249559304960968,
"loss": 2.0595,
"step": 495
},
{
"epoch": 0.34459400781589233,
"grad_norm": 0.91015625,
"learning_rate": 0.00019244522790229162,
"loss": 2.2088,
"step": 496
},
{
"epoch": 0.3452887537993921,
"grad_norm": 0.89453125,
"learning_rate": 0.00019239486275497356,
"loss": 1.789,
"step": 497
},
{
"epoch": 0.3459834997828919,
"grad_norm": 1.5078125,
"learning_rate": 0.00019234449760765552,
"loss": 1.7053,
"step": 498
},
{
"epoch": 0.34667824576639167,
"grad_norm": 0.87109375,
"learning_rate": 0.00019229413246033746,
"loss": 1.985,
"step": 499
},
{
"epoch": 0.34737299174989145,
"grad_norm": 0.79296875,
"learning_rate": 0.00019224376731301942,
"loss": 2.0731,
"step": 500
},
{
"epoch": 0.3480677377333912,
"grad_norm": 0.56640625,
"learning_rate": 0.00019219340216570133,
"loss": 2.3603,
"step": 501
},
{
"epoch": 0.348762483716891,
"grad_norm": 1.0703125,
"learning_rate": 0.0001921430370183833,
"loss": 2.2065,
"step": 502
},
{
"epoch": 0.3494572297003908,
"grad_norm": 1.0234375,
"learning_rate": 0.00019209267187106523,
"loss": 2.1569,
"step": 503
},
{
"epoch": 0.35015197568389056,
"grad_norm": 0.71484375,
"learning_rate": 0.00019204230672374717,
"loss": 1.8459,
"step": 504
},
{
"epoch": 0.35084672166739034,
"grad_norm": 0.6953125,
"learning_rate": 0.00019199194157642913,
"loss": 1.8065,
"step": 505
},
{
"epoch": 0.3515414676508901,
"grad_norm": 0.76171875,
"learning_rate": 0.00019194157642911107,
"loss": 2.2484,
"step": 506
},
{
"epoch": 0.35223621363438995,
"grad_norm": 0.6171875,
"learning_rate": 0.00019189121128179303,
"loss": 1.9146,
"step": 507
},
{
"epoch": 0.35293095961788973,
"grad_norm": 1.0390625,
"learning_rate": 0.00019184084613447494,
"loss": 1.7471,
"step": 508
},
{
"epoch": 0.3536257056013895,
"grad_norm": 0.87890625,
"learning_rate": 0.0001917904809871569,
"loss": 1.7203,
"step": 509
},
{
"epoch": 0.3543204515848893,
"grad_norm": 0.71875,
"learning_rate": 0.00019174011583983884,
"loss": 2.0287,
"step": 510
},
{
"epoch": 0.35501519756838906,
"grad_norm": 1.03125,
"learning_rate": 0.00019168975069252077,
"loss": 1.7405,
"step": 511
},
{
"epoch": 0.35570994355188884,
"grad_norm": 1.71875,
"learning_rate": 0.00019163938554520274,
"loss": 1.7935,
"step": 512
},
{
"epoch": 0.3564046895353886,
"grad_norm": 0.98828125,
"learning_rate": 0.00019158902039788467,
"loss": 1.7058,
"step": 513
},
{
"epoch": 0.3570994355188884,
"grad_norm": 1.2578125,
"learning_rate": 0.0001915386552505666,
"loss": 1.6631,
"step": 514
},
{
"epoch": 0.3577941815023882,
"grad_norm": 1.1640625,
"learning_rate": 0.00019148829010324855,
"loss": 2.0297,
"step": 515
},
{
"epoch": 0.35848892748588795,
"grad_norm": 0.69921875,
"learning_rate": 0.0001914379249559305,
"loss": 1.3673,
"step": 516
},
{
"epoch": 0.35918367346938773,
"grad_norm": 0.95703125,
"learning_rate": 0.00019138755980861245,
"loss": 1.7896,
"step": 517
},
{
"epoch": 0.35987841945288757,
"grad_norm": 0.890625,
"learning_rate": 0.00019133719466129438,
"loss": 2.2388,
"step": 518
},
{
"epoch": 0.36057316543638734,
"grad_norm": 0.75390625,
"learning_rate": 0.00019128682951397635,
"loss": 1.8827,
"step": 519
},
{
"epoch": 0.3612679114198871,
"grad_norm": 2.4375,
"learning_rate": 0.00019123646436665828,
"loss": 1.8957,
"step": 520
},
{
"epoch": 0.3619626574033869,
"grad_norm": 0.921875,
"learning_rate": 0.00019118609921934022,
"loss": 1.666,
"step": 521
},
{
"epoch": 0.3626574033868867,
"grad_norm": 0.71484375,
"learning_rate": 0.00019113573407202215,
"loss": 1.6648,
"step": 522
},
{
"epoch": 0.36335214937038646,
"grad_norm": 0.64453125,
"learning_rate": 0.00019108536892470412,
"loss": 1.9235,
"step": 523
},
{
"epoch": 0.36404689535388624,
"grad_norm": 0.65625,
"learning_rate": 0.00019103500377738605,
"loss": 2.1473,
"step": 524
},
{
"epoch": 0.364741641337386,
"grad_norm": 0.8828125,
"learning_rate": 0.000190984638630068,
"loss": 2.0953,
"step": 525
},
{
"epoch": 0.3654363873208858,
"grad_norm": 1.234375,
"learning_rate": 0.00019093427348274995,
"loss": 1.8025,
"step": 526
},
{
"epoch": 0.36613113330438557,
"grad_norm": 1.0546875,
"learning_rate": 0.0001908839083354319,
"loss": 2.0172,
"step": 527
},
{
"epoch": 0.36682587928788535,
"grad_norm": 1.3515625,
"learning_rate": 0.00019083354318811383,
"loss": 1.7116,
"step": 528
},
{
"epoch": 0.3675206252713851,
"grad_norm": 1.0078125,
"learning_rate": 0.00019078317804079576,
"loss": 2.0673,
"step": 529
},
{
"epoch": 0.36821537125488496,
"grad_norm": 1.09375,
"learning_rate": 0.00019073281289347773,
"loss": 2.1515,
"step": 530
},
{
"epoch": 0.36891011723838474,
"grad_norm": 0.6171875,
"learning_rate": 0.00019068244774615966,
"loss": 2.1945,
"step": 531
},
{
"epoch": 0.3696048632218845,
"grad_norm": 0.71484375,
"learning_rate": 0.00019063208259884163,
"loss": 1.9585,
"step": 532
},
{
"epoch": 0.3702996092053843,
"grad_norm": 0.7890625,
"learning_rate": 0.00019058171745152356,
"loss": 2.1528,
"step": 533
},
{
"epoch": 0.3709943551888841,
"grad_norm": 0.8203125,
"learning_rate": 0.0001905313523042055,
"loss": 1.8423,
"step": 534
},
{
"epoch": 0.37168910117238385,
"grad_norm": 1.1015625,
"learning_rate": 0.00019048098715688743,
"loss": 1.9329,
"step": 535
},
{
"epoch": 0.37238384715588363,
"grad_norm": 0.84765625,
"learning_rate": 0.00019043062200956937,
"loss": 1.7946,
"step": 536
},
{
"epoch": 0.3730785931393834,
"grad_norm": 0.84375,
"learning_rate": 0.00019038025686225133,
"loss": 2.1579,
"step": 537
},
{
"epoch": 0.3737733391228832,
"grad_norm": 0.8046875,
"learning_rate": 0.00019032989171493327,
"loss": 1.9882,
"step": 538
},
{
"epoch": 0.37446808510638296,
"grad_norm": 0.81640625,
"learning_rate": 0.00019027952656761523,
"loss": 1.6992,
"step": 539
},
{
"epoch": 0.37516283108988274,
"grad_norm": 0.62890625,
"learning_rate": 0.00019022916142029717,
"loss": 2.003,
"step": 540
},
{
"epoch": 0.3758575770733825,
"grad_norm": 1.5,
"learning_rate": 0.0001901787962729791,
"loss": 2.1709,
"step": 541
},
{
"epoch": 0.37655232305688235,
"grad_norm": 1.0546875,
"learning_rate": 0.00019012843112566104,
"loss": 1.9809,
"step": 542
},
{
"epoch": 0.37724706904038213,
"grad_norm": 0.7734375,
"learning_rate": 0.00019007806597834298,
"loss": 1.956,
"step": 543
},
{
"epoch": 0.3779418150238819,
"grad_norm": 0.953125,
"learning_rate": 0.00019002770083102494,
"loss": 1.619,
"step": 544
},
{
"epoch": 0.3786365610073817,
"grad_norm": 0.83984375,
"learning_rate": 0.00018997733568370688,
"loss": 1.7824,
"step": 545
},
{
"epoch": 0.37933130699088147,
"grad_norm": 0.984375,
"learning_rate": 0.00018992697053638884,
"loss": 1.885,
"step": 546
},
{
"epoch": 0.38002605297438125,
"grad_norm": 1.3125,
"learning_rate": 0.00018987660538907078,
"loss": 2.0227,
"step": 547
},
{
"epoch": 0.380720798957881,
"grad_norm": 0.79296875,
"learning_rate": 0.00018982624024175272,
"loss": 1.7396,
"step": 548
},
{
"epoch": 0.3814155449413808,
"grad_norm": 0.67578125,
"learning_rate": 0.00018977587509443465,
"loss": 1.8219,
"step": 549
},
{
"epoch": 0.3821102909248806,
"grad_norm": 4.65625,
"learning_rate": 0.0001897255099471166,
"loss": 2.2175,
"step": 550
},
{
"epoch": 0.38280503690838036,
"grad_norm": 0.81640625,
"learning_rate": 0.00018967514479979855,
"loss": 1.7872,
"step": 551
},
{
"epoch": 0.38349978289188014,
"grad_norm": 0.98046875,
"learning_rate": 0.0001896247796524805,
"loss": 1.6591,
"step": 552
},
{
"epoch": 0.3841945288753799,
"grad_norm": 1.2421875,
"learning_rate": 0.00018957441450516245,
"loss": 2.0484,
"step": 553
},
{
"epoch": 0.38488927485887975,
"grad_norm": 0.90625,
"learning_rate": 0.0001895240493578444,
"loss": 1.8777,
"step": 554
},
{
"epoch": 0.3855840208423795,
"grad_norm": 0.921875,
"learning_rate": 0.00018947368421052632,
"loss": 2.1238,
"step": 555
},
{
"epoch": 0.3862787668258793,
"grad_norm": 0.60546875,
"learning_rate": 0.00018942331906320826,
"loss": 1.9607,
"step": 556
},
{
"epoch": 0.3869735128093791,
"grad_norm": 0.8125,
"learning_rate": 0.0001893729539158902,
"loss": 1.6038,
"step": 557
},
{
"epoch": 0.38766825879287886,
"grad_norm": 1.359375,
"learning_rate": 0.00018932258876857216,
"loss": 1.7207,
"step": 558
},
{
"epoch": 0.38836300477637864,
"grad_norm": 0.81640625,
"learning_rate": 0.0001892722236212541,
"loss": 1.94,
"step": 559
},
{
"epoch": 0.3890577507598784,
"grad_norm": 1.546875,
"learning_rate": 0.00018922185847393606,
"loss": 1.6762,
"step": 560
},
{
"epoch": 0.3897524967433782,
"grad_norm": 0.8125,
"learning_rate": 0.000189171493326618,
"loss": 1.8562,
"step": 561
},
{
"epoch": 0.390447242726878,
"grad_norm": 0.734375,
"learning_rate": 0.00018912112817929993,
"loss": 2.252,
"step": 562
},
{
"epoch": 0.39114198871037775,
"grad_norm": 1.3359375,
"learning_rate": 0.00018907076303198187,
"loss": 1.7797,
"step": 563
},
{
"epoch": 0.39183673469387753,
"grad_norm": 0.87890625,
"learning_rate": 0.0001890203978846638,
"loss": 2.028,
"step": 564
},
{
"epoch": 0.3925314806773773,
"grad_norm": 0.8515625,
"learning_rate": 0.00018897003273734577,
"loss": 1.9406,
"step": 565
},
{
"epoch": 0.39322622666087714,
"grad_norm": 1.25,
"learning_rate": 0.0001889196675900277,
"loss": 1.4522,
"step": 566
},
{
"epoch": 0.3939209726443769,
"grad_norm": 1.53125,
"learning_rate": 0.00018886930244270967,
"loss": 2.1899,
"step": 567
},
{
"epoch": 0.3946157186278767,
"grad_norm": 0.8828125,
"learning_rate": 0.0001888189372953916,
"loss": 2.0125,
"step": 568
},
{
"epoch": 0.3953104646113765,
"grad_norm": 0.80078125,
"learning_rate": 0.00018876857214807354,
"loss": 1.8498,
"step": 569
},
{
"epoch": 0.39600521059487626,
"grad_norm": 0.85546875,
"learning_rate": 0.00018871820700075548,
"loss": 2.1336,
"step": 570
},
{
"epoch": 0.39669995657837603,
"grad_norm": 0.953125,
"learning_rate": 0.0001886678418534374,
"loss": 1.9395,
"step": 571
},
{
"epoch": 0.3973947025618758,
"grad_norm": 0.8515625,
"learning_rate": 0.00018861747670611938,
"loss": 1.9798,
"step": 572
},
{
"epoch": 0.3980894485453756,
"grad_norm": 2.375,
"learning_rate": 0.0001885671115588013,
"loss": 1.9257,
"step": 573
},
{
"epoch": 0.39878419452887537,
"grad_norm": 1.125,
"learning_rate": 0.00018851674641148328,
"loss": 1.7834,
"step": 574
},
{
"epoch": 0.39947894051237515,
"grad_norm": 1.09375,
"learning_rate": 0.0001884663812641652,
"loss": 2.0443,
"step": 575
},
{
"epoch": 0.4001736864958749,
"grad_norm": 1.109375,
"learning_rate": 0.00018841601611684715,
"loss": 1.6684,
"step": 576
},
{
"epoch": 0.4008684324793747,
"grad_norm": 0.79296875,
"learning_rate": 0.00018836565096952908,
"loss": 1.6101,
"step": 577
},
{
"epoch": 0.40156317846287454,
"grad_norm": 0.7265625,
"learning_rate": 0.00018831528582221102,
"loss": 1.5633,
"step": 578
},
{
"epoch": 0.4022579244463743,
"grad_norm": 0.7890625,
"learning_rate": 0.00018826492067489298,
"loss": 1.9582,
"step": 579
},
{
"epoch": 0.4029526704298741,
"grad_norm": 0.73828125,
"learning_rate": 0.00018821455552757492,
"loss": 2.1624,
"step": 580
},
{
"epoch": 0.40364741641337387,
"grad_norm": 1.078125,
"learning_rate": 0.00018816419038025688,
"loss": 1.8846,
"step": 581
},
{
"epoch": 0.40434216239687365,
"grad_norm": 1.1484375,
"learning_rate": 0.00018811382523293882,
"loss": 1.6138,
"step": 582
},
{
"epoch": 0.40503690838037343,
"grad_norm": 0.93359375,
"learning_rate": 0.00018806346008562076,
"loss": 1.6372,
"step": 583
},
{
"epoch": 0.4057316543638732,
"grad_norm": 0.66796875,
"learning_rate": 0.0001880130949383027,
"loss": 2.1048,
"step": 584
},
{
"epoch": 0.406426400347373,
"grad_norm": 0.94921875,
"learning_rate": 0.00018796272979098463,
"loss": 2.3389,
"step": 585
},
{
"epoch": 0.40712114633087276,
"grad_norm": 0.625,
"learning_rate": 0.0001879123646436666,
"loss": 2.0405,
"step": 586
},
{
"epoch": 0.40781589231437254,
"grad_norm": 0.79296875,
"learning_rate": 0.00018786199949634853,
"loss": 1.8406,
"step": 587
},
{
"epoch": 0.4085106382978723,
"grad_norm": 0.88671875,
"learning_rate": 0.0001878116343490305,
"loss": 2.2409,
"step": 588
},
{
"epoch": 0.4092053842813721,
"grad_norm": 2.421875,
"learning_rate": 0.00018776126920171243,
"loss": 1.8545,
"step": 589
},
{
"epoch": 0.40990013026487193,
"grad_norm": 0.765625,
"learning_rate": 0.00018771090405439437,
"loss": 1.5796,
"step": 590
},
{
"epoch": 0.4105948762483717,
"grad_norm": 0.91015625,
"learning_rate": 0.0001876605389070763,
"loss": 1.3451,
"step": 591
},
{
"epoch": 0.4112896222318715,
"grad_norm": 3.015625,
"learning_rate": 0.00018761017375975824,
"loss": 2.2266,
"step": 592
},
{
"epoch": 0.41198436821537127,
"grad_norm": 0.66796875,
"learning_rate": 0.0001875598086124402,
"loss": 1.9017,
"step": 593
},
{
"epoch": 0.41267911419887104,
"grad_norm": 0.96875,
"learning_rate": 0.00018750944346512214,
"loss": 1.6085,
"step": 594
},
{
"epoch": 0.4133738601823708,
"grad_norm": 0.71484375,
"learning_rate": 0.0001874590783178041,
"loss": 2.0503,
"step": 595
},
{
"epoch": 0.4140686061658706,
"grad_norm": 0.77734375,
"learning_rate": 0.00018740871317048604,
"loss": 1.9612,
"step": 596
},
{
"epoch": 0.4147633521493704,
"grad_norm": 0.6796875,
"learning_rate": 0.00018735834802316797,
"loss": 1.6432,
"step": 597
},
{
"epoch": 0.41545809813287016,
"grad_norm": 0.61328125,
"learning_rate": 0.0001873079828758499,
"loss": 1.855,
"step": 598
},
{
"epoch": 0.41615284411636994,
"grad_norm": 1.2265625,
"learning_rate": 0.00018725761772853187,
"loss": 1.7901,
"step": 599
},
{
"epoch": 0.4168475900998697,
"grad_norm": 0.828125,
"learning_rate": 0.0001872072525812138,
"loss": 1.6167,
"step": 600
},
{
"epoch": 0.4175423360833695,
"grad_norm": 1.046875,
"learning_rate": 0.00018715688743389575,
"loss": 1.6097,
"step": 601
},
{
"epoch": 0.4182370820668693,
"grad_norm": 0.8046875,
"learning_rate": 0.0001871065222865777,
"loss": 1.8758,
"step": 602
},
{
"epoch": 0.4189318280503691,
"grad_norm": 0.84375,
"learning_rate": 0.00018705615713925965,
"loss": 1.8387,
"step": 603
},
{
"epoch": 0.4196265740338689,
"grad_norm": 1.125,
"learning_rate": 0.00018700579199194158,
"loss": 1.5821,
"step": 604
},
{
"epoch": 0.42032132001736866,
"grad_norm": 1.2421875,
"learning_rate": 0.00018695542684462352,
"loss": 1.8596,
"step": 605
},
{
"epoch": 0.42101606600086844,
"grad_norm": 0.8828125,
"learning_rate": 0.00018690506169730548,
"loss": 1.6423,
"step": 606
},
{
"epoch": 0.4217108119843682,
"grad_norm": 0.640625,
"learning_rate": 0.00018685469654998742,
"loss": 1.9369,
"step": 607
},
{
"epoch": 0.422405557967868,
"grad_norm": 0.83984375,
"learning_rate": 0.00018680433140266935,
"loss": 1.7383,
"step": 608
},
{
"epoch": 0.4231003039513678,
"grad_norm": 1.078125,
"learning_rate": 0.00018675396625535132,
"loss": 1.5791,
"step": 609
},
{
"epoch": 0.42379504993486755,
"grad_norm": 1.0234375,
"learning_rate": 0.00018670360110803325,
"loss": 1.8138,
"step": 610
},
{
"epoch": 0.42448979591836733,
"grad_norm": 1.328125,
"learning_rate": 0.0001866532359607152,
"loss": 1.8374,
"step": 611
},
{
"epoch": 0.4251845419018671,
"grad_norm": 1.046875,
"learning_rate": 0.00018660287081339713,
"loss": 2.1108,
"step": 612
},
{
"epoch": 0.4258792878853669,
"grad_norm": 1.0,
"learning_rate": 0.0001865525056660791,
"loss": 1.7101,
"step": 613
},
{
"epoch": 0.4265740338688667,
"grad_norm": 0.7734375,
"learning_rate": 0.00018650214051876103,
"loss": 1.8065,
"step": 614
},
{
"epoch": 0.4272687798523665,
"grad_norm": 0.75390625,
"learning_rate": 0.00018645177537144296,
"loss": 1.7606,
"step": 615
},
{
"epoch": 0.4279635258358663,
"grad_norm": 0.953125,
"learning_rate": 0.00018640141022412493,
"loss": 1.9831,
"step": 616
},
{
"epoch": 0.42865827181936605,
"grad_norm": 1.171875,
"learning_rate": 0.00018635104507680686,
"loss": 2.1788,
"step": 617
},
{
"epoch": 0.42935301780286583,
"grad_norm": 1.0,
"learning_rate": 0.0001863006799294888,
"loss": 1.7741,
"step": 618
},
{
"epoch": 0.4300477637863656,
"grad_norm": 0.55078125,
"learning_rate": 0.00018625031478217073,
"loss": 1.6123,
"step": 619
},
{
"epoch": 0.4307425097698654,
"grad_norm": 1.171875,
"learning_rate": 0.0001861999496348527,
"loss": 2.0256,
"step": 620
},
{
"epoch": 0.43143725575336517,
"grad_norm": 1.3671875,
"learning_rate": 0.00018614958448753463,
"loss": 2.1769,
"step": 621
},
{
"epoch": 0.43213200173686495,
"grad_norm": 0.5859375,
"learning_rate": 0.00018609921934021657,
"loss": 1.765,
"step": 622
},
{
"epoch": 0.4328267477203647,
"grad_norm": 0.75390625,
"learning_rate": 0.00018604885419289853,
"loss": 2.0414,
"step": 623
},
{
"epoch": 0.4335214937038645,
"grad_norm": 0.9375,
"learning_rate": 0.00018599848904558047,
"loss": 1.823,
"step": 624
},
{
"epoch": 0.4342162396873643,
"grad_norm": 0.64453125,
"learning_rate": 0.0001859481238982624,
"loss": 1.9846,
"step": 625
},
{
"epoch": 0.4349109856708641,
"grad_norm": 0.69140625,
"learning_rate": 0.00018589775875094434,
"loss": 2.1152,
"step": 626
},
{
"epoch": 0.4356057316543639,
"grad_norm": 0.91796875,
"learning_rate": 0.0001858473936036263,
"loss": 1.8232,
"step": 627
},
{
"epoch": 0.43630047763786367,
"grad_norm": 0.96484375,
"learning_rate": 0.00018579702845630824,
"loss": 1.4993,
"step": 628
},
{
"epoch": 0.43699522362136345,
"grad_norm": 0.91796875,
"learning_rate": 0.0001857466633089902,
"loss": 1.7799,
"step": 629
},
{
"epoch": 0.4376899696048632,
"grad_norm": 0.75,
"learning_rate": 0.00018569629816167214,
"loss": 1.5612,
"step": 630
},
{
"epoch": 0.438384715588363,
"grad_norm": 1.1015625,
"learning_rate": 0.00018564593301435408,
"loss": 2.114,
"step": 631
},
{
"epoch": 0.4390794615718628,
"grad_norm": 0.74609375,
"learning_rate": 0.00018559556786703602,
"loss": 1.8577,
"step": 632
},
{
"epoch": 0.43977420755536256,
"grad_norm": 1.1015625,
"learning_rate": 0.00018554520271971795,
"loss": 1.9846,
"step": 633
},
{
"epoch": 0.44046895353886234,
"grad_norm": 1.7734375,
"learning_rate": 0.00018549483757239991,
"loss": 1.9394,
"step": 634
},
{
"epoch": 0.4411636995223621,
"grad_norm": 0.71875,
"learning_rate": 0.00018544447242508185,
"loss": 1.6845,
"step": 635
},
{
"epoch": 0.4418584455058619,
"grad_norm": 1.046875,
"learning_rate": 0.00018539410727776381,
"loss": 1.9656,
"step": 636
},
{
"epoch": 0.4425531914893617,
"grad_norm": 0.61328125,
"learning_rate": 0.00018534374213044575,
"loss": 2.2499,
"step": 637
},
{
"epoch": 0.4432479374728615,
"grad_norm": 1.0234375,
"learning_rate": 0.0001852933769831277,
"loss": 1.9494,
"step": 638
},
{
"epoch": 0.4439426834563613,
"grad_norm": 1.1875,
"learning_rate": 0.00018524301183580962,
"loss": 1.2849,
"step": 639
},
{
"epoch": 0.44463742943986106,
"grad_norm": 1.046875,
"learning_rate": 0.00018519264668849156,
"loss": 1.8238,
"step": 640
},
{
"epoch": 0.44533217542336084,
"grad_norm": 0.68359375,
"learning_rate": 0.00018514228154117352,
"loss": 1.8664,
"step": 641
},
{
"epoch": 0.4460269214068606,
"grad_norm": 0.6953125,
"learning_rate": 0.00018509191639385546,
"loss": 1.8457,
"step": 642
},
{
"epoch": 0.4467216673903604,
"grad_norm": 1.1640625,
"learning_rate": 0.00018504155124653742,
"loss": 1.9668,
"step": 643
},
{
"epoch": 0.4474164133738602,
"grad_norm": 0.79296875,
"learning_rate": 0.00018499118609921936,
"loss": 1.5451,
"step": 644
},
{
"epoch": 0.44811115935735996,
"grad_norm": 1.09375,
"learning_rate": 0.0001849408209519013,
"loss": 2.2038,
"step": 645
},
{
"epoch": 0.44880590534085973,
"grad_norm": 0.474609375,
"learning_rate": 0.00018489045580458323,
"loss": 1.7649,
"step": 646
},
{
"epoch": 0.4495006513243595,
"grad_norm": 2.5625,
"learning_rate": 0.00018484009065726517,
"loss": 1.9661,
"step": 647
},
{
"epoch": 0.4501953973078593,
"grad_norm": 0.75390625,
"learning_rate": 0.00018478972550994713,
"loss": 1.7701,
"step": 648
},
{
"epoch": 0.4508901432913591,
"grad_norm": 0.6953125,
"learning_rate": 0.00018473936036262907,
"loss": 1.7554,
"step": 649
},
{
"epoch": 0.4515848892748589,
"grad_norm": 0.5859375,
"learning_rate": 0.00018468899521531103,
"loss": 1.9016,
"step": 650
},
{
"epoch": 0.4522796352583587,
"grad_norm": 0.51953125,
"learning_rate": 0.00018463863006799297,
"loss": 1.9223,
"step": 651
},
{
"epoch": 0.45297438124185846,
"grad_norm": 1.09375,
"learning_rate": 0.0001845882649206749,
"loss": 1.7449,
"step": 652
},
{
"epoch": 0.45366912722535824,
"grad_norm": 0.62890625,
"learning_rate": 0.00018453789977335684,
"loss": 1.9779,
"step": 653
},
{
"epoch": 0.454363873208858,
"grad_norm": 0.8125,
"learning_rate": 0.00018448753462603878,
"loss": 1.7003,
"step": 654
},
{
"epoch": 0.4550586191923578,
"grad_norm": 1.015625,
"learning_rate": 0.00018443716947872074,
"loss": 1.6883,
"step": 655
},
{
"epoch": 0.45575336517585757,
"grad_norm": 0.97265625,
"learning_rate": 0.00018438680433140268,
"loss": 1.9557,
"step": 656
},
{
"epoch": 0.45644811115935735,
"grad_norm": 2.578125,
"learning_rate": 0.00018433643918408464,
"loss": 2.0112,
"step": 657
},
{
"epoch": 0.45714285714285713,
"grad_norm": 0.63671875,
"learning_rate": 0.00018428607403676658,
"loss": 2.0644,
"step": 658
},
{
"epoch": 0.4578376031263569,
"grad_norm": 0.796875,
"learning_rate": 0.0001842357088894485,
"loss": 2.2675,
"step": 659
},
{
"epoch": 0.4585323491098567,
"grad_norm": 0.7578125,
"learning_rate": 0.00018418534374213045,
"loss": 2.0472,
"step": 660
},
{
"epoch": 0.4592270950933565,
"grad_norm": 0.703125,
"learning_rate": 0.00018413497859481238,
"loss": 1.7624,
"step": 661
},
{
"epoch": 0.4599218410768563,
"grad_norm": 1.28125,
"learning_rate": 0.00018408461344749435,
"loss": 1.9001,
"step": 662
},
{
"epoch": 0.4606165870603561,
"grad_norm": 0.5625,
"learning_rate": 0.00018403424830017628,
"loss": 2.1622,
"step": 663
},
{
"epoch": 0.46131133304385585,
"grad_norm": 0.90625,
"learning_rate": 0.00018398388315285825,
"loss": 2.0592,
"step": 664
},
{
"epoch": 0.46200607902735563,
"grad_norm": 0.92578125,
"learning_rate": 0.00018393351800554018,
"loss": 1.97,
"step": 665
},
{
"epoch": 0.4627008250108554,
"grad_norm": 0.8203125,
"learning_rate": 0.00018388315285822212,
"loss": 1.7164,
"step": 666
},
{
"epoch": 0.4633955709943552,
"grad_norm": 1.21875,
"learning_rate": 0.00018383278771090406,
"loss": 2.0458,
"step": 667
},
{
"epoch": 0.46409031697785497,
"grad_norm": 0.90625,
"learning_rate": 0.000183782422563586,
"loss": 2.0149,
"step": 668
},
{
"epoch": 0.46478506296135474,
"grad_norm": 0.9453125,
"learning_rate": 0.00018373205741626796,
"loss": 2.3638,
"step": 669
},
{
"epoch": 0.4654798089448545,
"grad_norm": 1.1796875,
"learning_rate": 0.0001836816922689499,
"loss": 2.0574,
"step": 670
},
{
"epoch": 0.4661745549283543,
"grad_norm": 0.87109375,
"learning_rate": 0.00018363132712163186,
"loss": 1.7857,
"step": 671
},
{
"epoch": 0.4668693009118541,
"grad_norm": 0.734375,
"learning_rate": 0.0001835809619743138,
"loss": 1.8794,
"step": 672
},
{
"epoch": 0.4675640468953539,
"grad_norm": 0.625,
"learning_rate": 0.00018353059682699573,
"loss": 1.7081,
"step": 673
},
{
"epoch": 0.4682587928788537,
"grad_norm": 0.765625,
"learning_rate": 0.00018348023167967767,
"loss": 1.5881,
"step": 674
},
{
"epoch": 0.46895353886235347,
"grad_norm": 1.0546875,
"learning_rate": 0.0001834298665323596,
"loss": 2.0648,
"step": 675
},
{
"epoch": 0.46964828484585325,
"grad_norm": 0.93359375,
"learning_rate": 0.00018337950138504156,
"loss": 1.9419,
"step": 676
},
{
"epoch": 0.470343030829353,
"grad_norm": 0.83984375,
"learning_rate": 0.0001833291362377235,
"loss": 2.1462,
"step": 677
},
{
"epoch": 0.4710377768128528,
"grad_norm": 0.88671875,
"learning_rate": 0.00018327877109040546,
"loss": 2.0603,
"step": 678
},
{
"epoch": 0.4717325227963526,
"grad_norm": 1.296875,
"learning_rate": 0.0001832284059430874,
"loss": 1.9721,
"step": 679
},
{
"epoch": 0.47242726877985236,
"grad_norm": 0.859375,
"learning_rate": 0.00018317804079576934,
"loss": 1.5208,
"step": 680
},
{
"epoch": 0.47312201476335214,
"grad_norm": 1.125,
"learning_rate": 0.00018312767564845127,
"loss": 2.3462,
"step": 681
},
{
"epoch": 0.4738167607468519,
"grad_norm": 0.7890625,
"learning_rate": 0.0001830773105011332,
"loss": 1.8435,
"step": 682
},
{
"epoch": 0.4745115067303517,
"grad_norm": 1.0546875,
"learning_rate": 0.00018302694535381517,
"loss": 2.2692,
"step": 683
},
{
"epoch": 0.4752062527138515,
"grad_norm": 0.91015625,
"learning_rate": 0.0001829765802064971,
"loss": 2.1077,
"step": 684
},
{
"epoch": 0.4759009986973513,
"grad_norm": 0.796875,
"learning_rate": 0.00018292621505917907,
"loss": 1.9386,
"step": 685
},
{
"epoch": 0.4765957446808511,
"grad_norm": 0.78125,
"learning_rate": 0.000182875849911861,
"loss": 1.7375,
"step": 686
},
{
"epoch": 0.47729049066435086,
"grad_norm": 0.875,
"learning_rate": 0.00018282548476454295,
"loss": 1.8864,
"step": 687
},
{
"epoch": 0.47798523664785064,
"grad_norm": 0.7734375,
"learning_rate": 0.00018277511961722488,
"loss": 2.2137,
"step": 688
},
{
"epoch": 0.4786799826313504,
"grad_norm": 1.140625,
"learning_rate": 0.00018272475446990682,
"loss": 2.0135,
"step": 689
},
{
"epoch": 0.4793747286148502,
"grad_norm": 0.66015625,
"learning_rate": 0.00018267438932258878,
"loss": 1.9841,
"step": 690
},
{
"epoch": 0.48006947459835,
"grad_norm": 0.984375,
"learning_rate": 0.00018262402417527072,
"loss": 1.8652,
"step": 691
},
{
"epoch": 0.48076422058184975,
"grad_norm": 0.94921875,
"learning_rate": 0.00018257365902795268,
"loss": 1.533,
"step": 692
},
{
"epoch": 0.48145896656534953,
"grad_norm": 0.97265625,
"learning_rate": 0.00018252329388063462,
"loss": 1.7846,
"step": 693
},
{
"epoch": 0.4821537125488493,
"grad_norm": 0.890625,
"learning_rate": 0.00018247292873331655,
"loss": 1.8461,
"step": 694
},
{
"epoch": 0.4828484585323491,
"grad_norm": 0.953125,
"learning_rate": 0.0001824225635859985,
"loss": 1.8622,
"step": 695
},
{
"epoch": 0.48354320451584887,
"grad_norm": 1.125,
"learning_rate": 0.00018237219843868045,
"loss": 2.1919,
"step": 696
},
{
"epoch": 0.4842379504993487,
"grad_norm": 0.94921875,
"learning_rate": 0.0001823218332913624,
"loss": 1.9689,
"step": 697
},
{
"epoch": 0.4849326964828485,
"grad_norm": 0.953125,
"learning_rate": 0.00018227146814404433,
"loss": 1.5199,
"step": 698
},
{
"epoch": 0.48562744246634826,
"grad_norm": 1.484375,
"learning_rate": 0.0001822211029967263,
"loss": 2.2635,
"step": 699
},
{
"epoch": 0.48632218844984804,
"grad_norm": 0.98828125,
"learning_rate": 0.00018217073784940823,
"loss": 2.2775,
"step": 700
},
{
"epoch": 0.4870169344333478,
"grad_norm": 0.62109375,
"learning_rate": 0.00018212037270209016,
"loss": 1.9399,
"step": 701
},
{
"epoch": 0.4877116804168476,
"grad_norm": 2.9375,
"learning_rate": 0.0001820700075547721,
"loss": 2.0305,
"step": 702
},
{
"epoch": 0.48840642640034737,
"grad_norm": 1.0078125,
"learning_rate": 0.00018201964240745406,
"loss": 2.098,
"step": 703
},
{
"epoch": 0.48910117238384715,
"grad_norm": 1.2578125,
"learning_rate": 0.000181969277260136,
"loss": 2.0633,
"step": 704
},
{
"epoch": 0.4897959183673469,
"grad_norm": 3.0625,
"learning_rate": 0.00018191891211281793,
"loss": 1.8335,
"step": 705
},
{
"epoch": 0.4904906643508467,
"grad_norm": 0.8515625,
"learning_rate": 0.0001818685469654999,
"loss": 1.4018,
"step": 706
},
{
"epoch": 0.4911854103343465,
"grad_norm": 0.69921875,
"learning_rate": 0.00018181818181818183,
"loss": 1.5096,
"step": 707
},
{
"epoch": 0.49188015631784626,
"grad_norm": 0.62890625,
"learning_rate": 0.00018176781667086377,
"loss": 2.0383,
"step": 708
},
{
"epoch": 0.4925749023013461,
"grad_norm": 0.8515625,
"learning_rate": 0.0001817174515235457,
"loss": 2.0676,
"step": 709
},
{
"epoch": 0.4932696482848459,
"grad_norm": 0.9765625,
"learning_rate": 0.00018166708637622767,
"loss": 2.1933,
"step": 710
},
{
"epoch": 0.49396439426834565,
"grad_norm": 0.6953125,
"learning_rate": 0.0001816167212289096,
"loss": 1.7498,
"step": 711
},
{
"epoch": 0.49465914025184543,
"grad_norm": 0.8125,
"learning_rate": 0.00018156635608159154,
"loss": 1.9815,
"step": 712
},
{
"epoch": 0.4953538862353452,
"grad_norm": 0.90625,
"learning_rate": 0.0001815159909342735,
"loss": 2.2162,
"step": 713
},
{
"epoch": 0.496048632218845,
"grad_norm": 1.34375,
"learning_rate": 0.00018146562578695542,
"loss": 1.8575,
"step": 714
},
{
"epoch": 0.49674337820234477,
"grad_norm": 0.78125,
"learning_rate": 0.00018141526063963738,
"loss": 1.8807,
"step": 715
},
{
"epoch": 0.49743812418584454,
"grad_norm": 0.58203125,
"learning_rate": 0.00018136489549231932,
"loss": 1.1918,
"step": 716
},
{
"epoch": 0.4981328701693443,
"grad_norm": 1.03125,
"learning_rate": 0.00018131453034500128,
"loss": 2.1739,
"step": 717
},
{
"epoch": 0.4988276161528441,
"grad_norm": 0.6796875,
"learning_rate": 0.00018126416519768321,
"loss": 1.9931,
"step": 718
},
{
"epoch": 0.4995223621363439,
"grad_norm": 0.99609375,
"learning_rate": 0.00018121380005036515,
"loss": 1.8006,
"step": 719
},
{
"epoch": 0.5002171081198437,
"grad_norm": 0.7109375,
"learning_rate": 0.00018116343490304711,
"loss": 1.9731,
"step": 720
},
{
"epoch": 0.5009118541033435,
"grad_norm": 1.0390625,
"learning_rate": 0.00018111306975572902,
"loss": 1.9277,
"step": 721
},
{
"epoch": 0.5016066000868432,
"grad_norm": 1.8515625,
"learning_rate": 0.000181062704608411,
"loss": 1.8397,
"step": 722
},
{
"epoch": 0.502301346070343,
"grad_norm": 0.90625,
"learning_rate": 0.00018101233946109292,
"loss": 1.6404,
"step": 723
},
{
"epoch": 0.5029960920538428,
"grad_norm": 0.69921875,
"learning_rate": 0.0001809619743137749,
"loss": 1.6856,
"step": 724
},
{
"epoch": 0.5036908380373426,
"grad_norm": 1.1171875,
"learning_rate": 0.00018091160916645682,
"loss": 1.8246,
"step": 725
},
{
"epoch": 0.5043855840208423,
"grad_norm": 0.68359375,
"learning_rate": 0.00018086124401913876,
"loss": 1.9523,
"step": 726
},
{
"epoch": 0.5050803300043422,
"grad_norm": 0.81640625,
"learning_rate": 0.00018081087887182072,
"loss": 1.8332,
"step": 727
},
{
"epoch": 0.505775075987842,
"grad_norm": 0.9296875,
"learning_rate": 0.00018076051372450263,
"loss": 2.008,
"step": 728
},
{
"epoch": 0.5064698219713417,
"grad_norm": 1.0078125,
"learning_rate": 0.0001807101485771846,
"loss": 2.0791,
"step": 729
},
{
"epoch": 0.5071645679548415,
"grad_norm": 0.890625,
"learning_rate": 0.00018065978342986653,
"loss": 2.2381,
"step": 730
},
{
"epoch": 0.5078593139383413,
"grad_norm": 0.7109375,
"learning_rate": 0.0001806094182825485,
"loss": 1.6686,
"step": 731
},
{
"epoch": 0.5085540599218411,
"grad_norm": 1.0859375,
"learning_rate": 0.00018055905313523043,
"loss": 1.9747,
"step": 732
},
{
"epoch": 0.5092488059053408,
"grad_norm": 1.03125,
"learning_rate": 0.0001805086879879124,
"loss": 1.8613,
"step": 733
},
{
"epoch": 0.5099435518888407,
"grad_norm": 1.1953125,
"learning_rate": 0.00018045832284059433,
"loss": 1.6721,
"step": 734
},
{
"epoch": 0.5106382978723404,
"grad_norm": 0.7265625,
"learning_rate": 0.00018040795769327624,
"loss": 1.9698,
"step": 735
},
{
"epoch": 0.5113330438558402,
"grad_norm": 1.703125,
"learning_rate": 0.0001803575925459582,
"loss": 1.9346,
"step": 736
},
{
"epoch": 0.5120277898393399,
"grad_norm": 1.7421875,
"learning_rate": 0.00018030722739864014,
"loss": 1.6338,
"step": 737
},
{
"epoch": 0.5127225358228398,
"grad_norm": 0.828125,
"learning_rate": 0.0001802568622513221,
"loss": 1.7765,
"step": 738
},
{
"epoch": 0.5134172818063396,
"grad_norm": 0.9765625,
"learning_rate": 0.00018020649710400404,
"loss": 1.6058,
"step": 739
},
{
"epoch": 0.5141120277898393,
"grad_norm": 0.80078125,
"learning_rate": 0.000180156131956686,
"loss": 1.7684,
"step": 740
},
{
"epoch": 0.5148067737733392,
"grad_norm": 1.890625,
"learning_rate": 0.00018010576680936794,
"loss": 1.7943,
"step": 741
},
{
"epoch": 0.5155015197568389,
"grad_norm": 1.0625,
"learning_rate": 0.00018005540166204985,
"loss": 2.029,
"step": 742
},
{
"epoch": 0.5161962657403387,
"grad_norm": 13.1875,
"learning_rate": 0.0001800050365147318,
"loss": 2.0415,
"step": 743
},
{
"epoch": 0.5168910117238384,
"grad_norm": 1.0703125,
"learning_rate": 0.00017995467136741375,
"loss": 2.0677,
"step": 744
},
{
"epoch": 0.5175857577073383,
"grad_norm": 0.9609375,
"learning_rate": 0.0001799043062200957,
"loss": 1.7026,
"step": 745
},
{
"epoch": 0.518280503690838,
"grad_norm": 1.21875,
"learning_rate": 0.00017985394107277765,
"loss": 1.7546,
"step": 746
},
{
"epoch": 0.5189752496743378,
"grad_norm": 0.75,
"learning_rate": 0.0001798035759254596,
"loss": 1.7259,
"step": 747
},
{
"epoch": 0.5196699956578376,
"grad_norm": 1.1015625,
"learning_rate": 0.00017975321077814152,
"loss": 1.8416,
"step": 748
},
{
"epoch": 0.5203647416413374,
"grad_norm": 1.0703125,
"learning_rate": 0.00017970284563082346,
"loss": 2.3549,
"step": 749
},
{
"epoch": 0.5210594876248371,
"grad_norm": 1.140625,
"learning_rate": 0.00017965248048350542,
"loss": 1.573,
"step": 750
},
{
"epoch": 0.521754233608337,
"grad_norm": 0.73828125,
"learning_rate": 0.00017960211533618736,
"loss": 1.5468,
"step": 751
},
{
"epoch": 0.5224489795918368,
"grad_norm": 0.91796875,
"learning_rate": 0.00017955175018886932,
"loss": 1.8732,
"step": 752
},
{
"epoch": 0.5231437255753365,
"grad_norm": 0.73046875,
"learning_rate": 0.00017950138504155126,
"loss": 1.8024,
"step": 753
},
{
"epoch": 0.5238384715588363,
"grad_norm": 2.015625,
"learning_rate": 0.00017945101989423322,
"loss": 1.8361,
"step": 754
},
{
"epoch": 0.5245332175423361,
"grad_norm": 36.75,
"learning_rate": 0.00017940065474691513,
"loss": 2.3295,
"step": 755
},
{
"epoch": 0.5252279635258359,
"grad_norm": 2.25,
"learning_rate": 0.00017935028959959707,
"loss": 2.1014,
"step": 756
},
{
"epoch": 0.5259227095093356,
"grad_norm": 0.6484375,
"learning_rate": 0.00017929992445227903,
"loss": 1.6904,
"step": 757
},
{
"epoch": 0.5266174554928355,
"grad_norm": 3.25,
"learning_rate": 0.00017924955930496097,
"loss": 1.8936,
"step": 758
},
{
"epoch": 0.5273122014763352,
"grad_norm": 0.83203125,
"learning_rate": 0.00017919919415764293,
"loss": 2.0644,
"step": 759
},
{
"epoch": 0.528006947459835,
"grad_norm": 1.1484375,
"learning_rate": 0.00017914882901032486,
"loss": 1.9939,
"step": 760
},
{
"epoch": 0.5287016934433347,
"grad_norm": 0.86328125,
"learning_rate": 0.00017909846386300683,
"loss": 1.992,
"step": 761
},
{
"epoch": 0.5293964394268346,
"grad_norm": 2.0625,
"learning_rate": 0.00017904809871568874,
"loss": 2.1399,
"step": 762
},
{
"epoch": 0.5300911854103344,
"grad_norm": 1.09375,
"learning_rate": 0.0001789977335683707,
"loss": 1.4181,
"step": 763
},
{
"epoch": 0.5307859313938341,
"grad_norm": 1.109375,
"learning_rate": 0.00017894736842105264,
"loss": 1.8635,
"step": 764
},
{
"epoch": 0.531480677377334,
"grad_norm": 1.0390625,
"learning_rate": 0.00017889700327373457,
"loss": 1.6296,
"step": 765
},
{
"epoch": 0.5321754233608337,
"grad_norm": 1.0,
"learning_rate": 0.00017884663812641654,
"loss": 2.0959,
"step": 766
},
{
"epoch": 0.5328701693443335,
"grad_norm": 0.91796875,
"learning_rate": 0.00017879627297909847,
"loss": 1.6697,
"step": 767
},
{
"epoch": 0.5335649153278332,
"grad_norm": 1.0703125,
"learning_rate": 0.00017874590783178044,
"loss": 2.4245,
"step": 768
},
{
"epoch": 0.5342596613113331,
"grad_norm": 1.0859375,
"learning_rate": 0.00017869554268446235,
"loss": 1.5763,
"step": 769
},
{
"epoch": 0.5349544072948328,
"grad_norm": 1.125,
"learning_rate": 0.0001786451775371443,
"loss": 1.9295,
"step": 770
},
{
"epoch": 0.5356491532783326,
"grad_norm": 1.1171875,
"learning_rate": 0.00017859481238982625,
"loss": 1.3931,
"step": 771
},
{
"epoch": 0.5363438992618323,
"grad_norm": 0.98046875,
"learning_rate": 0.00017854444724250818,
"loss": 2.1037,
"step": 772
},
{
"epoch": 0.5370386452453322,
"grad_norm": 0.74609375,
"learning_rate": 0.00017849408209519015,
"loss": 1.7615,
"step": 773
},
{
"epoch": 0.5377333912288319,
"grad_norm": 0.91015625,
"learning_rate": 0.00017844371694787208,
"loss": 1.7058,
"step": 774
},
{
"epoch": 0.5384281372123317,
"grad_norm": 1.109375,
"learning_rate": 0.00017839335180055405,
"loss": 1.9699,
"step": 775
},
{
"epoch": 0.5391228831958316,
"grad_norm": 0.69921875,
"learning_rate": 0.00017834298665323595,
"loss": 1.9709,
"step": 776
},
{
"epoch": 0.5398176291793313,
"grad_norm": 0.88671875,
"learning_rate": 0.00017829262150591792,
"loss": 1.9188,
"step": 777
},
{
"epoch": 0.5405123751628311,
"grad_norm": 0.6796875,
"learning_rate": 0.00017824225635859985,
"loss": 1.658,
"step": 778
},
{
"epoch": 0.5412071211463308,
"grad_norm": 1.046875,
"learning_rate": 0.0001781918912112818,
"loss": 1.9932,
"step": 779
},
{
"epoch": 0.5419018671298307,
"grad_norm": 1.3359375,
"learning_rate": 0.00017814152606396375,
"loss": 1.9009,
"step": 780
},
{
"epoch": 0.5425966131133304,
"grad_norm": 0.8046875,
"learning_rate": 0.0001780911609166457,
"loss": 1.734,
"step": 781
},
{
"epoch": 0.5432913590968302,
"grad_norm": 1.40625,
"learning_rate": 0.00017804079576932763,
"loss": 2.1049,
"step": 782
},
{
"epoch": 0.54398610508033,
"grad_norm": 0.8203125,
"learning_rate": 0.00017799043062200956,
"loss": 1.6874,
"step": 783
},
{
"epoch": 0.5446808510638298,
"grad_norm": 0.73046875,
"learning_rate": 0.00017794006547469153,
"loss": 1.9051,
"step": 784
},
{
"epoch": 0.5453755970473295,
"grad_norm": 1.25,
"learning_rate": 0.00017788970032737346,
"loss": 1.8446,
"step": 785
},
{
"epoch": 0.5460703430308294,
"grad_norm": 0.71484375,
"learning_rate": 0.0001778393351800554,
"loss": 1.9275,
"step": 786
},
{
"epoch": 0.5467650890143292,
"grad_norm": 1.0,
"learning_rate": 0.00017778897003273736,
"loss": 2.1717,
"step": 787
},
{
"epoch": 0.5474598349978289,
"grad_norm": 0.84375,
"learning_rate": 0.0001777386048854193,
"loss": 2.5151,
"step": 788
},
{
"epoch": 0.5481545809813287,
"grad_norm": 1.1015625,
"learning_rate": 0.00017768823973810123,
"loss": 1.7945,
"step": 789
},
{
"epoch": 0.5488493269648285,
"grad_norm": 0.98828125,
"learning_rate": 0.00017763787459078317,
"loss": 1.9144,
"step": 790
},
{
"epoch": 0.5495440729483283,
"grad_norm": 1.59375,
"learning_rate": 0.00017758750944346513,
"loss": 1.7331,
"step": 791
},
{
"epoch": 0.550238818931828,
"grad_norm": 0.62109375,
"learning_rate": 0.00017753714429614707,
"loss": 1.8174,
"step": 792
},
{
"epoch": 0.5509335649153279,
"grad_norm": 0.72265625,
"learning_rate": 0.00017748677914882903,
"loss": 2.1623,
"step": 793
},
{
"epoch": 0.5516283108988276,
"grad_norm": 1.015625,
"learning_rate": 0.00017743641400151097,
"loss": 2.0982,
"step": 794
},
{
"epoch": 0.5523230568823274,
"grad_norm": 0.62890625,
"learning_rate": 0.0001773860488541929,
"loss": 1.9725,
"step": 795
},
{
"epoch": 0.5530178028658271,
"grad_norm": 1.1640625,
"learning_rate": 0.00017733568370687484,
"loss": 1.9904,
"step": 796
},
{
"epoch": 0.553712548849327,
"grad_norm": 1.046875,
"learning_rate": 0.00017728531855955678,
"loss": 2.066,
"step": 797
},
{
"epoch": 0.5544072948328268,
"grad_norm": 0.73046875,
"learning_rate": 0.00017723495341223874,
"loss": 1.9711,
"step": 798
},
{
"epoch": 0.5551020408163265,
"grad_norm": 0.59375,
"learning_rate": 0.00017718458826492068,
"loss": 1.64,
"step": 799
},
{
"epoch": 0.5557967867998264,
"grad_norm": 0.88671875,
"learning_rate": 0.00017713422311760264,
"loss": 1.3968,
"step": 800
},
{
"epoch": 0.5564915327833261,
"grad_norm": 1.203125,
"learning_rate": 0.00017708385797028458,
"loss": 2.1326,
"step": 801
},
{
"epoch": 0.5571862787668259,
"grad_norm": 0.5,
"learning_rate": 0.00017703349282296652,
"loss": 1.8806,
"step": 802
},
{
"epoch": 0.5578810247503256,
"grad_norm": 1.015625,
"learning_rate": 0.00017698312767564845,
"loss": 2.022,
"step": 803
},
{
"epoch": 0.5585757707338255,
"grad_norm": 1.1171875,
"learning_rate": 0.0001769327625283304,
"loss": 1.79,
"step": 804
},
{
"epoch": 0.5592705167173252,
"grad_norm": 0.8984375,
"learning_rate": 0.00017688239738101235,
"loss": 2.2328,
"step": 805
},
{
"epoch": 0.559965262700825,
"grad_norm": 0.76953125,
"learning_rate": 0.0001768320322336943,
"loss": 2.0766,
"step": 806
},
{
"epoch": 0.5606600086843248,
"grad_norm": 1.5234375,
"learning_rate": 0.00017678166708637625,
"loss": 2.4812,
"step": 807
},
{
"epoch": 0.5613547546678246,
"grad_norm": 1.03125,
"learning_rate": 0.0001767313019390582,
"loss": 1.9864,
"step": 808
},
{
"epoch": 0.5620495006513243,
"grad_norm": 0.82421875,
"learning_rate": 0.00017668093679174012,
"loss": 2.1901,
"step": 809
},
{
"epoch": 0.5627442466348241,
"grad_norm": 1.2109375,
"learning_rate": 0.00017663057164442206,
"loss": 1.9752,
"step": 810
},
{
"epoch": 0.563438992618324,
"grad_norm": 0.89453125,
"learning_rate": 0.000176580206497104,
"loss": 1.5796,
"step": 811
},
{
"epoch": 0.5641337386018237,
"grad_norm": 0.76171875,
"learning_rate": 0.00017652984134978596,
"loss": 2.244,
"step": 812
},
{
"epoch": 0.5648284845853235,
"grad_norm": 0.89453125,
"learning_rate": 0.0001764794762024679,
"loss": 1.7504,
"step": 813
},
{
"epoch": 0.5655232305688233,
"grad_norm": 1.015625,
"learning_rate": 0.00017642911105514986,
"loss": 2.24,
"step": 814
},
{
"epoch": 0.5662179765523231,
"grad_norm": 0.65234375,
"learning_rate": 0.0001763787459078318,
"loss": 2.1324,
"step": 815
},
{
"epoch": 0.5669127225358228,
"grad_norm": 1.0078125,
"learning_rate": 0.00017632838076051373,
"loss": 1.9129,
"step": 816
},
{
"epoch": 0.5676074685193226,
"grad_norm": 0.81640625,
"learning_rate": 0.00017627801561319567,
"loss": 1.8645,
"step": 817
},
{
"epoch": 0.5683022145028224,
"grad_norm": 0.80078125,
"learning_rate": 0.0001762276504658776,
"loss": 1.8864,
"step": 818
},
{
"epoch": 0.5689969604863222,
"grad_norm": 0.82421875,
"learning_rate": 0.00017617728531855957,
"loss": 1.9661,
"step": 819
},
{
"epoch": 0.5696917064698219,
"grad_norm": 0.9453125,
"learning_rate": 0.0001761269201712415,
"loss": 2.0306,
"step": 820
},
{
"epoch": 0.5703864524533218,
"grad_norm": 0.83984375,
"learning_rate": 0.00017607655502392347,
"loss": 1.8562,
"step": 821
},
{
"epoch": 0.5710811984368216,
"grad_norm": 1.2265625,
"learning_rate": 0.0001760261898766054,
"loss": 2.1271,
"step": 822
},
{
"epoch": 0.5717759444203213,
"grad_norm": 1.03125,
"learning_rate": 0.00017597582472928734,
"loss": 2.0361,
"step": 823
},
{
"epoch": 0.5724706904038211,
"grad_norm": 0.8359375,
"learning_rate": 0.00017592545958196928,
"loss": 1.5519,
"step": 824
},
{
"epoch": 0.5731654363873209,
"grad_norm": 0.7734375,
"learning_rate": 0.0001758750944346512,
"loss": 2.0971,
"step": 825
},
{
"epoch": 0.5738601823708207,
"grad_norm": 1.46875,
"learning_rate": 0.00017582472928733318,
"loss": 1.9318,
"step": 826
},
{
"epoch": 0.5745549283543204,
"grad_norm": 0.8046875,
"learning_rate": 0.0001757743641400151,
"loss": 2.0558,
"step": 827
},
{
"epoch": 0.5752496743378203,
"grad_norm": 0.91796875,
"learning_rate": 0.00017572399899269708,
"loss": 1.7626,
"step": 828
},
{
"epoch": 0.57594442032132,
"grad_norm": 1.125,
"learning_rate": 0.000175673633845379,
"loss": 2.0998,
"step": 829
},
{
"epoch": 0.5766391663048198,
"grad_norm": 0.97265625,
"learning_rate": 0.00017562326869806095,
"loss": 1.6269,
"step": 830
},
{
"epoch": 0.5773339122883195,
"grad_norm": 1.3671875,
"learning_rate": 0.00017557290355074288,
"loss": 2.3553,
"step": 831
},
{
"epoch": 0.5780286582718194,
"grad_norm": 0.83203125,
"learning_rate": 0.00017552253840342482,
"loss": 1.9309,
"step": 832
},
{
"epoch": 0.5787234042553191,
"grad_norm": 0.55859375,
"learning_rate": 0.00017547217325610678,
"loss": 1.7086,
"step": 833
},
{
"epoch": 0.5794181502388189,
"grad_norm": 0.84765625,
"learning_rate": 0.00017542180810878872,
"loss": 1.8288,
"step": 834
},
{
"epoch": 0.5801128962223188,
"grad_norm": 0.70703125,
"learning_rate": 0.00017537144296147068,
"loss": 1.6589,
"step": 835
},
{
"epoch": 0.5808076422058185,
"grad_norm": 1.53125,
"learning_rate": 0.00017532107781415262,
"loss": 1.7197,
"step": 836
},
{
"epoch": 0.5815023881893183,
"grad_norm": 0.8359375,
"learning_rate": 0.00017527071266683456,
"loss": 2.2181,
"step": 837
},
{
"epoch": 0.582197134172818,
"grad_norm": 0.8984375,
"learning_rate": 0.0001752203475195165,
"loss": 2.1823,
"step": 838
},
{
"epoch": 0.5828918801563179,
"grad_norm": 0.6328125,
"learning_rate": 0.00017516998237219843,
"loss": 1.3199,
"step": 839
},
{
"epoch": 0.5835866261398176,
"grad_norm": 0.87890625,
"learning_rate": 0.0001751196172248804,
"loss": 1.8895,
"step": 840
},
{
"epoch": 0.5842813721233174,
"grad_norm": 1.3046875,
"learning_rate": 0.00017506925207756233,
"loss": 2.1302,
"step": 841
},
{
"epoch": 0.5849761181068172,
"grad_norm": 0.83984375,
"learning_rate": 0.0001750188869302443,
"loss": 1.9623,
"step": 842
},
{
"epoch": 0.585670864090317,
"grad_norm": 1.375,
"learning_rate": 0.00017496852178292623,
"loss": 1.61,
"step": 843
},
{
"epoch": 0.5863656100738167,
"grad_norm": 0.9921875,
"learning_rate": 0.00017491815663560817,
"loss": 1.8907,
"step": 844
},
{
"epoch": 0.5870603560573165,
"grad_norm": 1.4375,
"learning_rate": 0.0001748677914882901,
"loss": 1.7086,
"step": 845
},
{
"epoch": 0.5877551020408164,
"grad_norm": 1.0703125,
"learning_rate": 0.00017481742634097204,
"loss": 1.718,
"step": 846
},
{
"epoch": 0.5884498480243161,
"grad_norm": 0.90234375,
"learning_rate": 0.000174767061193654,
"loss": 2.0364,
"step": 847
},
{
"epoch": 0.5891445940078159,
"grad_norm": 1.28125,
"learning_rate": 0.00017471669604633594,
"loss": 2.1759,
"step": 848
},
{
"epoch": 0.5898393399913157,
"grad_norm": 1.5,
"learning_rate": 0.0001746663308990179,
"loss": 2.1323,
"step": 849
},
{
"epoch": 0.5905340859748155,
"grad_norm": 1.28125,
"learning_rate": 0.00017461596575169984,
"loss": 1.9511,
"step": 850
},
{
"epoch": 0.5912288319583152,
"grad_norm": 0.625,
"learning_rate": 0.00017456560060438177,
"loss": 2.0314,
"step": 851
},
{
"epoch": 0.591923577941815,
"grad_norm": 0.90234375,
"learning_rate": 0.0001745152354570637,
"loss": 1.5775,
"step": 852
},
{
"epoch": 0.5926183239253148,
"grad_norm": 0.81640625,
"learning_rate": 0.00017446487030974565,
"loss": 2.047,
"step": 853
},
{
"epoch": 0.5933130699088146,
"grad_norm": 1.1328125,
"learning_rate": 0.0001744145051624276,
"loss": 2.1235,
"step": 854
},
{
"epoch": 0.5940078158923143,
"grad_norm": 0.80859375,
"learning_rate": 0.00017436414001510955,
"loss": 1.7731,
"step": 855
},
{
"epoch": 0.5947025618758142,
"grad_norm": 1.1796875,
"learning_rate": 0.0001743137748677915,
"loss": 1.9088,
"step": 856
},
{
"epoch": 0.5953973078593139,
"grad_norm": 0.97265625,
"learning_rate": 0.00017426340972047345,
"loss": 1.6199,
"step": 857
},
{
"epoch": 0.5960920538428137,
"grad_norm": 1.7109375,
"learning_rate": 0.00017421304457315538,
"loss": 1.6654,
"step": 858
},
{
"epoch": 0.5967867998263136,
"grad_norm": 1.2734375,
"learning_rate": 0.00017416267942583732,
"loss": 2.0971,
"step": 859
},
{
"epoch": 0.5974815458098133,
"grad_norm": 0.87109375,
"learning_rate": 0.00017411231427851928,
"loss": 1.8046,
"step": 860
},
{
"epoch": 0.5981762917933131,
"grad_norm": 0.97265625,
"learning_rate": 0.00017406194913120122,
"loss": 2.2793,
"step": 861
},
{
"epoch": 0.5988710377768128,
"grad_norm": 1.4375,
"learning_rate": 0.00017401158398388315,
"loss": 1.8008,
"step": 862
},
{
"epoch": 0.5995657837603127,
"grad_norm": 1.1953125,
"learning_rate": 0.00017396121883656512,
"loss": 2.0149,
"step": 863
},
{
"epoch": 0.6002605297438124,
"grad_norm": 0.80078125,
"learning_rate": 0.00017391085368924705,
"loss": 2.0395,
"step": 864
},
{
"epoch": 0.6009552757273122,
"grad_norm": 1.0625,
"learning_rate": 0.000173860488541929,
"loss": 2.0035,
"step": 865
},
{
"epoch": 0.6016500217108119,
"grad_norm": 1.2578125,
"learning_rate": 0.00017381012339461093,
"loss": 1.856,
"step": 866
},
{
"epoch": 0.6023447676943118,
"grad_norm": 4.40625,
"learning_rate": 0.0001737597582472929,
"loss": 1.8616,
"step": 867
},
{
"epoch": 0.6030395136778115,
"grad_norm": 1.359375,
"learning_rate": 0.00017370939309997483,
"loss": 1.829,
"step": 868
},
{
"epoch": 0.6037342596613113,
"grad_norm": 1.203125,
"learning_rate": 0.00017365902795265676,
"loss": 2.1977,
"step": 869
},
{
"epoch": 0.6044290056448112,
"grad_norm": 2.609375,
"learning_rate": 0.00017360866280533873,
"loss": 2.1831,
"step": 870
},
{
"epoch": 0.6051237516283109,
"grad_norm": 0.7890625,
"learning_rate": 0.00017355829765802066,
"loss": 1.5552,
"step": 871
},
{
"epoch": 0.6058184976118107,
"grad_norm": 0.88671875,
"learning_rate": 0.0001735079325107026,
"loss": 1.977,
"step": 872
},
{
"epoch": 0.6065132435953104,
"grad_norm": 0.703125,
"learning_rate": 0.00017345756736338453,
"loss": 1.9865,
"step": 873
},
{
"epoch": 0.6072079895788103,
"grad_norm": 1.1796875,
"learning_rate": 0.0001734072022160665,
"loss": 1.9972,
"step": 874
},
{
"epoch": 0.60790273556231,
"grad_norm": 0.74609375,
"learning_rate": 0.00017335683706874843,
"loss": 1.7239,
"step": 875
},
{
"epoch": 0.6085974815458098,
"grad_norm": 2.046875,
"learning_rate": 0.00017330647192143037,
"loss": 1.948,
"step": 876
},
{
"epoch": 0.6092922275293096,
"grad_norm": 0.9609375,
"learning_rate": 0.00017325610677411233,
"loss": 1.8617,
"step": 877
},
{
"epoch": 0.6099869735128094,
"grad_norm": 0.953125,
"learning_rate": 0.00017320574162679427,
"loss": 2.1794,
"step": 878
},
{
"epoch": 0.6106817194963091,
"grad_norm": 1.0078125,
"learning_rate": 0.0001731553764794762,
"loss": 1.9517,
"step": 879
},
{
"epoch": 0.611376465479809,
"grad_norm": 0.98828125,
"learning_rate": 0.00017310501133215814,
"loss": 1.8475,
"step": 880
},
{
"epoch": 0.6120712114633087,
"grad_norm": 1.9921875,
"learning_rate": 0.0001730546461848401,
"loss": 2.4872,
"step": 881
},
{
"epoch": 0.6127659574468085,
"grad_norm": 0.62109375,
"learning_rate": 0.00017300428103752204,
"loss": 1.7795,
"step": 882
},
{
"epoch": 0.6134607034303083,
"grad_norm": 0.890625,
"learning_rate": 0.00017295391589020398,
"loss": 1.8401,
"step": 883
},
{
"epoch": 0.6141554494138081,
"grad_norm": 1.0703125,
"learning_rate": 0.00017290355074288594,
"loss": 1.6861,
"step": 884
},
{
"epoch": 0.6148501953973079,
"grad_norm": 0.65625,
"learning_rate": 0.00017285318559556788,
"loss": 2.1298,
"step": 885
},
{
"epoch": 0.6155449413808076,
"grad_norm": 1.0390625,
"learning_rate": 0.00017280282044824982,
"loss": 1.9718,
"step": 886
},
{
"epoch": 0.6162396873643075,
"grad_norm": 0.87109375,
"learning_rate": 0.00017275245530093175,
"loss": 2.0941,
"step": 887
},
{
"epoch": 0.6169344333478072,
"grad_norm": 1.0703125,
"learning_rate": 0.00017270209015361371,
"loss": 1.972,
"step": 888
},
{
"epoch": 0.617629179331307,
"grad_norm": 1.3125,
"learning_rate": 0.00017265172500629565,
"loss": 1.6474,
"step": 889
},
{
"epoch": 0.6183239253148067,
"grad_norm": 0.703125,
"learning_rate": 0.0001726013598589776,
"loss": 1.9091,
"step": 890
},
{
"epoch": 0.6190186712983066,
"grad_norm": 1.2109375,
"learning_rate": 0.00017255099471165955,
"loss": 1.6085,
"step": 891
},
{
"epoch": 0.6197134172818063,
"grad_norm": 0.69921875,
"learning_rate": 0.0001725006295643415,
"loss": 1.8403,
"step": 892
},
{
"epoch": 0.6204081632653061,
"grad_norm": 1.0703125,
"learning_rate": 0.00017245026441702342,
"loss": 1.6317,
"step": 893
},
{
"epoch": 0.621102909248806,
"grad_norm": 1.125,
"learning_rate": 0.00017239989926970536,
"loss": 1.6257,
"step": 894
},
{
"epoch": 0.6217976552323057,
"grad_norm": 1.953125,
"learning_rate": 0.00017234953412238732,
"loss": 2.384,
"step": 895
},
{
"epoch": 0.6224924012158055,
"grad_norm": 0.765625,
"learning_rate": 0.00017229916897506926,
"loss": 1.8285,
"step": 896
},
{
"epoch": 0.6231871471993052,
"grad_norm": 1.03125,
"learning_rate": 0.00017224880382775122,
"loss": 1.9576,
"step": 897
},
{
"epoch": 0.6238818931828051,
"grad_norm": 1.0234375,
"learning_rate": 0.00017219843868043316,
"loss": 1.8259,
"step": 898
},
{
"epoch": 0.6245766391663048,
"grad_norm": 0.9375,
"learning_rate": 0.0001721480735331151,
"loss": 2.2432,
"step": 899
},
{
"epoch": 0.6252713851498046,
"grad_norm": 0.83984375,
"learning_rate": 0.00017209770838579703,
"loss": 1.6558,
"step": 900
},
{
"epoch": 0.6259661311333043,
"grad_norm": 0.7421875,
"learning_rate": 0.00017204734323847897,
"loss": 2.0904,
"step": 901
},
{
"epoch": 0.6266608771168042,
"grad_norm": 0.8671875,
"learning_rate": 0.00017199697809116093,
"loss": 1.8032,
"step": 902
},
{
"epoch": 0.6273556231003039,
"grad_norm": 0.92578125,
"learning_rate": 0.00017194661294384287,
"loss": 1.7511,
"step": 903
},
{
"epoch": 0.6280503690838037,
"grad_norm": 0.6484375,
"learning_rate": 0.00017189624779652483,
"loss": 1.8939,
"step": 904
},
{
"epoch": 0.6287451150673035,
"grad_norm": 0.80078125,
"learning_rate": 0.00017184588264920677,
"loss": 1.9086,
"step": 905
},
{
"epoch": 0.6294398610508033,
"grad_norm": 0.83984375,
"learning_rate": 0.0001717955175018887,
"loss": 1.6633,
"step": 906
},
{
"epoch": 0.6301346070343031,
"grad_norm": 0.83984375,
"learning_rate": 0.00017174515235457064,
"loss": 1.9224,
"step": 907
},
{
"epoch": 0.6308293530178029,
"grad_norm": 0.91015625,
"learning_rate": 0.00017169478720725258,
"loss": 2.0203,
"step": 908
},
{
"epoch": 0.6315240990013027,
"grad_norm": 0.6796875,
"learning_rate": 0.00017164442205993454,
"loss": 1.9564,
"step": 909
},
{
"epoch": 0.6322188449848024,
"grad_norm": 1.0546875,
"learning_rate": 0.00017159405691261648,
"loss": 1.8416,
"step": 910
},
{
"epoch": 0.6329135909683022,
"grad_norm": 1.1640625,
"learning_rate": 0.00017154369176529844,
"loss": 1.9112,
"step": 911
},
{
"epoch": 0.633608336951802,
"grad_norm": 0.765625,
"learning_rate": 0.00017149332661798038,
"loss": 1.9129,
"step": 912
},
{
"epoch": 0.6343030829353018,
"grad_norm": 0.984375,
"learning_rate": 0.0001714429614706623,
"loss": 1.7555,
"step": 913
},
{
"epoch": 0.6349978289188015,
"grad_norm": 2.0625,
"learning_rate": 0.00017139259632334425,
"loss": 1.7537,
"step": 914
},
{
"epoch": 0.6356925749023014,
"grad_norm": 0.71484375,
"learning_rate": 0.00017134223117602618,
"loss": 2.1289,
"step": 915
},
{
"epoch": 0.6363873208858011,
"grad_norm": 1.2734375,
"learning_rate": 0.00017129186602870815,
"loss": 2.0833,
"step": 916
},
{
"epoch": 0.6370820668693009,
"grad_norm": 1.3984375,
"learning_rate": 0.00017124150088139008,
"loss": 2.0708,
"step": 917
},
{
"epoch": 0.6377768128528007,
"grad_norm": 0.73046875,
"learning_rate": 0.00017119113573407205,
"loss": 1.9677,
"step": 918
},
{
"epoch": 0.6384715588363005,
"grad_norm": 1.265625,
"learning_rate": 0.00017114077058675398,
"loss": 1.7507,
"step": 919
},
{
"epoch": 0.6391663048198003,
"grad_norm": 0.85546875,
"learning_rate": 0.00017109040543943592,
"loss": 2.2454,
"step": 920
},
{
"epoch": 0.6398610508033,
"grad_norm": 0.9296875,
"learning_rate": 0.00017104004029211786,
"loss": 2.0876,
"step": 921
},
{
"epoch": 0.6405557967867999,
"grad_norm": 1.515625,
"learning_rate": 0.0001709896751447998,
"loss": 1.7415,
"step": 922
},
{
"epoch": 0.6412505427702996,
"grad_norm": 0.95703125,
"learning_rate": 0.00017093930999748176,
"loss": 2.4147,
"step": 923
},
{
"epoch": 0.6419452887537994,
"grad_norm": 0.86328125,
"learning_rate": 0.0001708889448501637,
"loss": 1.6442,
"step": 924
},
{
"epoch": 0.6426400347372991,
"grad_norm": 1.0,
"learning_rate": 0.00017083857970284566,
"loss": 1.9067,
"step": 925
},
{
"epoch": 0.643334780720799,
"grad_norm": 1.1015625,
"learning_rate": 0.0001707882145555276,
"loss": 2.1339,
"step": 926
},
{
"epoch": 0.6440295267042987,
"grad_norm": 0.828125,
"learning_rate": 0.00017073784940820953,
"loss": 1.7962,
"step": 927
},
{
"epoch": 0.6447242726877985,
"grad_norm": 0.96875,
"learning_rate": 0.00017068748426089147,
"loss": 2.1669,
"step": 928
},
{
"epoch": 0.6454190186712984,
"grad_norm": 0.56640625,
"learning_rate": 0.0001706371191135734,
"loss": 1.9113,
"step": 929
},
{
"epoch": 0.6461137646547981,
"grad_norm": 0.90625,
"learning_rate": 0.00017058675396625536,
"loss": 1.9345,
"step": 930
},
{
"epoch": 0.6468085106382979,
"grad_norm": 0.8984375,
"learning_rate": 0.0001705363888189373,
"loss": 2.1483,
"step": 931
},
{
"epoch": 0.6475032566217976,
"grad_norm": 0.87109375,
"learning_rate": 0.00017048602367161926,
"loss": 2.1292,
"step": 932
},
{
"epoch": 0.6481980026052975,
"grad_norm": 0.83984375,
"learning_rate": 0.0001704356585243012,
"loss": 1.855,
"step": 933
},
{
"epoch": 0.6488927485887972,
"grad_norm": 0.86328125,
"learning_rate": 0.00017038529337698314,
"loss": 1.9374,
"step": 934
},
{
"epoch": 0.649587494572297,
"grad_norm": 0.859375,
"learning_rate": 0.00017033492822966507,
"loss": 1.9404,
"step": 935
},
{
"epoch": 0.6502822405557968,
"grad_norm": 0.66796875,
"learning_rate": 0.000170284563082347,
"loss": 1.6083,
"step": 936
},
{
"epoch": 0.6509769865392966,
"grad_norm": 1.046875,
"learning_rate": 0.00017023419793502897,
"loss": 1.8623,
"step": 937
},
{
"epoch": 0.6516717325227963,
"grad_norm": 1.265625,
"learning_rate": 0.0001701838327877109,
"loss": 2.0822,
"step": 938
},
{
"epoch": 0.6523664785062961,
"grad_norm": 0.70703125,
"learning_rate": 0.00017013346764039287,
"loss": 1.6943,
"step": 939
},
{
"epoch": 0.6530612244897959,
"grad_norm": 1.1328125,
"learning_rate": 0.0001700831024930748,
"loss": 1.745,
"step": 940
},
{
"epoch": 0.6537559704732957,
"grad_norm": 0.96484375,
"learning_rate": 0.00017003273734575675,
"loss": 1.7084,
"step": 941
},
{
"epoch": 0.6544507164567955,
"grad_norm": 0.87109375,
"learning_rate": 0.00016998237219843868,
"loss": 1.6061,
"step": 942
},
{
"epoch": 0.6551454624402953,
"grad_norm": 0.94140625,
"learning_rate": 0.00016993200705112062,
"loss": 2.2639,
"step": 943
},
{
"epoch": 0.6558402084237951,
"grad_norm": 0.7890625,
"learning_rate": 0.00016988164190380258,
"loss": 1.9709,
"step": 944
},
{
"epoch": 0.6565349544072948,
"grad_norm": 1.0390625,
"learning_rate": 0.00016983127675648452,
"loss": 1.9258,
"step": 945
},
{
"epoch": 0.6572297003907946,
"grad_norm": 1.3046875,
"learning_rate": 0.00016978091160916648,
"loss": 2.153,
"step": 946
},
{
"epoch": 0.6579244463742944,
"grad_norm": 1.015625,
"learning_rate": 0.00016973054646184842,
"loss": 1.7945,
"step": 947
},
{
"epoch": 0.6586191923577942,
"grad_norm": 0.9453125,
"learning_rate": 0.00016968018131453035,
"loss": 1.9769,
"step": 948
},
{
"epoch": 0.6593139383412939,
"grad_norm": 0.8203125,
"learning_rate": 0.0001696298161672123,
"loss": 1.9792,
"step": 949
},
{
"epoch": 0.6600086843247938,
"grad_norm": 1.1640625,
"learning_rate": 0.00016957945101989423,
"loss": 1.9845,
"step": 950
},
{
"epoch": 0.6607034303082935,
"grad_norm": 1.0234375,
"learning_rate": 0.0001695290858725762,
"loss": 1.9359,
"step": 951
},
{
"epoch": 0.6613981762917933,
"grad_norm": 4.1875,
"learning_rate": 0.00016947872072525813,
"loss": 1.9572,
"step": 952
},
{
"epoch": 0.6620929222752931,
"grad_norm": 0.70703125,
"learning_rate": 0.0001694283555779401,
"loss": 1.3144,
"step": 953
},
{
"epoch": 0.6627876682587929,
"grad_norm": 0.76171875,
"learning_rate": 0.00016937799043062203,
"loss": 1.4918,
"step": 954
},
{
"epoch": 0.6634824142422927,
"grad_norm": 0.80859375,
"learning_rate": 0.00016932762528330396,
"loss": 2.0,
"step": 955
},
{
"epoch": 0.6641771602257924,
"grad_norm": 0.7890625,
"learning_rate": 0.0001692772601359859,
"loss": 1.7969,
"step": 956
},
{
"epoch": 0.6648719062092923,
"grad_norm": 0.62109375,
"learning_rate": 0.00016922689498866783,
"loss": 1.7975,
"step": 957
},
{
"epoch": 0.665566652192792,
"grad_norm": 0.703125,
"learning_rate": 0.0001691765298413498,
"loss": 1.5022,
"step": 958
},
{
"epoch": 0.6662613981762918,
"grad_norm": 0.921875,
"learning_rate": 0.00016912616469403173,
"loss": 1.7859,
"step": 959
},
{
"epoch": 0.6669561441597915,
"grad_norm": 0.8671875,
"learning_rate": 0.0001690757995467137,
"loss": 2.1235,
"step": 960
},
{
"epoch": 0.6676508901432914,
"grad_norm": 0.7421875,
"learning_rate": 0.00016902543439939563,
"loss": 1.8601,
"step": 961
},
{
"epoch": 0.6683456361267911,
"grad_norm": 0.73828125,
"learning_rate": 0.00016897506925207757,
"loss": 2.0707,
"step": 962
},
{
"epoch": 0.6690403821102909,
"grad_norm": 0.87109375,
"learning_rate": 0.0001689247041047595,
"loss": 1.9595,
"step": 963
},
{
"epoch": 0.6697351280937907,
"grad_norm": 1.8671875,
"learning_rate": 0.00016887433895744147,
"loss": 2.1069,
"step": 964
},
{
"epoch": 0.6704298740772905,
"grad_norm": 0.921875,
"learning_rate": 0.0001688239738101234,
"loss": 1.6447,
"step": 965
},
{
"epoch": 0.6711246200607903,
"grad_norm": 0.8203125,
"learning_rate": 0.00016877360866280534,
"loss": 1.8459,
"step": 966
},
{
"epoch": 0.67181936604429,
"grad_norm": 2.53125,
"learning_rate": 0.0001687232435154873,
"loss": 1.9345,
"step": 967
},
{
"epoch": 0.6725141120277899,
"grad_norm": 1.625,
"learning_rate": 0.00016867287836816924,
"loss": 1.8392,
"step": 968
},
{
"epoch": 0.6732088580112896,
"grad_norm": 0.93359375,
"learning_rate": 0.00016862251322085118,
"loss": 1.8335,
"step": 969
},
{
"epoch": 0.6739036039947894,
"grad_norm": 0.80859375,
"learning_rate": 0.00016857214807353312,
"loss": 1.8878,
"step": 970
},
{
"epoch": 0.6745983499782892,
"grad_norm": 3.390625,
"learning_rate": 0.00016852178292621508,
"loss": 2.0614,
"step": 971
},
{
"epoch": 0.675293095961789,
"grad_norm": 0.80078125,
"learning_rate": 0.00016847141777889701,
"loss": 1.7292,
"step": 972
},
{
"epoch": 0.6759878419452887,
"grad_norm": 1.34375,
"learning_rate": 0.00016842105263157895,
"loss": 2.096,
"step": 973
},
{
"epoch": 0.6766825879287885,
"grad_norm": 1.28125,
"learning_rate": 0.00016837068748426091,
"loss": 1.7483,
"step": 974
},
{
"epoch": 0.6773773339122883,
"grad_norm": 0.77734375,
"learning_rate": 0.00016832032233694282,
"loss": 1.8725,
"step": 975
},
{
"epoch": 0.6780720798957881,
"grad_norm": 0.9296875,
"learning_rate": 0.0001682699571896248,
"loss": 2.1252,
"step": 976
},
{
"epoch": 0.6787668258792879,
"grad_norm": 0.89453125,
"learning_rate": 0.00016821959204230672,
"loss": 1.6795,
"step": 977
},
{
"epoch": 0.6794615718627877,
"grad_norm": 1.0859375,
"learning_rate": 0.0001681692268949887,
"loss": 1.7347,
"step": 978
},
{
"epoch": 0.6801563178462875,
"grad_norm": 0.7734375,
"learning_rate": 0.00016811886174767062,
"loss": 2.0712,
"step": 979
},
{
"epoch": 0.6808510638297872,
"grad_norm": 0.8125,
"learning_rate": 0.00016806849660035256,
"loss": 1.7589,
"step": 980
},
{
"epoch": 0.681545809813287,
"grad_norm": 1.1015625,
"learning_rate": 0.00016801813145303452,
"loss": 2.2129,
"step": 981
},
{
"epoch": 0.6822405557967868,
"grad_norm": 0.9375,
"learning_rate": 0.00016796776630571643,
"loss": 2.141,
"step": 982
},
{
"epoch": 0.6829353017802866,
"grad_norm": 0.97265625,
"learning_rate": 0.0001679174011583984,
"loss": 1.8133,
"step": 983
},
{
"epoch": 0.6836300477637863,
"grad_norm": 0.76953125,
"learning_rate": 0.00016786703601108033,
"loss": 1.7274,
"step": 984
},
{
"epoch": 0.6843247937472862,
"grad_norm": 0.65625,
"learning_rate": 0.0001678166708637623,
"loss": 1.7442,
"step": 985
},
{
"epoch": 0.6850195397307859,
"grad_norm": 0.80859375,
"learning_rate": 0.00016776630571644423,
"loss": 1.7292,
"step": 986
},
{
"epoch": 0.6857142857142857,
"grad_norm": 0.875,
"learning_rate": 0.00016771594056912617,
"loss": 1.8515,
"step": 987
},
{
"epoch": 0.6864090316977854,
"grad_norm": 0.58984375,
"learning_rate": 0.00016766557542180813,
"loss": 1.3847,
"step": 988
},
{
"epoch": 0.6871037776812853,
"grad_norm": 0.703125,
"learning_rate": 0.00016761521027449004,
"loss": 1.9493,
"step": 989
},
{
"epoch": 0.6877985236647851,
"grad_norm": 0.765625,
"learning_rate": 0.000167564845127172,
"loss": 1.8116,
"step": 990
},
{
"epoch": 0.6884932696482848,
"grad_norm": 1.015625,
"learning_rate": 0.00016751447997985394,
"loss": 1.5132,
"step": 991
},
{
"epoch": 0.6891880156317847,
"grad_norm": 0.94921875,
"learning_rate": 0.0001674641148325359,
"loss": 2.2143,
"step": 992
},
{
"epoch": 0.6898827616152844,
"grad_norm": 0.859375,
"learning_rate": 0.00016741374968521784,
"loss": 1.8619,
"step": 993
},
{
"epoch": 0.6905775075987842,
"grad_norm": 0.81640625,
"learning_rate": 0.0001673633845378998,
"loss": 1.8431,
"step": 994
},
{
"epoch": 0.691272253582284,
"grad_norm": 0.75,
"learning_rate": 0.00016731301939058174,
"loss": 1.942,
"step": 995
},
{
"epoch": 0.6919669995657838,
"grad_norm": 0.953125,
"learning_rate": 0.00016726265424326365,
"loss": 2.0124,
"step": 996
},
{
"epoch": 0.6926617455492835,
"grad_norm": 0.9453125,
"learning_rate": 0.0001672122890959456,
"loss": 1.9465,
"step": 997
},
{
"epoch": 0.6933564915327833,
"grad_norm": 0.8984375,
"learning_rate": 0.00016716192394862755,
"loss": 1.6335,
"step": 998
},
{
"epoch": 0.6940512375162831,
"grad_norm": 0.89453125,
"learning_rate": 0.0001671115588013095,
"loss": 1.8172,
"step": 999
},
{
"epoch": 0.6947459834997829,
"grad_norm": 1.234375,
"learning_rate": 0.00016706119365399145,
"loss": 1.8174,
"step": 1000
},
{
"epoch": 0.6954407294832827,
"grad_norm": 0.703125,
"learning_rate": 0.0001670108285066734,
"loss": 1.8297,
"step": 1001
},
{
"epoch": 0.6961354754667824,
"grad_norm": 0.73828125,
"learning_rate": 0.00016696046335935535,
"loss": 1.9633,
"step": 1002
},
{
"epoch": 0.6968302214502823,
"grad_norm": 1.1171875,
"learning_rate": 0.00016691009821203726,
"loss": 2.1313,
"step": 1003
},
{
"epoch": 0.697524967433782,
"grad_norm": 0.83203125,
"learning_rate": 0.00016685973306471922,
"loss": 1.6867,
"step": 1004
},
{
"epoch": 0.6982197134172818,
"grad_norm": 0.71484375,
"learning_rate": 0.00016680936791740116,
"loss": 1.4534,
"step": 1005
},
{
"epoch": 0.6989144594007816,
"grad_norm": 1.3515625,
"learning_rate": 0.00016675900277008312,
"loss": 2.0626,
"step": 1006
},
{
"epoch": 0.6996092053842814,
"grad_norm": 0.859375,
"learning_rate": 0.00016670863762276506,
"loss": 2.0868,
"step": 1007
},
{
"epoch": 0.7003039513677811,
"grad_norm": 0.8984375,
"learning_rate": 0.00016665827247544702,
"loss": 1.6758,
"step": 1008
},
{
"epoch": 0.700998697351281,
"grad_norm": 0.71484375,
"learning_rate": 0.00016660790732812893,
"loss": 2.0535,
"step": 1009
},
{
"epoch": 0.7016934433347807,
"grad_norm": 1.234375,
"learning_rate": 0.00016655754218081087,
"loss": 1.7197,
"step": 1010
},
{
"epoch": 0.7023881893182805,
"grad_norm": 1.8359375,
"learning_rate": 0.00016650717703349283,
"loss": 2.23,
"step": 1011
},
{
"epoch": 0.7030829353017802,
"grad_norm": 0.9296875,
"learning_rate": 0.00016645681188617477,
"loss": 1.4958,
"step": 1012
},
{
"epoch": 0.7037776812852801,
"grad_norm": 1.078125,
"learning_rate": 0.00016640644673885673,
"loss": 1.569,
"step": 1013
},
{
"epoch": 0.7044724272687799,
"grad_norm": 1.3671875,
"learning_rate": 0.00016635608159153866,
"loss": 1.9083,
"step": 1014
},
{
"epoch": 0.7051671732522796,
"grad_norm": 1.015625,
"learning_rate": 0.00016630571644422063,
"loss": 2.0236,
"step": 1015
},
{
"epoch": 0.7058619192357795,
"grad_norm": 1.109375,
"learning_rate": 0.00016625535129690254,
"loss": 1.8555,
"step": 1016
},
{
"epoch": 0.7065566652192792,
"grad_norm": 1.1796875,
"learning_rate": 0.00016620498614958447,
"loss": 2.0917,
"step": 1017
},
{
"epoch": 0.707251411202779,
"grad_norm": 0.92578125,
"learning_rate": 0.00016615462100226644,
"loss": 1.9695,
"step": 1018
},
{
"epoch": 0.7079461571862787,
"grad_norm": 0.486328125,
"learning_rate": 0.00016610425585494837,
"loss": 1.951,
"step": 1019
},
{
"epoch": 0.7086409031697786,
"grad_norm": 0.82421875,
"learning_rate": 0.00016605389070763034,
"loss": 2.0722,
"step": 1020
},
{
"epoch": 0.7093356491532783,
"grad_norm": 1.109375,
"learning_rate": 0.00016600352556031227,
"loss": 1.7612,
"step": 1021
},
{
"epoch": 0.7100303951367781,
"grad_norm": 0.8828125,
"learning_rate": 0.00016595316041299424,
"loss": 1.5708,
"step": 1022
},
{
"epoch": 0.7107251411202778,
"grad_norm": 0.9140625,
"learning_rate": 0.00016590279526567615,
"loss": 2.0463,
"step": 1023
},
{
"epoch": 0.7114198871037777,
"grad_norm": 1.453125,
"learning_rate": 0.00016585243011835808,
"loss": 1.9702,
"step": 1024
},
{
"epoch": 0.7121146330872775,
"grad_norm": 1.015625,
"learning_rate": 0.00016580206497104005,
"loss": 1.6529,
"step": 1025
},
{
"epoch": 0.7128093790707772,
"grad_norm": 0.890625,
"learning_rate": 0.00016575169982372198,
"loss": 1.8015,
"step": 1026
},
{
"epoch": 0.7135041250542771,
"grad_norm": 0.78125,
"learning_rate": 0.00016570133467640395,
"loss": 2.2328,
"step": 1027
},
{
"epoch": 0.7141988710377768,
"grad_norm": 0.9375,
"learning_rate": 0.00016565096952908588,
"loss": 1.9973,
"step": 1028
},
{
"epoch": 0.7148936170212766,
"grad_norm": 1.5703125,
"learning_rate": 0.00016560060438176784,
"loss": 2.0212,
"step": 1029
},
{
"epoch": 0.7155883630047764,
"grad_norm": 1.078125,
"learning_rate": 0.00016555023923444975,
"loss": 1.986,
"step": 1030
},
{
"epoch": 0.7162831089882762,
"grad_norm": 0.62890625,
"learning_rate": 0.00016549987408713172,
"loss": 2.0747,
"step": 1031
},
{
"epoch": 0.7169778549717759,
"grad_norm": 0.79296875,
"learning_rate": 0.00016544950893981365,
"loss": 1.8655,
"step": 1032
},
{
"epoch": 0.7176726009552757,
"grad_norm": 1.7109375,
"learning_rate": 0.0001653991437924956,
"loss": 2.2013,
"step": 1033
},
{
"epoch": 0.7183673469387755,
"grad_norm": 1.1875,
"learning_rate": 0.00016534877864517755,
"loss": 2.1917,
"step": 1034
},
{
"epoch": 0.7190620929222753,
"grad_norm": 0.94140625,
"learning_rate": 0.0001652984134978595,
"loss": 1.8524,
"step": 1035
},
{
"epoch": 0.7197568389057751,
"grad_norm": 0.63671875,
"learning_rate": 0.00016524804835054145,
"loss": 2.0421,
"step": 1036
},
{
"epoch": 0.7204515848892749,
"grad_norm": 0.8515625,
"learning_rate": 0.00016519768320322336,
"loss": 1.6317,
"step": 1037
},
{
"epoch": 0.7211463308727747,
"grad_norm": 1.859375,
"learning_rate": 0.00016514731805590533,
"loss": 1.6442,
"step": 1038
},
{
"epoch": 0.7218410768562744,
"grad_norm": 0.9453125,
"learning_rate": 0.00016509695290858726,
"loss": 1.8236,
"step": 1039
},
{
"epoch": 0.7225358228397742,
"grad_norm": 1.328125,
"learning_rate": 0.0001650465877612692,
"loss": 1.6599,
"step": 1040
},
{
"epoch": 0.723230568823274,
"grad_norm": 0.69921875,
"learning_rate": 0.00016499622261395116,
"loss": 2.0284,
"step": 1041
},
{
"epoch": 0.7239253148067738,
"grad_norm": 3.515625,
"learning_rate": 0.0001649458574666331,
"loss": 1.6601,
"step": 1042
},
{
"epoch": 0.7246200607902735,
"grad_norm": 0.58203125,
"learning_rate": 0.00016489549231931503,
"loss": 1.3995,
"step": 1043
},
{
"epoch": 0.7253148067737734,
"grad_norm": 1.3671875,
"learning_rate": 0.00016484512717199697,
"loss": 1.9502,
"step": 1044
},
{
"epoch": 0.7260095527572731,
"grad_norm": 0.6484375,
"learning_rate": 0.00016479476202467893,
"loss": 1.7405,
"step": 1045
},
{
"epoch": 0.7267042987407729,
"grad_norm": 0.79296875,
"learning_rate": 0.00016474439687736087,
"loss": 1.6638,
"step": 1046
},
{
"epoch": 0.7273990447242726,
"grad_norm": 1.546875,
"learning_rate": 0.0001646940317300428,
"loss": 1.9068,
"step": 1047
},
{
"epoch": 0.7280937907077725,
"grad_norm": 0.5546875,
"learning_rate": 0.00016464366658272477,
"loss": 1.9891,
"step": 1048
},
{
"epoch": 0.7287885366912723,
"grad_norm": 1.328125,
"learning_rate": 0.0001645933014354067,
"loss": 1.6797,
"step": 1049
},
{
"epoch": 0.729483282674772,
"grad_norm": 0.859375,
"learning_rate": 0.00016454293628808864,
"loss": 2.1491,
"step": 1050
},
{
"epoch": 0.7301780286582719,
"grad_norm": 0.79296875,
"learning_rate": 0.00016449257114077058,
"loss": 1.6768,
"step": 1051
},
{
"epoch": 0.7308727746417716,
"grad_norm": 1.21875,
"learning_rate": 0.00016444220599345254,
"loss": 1.6134,
"step": 1052
},
{
"epoch": 0.7315675206252714,
"grad_norm": 0.98046875,
"learning_rate": 0.00016439184084613448,
"loss": 1.4917,
"step": 1053
},
{
"epoch": 0.7322622666087711,
"grad_norm": 1.0546875,
"learning_rate": 0.00016434147569881642,
"loss": 1.9061,
"step": 1054
},
{
"epoch": 0.732957012592271,
"grad_norm": 0.765625,
"learning_rate": 0.00016429111055149838,
"loss": 2.0705,
"step": 1055
},
{
"epoch": 0.7336517585757707,
"grad_norm": 0.99609375,
"learning_rate": 0.00016424074540418031,
"loss": 1.7986,
"step": 1056
},
{
"epoch": 0.7343465045592705,
"grad_norm": 1.203125,
"learning_rate": 0.00016419038025686225,
"loss": 1.5941,
"step": 1057
},
{
"epoch": 0.7350412505427703,
"grad_norm": 1.03125,
"learning_rate": 0.0001641400151095442,
"loss": 2.0374,
"step": 1058
},
{
"epoch": 0.7357359965262701,
"grad_norm": 0.8984375,
"learning_rate": 0.00016408964996222615,
"loss": 1.8155,
"step": 1059
},
{
"epoch": 0.7364307425097699,
"grad_norm": 1.703125,
"learning_rate": 0.0001640392848149081,
"loss": 1.7869,
"step": 1060
},
{
"epoch": 0.7371254884932696,
"grad_norm": 0.9609375,
"learning_rate": 0.00016398891966759005,
"loss": 2.0092,
"step": 1061
},
{
"epoch": 0.7378202344767695,
"grad_norm": 0.8125,
"learning_rate": 0.000163938554520272,
"loss": 1.8563,
"step": 1062
},
{
"epoch": 0.7385149804602692,
"grad_norm": 3.453125,
"learning_rate": 0.00016388818937295392,
"loss": 1.93,
"step": 1063
},
{
"epoch": 0.739209726443769,
"grad_norm": 1.234375,
"learning_rate": 0.00016383782422563586,
"loss": 1.957,
"step": 1064
},
{
"epoch": 0.7399044724272688,
"grad_norm": 1.25,
"learning_rate": 0.0001637874590783178,
"loss": 1.6903,
"step": 1065
},
{
"epoch": 0.7405992184107686,
"grad_norm": 1.0546875,
"learning_rate": 0.00016373709393099976,
"loss": 1.7496,
"step": 1066
},
{
"epoch": 0.7412939643942683,
"grad_norm": 1.15625,
"learning_rate": 0.0001636867287836817,
"loss": 2.2642,
"step": 1067
},
{
"epoch": 0.7419887103777681,
"grad_norm": 0.83203125,
"learning_rate": 0.00016363636363636366,
"loss": 1.8255,
"step": 1068
},
{
"epoch": 0.7426834563612679,
"grad_norm": 1.0625,
"learning_rate": 0.0001635859984890456,
"loss": 2.0617,
"step": 1069
},
{
"epoch": 0.7433782023447677,
"grad_norm": 0.65234375,
"learning_rate": 0.00016353563334172753,
"loss": 2.2496,
"step": 1070
},
{
"epoch": 0.7440729483282674,
"grad_norm": 1.5390625,
"learning_rate": 0.00016348526819440947,
"loss": 2.2341,
"step": 1071
},
{
"epoch": 0.7447676943117673,
"grad_norm": 0.65234375,
"learning_rate": 0.0001634349030470914,
"loss": 2.0895,
"step": 1072
},
{
"epoch": 0.7454624402952671,
"grad_norm": 1.125,
"learning_rate": 0.00016338453789977337,
"loss": 1.3802,
"step": 1073
},
{
"epoch": 0.7461571862787668,
"grad_norm": 0.84765625,
"learning_rate": 0.0001633341727524553,
"loss": 1.874,
"step": 1074
},
{
"epoch": 0.7468519322622666,
"grad_norm": 1.03125,
"learning_rate": 0.00016328380760513727,
"loss": 2.2254,
"step": 1075
},
{
"epoch": 0.7475466782457664,
"grad_norm": 0.9609375,
"learning_rate": 0.0001632334424578192,
"loss": 1.6083,
"step": 1076
},
{
"epoch": 0.7482414242292662,
"grad_norm": 1.0625,
"learning_rate": 0.00016318307731050114,
"loss": 2.1731,
"step": 1077
},
{
"epoch": 0.7489361702127659,
"grad_norm": 0.71875,
"learning_rate": 0.00016313271216318308,
"loss": 1.7249,
"step": 1078
},
{
"epoch": 0.7496309161962658,
"grad_norm": 0.66015625,
"learning_rate": 0.000163082347015865,
"loss": 1.9163,
"step": 1079
},
{
"epoch": 0.7503256621797655,
"grad_norm": 0.65625,
"learning_rate": 0.00016303198186854698,
"loss": 1.8562,
"step": 1080
},
{
"epoch": 0.7510204081632653,
"grad_norm": 0.6171875,
"learning_rate": 0.0001629816167212289,
"loss": 1.7651,
"step": 1081
},
{
"epoch": 0.751715154146765,
"grad_norm": 1.28125,
"learning_rate": 0.00016293125157391088,
"loss": 2.4086,
"step": 1082
},
{
"epoch": 0.7524099001302649,
"grad_norm": 0.6015625,
"learning_rate": 0.0001628808864265928,
"loss": 1.6701,
"step": 1083
},
{
"epoch": 0.7531046461137647,
"grad_norm": 0.83203125,
"learning_rate": 0.00016283052127927475,
"loss": 1.7093,
"step": 1084
},
{
"epoch": 0.7537993920972644,
"grad_norm": 2.1875,
"learning_rate": 0.00016278015613195668,
"loss": 1.8675,
"step": 1085
},
{
"epoch": 0.7544941380807643,
"grad_norm": 0.73828125,
"learning_rate": 0.00016272979098463862,
"loss": 1.8314,
"step": 1086
},
{
"epoch": 0.755188884064264,
"grad_norm": 0.8046875,
"learning_rate": 0.00016267942583732058,
"loss": 1.9986,
"step": 1087
},
{
"epoch": 0.7558836300477638,
"grad_norm": 0.73828125,
"learning_rate": 0.00016262906069000252,
"loss": 2.0108,
"step": 1088
},
{
"epoch": 0.7565783760312635,
"grad_norm": 0.68359375,
"learning_rate": 0.00016257869554268448,
"loss": 1.6761,
"step": 1089
},
{
"epoch": 0.7572731220147634,
"grad_norm": 0.76953125,
"learning_rate": 0.00016252833039536642,
"loss": 1.9731,
"step": 1090
},
{
"epoch": 0.7579678679982631,
"grad_norm": 1.15625,
"learning_rate": 0.00016247796524804836,
"loss": 1.6051,
"step": 1091
},
{
"epoch": 0.7586626139817629,
"grad_norm": 0.78515625,
"learning_rate": 0.0001624276001007303,
"loss": 2.0622,
"step": 1092
},
{
"epoch": 0.7593573599652627,
"grad_norm": 0.76171875,
"learning_rate": 0.00016237723495341223,
"loss": 1.8382,
"step": 1093
},
{
"epoch": 0.7600521059487625,
"grad_norm": 0.8046875,
"learning_rate": 0.0001623268698060942,
"loss": 2.0037,
"step": 1094
},
{
"epoch": 0.7607468519322622,
"grad_norm": 0.734375,
"learning_rate": 0.00016227650465877613,
"loss": 2.0415,
"step": 1095
},
{
"epoch": 0.761441597915762,
"grad_norm": 0.8515625,
"learning_rate": 0.0001622261395114581,
"loss": 2.0115,
"step": 1096
},
{
"epoch": 0.7621363438992619,
"grad_norm": 0.77734375,
"learning_rate": 0.00016217577436414003,
"loss": 2.1002,
"step": 1097
},
{
"epoch": 0.7628310898827616,
"grad_norm": 0.9140625,
"learning_rate": 0.00016212540921682196,
"loss": 1.3513,
"step": 1098
},
{
"epoch": 0.7635258358662614,
"grad_norm": 0.65625,
"learning_rate": 0.0001620750440695039,
"loss": 1.8398,
"step": 1099
},
{
"epoch": 0.7642205818497612,
"grad_norm": 1.3828125,
"learning_rate": 0.00016202467892218584,
"loss": 2.1121,
"step": 1100
},
{
"epoch": 0.764915327833261,
"grad_norm": 0.90234375,
"learning_rate": 0.0001619743137748678,
"loss": 1.3797,
"step": 1101
},
{
"epoch": 0.7656100738167607,
"grad_norm": 1.046875,
"learning_rate": 0.00016192394862754974,
"loss": 2.0456,
"step": 1102
},
{
"epoch": 0.7663048198002606,
"grad_norm": 0.6875,
"learning_rate": 0.0001618735834802317,
"loss": 1.7388,
"step": 1103
},
{
"epoch": 0.7669995657837603,
"grad_norm": 0.9609375,
"learning_rate": 0.00016182321833291364,
"loss": 2.0159,
"step": 1104
},
{
"epoch": 0.7676943117672601,
"grad_norm": 0.921875,
"learning_rate": 0.00016177285318559557,
"loss": 1.6245,
"step": 1105
},
{
"epoch": 0.7683890577507598,
"grad_norm": 1.15625,
"learning_rate": 0.0001617224880382775,
"loss": 1.9893,
"step": 1106
},
{
"epoch": 0.7690838037342597,
"grad_norm": 1.015625,
"learning_rate": 0.00016167212289095945,
"loss": 1.8131,
"step": 1107
},
{
"epoch": 0.7697785497177595,
"grad_norm": 0.89453125,
"learning_rate": 0.0001616217577436414,
"loss": 2.1454,
"step": 1108
},
{
"epoch": 0.7704732957012592,
"grad_norm": 0.93359375,
"learning_rate": 0.00016157139259632335,
"loss": 1.9464,
"step": 1109
},
{
"epoch": 0.771168041684759,
"grad_norm": 0.68359375,
"learning_rate": 0.0001615210274490053,
"loss": 1.5576,
"step": 1110
},
{
"epoch": 0.7718627876682588,
"grad_norm": 0.8203125,
"learning_rate": 0.00016147066230168725,
"loss": 1.524,
"step": 1111
},
{
"epoch": 0.7725575336517586,
"grad_norm": 0.6484375,
"learning_rate": 0.00016142029715436918,
"loss": 1.597,
"step": 1112
},
{
"epoch": 0.7732522796352583,
"grad_norm": 1.1953125,
"learning_rate": 0.00016136993200705112,
"loss": 1.8816,
"step": 1113
},
{
"epoch": 0.7739470256187582,
"grad_norm": 0.68359375,
"learning_rate": 0.00016131956685973305,
"loss": 2.019,
"step": 1114
},
{
"epoch": 0.7746417716022579,
"grad_norm": 0.7890625,
"learning_rate": 0.00016126920171241502,
"loss": 1.7259,
"step": 1115
},
{
"epoch": 0.7753365175857577,
"grad_norm": 0.90625,
"learning_rate": 0.00016121883656509695,
"loss": 1.7952,
"step": 1116
},
{
"epoch": 0.7760312635692574,
"grad_norm": 2.0,
"learning_rate": 0.00016116847141777892,
"loss": 1.8621,
"step": 1117
},
{
"epoch": 0.7767260095527573,
"grad_norm": 1.015625,
"learning_rate": 0.00016111810627046085,
"loss": 2.1855,
"step": 1118
},
{
"epoch": 0.777420755536257,
"grad_norm": 1.0234375,
"learning_rate": 0.0001610677411231428,
"loss": 1.9703,
"step": 1119
},
{
"epoch": 0.7781155015197568,
"grad_norm": 0.94921875,
"learning_rate": 0.00016101737597582473,
"loss": 2.0817,
"step": 1120
},
{
"epoch": 0.7788102475032567,
"grad_norm": 1.2421875,
"learning_rate": 0.00016096701082850666,
"loss": 2.18,
"step": 1121
},
{
"epoch": 0.7795049934867564,
"grad_norm": 1.03125,
"learning_rate": 0.00016091664568118863,
"loss": 1.8113,
"step": 1122
},
{
"epoch": 0.7801997394702562,
"grad_norm": 0.94140625,
"learning_rate": 0.00016086628053387056,
"loss": 1.9625,
"step": 1123
},
{
"epoch": 0.780894485453756,
"grad_norm": 0.8203125,
"learning_rate": 0.00016081591538655253,
"loss": 1.6948,
"step": 1124
},
{
"epoch": 0.7815892314372558,
"grad_norm": 0.99609375,
"learning_rate": 0.00016076555023923446,
"loss": 2.1205,
"step": 1125
},
{
"epoch": 0.7822839774207555,
"grad_norm": 1.1015625,
"learning_rate": 0.0001607151850919164,
"loss": 1.8704,
"step": 1126
},
{
"epoch": 0.7829787234042553,
"grad_norm": 0.87109375,
"learning_rate": 0.00016066481994459833,
"loss": 1.6526,
"step": 1127
},
{
"epoch": 0.7836734693877551,
"grad_norm": 0.69140625,
"learning_rate": 0.0001606144547972803,
"loss": 2.0315,
"step": 1128
},
{
"epoch": 0.7843682153712549,
"grad_norm": 0.9296875,
"learning_rate": 0.00016056408964996223,
"loss": 2.0635,
"step": 1129
},
{
"epoch": 0.7850629613547546,
"grad_norm": 0.859375,
"learning_rate": 0.00016051372450264417,
"loss": 1.9912,
"step": 1130
}
],
"logging_steps": 1,
"max_steps": 4317,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 10,
"total_flos": 7.924900854625124e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}