llama3-8b-sft-full-mn / trainer_state.json
Dynosaur's picture
Model save
01f7fd2 verified
raw
history blame contribute delete
No virus
165 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.999075907590759,
"eval_steps": 500,
"global_step": 946,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001056105610561056,
"grad_norm": 25.502913554245602,
"learning_rate": 2.105263157894737e-07,
"loss": 1.6603,
"step": 1
},
{
"epoch": 0.002112211221122112,
"grad_norm": 24.684662166189923,
"learning_rate": 4.210526315789474e-07,
"loss": 1.6776,
"step": 2
},
{
"epoch": 0.0031683168316831685,
"grad_norm": 25.937156208692112,
"learning_rate": 6.315789473684211e-07,
"loss": 1.7195,
"step": 3
},
{
"epoch": 0.004224422442244224,
"grad_norm": 25.37329937507709,
"learning_rate": 8.421052631578948e-07,
"loss": 1.6376,
"step": 4
},
{
"epoch": 0.005280528052805281,
"grad_norm": 22.914455804002028,
"learning_rate": 1.0526315789473685e-06,
"loss": 1.6745,
"step": 5
},
{
"epoch": 0.006336633663366337,
"grad_norm": 18.71489420172178,
"learning_rate": 1.2631578947368422e-06,
"loss": 1.5926,
"step": 6
},
{
"epoch": 0.007392739273927393,
"grad_norm": 17.442510603544665,
"learning_rate": 1.4736842105263159e-06,
"loss": 1.5996,
"step": 7
},
{
"epoch": 0.008448844884488448,
"grad_norm": 11.035439167252106,
"learning_rate": 1.6842105263157895e-06,
"loss": 1.4839,
"step": 8
},
{
"epoch": 0.009504950495049505,
"grad_norm": 10.359727005326327,
"learning_rate": 1.8947368421052634e-06,
"loss": 1.444,
"step": 9
},
{
"epoch": 0.010561056105610561,
"grad_norm": 8.728411134919265,
"learning_rate": 2.105263157894737e-06,
"loss": 1.4256,
"step": 10
},
{
"epoch": 0.011617161716171618,
"grad_norm": 4.818945629690555,
"learning_rate": 2.3157894736842105e-06,
"loss": 1.3524,
"step": 11
},
{
"epoch": 0.012673267326732674,
"grad_norm": 4.27046680140822,
"learning_rate": 2.5263157894736844e-06,
"loss": 1.3469,
"step": 12
},
{
"epoch": 0.013729372937293729,
"grad_norm": 3.794294200313254,
"learning_rate": 2.7368421052631583e-06,
"loss": 1.3905,
"step": 13
},
{
"epoch": 0.014785478547854785,
"grad_norm": 2.2340045129056976,
"learning_rate": 2.9473684210526317e-06,
"loss": 1.3265,
"step": 14
},
{
"epoch": 0.015841584158415842,
"grad_norm": 2.087270218969115,
"learning_rate": 3.157894736842105e-06,
"loss": 1.2932,
"step": 15
},
{
"epoch": 0.016897689768976897,
"grad_norm": 1.8996187727979443,
"learning_rate": 3.368421052631579e-06,
"loss": 1.3021,
"step": 16
},
{
"epoch": 0.017953795379537955,
"grad_norm": 1.601408336569392,
"learning_rate": 3.578947368421053e-06,
"loss": 1.2886,
"step": 17
},
{
"epoch": 0.01900990099009901,
"grad_norm": 1.3273459437302755,
"learning_rate": 3.789473684210527e-06,
"loss": 1.2075,
"step": 18
},
{
"epoch": 0.020066006600660068,
"grad_norm": 1.5140067280164757,
"learning_rate": 4.000000000000001e-06,
"loss": 1.2416,
"step": 19
},
{
"epoch": 0.021122112211221122,
"grad_norm": 1.2762306307100535,
"learning_rate": 4.210526315789474e-06,
"loss": 1.209,
"step": 20
},
{
"epoch": 0.022178217821782177,
"grad_norm": 1.3732608692671324,
"learning_rate": 4.4210526315789476e-06,
"loss": 1.2203,
"step": 21
},
{
"epoch": 0.023234323432343235,
"grad_norm": 1.1927447535405533,
"learning_rate": 4.631578947368421e-06,
"loss": 1.1774,
"step": 22
},
{
"epoch": 0.02429042904290429,
"grad_norm": 1.1312372728980389,
"learning_rate": 4.842105263157895e-06,
"loss": 1.2149,
"step": 23
},
{
"epoch": 0.025346534653465348,
"grad_norm": 0.9297200194995964,
"learning_rate": 5.052631578947369e-06,
"loss": 1.1829,
"step": 24
},
{
"epoch": 0.026402640264026403,
"grad_norm": 0.84338772082761,
"learning_rate": 5.263157894736842e-06,
"loss": 1.1688,
"step": 25
},
{
"epoch": 0.027458745874587458,
"grad_norm": 1.0117469132601131,
"learning_rate": 5.4736842105263165e-06,
"loss": 1.1664,
"step": 26
},
{
"epoch": 0.028514851485148516,
"grad_norm": 0.840878824216462,
"learning_rate": 5.68421052631579e-06,
"loss": 1.1894,
"step": 27
},
{
"epoch": 0.02957095709570957,
"grad_norm": 0.8695262241575115,
"learning_rate": 5.8947368421052634e-06,
"loss": 1.2137,
"step": 28
},
{
"epoch": 0.030627062706270625,
"grad_norm": 0.9522702264944957,
"learning_rate": 6.105263157894738e-06,
"loss": 1.1908,
"step": 29
},
{
"epoch": 0.031683168316831684,
"grad_norm": 0.7477754943944812,
"learning_rate": 6.31578947368421e-06,
"loss": 1.1583,
"step": 30
},
{
"epoch": 0.03273927392739274,
"grad_norm": 0.874775633870445,
"learning_rate": 6.526315789473685e-06,
"loss": 1.1982,
"step": 31
},
{
"epoch": 0.03379537953795379,
"grad_norm": 0.7973527566984862,
"learning_rate": 6.736842105263158e-06,
"loss": 1.1876,
"step": 32
},
{
"epoch": 0.034851485148514855,
"grad_norm": 0.812120195425406,
"learning_rate": 6.947368421052632e-06,
"loss": 1.1597,
"step": 33
},
{
"epoch": 0.03590759075907591,
"grad_norm": 0.8114493228466256,
"learning_rate": 7.157894736842106e-06,
"loss": 1.1348,
"step": 34
},
{
"epoch": 0.036963696369636964,
"grad_norm": 0.7550443626683123,
"learning_rate": 7.368421052631579e-06,
"loss": 1.1609,
"step": 35
},
{
"epoch": 0.03801980198019802,
"grad_norm": 0.7891867751175521,
"learning_rate": 7.578947368421054e-06,
"loss": 1.1492,
"step": 36
},
{
"epoch": 0.039075907590759074,
"grad_norm": 0.7334134231688958,
"learning_rate": 7.789473684210526e-06,
"loss": 1.1249,
"step": 37
},
{
"epoch": 0.040132013201320135,
"grad_norm": 0.7049287018133161,
"learning_rate": 8.000000000000001e-06,
"loss": 1.1631,
"step": 38
},
{
"epoch": 0.04118811881188119,
"grad_norm": 0.6980781134363873,
"learning_rate": 8.210526315789475e-06,
"loss": 1.1601,
"step": 39
},
{
"epoch": 0.042244224422442245,
"grad_norm": 0.7462769750300746,
"learning_rate": 8.421052631578948e-06,
"loss": 1.1838,
"step": 40
},
{
"epoch": 0.0433003300330033,
"grad_norm": 0.7474655673611977,
"learning_rate": 8.631578947368422e-06,
"loss": 1.2002,
"step": 41
},
{
"epoch": 0.044356435643564354,
"grad_norm": 0.8147941874253506,
"learning_rate": 8.842105263157895e-06,
"loss": 1.1636,
"step": 42
},
{
"epoch": 0.045412541254125416,
"grad_norm": 0.7023263887773031,
"learning_rate": 9.05263157894737e-06,
"loss": 1.156,
"step": 43
},
{
"epoch": 0.04646864686468647,
"grad_norm": 0.8018367270542662,
"learning_rate": 9.263157894736842e-06,
"loss": 1.1534,
"step": 44
},
{
"epoch": 0.047524752475247525,
"grad_norm": 1.1669111038248,
"learning_rate": 9.473684210526315e-06,
"loss": 1.1454,
"step": 45
},
{
"epoch": 0.04858085808580858,
"grad_norm": 0.7243210860470609,
"learning_rate": 9.68421052631579e-06,
"loss": 1.1254,
"step": 46
},
{
"epoch": 0.049636963696369635,
"grad_norm": 0.6461690746933314,
"learning_rate": 9.894736842105264e-06,
"loss": 1.1288,
"step": 47
},
{
"epoch": 0.050693069306930696,
"grad_norm": 0.7848888465495927,
"learning_rate": 1.0105263157894738e-05,
"loss": 1.1542,
"step": 48
},
{
"epoch": 0.05174917491749175,
"grad_norm": 0.7834236627733278,
"learning_rate": 1.0315789473684213e-05,
"loss": 1.1844,
"step": 49
},
{
"epoch": 0.052805280528052806,
"grad_norm": 0.7246046670057416,
"learning_rate": 1.0526315789473684e-05,
"loss": 1.1178,
"step": 50
},
{
"epoch": 0.05386138613861386,
"grad_norm": 0.7373929105064881,
"learning_rate": 1.073684210526316e-05,
"loss": 1.1484,
"step": 51
},
{
"epoch": 0.054917491749174915,
"grad_norm": 0.784356077606316,
"learning_rate": 1.0947368421052633e-05,
"loss": 1.1416,
"step": 52
},
{
"epoch": 0.05597359735973598,
"grad_norm": 0.8177227416157161,
"learning_rate": 1.1157894736842105e-05,
"loss": 1.1315,
"step": 53
},
{
"epoch": 0.05702970297029703,
"grad_norm": 0.9715104804921632,
"learning_rate": 1.136842105263158e-05,
"loss": 1.1653,
"step": 54
},
{
"epoch": 0.058085808580858087,
"grad_norm": 0.8170838220966891,
"learning_rate": 1.1578947368421053e-05,
"loss": 1.1073,
"step": 55
},
{
"epoch": 0.05914191419141914,
"grad_norm": 0.7462537399553113,
"learning_rate": 1.1789473684210527e-05,
"loss": 1.1354,
"step": 56
},
{
"epoch": 0.060198019801980196,
"grad_norm": 0.7206352583245051,
"learning_rate": 1.2e-05,
"loss": 1.1272,
"step": 57
},
{
"epoch": 0.06125412541254125,
"grad_norm": 0.687562855659723,
"learning_rate": 1.2210526315789475e-05,
"loss": 1.1067,
"step": 58
},
{
"epoch": 0.06231023102310231,
"grad_norm": 0.7329563291742486,
"learning_rate": 1.2421052631578949e-05,
"loss": 1.1926,
"step": 59
},
{
"epoch": 0.06336633663366337,
"grad_norm": 0.7814530654271682,
"learning_rate": 1.263157894736842e-05,
"loss": 1.1438,
"step": 60
},
{
"epoch": 0.06442244224422443,
"grad_norm": 0.7851402185734687,
"learning_rate": 1.2842105263157896e-05,
"loss": 1.1573,
"step": 61
},
{
"epoch": 0.06547854785478548,
"grad_norm": 0.7139229981495742,
"learning_rate": 1.305263157894737e-05,
"loss": 1.1564,
"step": 62
},
{
"epoch": 0.06653465346534654,
"grad_norm": 0.6930887110463827,
"learning_rate": 1.3263157894736843e-05,
"loss": 1.0996,
"step": 63
},
{
"epoch": 0.06759075907590759,
"grad_norm": 0.6886226242482307,
"learning_rate": 1.3473684210526316e-05,
"loss": 1.112,
"step": 64
},
{
"epoch": 0.06864686468646865,
"grad_norm": 0.7692500021912094,
"learning_rate": 1.3684210526315791e-05,
"loss": 1.119,
"step": 65
},
{
"epoch": 0.06970297029702971,
"grad_norm": 0.7832518455809727,
"learning_rate": 1.3894736842105265e-05,
"loss": 1.1066,
"step": 66
},
{
"epoch": 0.07075907590759076,
"grad_norm": 0.6798486391668644,
"learning_rate": 1.4105263157894738e-05,
"loss": 1.156,
"step": 67
},
{
"epoch": 0.07181518151815182,
"grad_norm": 0.7474176818473223,
"learning_rate": 1.4315789473684212e-05,
"loss": 1.1641,
"step": 68
},
{
"epoch": 0.07287128712871287,
"grad_norm": 0.7727287470735991,
"learning_rate": 1.4526315789473687e-05,
"loss": 1.1455,
"step": 69
},
{
"epoch": 0.07392739273927393,
"grad_norm": 0.7223969480674177,
"learning_rate": 1.4736842105263159e-05,
"loss": 1.1662,
"step": 70
},
{
"epoch": 0.07498349834983499,
"grad_norm": 0.7393742928142755,
"learning_rate": 1.4947368421052632e-05,
"loss": 1.1116,
"step": 71
},
{
"epoch": 0.07603960396039604,
"grad_norm": 0.7332475867080597,
"learning_rate": 1.5157894736842107e-05,
"loss": 1.1325,
"step": 72
},
{
"epoch": 0.0770957095709571,
"grad_norm": 0.7410998060858934,
"learning_rate": 1.536842105263158e-05,
"loss": 1.1318,
"step": 73
},
{
"epoch": 0.07815181518151815,
"grad_norm": 0.7308502656202672,
"learning_rate": 1.5578947368421052e-05,
"loss": 1.0867,
"step": 74
},
{
"epoch": 0.07920792079207921,
"grad_norm": 0.7161183811893117,
"learning_rate": 1.578947368421053e-05,
"loss": 1.1168,
"step": 75
},
{
"epoch": 0.08026402640264027,
"grad_norm": 0.6893829461588648,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.1038,
"step": 76
},
{
"epoch": 0.08132013201320132,
"grad_norm": 0.7435959885435712,
"learning_rate": 1.6210526315789473e-05,
"loss": 1.1448,
"step": 77
},
{
"epoch": 0.08237623762376238,
"grad_norm": 0.8277082310122869,
"learning_rate": 1.642105263157895e-05,
"loss": 1.1797,
"step": 78
},
{
"epoch": 0.08343234323432343,
"grad_norm": 0.7677063538524401,
"learning_rate": 1.6631578947368423e-05,
"loss": 1.1223,
"step": 79
},
{
"epoch": 0.08448844884488449,
"grad_norm": 0.8400416024557994,
"learning_rate": 1.6842105263157896e-05,
"loss": 1.1381,
"step": 80
},
{
"epoch": 0.08554455445544555,
"grad_norm": 0.7326231238640358,
"learning_rate": 1.705263157894737e-05,
"loss": 1.1648,
"step": 81
},
{
"epoch": 0.0866006600660066,
"grad_norm": 0.8095424544573007,
"learning_rate": 1.7263157894736843e-05,
"loss": 1.115,
"step": 82
},
{
"epoch": 0.08765676567656766,
"grad_norm": 0.9063959786843001,
"learning_rate": 1.7473684210526317e-05,
"loss": 1.1213,
"step": 83
},
{
"epoch": 0.08871287128712871,
"grad_norm": 0.9956404471188963,
"learning_rate": 1.768421052631579e-05,
"loss": 1.1658,
"step": 84
},
{
"epoch": 0.08976897689768977,
"grad_norm": 0.7852074707162467,
"learning_rate": 1.7894736842105264e-05,
"loss": 1.0973,
"step": 85
},
{
"epoch": 0.09082508250825083,
"grad_norm": 0.8627994602873862,
"learning_rate": 1.810526315789474e-05,
"loss": 1.1654,
"step": 86
},
{
"epoch": 0.09188118811881188,
"grad_norm": 0.9671934003004978,
"learning_rate": 1.831578947368421e-05,
"loss": 1.1188,
"step": 87
},
{
"epoch": 0.09293729372937294,
"grad_norm": 1.00871689868374,
"learning_rate": 1.8526315789473684e-05,
"loss": 1.133,
"step": 88
},
{
"epoch": 0.09399339933993399,
"grad_norm": 0.8036355391896645,
"learning_rate": 1.873684210526316e-05,
"loss": 1.1588,
"step": 89
},
{
"epoch": 0.09504950495049505,
"grad_norm": 0.7378986406248232,
"learning_rate": 1.894736842105263e-05,
"loss": 1.1446,
"step": 90
},
{
"epoch": 0.09610561056105611,
"grad_norm": 1.0045392025555848,
"learning_rate": 1.9157894736842108e-05,
"loss": 1.1341,
"step": 91
},
{
"epoch": 0.09716171617161716,
"grad_norm": 1.063405415184342,
"learning_rate": 1.936842105263158e-05,
"loss": 1.0834,
"step": 92
},
{
"epoch": 0.09821782178217822,
"grad_norm": 0.8643175593742832,
"learning_rate": 1.9578947368421055e-05,
"loss": 1.1016,
"step": 93
},
{
"epoch": 0.09927392739273927,
"grad_norm": 0.8070337934394064,
"learning_rate": 1.9789473684210528e-05,
"loss": 1.1524,
"step": 94
},
{
"epoch": 0.10033003300330033,
"grad_norm": 0.9929097664518711,
"learning_rate": 2e-05,
"loss": 1.0985,
"step": 95
},
{
"epoch": 0.10138613861386139,
"grad_norm": 0.842000698402938,
"learning_rate": 1.9999931858743692e-05,
"loss": 1.1738,
"step": 96
},
{
"epoch": 0.10244224422442244,
"grad_norm": 1.0082898792698418,
"learning_rate": 1.9999727435903407e-05,
"loss": 1.1616,
"step": 97
},
{
"epoch": 0.1034983498349835,
"grad_norm": 0.9860524503399334,
"learning_rate": 1.9999386734265068e-05,
"loss": 1.1251,
"step": 98
},
{
"epoch": 0.10455445544554455,
"grad_norm": 0.9738327136961432,
"learning_rate": 1.999890975847185e-05,
"loss": 1.1282,
"step": 99
},
{
"epoch": 0.10561056105610561,
"grad_norm": 0.8023621741730762,
"learning_rate": 1.99982965150241e-05,
"loss": 1.116,
"step": 100
},
{
"epoch": 0.10666666666666667,
"grad_norm": 0.8828608595704724,
"learning_rate": 1.9997547012279245e-05,
"loss": 1.1464,
"step": 101
},
{
"epoch": 0.10772277227722772,
"grad_norm": 0.8062169100202478,
"learning_rate": 1.9996661260451706e-05,
"loss": 1.1352,
"step": 102
},
{
"epoch": 0.10877887788778878,
"grad_norm": 0.8629471318004416,
"learning_rate": 1.999563927161272e-05,
"loss": 1.1188,
"step": 103
},
{
"epoch": 0.10983498349834983,
"grad_norm": 0.8765875675743416,
"learning_rate": 1.9994481059690223e-05,
"loss": 1.127,
"step": 104
},
{
"epoch": 0.11089108910891089,
"grad_norm": 0.7914109008408529,
"learning_rate": 1.9993186640468603e-05,
"loss": 1.0927,
"step": 105
},
{
"epoch": 0.11194719471947195,
"grad_norm": 0.8426098697824409,
"learning_rate": 1.9991756031588542e-05,
"loss": 1.1501,
"step": 106
},
{
"epoch": 0.113003300330033,
"grad_norm": 0.9446394478273581,
"learning_rate": 1.999018925254673e-05,
"loss": 1.1368,
"step": 107
},
{
"epoch": 0.11405940594059406,
"grad_norm": 0.9489607108993651,
"learning_rate": 1.998848632469563e-05,
"loss": 1.1095,
"step": 108
},
{
"epoch": 0.11511551155115511,
"grad_norm": 0.7243197709299908,
"learning_rate": 1.9986647271243163e-05,
"loss": 1.0991,
"step": 109
},
{
"epoch": 0.11617161716171617,
"grad_norm": 0.8601676454918876,
"learning_rate": 1.9984672117252425e-05,
"loss": 1.1592,
"step": 110
},
{
"epoch": 0.11722772277227722,
"grad_norm": 0.7777059319738133,
"learning_rate": 1.9982560889641294e-05,
"loss": 1.1498,
"step": 111
},
{
"epoch": 0.11828382838283828,
"grad_norm": 0.9418512705124117,
"learning_rate": 1.998031361718213e-05,
"loss": 1.1469,
"step": 112
},
{
"epoch": 0.11933993399339934,
"grad_norm": 0.9785135656756279,
"learning_rate": 1.9977930330501308e-05,
"loss": 1.1132,
"step": 113
},
{
"epoch": 0.12039603960396039,
"grad_norm": 0.80940426303579,
"learning_rate": 1.9975411062078867e-05,
"loss": 1.1349,
"step": 114
},
{
"epoch": 0.12145214521452145,
"grad_norm": 0.8660301122699662,
"learning_rate": 1.9972755846248032e-05,
"loss": 1.1283,
"step": 115
},
{
"epoch": 0.1225082508250825,
"grad_norm": 0.9011445183116514,
"learning_rate": 1.9969964719194745e-05,
"loss": 1.1223,
"step": 116
},
{
"epoch": 0.12356435643564356,
"grad_norm": 0.8830251058382526,
"learning_rate": 1.9967037718957193e-05,
"loss": 1.1135,
"step": 117
},
{
"epoch": 0.12462046204620462,
"grad_norm": 1.2009903569482427,
"learning_rate": 1.9963974885425267e-05,
"loss": 1.1996,
"step": 118
},
{
"epoch": 0.1256765676567657,
"grad_norm": 0.9130659346547352,
"learning_rate": 1.996077626034003e-05,
"loss": 1.1278,
"step": 119
},
{
"epoch": 0.12673267326732673,
"grad_norm": 0.9066587801732933,
"learning_rate": 1.9957441887293157e-05,
"loss": 1.1124,
"step": 120
},
{
"epoch": 0.12778877887788778,
"grad_norm": 0.8166936091990771,
"learning_rate": 1.995397181172631e-05,
"loss": 1.1355,
"step": 121
},
{
"epoch": 0.12884488448844886,
"grad_norm": 0.9877185760283339,
"learning_rate": 1.9950366080930557e-05,
"loss": 1.1065,
"step": 122
},
{
"epoch": 0.1299009900990099,
"grad_norm": 0.8555210819794496,
"learning_rate": 1.9946624744045706e-05,
"loss": 1.1407,
"step": 123
},
{
"epoch": 0.13095709570957095,
"grad_norm": 0.8125459388527925,
"learning_rate": 1.994274785205963e-05,
"loss": 1.1266,
"step": 124
},
{
"epoch": 0.132013201320132,
"grad_norm": 0.8297829918999193,
"learning_rate": 1.993873545780759e-05,
"loss": 1.1094,
"step": 125
},
{
"epoch": 0.13306930693069308,
"grad_norm": 0.7626251371647725,
"learning_rate": 1.993458761597151e-05,
"loss": 1.1165,
"step": 126
},
{
"epoch": 0.13412541254125412,
"grad_norm": 0.7625431749902333,
"learning_rate": 1.9930304383079204e-05,
"loss": 1.1072,
"step": 127
},
{
"epoch": 0.13518151815181517,
"grad_norm": 0.768212533249399,
"learning_rate": 1.992588581750366e-05,
"loss": 1.1107,
"step": 128
},
{
"epoch": 0.13623762376237625,
"grad_norm": 0.7286268116154822,
"learning_rate": 1.9921331979462198e-05,
"loss": 1.1168,
"step": 129
},
{
"epoch": 0.1372937293729373,
"grad_norm": 0.7658565174679054,
"learning_rate": 1.991664293101566e-05,
"loss": 1.1198,
"step": 130
},
{
"epoch": 0.13834983498349834,
"grad_norm": 0.8093486591292915,
"learning_rate": 1.9911818736067586e-05,
"loss": 1.1412,
"step": 131
},
{
"epoch": 0.13940594059405942,
"grad_norm": 0.7915424098308419,
"learning_rate": 1.9906859460363307e-05,
"loss": 1.1763,
"step": 132
},
{
"epoch": 0.14046204620462047,
"grad_norm": 0.8510305456386904,
"learning_rate": 1.990176517148909e-05,
"loss": 1.1194,
"step": 133
},
{
"epoch": 0.14151815181518151,
"grad_norm": 0.7327834677886498,
"learning_rate": 1.989653593887117e-05,
"loss": 1.1539,
"step": 134
},
{
"epoch": 0.14257425742574256,
"grad_norm": 0.8423811352540258,
"learning_rate": 1.9891171833774856e-05,
"loss": 1.1252,
"step": 135
},
{
"epoch": 0.14363036303630364,
"grad_norm": 0.760275777306678,
"learning_rate": 1.988567292930351e-05,
"loss": 1.1617,
"step": 136
},
{
"epoch": 0.14468646864686469,
"grad_norm": 0.7840443204959701,
"learning_rate": 1.988003930039759e-05,
"loss": 1.1486,
"step": 137
},
{
"epoch": 0.14574257425742573,
"grad_norm": 0.7643254385708373,
"learning_rate": 1.9874271023833604e-05,
"loss": 1.1471,
"step": 138
},
{
"epoch": 0.1467986798679868,
"grad_norm": 0.7187784086293781,
"learning_rate": 1.9868368178223078e-05,
"loss": 1.055,
"step": 139
},
{
"epoch": 0.14785478547854786,
"grad_norm": 0.7246693772843589,
"learning_rate": 1.986233084401147e-05,
"loss": 1.0596,
"step": 140
},
{
"epoch": 0.1489108910891089,
"grad_norm": 0.9568239624311117,
"learning_rate": 1.9856159103477085e-05,
"loss": 1.0909,
"step": 141
},
{
"epoch": 0.14996699669966998,
"grad_norm": 0.7802965707535912,
"learning_rate": 1.984985304072996e-05,
"loss": 1.1187,
"step": 142
},
{
"epoch": 0.15102310231023103,
"grad_norm": 0.7554563358834009,
"learning_rate": 1.9843412741710706e-05,
"loss": 1.0966,
"step": 143
},
{
"epoch": 0.15207920792079208,
"grad_norm": 0.9111375558823462,
"learning_rate": 1.9836838294189325e-05,
"loss": 1.1002,
"step": 144
},
{
"epoch": 0.15313531353135312,
"grad_norm": 0.7729960566363951,
"learning_rate": 1.983012978776405e-05,
"loss": 1.1173,
"step": 145
},
{
"epoch": 0.1541914191419142,
"grad_norm": 0.731690771128178,
"learning_rate": 1.9823287313860088e-05,
"loss": 1.1201,
"step": 146
},
{
"epoch": 0.15524752475247525,
"grad_norm": 0.8267490425574995,
"learning_rate": 1.981631096572839e-05,
"loss": 1.1229,
"step": 147
},
{
"epoch": 0.1563036303630363,
"grad_norm": 0.8137005291755369,
"learning_rate": 1.9809200838444382e-05,
"loss": 1.1729,
"step": 148
},
{
"epoch": 0.15735973597359737,
"grad_norm": 0.7578890351808334,
"learning_rate": 1.980195702890667e-05,
"loss": 1.1362,
"step": 149
},
{
"epoch": 0.15841584158415842,
"grad_norm": 0.8253010602730352,
"learning_rate": 1.9794579635835705e-05,
"loss": 1.1247,
"step": 150
},
{
"epoch": 0.15947194719471947,
"grad_norm": 0.7187439221446463,
"learning_rate": 1.9787068759772458e-05,
"loss": 1.1606,
"step": 151
},
{
"epoch": 0.16052805280528054,
"grad_norm": 0.8611603170138422,
"learning_rate": 1.9779424503077033e-05,
"loss": 1.13,
"step": 152
},
{
"epoch": 0.1615841584158416,
"grad_norm": 0.8074304089091228,
"learning_rate": 1.977164696992728e-05,
"loss": 1.1291,
"step": 153
},
{
"epoch": 0.16264026402640264,
"grad_norm": 0.8229234532876638,
"learning_rate": 1.9763736266317374e-05,
"loss": 1.121,
"step": 154
},
{
"epoch": 0.16369636963696368,
"grad_norm": 0.8957549904435468,
"learning_rate": 1.9755692500056376e-05,
"loss": 1.086,
"step": 155
},
{
"epoch": 0.16475247524752476,
"grad_norm": 0.8151639982091587,
"learning_rate": 1.974751578076675e-05,
"loss": 1.1272,
"step": 156
},
{
"epoch": 0.1658085808580858,
"grad_norm": 0.7126947850301165,
"learning_rate": 1.9739206219882884e-05,
"loss": 1.129,
"step": 157
},
{
"epoch": 0.16686468646864686,
"grad_norm": 0.8287103468581603,
"learning_rate": 1.9730763930649556e-05,
"loss": 1.0917,
"step": 158
},
{
"epoch": 0.16792079207920793,
"grad_norm": 0.7814337127284832,
"learning_rate": 1.9722189028120412e-05,
"loss": 1.138,
"step": 159
},
{
"epoch": 0.16897689768976898,
"grad_norm": 0.7619669454412544,
"learning_rate": 1.9713481629156373e-05,
"loss": 1.1769,
"step": 160
},
{
"epoch": 0.17003300330033003,
"grad_norm": 0.7697120455188076,
"learning_rate": 1.970464185242406e-05,
"loss": 1.1156,
"step": 161
},
{
"epoch": 0.1710891089108911,
"grad_norm": 3.0818342946796857,
"learning_rate": 1.9695669818394178e-05,
"loss": 1.1011,
"step": 162
},
{
"epoch": 0.17214521452145215,
"grad_norm": 0.7994973447567014,
"learning_rate": 1.968656564933985e-05,
"loss": 1.1173,
"step": 163
},
{
"epoch": 0.1732013201320132,
"grad_norm": 0.6806813797677709,
"learning_rate": 1.967732946933499e-05,
"loss": 1.067,
"step": 164
},
{
"epoch": 0.17425742574257425,
"grad_norm": 0.8109552734031079,
"learning_rate": 1.9667961404252575e-05,
"loss": 1.0799,
"step": 165
},
{
"epoch": 0.17531353135313532,
"grad_norm": 0.8233943422170855,
"learning_rate": 1.9658461581762948e-05,
"loss": 1.1421,
"step": 166
},
{
"epoch": 0.17636963696369637,
"grad_norm": 0.7651233542522439,
"learning_rate": 1.964883013133208e-05,
"loss": 1.1458,
"step": 167
},
{
"epoch": 0.17742574257425742,
"grad_norm": 0.776638650301697,
"learning_rate": 1.9639067184219796e-05,
"loss": 1.2248,
"step": 168
},
{
"epoch": 0.1784818481848185,
"grad_norm": 0.730863090604111,
"learning_rate": 1.9629172873477995e-05,
"loss": 1.1153,
"step": 169
},
{
"epoch": 0.17953795379537954,
"grad_norm": 0.8088972089910246,
"learning_rate": 1.9619147333948826e-05,
"loss": 1.0992,
"step": 170
},
{
"epoch": 0.1805940594059406,
"grad_norm": 0.7490097861255294,
"learning_rate": 1.960899070226286e-05,
"loss": 1.1253,
"step": 171
},
{
"epoch": 0.18165016501650166,
"grad_norm": 0.8209388589676341,
"learning_rate": 1.9598703116837232e-05,
"loss": 1.1087,
"step": 172
},
{
"epoch": 0.1827062706270627,
"grad_norm": 0.8142624083071723,
"learning_rate": 1.9588284717873738e-05,
"loss": 1.0853,
"step": 173
},
{
"epoch": 0.18376237623762376,
"grad_norm": 0.7912029767928781,
"learning_rate": 1.957773564735693e-05,
"loss": 1.1081,
"step": 174
},
{
"epoch": 0.1848184818481848,
"grad_norm": 0.7887318365145909,
"learning_rate": 1.95670560490522e-05,
"loss": 1.0843,
"step": 175
},
{
"epoch": 0.18587458745874588,
"grad_norm": 0.8526865055884639,
"learning_rate": 1.9556246068503796e-05,
"loss": 1.1293,
"step": 176
},
{
"epoch": 0.18693069306930693,
"grad_norm": 0.7412154672884173,
"learning_rate": 1.954530585303285e-05,
"loss": 1.1281,
"step": 177
},
{
"epoch": 0.18798679867986798,
"grad_norm": 0.8431817637192267,
"learning_rate": 1.953423555173536e-05,
"loss": 1.1161,
"step": 178
},
{
"epoch": 0.18904290429042905,
"grad_norm": 0.7182423193466202,
"learning_rate": 1.952303531548018e-05,
"loss": 1.138,
"step": 179
},
{
"epoch": 0.1900990099009901,
"grad_norm": 0.7284270118780684,
"learning_rate": 1.9511705296906944e-05,
"loss": 1.1037,
"step": 180
},
{
"epoch": 0.19115511551155115,
"grad_norm": 0.7201100466883832,
"learning_rate": 1.950024565042399e-05,
"loss": 1.0729,
"step": 181
},
{
"epoch": 0.19221122112211222,
"grad_norm": 0.7758043839654428,
"learning_rate": 1.948865653220626e-05,
"loss": 1.1094,
"step": 182
},
{
"epoch": 0.19326732673267327,
"grad_norm": 0.8348595971336878,
"learning_rate": 1.9476938100193166e-05,
"loss": 1.0834,
"step": 183
},
{
"epoch": 0.19432343234323432,
"grad_norm": 0.7257299420195784,
"learning_rate": 1.9465090514086448e-05,
"loss": 1.0932,
"step": 184
},
{
"epoch": 0.19537953795379537,
"grad_norm": 0.8532725017422013,
"learning_rate": 1.9453113935347984e-05,
"loss": 1.0931,
"step": 185
},
{
"epoch": 0.19643564356435644,
"grad_norm": 0.8234257750118776,
"learning_rate": 1.9441008527197603e-05,
"loss": 1.1609,
"step": 186
},
{
"epoch": 0.1974917491749175,
"grad_norm": 0.7459373300176395,
"learning_rate": 1.9428774454610845e-05,
"loss": 1.0773,
"step": 187
},
{
"epoch": 0.19854785478547854,
"grad_norm": 0.9796104021208152,
"learning_rate": 1.9416411884316725e-05,
"loss": 1.1493,
"step": 188
},
{
"epoch": 0.19960396039603961,
"grad_norm": 0.8444393255178346,
"learning_rate": 1.9403920984795453e-05,
"loss": 1.1023,
"step": 189
},
{
"epoch": 0.20066006600660066,
"grad_norm": 1.101615286479999,
"learning_rate": 1.9391301926276157e-05,
"loss": 1.1121,
"step": 190
},
{
"epoch": 0.2017161716171617,
"grad_norm": 0.7795093325580587,
"learning_rate": 1.9378554880734527e-05,
"loss": 1.1435,
"step": 191
},
{
"epoch": 0.20277227722772279,
"grad_norm": 0.7915705826025128,
"learning_rate": 1.9365680021890508e-05,
"loss": 1.1191,
"step": 192
},
{
"epoch": 0.20382838283828383,
"grad_norm": 0.8565349057548476,
"learning_rate": 1.935267752520591e-05,
"loss": 1.1304,
"step": 193
},
{
"epoch": 0.20488448844884488,
"grad_norm": 0.8118025251236799,
"learning_rate": 1.9339547567882023e-05,
"loss": 1.0859,
"step": 194
},
{
"epoch": 0.20594059405940593,
"grad_norm": 0.7804166192871781,
"learning_rate": 1.9326290328857212e-05,
"loss": 1.0987,
"step": 195
},
{
"epoch": 0.206996699669967,
"grad_norm": 0.7787593613628443,
"learning_rate": 1.931290598880445e-05,
"loss": 1.12,
"step": 196
},
{
"epoch": 0.20805280528052805,
"grad_norm": 0.840493061055616,
"learning_rate": 1.9299394730128896e-05,
"loss": 1.1223,
"step": 197
},
{
"epoch": 0.2091089108910891,
"grad_norm": 0.7632277660937049,
"learning_rate": 1.928575673696537e-05,
"loss": 1.1434,
"step": 198
},
{
"epoch": 0.21016501650165018,
"grad_norm": 0.8139168175146491,
"learning_rate": 1.9271992195175875e-05,
"loss": 1.119,
"step": 199
},
{
"epoch": 0.21122112211221122,
"grad_norm": 0.8243115984060624,
"learning_rate": 1.9258101292347042e-05,
"loss": 1.1196,
"step": 200
},
{
"epoch": 0.21227722772277227,
"grad_norm": 0.7058242124533236,
"learning_rate": 1.9244084217787587e-05,
"loss": 1.1194,
"step": 201
},
{
"epoch": 0.21333333333333335,
"grad_norm": 0.749899154333577,
"learning_rate": 1.9229941162525727e-05,
"loss": 1.1043,
"step": 202
},
{
"epoch": 0.2143894389438944,
"grad_norm": 0.8277102106164448,
"learning_rate": 1.9215672319306566e-05,
"loss": 1.1159,
"step": 203
},
{
"epoch": 0.21544554455445544,
"grad_norm": 0.836072058709913,
"learning_rate": 1.920127788258949e-05,
"loss": 1.1567,
"step": 204
},
{
"epoch": 0.2165016501650165,
"grad_norm": 0.7353498909562172,
"learning_rate": 1.9186758048545497e-05,
"loss": 1.1221,
"step": 205
},
{
"epoch": 0.21755775577557757,
"grad_norm": 0.7116311277710015,
"learning_rate": 1.917211301505453e-05,
"loss": 1.1466,
"step": 206
},
{
"epoch": 0.2186138613861386,
"grad_norm": 0.7513794548965178,
"learning_rate": 1.9157342981702792e-05,
"loss": 1.1155,
"step": 207
},
{
"epoch": 0.21966996699669966,
"grad_norm": 0.725628703759022,
"learning_rate": 1.914244814978001e-05,
"loss": 1.1026,
"step": 208
},
{
"epoch": 0.22072607260726074,
"grad_norm": 0.7259161832149346,
"learning_rate": 1.9127428722276686e-05,
"loss": 1.0965,
"step": 209
},
{
"epoch": 0.22178217821782178,
"grad_norm": 0.787330129005164,
"learning_rate": 1.911228490388136e-05,
"loss": 1.1341,
"step": 210
},
{
"epoch": 0.22283828382838283,
"grad_norm": 0.7511259214928903,
"learning_rate": 1.909701690097779e-05,
"loss": 1.0637,
"step": 211
},
{
"epoch": 0.2238943894389439,
"grad_norm": 0.8274088874770148,
"learning_rate": 1.9081624921642156e-05,
"loss": 1.1023,
"step": 212
},
{
"epoch": 0.22495049504950496,
"grad_norm": 0.7839372561547511,
"learning_rate": 1.9066109175640225e-05,
"loss": 1.1286,
"step": 213
},
{
"epoch": 0.226006600660066,
"grad_norm": 0.9405054196473267,
"learning_rate": 1.9050469874424477e-05,
"loss": 1.0968,
"step": 214
},
{
"epoch": 0.22706270627062705,
"grad_norm": 0.7249063662233165,
"learning_rate": 1.903470723113124e-05,
"loss": 1.1389,
"step": 215
},
{
"epoch": 0.22811881188118813,
"grad_norm": 0.7811152611940463,
"learning_rate": 1.9018821460577776e-05,
"loss": 1.0835,
"step": 216
},
{
"epoch": 0.22917491749174917,
"grad_norm": 0.7328710082741681,
"learning_rate": 1.9002812779259364e-05,
"loss": 1.0633,
"step": 217
},
{
"epoch": 0.23023102310231022,
"grad_norm": 0.7227732574244917,
"learning_rate": 1.8986681405346323e-05,
"loss": 1.1049,
"step": 218
},
{
"epoch": 0.2312871287128713,
"grad_norm": 0.8919105276269641,
"learning_rate": 1.8970427558681083e-05,
"loss": 1.0959,
"step": 219
},
{
"epoch": 0.23234323432343235,
"grad_norm": 0.7286623354846008,
"learning_rate": 1.895405146077514e-05,
"loss": 1.1379,
"step": 220
},
{
"epoch": 0.2333993399339934,
"grad_norm": 0.7950855296975947,
"learning_rate": 1.8937553334806077e-05,
"loss": 1.091,
"step": 221
},
{
"epoch": 0.23445544554455444,
"grad_norm": 0.7605891483004984,
"learning_rate": 1.8920933405614497e-05,
"loss": 1.0875,
"step": 222
},
{
"epoch": 0.23551155115511552,
"grad_norm": 0.7851988888416784,
"learning_rate": 1.8904191899700978e-05,
"loss": 1.0839,
"step": 223
},
{
"epoch": 0.23656765676567656,
"grad_norm": 0.8371891437966121,
"learning_rate": 1.888732904522296e-05,
"loss": 1.1204,
"step": 224
},
{
"epoch": 0.2376237623762376,
"grad_norm": 0.702832871603263,
"learning_rate": 1.8870345071991662e-05,
"loss": 1.0829,
"step": 225
},
{
"epoch": 0.2386798679867987,
"grad_norm": 0.8040713620787583,
"learning_rate": 1.8853240211468945e-05,
"loss": 1.1477,
"step": 226
},
{
"epoch": 0.23973597359735974,
"grad_norm": 0.7741863685452455,
"learning_rate": 1.883601469676414e-05,
"loss": 1.1002,
"step": 227
},
{
"epoch": 0.24079207920792078,
"grad_norm": 0.6856533659381394,
"learning_rate": 1.881866876263089e-05,
"loss": 1.1134,
"step": 228
},
{
"epoch": 0.24184818481848186,
"grad_norm": 0.7296426643097129,
"learning_rate": 1.880120264546395e-05,
"loss": 1.134,
"step": 229
},
{
"epoch": 0.2429042904290429,
"grad_norm": 0.7285384484710288,
"learning_rate": 1.8783616583295942e-05,
"loss": 1.1136,
"step": 230
},
{
"epoch": 0.24396039603960396,
"grad_norm": 0.7665799831821392,
"learning_rate": 1.8765910815794152e-05,
"loss": 1.1406,
"step": 231
},
{
"epoch": 0.245016501650165,
"grad_norm": 0.7146131411731325,
"learning_rate": 1.874808558425722e-05,
"loss": 1.1236,
"step": 232
},
{
"epoch": 0.24607260726072608,
"grad_norm": 0.7043764792134646,
"learning_rate": 1.8730141131611882e-05,
"loss": 1.1309,
"step": 233
},
{
"epoch": 0.24712871287128713,
"grad_norm": 0.7141175251922907,
"learning_rate": 1.871207770240965e-05,
"loss": 1.0821,
"step": 234
},
{
"epoch": 0.24818481848184817,
"grad_norm": 0.7240560361081955,
"learning_rate": 1.8693895542823477e-05,
"loss": 1.1353,
"step": 235
},
{
"epoch": 0.24924092409240925,
"grad_norm": 0.7114333346757202,
"learning_rate": 1.8675594900644395e-05,
"loss": 1.116,
"step": 236
},
{
"epoch": 0.2502970297029703,
"grad_norm": 0.6947007074015005,
"learning_rate": 1.8657176025278163e-05,
"loss": 1.0972,
"step": 237
},
{
"epoch": 0.2513531353135314,
"grad_norm": 0.7382016270454009,
"learning_rate": 1.8638639167741836e-05,
"loss": 1.1421,
"step": 238
},
{
"epoch": 0.2524092409240924,
"grad_norm": 0.7219520168324991,
"learning_rate": 1.8619984580660365e-05,
"loss": 1.0945,
"step": 239
},
{
"epoch": 0.25346534653465347,
"grad_norm": 0.6773701287397652,
"learning_rate": 1.8601212518263157e-05,
"loss": 1.1194,
"step": 240
},
{
"epoch": 0.25452145214521454,
"grad_norm": 0.7613899903809591,
"learning_rate": 1.858232323638059e-05,
"loss": 1.1137,
"step": 241
},
{
"epoch": 0.25557755775577556,
"grad_norm": 0.7505998081567404,
"learning_rate": 1.8563316992440545e-05,
"loss": 1.1337,
"step": 242
},
{
"epoch": 0.25663366336633664,
"grad_norm": 0.7701939577260392,
"learning_rate": 1.8544194045464888e-05,
"loss": 1.0726,
"step": 243
},
{
"epoch": 0.2576897689768977,
"grad_norm": 0.7447757726093532,
"learning_rate": 1.8524954656065944e-05,
"loss": 1.1271,
"step": 244
},
{
"epoch": 0.25874587458745874,
"grad_norm": 0.7015183997568638,
"learning_rate": 1.8505599086442956e-05,
"loss": 1.103,
"step": 245
},
{
"epoch": 0.2598019801980198,
"grad_norm": 0.7632139318945852,
"learning_rate": 1.848612760037848e-05,
"loss": 1.1534,
"step": 246
},
{
"epoch": 0.26085808580858083,
"grad_norm": 0.770906298616509,
"learning_rate": 1.846654046323482e-05,
"loss": 1.1091,
"step": 247
},
{
"epoch": 0.2619141914191419,
"grad_norm": 0.6644801023889668,
"learning_rate": 1.844683794195041e-05,
"loss": 1.1009,
"step": 248
},
{
"epoch": 0.262970297029703,
"grad_norm": 0.7404691323171753,
"learning_rate": 1.8427020305036158e-05,
"loss": 1.1251,
"step": 249
},
{
"epoch": 0.264026402640264,
"grad_norm": 0.7391557745385032,
"learning_rate": 1.8407087822571794e-05,
"loss": 1.1014,
"step": 250
},
{
"epoch": 0.2650825082508251,
"grad_norm": 0.7412509249702083,
"learning_rate": 1.83870407662022e-05,
"loss": 1.1084,
"step": 251
},
{
"epoch": 0.26613861386138615,
"grad_norm": 0.726581092966362,
"learning_rate": 1.8366879409133703e-05,
"loss": 1.1242,
"step": 252
},
{
"epoch": 0.2671947194719472,
"grad_norm": 0.7389045635296936,
"learning_rate": 1.8346604026130335e-05,
"loss": 1.1523,
"step": 253
},
{
"epoch": 0.26825082508250825,
"grad_norm": 0.7183540198170827,
"learning_rate": 1.8326214893510115e-05,
"loss": 1.0529,
"step": 254
},
{
"epoch": 0.2693069306930693,
"grad_norm": 0.6617247858223898,
"learning_rate": 1.8305712289141266e-05,
"loss": 1.0761,
"step": 255
},
{
"epoch": 0.27036303630363034,
"grad_norm": 0.7341191597515787,
"learning_rate": 1.8285096492438424e-05,
"loss": 1.0825,
"step": 256
},
{
"epoch": 0.2714191419141914,
"grad_norm": 0.6803880666086052,
"learning_rate": 1.8264367784358856e-05,
"loss": 1.1213,
"step": 257
},
{
"epoch": 0.2724752475247525,
"grad_norm": 0.7395407173441846,
"learning_rate": 1.8243526447398595e-05,
"loss": 1.0653,
"step": 258
},
{
"epoch": 0.2735313531353135,
"grad_norm": 0.67390790954467,
"learning_rate": 1.8222572765588626e-05,
"loss": 1.1167,
"step": 259
},
{
"epoch": 0.2745874587458746,
"grad_norm": 0.7300101358875658,
"learning_rate": 1.8201507024490986e-05,
"loss": 1.1118,
"step": 260
},
{
"epoch": 0.27564356435643567,
"grad_norm": 0.7190182909346549,
"learning_rate": 1.818032951119489e-05,
"loss": 1.1115,
"step": 261
},
{
"epoch": 0.2766996699669967,
"grad_norm": 0.6826526899970481,
"learning_rate": 1.8159040514312805e-05,
"loss": 1.1334,
"step": 262
},
{
"epoch": 0.27775577557755776,
"grad_norm": 0.74333078221306,
"learning_rate": 1.8137640323976536e-05,
"loss": 1.1161,
"step": 263
},
{
"epoch": 0.27881188118811884,
"grad_norm": 0.6583069425400893,
"learning_rate": 1.8116129231833247e-05,
"loss": 1.1093,
"step": 264
},
{
"epoch": 0.27986798679867986,
"grad_norm": 0.7099976818200693,
"learning_rate": 1.8094507531041516e-05,
"loss": 1.1198,
"step": 265
},
{
"epoch": 0.28092409240924093,
"grad_norm": 0.7008484142021361,
"learning_rate": 1.8072775516267306e-05,
"loss": 1.0709,
"step": 266
},
{
"epoch": 0.28198019801980195,
"grad_norm": 0.7196712908309428,
"learning_rate": 1.8050933483679974e-05,
"loss": 1.1018,
"step": 267
},
{
"epoch": 0.28303630363036303,
"grad_norm": 0.7180107273478451,
"learning_rate": 1.8028981730948238e-05,
"loss": 1.1456,
"step": 268
},
{
"epoch": 0.2840924092409241,
"grad_norm": 0.7760467285880182,
"learning_rate": 1.800692055723609e-05,
"loss": 1.0996,
"step": 269
},
{
"epoch": 0.2851485148514851,
"grad_norm": 0.7221670879554789,
"learning_rate": 1.7984750263198752e-05,
"loss": 1.0671,
"step": 270
},
{
"epoch": 0.2862046204620462,
"grad_norm": 0.7769777392766594,
"learning_rate": 1.7962471150978565e-05,
"loss": 1.1347,
"step": 271
},
{
"epoch": 0.2872607260726073,
"grad_norm": 0.7943944865167477,
"learning_rate": 1.794008352420086e-05,
"loss": 1.0988,
"step": 272
},
{
"epoch": 0.2883168316831683,
"grad_norm": 0.7955672848702172,
"learning_rate": 1.7917587687969847e-05,
"loss": 1.0973,
"step": 273
},
{
"epoch": 0.28937293729372937,
"grad_norm": 0.6401076365831395,
"learning_rate": 1.789498394886443e-05,
"loss": 1.1312,
"step": 274
},
{
"epoch": 0.29042904290429045,
"grad_norm": 0.7462335971121532,
"learning_rate": 1.7872272614934053e-05,
"loss": 1.0976,
"step": 275
},
{
"epoch": 0.29148514851485147,
"grad_norm": 0.654414454441831,
"learning_rate": 1.784945399569447e-05,
"loss": 1.1158,
"step": 276
},
{
"epoch": 0.29254125412541254,
"grad_norm": 0.7697662744274827,
"learning_rate": 1.7826528402123565e-05,
"loss": 1.0959,
"step": 277
},
{
"epoch": 0.2935973597359736,
"grad_norm": 0.7338237066997852,
"learning_rate": 1.7803496146657086e-05,
"loss": 1.1519,
"step": 278
},
{
"epoch": 0.29465346534653464,
"grad_norm": 0.7757353337504138,
"learning_rate": 1.7780357543184396e-05,
"loss": 1.1193,
"step": 279
},
{
"epoch": 0.2957095709570957,
"grad_norm": 0.6948395842000579,
"learning_rate": 1.77571129070442e-05,
"loss": 1.0713,
"step": 280
},
{
"epoch": 0.2967656765676568,
"grad_norm": 0.6723340107663477,
"learning_rate": 1.7733762555020235e-05,
"loss": 1.0914,
"step": 281
},
{
"epoch": 0.2978217821782178,
"grad_norm": 0.7832989501682481,
"learning_rate": 1.7710306805336973e-05,
"loss": 1.094,
"step": 282
},
{
"epoch": 0.2988778877887789,
"grad_norm": 0.6942791711248667,
"learning_rate": 1.7686745977655254e-05,
"loss": 1.0987,
"step": 283
},
{
"epoch": 0.29993399339933996,
"grad_norm": 0.78182017024036,
"learning_rate": 1.766308039306797e-05,
"loss": 1.0998,
"step": 284
},
{
"epoch": 0.300990099009901,
"grad_norm": 0.7605053024925295,
"learning_rate": 1.7639310374095645e-05,
"loss": 1.1474,
"step": 285
},
{
"epoch": 0.30204620462046206,
"grad_norm": 0.7533682382759994,
"learning_rate": 1.7615436244682068e-05,
"loss": 1.1033,
"step": 286
},
{
"epoch": 0.3031023102310231,
"grad_norm": 0.6966082167864567,
"learning_rate": 1.759145833018988e-05,
"loss": 1.0873,
"step": 287
},
{
"epoch": 0.30415841584158415,
"grad_norm": 0.77351689360323,
"learning_rate": 1.7567376957396124e-05,
"loss": 1.0896,
"step": 288
},
{
"epoch": 0.3052145214521452,
"grad_norm": 0.769081614706101,
"learning_rate": 1.7543192454487793e-05,
"loss": 1.1026,
"step": 289
},
{
"epoch": 0.30627062706270625,
"grad_norm": 0.7461942610501254,
"learning_rate": 1.751890515105738e-05,
"loss": 1.119,
"step": 290
},
{
"epoch": 0.3073267326732673,
"grad_norm": 0.7018336126907382,
"learning_rate": 1.7494515378098348e-05,
"loss": 1.0672,
"step": 291
},
{
"epoch": 0.3083828382838284,
"grad_norm": 0.8503539268099102,
"learning_rate": 1.7470023468000655e-05,
"loss": 1.1198,
"step": 292
},
{
"epoch": 0.3094389438943894,
"grad_norm": 0.7174366806252889,
"learning_rate": 1.744542975454621e-05,
"loss": 1.0916,
"step": 293
},
{
"epoch": 0.3104950495049505,
"grad_norm": 0.8057997605492729,
"learning_rate": 1.742073457290431e-05,
"loss": 1.1053,
"step": 294
},
{
"epoch": 0.31155115511551157,
"grad_norm": 0.7240874272669294,
"learning_rate": 1.7395938259627102e-05,
"loss": 1.0691,
"step": 295
},
{
"epoch": 0.3126072607260726,
"grad_norm": 0.7830986396758909,
"learning_rate": 1.7371041152644975e-05,
"loss": 1.0995,
"step": 296
},
{
"epoch": 0.31366336633663366,
"grad_norm": 0.7943782245355901,
"learning_rate": 1.7346043591261958e-05,
"loss": 1.1293,
"step": 297
},
{
"epoch": 0.31471947194719474,
"grad_norm": 0.7183388442438656,
"learning_rate": 1.7320945916151092e-05,
"loss": 1.0748,
"step": 298
},
{
"epoch": 0.31577557755775576,
"grad_norm": 0.778736200623747,
"learning_rate": 1.7295748469349805e-05,
"loss": 1.0982,
"step": 299
},
{
"epoch": 0.31683168316831684,
"grad_norm": 0.7320565509792677,
"learning_rate": 1.7270451594255232e-05,
"loss": 1.092,
"step": 300
},
{
"epoch": 0.3178877887788779,
"grad_norm": 0.8034747292740897,
"learning_rate": 1.7245055635619543e-05,
"loss": 1.0845,
"step": 301
},
{
"epoch": 0.31894389438943893,
"grad_norm": 0.6617151402029439,
"learning_rate": 1.7219560939545246e-05,
"loss": 1.0614,
"step": 302
},
{
"epoch": 0.32,
"grad_norm": 0.7793454458969497,
"learning_rate": 1.719396785348046e-05,
"loss": 1.107,
"step": 303
},
{
"epoch": 0.3210561056105611,
"grad_norm": 0.7008834786688865,
"learning_rate": 1.7168276726214198e-05,
"loss": 1.0787,
"step": 304
},
{
"epoch": 0.3221122112211221,
"grad_norm": 0.7734087474626543,
"learning_rate": 1.7142487907871595e-05,
"loss": 1.0969,
"step": 305
},
{
"epoch": 0.3231683168316832,
"grad_norm": 0.6939131020225262,
"learning_rate": 1.711660174990914e-05,
"loss": 1.0912,
"step": 306
},
{
"epoch": 0.3242244224422442,
"grad_norm": 0.7470818698693782,
"learning_rate": 1.7090618605109913e-05,
"loss": 1.1036,
"step": 307
},
{
"epoch": 0.3252805280528053,
"grad_norm": 0.705753089594604,
"learning_rate": 1.7064538827578724e-05,
"loss": 1.0755,
"step": 308
},
{
"epoch": 0.32633663366336635,
"grad_norm": 0.8009231875636987,
"learning_rate": 1.703836277273735e-05,
"loss": 1.0898,
"step": 309
},
{
"epoch": 0.32739273927392737,
"grad_norm": 0.7009480286818737,
"learning_rate": 1.701209079731963e-05,
"loss": 1.1402,
"step": 310
},
{
"epoch": 0.32844884488448844,
"grad_norm": 0.7038791800197457,
"learning_rate": 1.698572325936665e-05,
"loss": 1.0998,
"step": 311
},
{
"epoch": 0.3295049504950495,
"grad_norm": 0.7512953551242046,
"learning_rate": 1.6959260518221844e-05,
"loss": 1.1362,
"step": 312
},
{
"epoch": 0.33056105610561054,
"grad_norm": 0.6501206340577776,
"learning_rate": 1.69327029345261e-05,
"loss": 1.0783,
"step": 313
},
{
"epoch": 0.3316171617161716,
"grad_norm": 0.7287631481760563,
"learning_rate": 1.6906050870212834e-05,
"loss": 1.1152,
"step": 314
},
{
"epoch": 0.3326732673267327,
"grad_norm": 0.6660156857131329,
"learning_rate": 1.6879304688503088e-05,
"loss": 1.1262,
"step": 315
},
{
"epoch": 0.3337293729372937,
"grad_norm": 0.7127069908206151,
"learning_rate": 1.685246475390053e-05,
"loss": 1.11,
"step": 316
},
{
"epoch": 0.3347854785478548,
"grad_norm": 0.6844715289023341,
"learning_rate": 1.6825531432186545e-05,
"loss": 1.0995,
"step": 317
},
{
"epoch": 0.33584158415841586,
"grad_norm": 0.7558240758806871,
"learning_rate": 1.67985050904152e-05,
"loss": 1.1277,
"step": 318
},
{
"epoch": 0.3368976897689769,
"grad_norm": 0.6548389494201281,
"learning_rate": 1.677138609690828e-05,
"loss": 1.1198,
"step": 319
},
{
"epoch": 0.33795379537953796,
"grad_norm": 0.7223326498468008,
"learning_rate": 1.6744174821250237e-05,
"loss": 1.0942,
"step": 320
},
{
"epoch": 0.33900990099009903,
"grad_norm": 0.6981276944999266,
"learning_rate": 1.6716871634283173e-05,
"loss": 1.076,
"step": 321
},
{
"epoch": 0.34006600660066005,
"grad_norm": 0.6863175028888309,
"learning_rate": 1.6689476908101784e-05,
"loss": 1.0768,
"step": 322
},
{
"epoch": 0.34112211221122113,
"grad_norm": 0.7035323767190387,
"learning_rate": 1.6661991016048275e-05,
"loss": 1.116,
"step": 323
},
{
"epoch": 0.3421782178217822,
"grad_norm": 0.6612295886827857,
"learning_rate": 1.6634414332707297e-05,
"loss": 1.0678,
"step": 324
},
{
"epoch": 0.3432343234323432,
"grad_norm": 0.7408384014139404,
"learning_rate": 1.6606747233900816e-05,
"loss": 1.0634,
"step": 325
},
{
"epoch": 0.3442904290429043,
"grad_norm": 0.6525772496771438,
"learning_rate": 1.6578990096683005e-05,
"loss": 1.0545,
"step": 326
},
{
"epoch": 0.3453465346534653,
"grad_norm": 0.7628586104840287,
"learning_rate": 1.6551143299335107e-05,
"loss": 1.1221,
"step": 327
},
{
"epoch": 0.3464026402640264,
"grad_norm": 0.6392841262980653,
"learning_rate": 1.6523207221360267e-05,
"loss": 1.1081,
"step": 328
},
{
"epoch": 0.34745874587458747,
"grad_norm": 0.6769437709914546,
"learning_rate": 1.6495182243478382e-05,
"loss": 1.0956,
"step": 329
},
{
"epoch": 0.3485148514851485,
"grad_norm": 0.7062180639093689,
"learning_rate": 1.6467068747620888e-05,
"loss": 1.0942,
"step": 330
},
{
"epoch": 0.34957095709570957,
"grad_norm": 0.6650935241647871,
"learning_rate": 1.6438867116925572e-05,
"loss": 1.0662,
"step": 331
},
{
"epoch": 0.35062706270627064,
"grad_norm": 0.737267900754825,
"learning_rate": 1.6410577735731346e-05,
"loss": 1.0729,
"step": 332
},
{
"epoch": 0.35168316831683166,
"grad_norm": 0.7220718539987816,
"learning_rate": 1.6382200989572997e-05,
"loss": 1.0887,
"step": 333
},
{
"epoch": 0.35273927392739274,
"grad_norm": 0.7142930222207629,
"learning_rate": 1.6353737265175963e-05,
"loss": 1.1013,
"step": 334
},
{
"epoch": 0.3537953795379538,
"grad_norm": 0.666907919962204,
"learning_rate": 1.632518695045102e-05,
"loss": 1.1176,
"step": 335
},
{
"epoch": 0.35485148514851483,
"grad_norm": 0.6902276034428666,
"learning_rate": 1.629655043448904e-05,
"loss": 1.1165,
"step": 336
},
{
"epoch": 0.3559075907590759,
"grad_norm": 0.7060277985912837,
"learning_rate": 1.626782810755565e-05,
"loss": 1.0755,
"step": 337
},
{
"epoch": 0.356963696369637,
"grad_norm": 0.654598677432983,
"learning_rate": 1.6239020361085947e-05,
"loss": 1.0722,
"step": 338
},
{
"epoch": 0.358019801980198,
"grad_norm": 0.6499861559788678,
"learning_rate": 1.621012758767913e-05,
"loss": 1.1159,
"step": 339
},
{
"epoch": 0.3590759075907591,
"grad_norm": 0.6693339178949881,
"learning_rate": 1.618115018109318e-05,
"loss": 1.1087,
"step": 340
},
{
"epoch": 0.36013201320132016,
"grad_norm": 0.6794045929041601,
"learning_rate": 1.615208853623947e-05,
"loss": 1.1214,
"step": 341
},
{
"epoch": 0.3611881188118812,
"grad_norm": 0.6872109015417777,
"learning_rate": 1.6122943049177403e-05,
"loss": 1.0853,
"step": 342
},
{
"epoch": 0.36224422442244225,
"grad_norm": 0.6749858887871915,
"learning_rate": 1.6093714117109e-05,
"loss": 1.0871,
"step": 343
},
{
"epoch": 0.3633003300330033,
"grad_norm": 0.6357991629501155,
"learning_rate": 1.6064402138373488e-05,
"loss": 1.0531,
"step": 344
},
{
"epoch": 0.36435643564356435,
"grad_norm": 0.6490690346386233,
"learning_rate": 1.6035007512441883e-05,
"loss": 1.1124,
"step": 345
},
{
"epoch": 0.3654125412541254,
"grad_norm": 0.6345817957796211,
"learning_rate": 1.6005530639911525e-05,
"loss": 1.1037,
"step": 346
},
{
"epoch": 0.36646864686468644,
"grad_norm": 0.6527609533545764,
"learning_rate": 1.5975971922500643e-05,
"loss": 1.1026,
"step": 347
},
{
"epoch": 0.3675247524752475,
"grad_norm": 0.6576274962389469,
"learning_rate": 1.594633176304287e-05,
"loss": 1.1034,
"step": 348
},
{
"epoch": 0.3685808580858086,
"grad_norm": 0.7064980398358128,
"learning_rate": 1.5916610565481737e-05,
"loss": 1.1245,
"step": 349
},
{
"epoch": 0.3696369636963696,
"grad_norm": 0.6713891888197137,
"learning_rate": 1.5886808734865202e-05,
"loss": 1.086,
"step": 350
},
{
"epoch": 0.3706930693069307,
"grad_norm": 0.6598157663934,
"learning_rate": 1.5856926677340093e-05,
"loss": 1.1055,
"step": 351
},
{
"epoch": 0.37174917491749176,
"grad_norm": 0.780243809125385,
"learning_rate": 1.58269648001466e-05,
"loss": 1.1146,
"step": 352
},
{
"epoch": 0.3728052805280528,
"grad_norm": 0.6615597933234442,
"learning_rate": 1.579692351161272e-05,
"loss": 1.0992,
"step": 353
},
{
"epoch": 0.37386138613861386,
"grad_norm": 0.7479427839490941,
"learning_rate": 1.5766803221148676e-05,
"loss": 1.0981,
"step": 354
},
{
"epoch": 0.37491749174917494,
"grad_norm": 0.7333748361267198,
"learning_rate": 1.573660433924135e-05,
"loss": 1.0994,
"step": 355
},
{
"epoch": 0.37597359735973596,
"grad_norm": 0.7219990763500711,
"learning_rate": 1.57063272774487e-05,
"loss": 1.071,
"step": 356
},
{
"epoch": 0.37702970297029703,
"grad_norm": 0.7994336955147575,
"learning_rate": 1.5675972448394126e-05,
"loss": 1.0759,
"step": 357
},
{
"epoch": 0.3780858085808581,
"grad_norm": 0.7134074193132752,
"learning_rate": 1.5645540265760874e-05,
"loss": 1.1016,
"step": 358
},
{
"epoch": 0.3791419141914191,
"grad_norm": 0.739592021602517,
"learning_rate": 1.5615031144286364e-05,
"loss": 1.1025,
"step": 359
},
{
"epoch": 0.3801980198019802,
"grad_norm": 0.6875099894028719,
"learning_rate": 1.5584445499756577e-05,
"loss": 1.0625,
"step": 360
},
{
"epoch": 0.3812541254125413,
"grad_norm": 0.775166546640488,
"learning_rate": 1.5553783749000363e-05,
"loss": 1.1471,
"step": 361
},
{
"epoch": 0.3823102310231023,
"grad_norm": 0.6748406639884176,
"learning_rate": 1.5523046309883757e-05,
"loss": 1.1072,
"step": 362
},
{
"epoch": 0.3833663366336634,
"grad_norm": 0.7479763082912592,
"learning_rate": 1.5492233601304313e-05,
"loss": 1.0932,
"step": 363
},
{
"epoch": 0.38442244224422445,
"grad_norm": 0.6872666626339455,
"learning_rate": 1.5461346043185358e-05,
"loss": 1.115,
"step": 364
},
{
"epoch": 0.38547854785478547,
"grad_norm": 0.7410608320578218,
"learning_rate": 1.5430384056470294e-05,
"loss": 1.1319,
"step": 365
},
{
"epoch": 0.38653465346534654,
"grad_norm": 0.6648975548116297,
"learning_rate": 1.5399348063116858e-05,
"loss": 1.1093,
"step": 366
},
{
"epoch": 0.38759075907590756,
"grad_norm": 0.690463073646725,
"learning_rate": 1.5368238486091372e-05,
"loss": 1.1134,
"step": 367
},
{
"epoch": 0.38864686468646864,
"grad_norm": 0.6892734837125382,
"learning_rate": 1.5337055749362957e-05,
"loss": 1.1215,
"step": 368
},
{
"epoch": 0.3897029702970297,
"grad_norm": 0.6769119632112021,
"learning_rate": 1.530580027789779e-05,
"loss": 1.0895,
"step": 369
},
{
"epoch": 0.39075907590759074,
"grad_norm": 0.6450434616022525,
"learning_rate": 1.527447249765329e-05,
"loss": 1.1112,
"step": 370
},
{
"epoch": 0.3918151815181518,
"grad_norm": 0.6478773579790501,
"learning_rate": 1.5243072835572319e-05,
"loss": 1.1019,
"step": 371
},
{
"epoch": 0.3928712871287129,
"grad_norm": 0.6617247721122851,
"learning_rate": 1.5211601719577358e-05,
"loss": 1.0545,
"step": 372
},
{
"epoch": 0.3939273927392739,
"grad_norm": 0.6456390472590054,
"learning_rate": 1.518005957856468e-05,
"loss": 1.0691,
"step": 373
},
{
"epoch": 0.394983498349835,
"grad_norm": 0.6985514520312006,
"learning_rate": 1.5148446842398518e-05,
"loss": 1.1149,
"step": 374
},
{
"epoch": 0.39603960396039606,
"grad_norm": 0.6619613470097063,
"learning_rate": 1.5116763941905177e-05,
"loss": 1.0828,
"step": 375
},
{
"epoch": 0.3970957095709571,
"grad_norm": 0.6993558838232781,
"learning_rate": 1.5085011308867183e-05,
"loss": 1.0969,
"step": 376
},
{
"epoch": 0.39815181518151815,
"grad_norm": 0.7702836648944029,
"learning_rate": 1.5053189376017407e-05,
"loss": 1.0478,
"step": 377
},
{
"epoch": 0.39920792079207923,
"grad_norm": 0.6760422492135052,
"learning_rate": 1.5021298577033134e-05,
"loss": 1.0923,
"step": 378
},
{
"epoch": 0.40026402640264025,
"grad_norm": 0.6952201486005894,
"learning_rate": 1.4989339346530197e-05,
"loss": 1.0963,
"step": 379
},
{
"epoch": 0.4013201320132013,
"grad_norm": 0.6766834812107589,
"learning_rate": 1.4957312120057006e-05,
"loss": 1.1025,
"step": 380
},
{
"epoch": 0.4023762376237624,
"grad_norm": 0.6743168791634542,
"learning_rate": 1.4925217334088662e-05,
"loss": 1.1274,
"step": 381
},
{
"epoch": 0.4034323432343234,
"grad_norm": 0.7123937571719575,
"learning_rate": 1.4893055426020969e-05,
"loss": 1.0986,
"step": 382
},
{
"epoch": 0.4044884488448845,
"grad_norm": 0.6695911757932282,
"learning_rate": 1.4860826834164489e-05,
"loss": 1.0666,
"step": 383
},
{
"epoch": 0.40554455445544557,
"grad_norm": 0.6787795068658586,
"learning_rate": 1.4828531997738574e-05,
"loss": 1.0606,
"step": 384
},
{
"epoch": 0.4066006600660066,
"grad_norm": 0.6699120272225082,
"learning_rate": 1.4796171356865362e-05,
"loss": 1.033,
"step": 385
},
{
"epoch": 0.40765676567656767,
"grad_norm": 0.678802141382895,
"learning_rate": 1.4763745352563805e-05,
"loss": 1.0626,
"step": 386
},
{
"epoch": 0.4087128712871287,
"grad_norm": 0.6822394635390534,
"learning_rate": 1.4731254426743633e-05,
"loss": 1.1028,
"step": 387
},
{
"epoch": 0.40976897689768976,
"grad_norm": 0.7038707571594772,
"learning_rate": 1.469869902219935e-05,
"loss": 1.0808,
"step": 388
},
{
"epoch": 0.41082508250825084,
"grad_norm": 0.6559345705211832,
"learning_rate": 1.4666079582604185e-05,
"loss": 1.1126,
"step": 389
},
{
"epoch": 0.41188118811881186,
"grad_norm": 0.6307815347996475,
"learning_rate": 1.4633396552504062e-05,
"loss": 1.0848,
"step": 390
},
{
"epoch": 0.41293729372937293,
"grad_norm": 0.6680698774967592,
"learning_rate": 1.4600650377311523e-05,
"loss": 1.0794,
"step": 391
},
{
"epoch": 0.413993399339934,
"grad_norm": 0.6390665153207211,
"learning_rate": 1.4567841503299673e-05,
"loss": 1.06,
"step": 392
},
{
"epoch": 0.41504950495049503,
"grad_norm": 0.6377177285205933,
"learning_rate": 1.4534970377596089e-05,
"loss": 1.0505,
"step": 393
},
{
"epoch": 0.4161056105610561,
"grad_norm": 0.6367095687723813,
"learning_rate": 1.4502037448176734e-05,
"loss": 1.0364,
"step": 394
},
{
"epoch": 0.4171617161716172,
"grad_norm": 0.6909756668761791,
"learning_rate": 1.4469043163859844e-05,
"loss": 1.1228,
"step": 395
},
{
"epoch": 0.4182178217821782,
"grad_norm": 0.6707554941725189,
"learning_rate": 1.4435987974299815e-05,
"loss": 1.1044,
"step": 396
},
{
"epoch": 0.4192739273927393,
"grad_norm": 0.6541534416842573,
"learning_rate": 1.4402872329981077e-05,
"loss": 1.0547,
"step": 397
},
{
"epoch": 0.42033003300330035,
"grad_norm": 0.6646687606940533,
"learning_rate": 1.4369696682211948e-05,
"loss": 1.1011,
"step": 398
},
{
"epoch": 0.42138613861386137,
"grad_norm": 0.657658096698119,
"learning_rate": 1.4336461483118498e-05,
"loss": 1.0585,
"step": 399
},
{
"epoch": 0.42244224422442245,
"grad_norm": 0.677347722123811,
"learning_rate": 1.4303167185638367e-05,
"loss": 1.1137,
"step": 400
},
{
"epoch": 0.4234983498349835,
"grad_norm": 0.6411654283958523,
"learning_rate": 1.4269814243514608e-05,
"loss": 1.0705,
"step": 401
},
{
"epoch": 0.42455445544554454,
"grad_norm": 0.6301130191003985,
"learning_rate": 1.4236403111289494e-05,
"loss": 1.0724,
"step": 402
},
{
"epoch": 0.4256105610561056,
"grad_norm": 0.6815021910608711,
"learning_rate": 1.420293424429833e-05,
"loss": 1.122,
"step": 403
},
{
"epoch": 0.4266666666666667,
"grad_norm": 0.6698873904254661,
"learning_rate": 1.4169408098663249e-05,
"loss": 1.0676,
"step": 404
},
{
"epoch": 0.4277227722772277,
"grad_norm": 0.6665708485173318,
"learning_rate": 1.4135825131286984e-05,
"loss": 1.0525,
"step": 405
},
{
"epoch": 0.4287788778877888,
"grad_norm": 0.6534707277549673,
"learning_rate": 1.4102185799846652e-05,
"loss": 1.0734,
"step": 406
},
{
"epoch": 0.4298349834983498,
"grad_norm": 0.7106562945034592,
"learning_rate": 1.406849056278752e-05,
"loss": 1.0843,
"step": 407
},
{
"epoch": 0.4308910891089109,
"grad_norm": 0.6338149210338043,
"learning_rate": 1.4034739879316737e-05,
"loss": 1.0965,
"step": 408
},
{
"epoch": 0.43194719471947196,
"grad_norm": 0.6744853001120149,
"learning_rate": 1.4000934209397103e-05,
"loss": 1.0457,
"step": 409
},
{
"epoch": 0.433003300330033,
"grad_norm": 0.6487682041680315,
"learning_rate": 1.396707401374078e-05,
"loss": 1.0625,
"step": 410
},
{
"epoch": 0.43405940594059406,
"grad_norm": 0.7243685988056301,
"learning_rate": 1.3933159753803021e-05,
"loss": 1.0283,
"step": 411
},
{
"epoch": 0.43511551155115513,
"grad_norm": 0.636845442234845,
"learning_rate": 1.3899191891775885e-05,
"loss": 1.103,
"step": 412
},
{
"epoch": 0.43617161716171615,
"grad_norm": 0.758442357635514,
"learning_rate": 1.3865170890581925e-05,
"loss": 1.0909,
"step": 413
},
{
"epoch": 0.4372277227722772,
"grad_norm": 0.6690226546024075,
"learning_rate": 1.3831097213867902e-05,
"loss": 1.088,
"step": 414
},
{
"epoch": 0.4382838283828383,
"grad_norm": 0.6854450164962684,
"learning_rate": 1.3796971325998434e-05,
"loss": 1.0719,
"step": 415
},
{
"epoch": 0.4393399339933993,
"grad_norm": 0.6437858338481635,
"learning_rate": 1.3762793692049702e-05,
"loss": 1.1117,
"step": 416
},
{
"epoch": 0.4403960396039604,
"grad_norm": 0.637083816055767,
"learning_rate": 1.3728564777803089e-05,
"loss": 1.085,
"step": 417
},
{
"epoch": 0.4414521452145215,
"grad_norm": 0.7061566863290087,
"learning_rate": 1.3694285049738833e-05,
"loss": 1.0956,
"step": 418
},
{
"epoch": 0.4425082508250825,
"grad_norm": 0.729429954435777,
"learning_rate": 1.365995497502969e-05,
"loss": 1.0441,
"step": 419
},
{
"epoch": 0.44356435643564357,
"grad_norm": 0.6475900555433982,
"learning_rate": 1.3625575021534536e-05,
"loss": 1.0871,
"step": 420
},
{
"epoch": 0.44462046204620465,
"grad_norm": 0.6600082285768424,
"learning_rate": 1.3591145657792018e-05,
"loss": 1.0488,
"step": 421
},
{
"epoch": 0.44567656765676567,
"grad_norm": 0.6340592392790095,
"learning_rate": 1.3556667353014159e-05,
"loss": 1.1029,
"step": 422
},
{
"epoch": 0.44673267326732674,
"grad_norm": 0.6670694212532478,
"learning_rate": 1.3522140577079954e-05,
"loss": 1.0844,
"step": 423
},
{
"epoch": 0.4477887788778878,
"grad_norm": 0.650960082002708,
"learning_rate": 1.348756580052899e-05,
"loss": 1.0595,
"step": 424
},
{
"epoch": 0.44884488448844884,
"grad_norm": 0.6516516894541583,
"learning_rate": 1.3452943494554998e-05,
"loss": 1.1071,
"step": 425
},
{
"epoch": 0.4499009900990099,
"grad_norm": 0.6632355776734516,
"learning_rate": 1.3418274130999468e-05,
"loss": 1.0569,
"step": 426
},
{
"epoch": 0.45095709570957093,
"grad_norm": 0.6169716342225784,
"learning_rate": 1.3383558182345203e-05,
"loss": 1.0531,
"step": 427
},
{
"epoch": 0.452013201320132,
"grad_norm": 0.6125169016440121,
"learning_rate": 1.3348796121709862e-05,
"loss": 1.0326,
"step": 428
},
{
"epoch": 0.4530693069306931,
"grad_norm": 0.6657583684665047,
"learning_rate": 1.3313988422839552e-05,
"loss": 1.0668,
"step": 429
},
{
"epoch": 0.4541254125412541,
"grad_norm": 0.6776745636316159,
"learning_rate": 1.3279135560102336e-05,
"loss": 1.0945,
"step": 430
},
{
"epoch": 0.4551815181518152,
"grad_norm": 0.7048253495382116,
"learning_rate": 1.3244238008481785e-05,
"loss": 1.1016,
"step": 431
},
{
"epoch": 0.45623762376237625,
"grad_norm": 0.6821294041692253,
"learning_rate": 1.32092962435705e-05,
"loss": 1.0575,
"step": 432
},
{
"epoch": 0.4572937293729373,
"grad_norm": 0.6726975166666695,
"learning_rate": 1.317431074156363e-05,
"loss": 1.0932,
"step": 433
},
{
"epoch": 0.45834983498349835,
"grad_norm": 0.6545387408102209,
"learning_rate": 1.3139281979252394e-05,
"loss": 1.0618,
"step": 434
},
{
"epoch": 0.4594059405940594,
"grad_norm": 0.7039848947715922,
"learning_rate": 1.310421043401756e-05,
"loss": 1.0868,
"step": 435
},
{
"epoch": 0.46046204620462045,
"grad_norm": 0.6376171705375021,
"learning_rate": 1.306909658382296e-05,
"loss": 1.0548,
"step": 436
},
{
"epoch": 0.4615181518151815,
"grad_norm": 0.6843019714453976,
"learning_rate": 1.303394090720897e-05,
"loss": 1.0883,
"step": 437
},
{
"epoch": 0.4625742574257426,
"grad_norm": 0.6163145194667164,
"learning_rate": 1.299874388328598e-05,
"loss": 1.0741,
"step": 438
},
{
"epoch": 0.4636303630363036,
"grad_norm": 0.635005107539355,
"learning_rate": 1.2963505991727878e-05,
"loss": 1.0634,
"step": 439
},
{
"epoch": 0.4646864686468647,
"grad_norm": 0.9709870514103477,
"learning_rate": 1.2928227712765504e-05,
"loss": 1.0518,
"step": 440
},
{
"epoch": 0.46574257425742577,
"grad_norm": 0.6900682108034367,
"learning_rate": 1.289290952718011e-05,
"loss": 1.1063,
"step": 441
},
{
"epoch": 0.4667986798679868,
"grad_norm": 0.6632591952516493,
"learning_rate": 1.28575519162968e-05,
"loss": 1.064,
"step": 442
},
{
"epoch": 0.46785478547854786,
"grad_norm": 0.6623377746790227,
"learning_rate": 1.2822155361977977e-05,
"loss": 1.0948,
"step": 443
},
{
"epoch": 0.4689108910891089,
"grad_norm": 0.6885461079200218,
"learning_rate": 1.2786720346616784e-05,
"loss": 1.0781,
"step": 444
},
{
"epoch": 0.46996699669966996,
"grad_norm": 0.67075610341075,
"learning_rate": 1.2751247353130507e-05,
"loss": 1.0994,
"step": 445
},
{
"epoch": 0.47102310231023103,
"grad_norm": 0.6756174815586324,
"learning_rate": 1.2715736864954017e-05,
"loss": 1.0561,
"step": 446
},
{
"epoch": 0.47207920792079205,
"grad_norm": 0.6583272836245984,
"learning_rate": 1.2680189366033173e-05,
"loss": 1.097,
"step": 447
},
{
"epoch": 0.47313531353135313,
"grad_norm": 0.6054623297072593,
"learning_rate": 1.2644605340818217e-05,
"loss": 1.0157,
"step": 448
},
{
"epoch": 0.4741914191419142,
"grad_norm": 0.7055802594697554,
"learning_rate": 1.2608985274257186e-05,
"loss": 1.0996,
"step": 449
},
{
"epoch": 0.4752475247524752,
"grad_norm": 0.6492420963045018,
"learning_rate": 1.25733296517893e-05,
"loss": 1.0626,
"step": 450
},
{
"epoch": 0.4763036303630363,
"grad_norm": 0.7333388275966733,
"learning_rate": 1.2537638959338336e-05,
"loss": 1.0973,
"step": 451
},
{
"epoch": 0.4773597359735974,
"grad_norm": 0.7320358480481357,
"learning_rate": 1.2501913683306026e-05,
"loss": 1.0731,
"step": 452
},
{
"epoch": 0.4784158415841584,
"grad_norm": 0.6895742285550112,
"learning_rate": 1.24661543105654e-05,
"loss": 1.0481,
"step": 453
},
{
"epoch": 0.47947194719471947,
"grad_norm": 0.7067970914032936,
"learning_rate": 1.243036132845418e-05,
"loss": 1.0627,
"step": 454
},
{
"epoch": 0.48052805280528055,
"grad_norm": 0.6926818694308146,
"learning_rate": 1.2394535224768118e-05,
"loss": 1.0614,
"step": 455
},
{
"epoch": 0.48158415841584157,
"grad_norm": 0.6945585565434185,
"learning_rate": 1.2358676487754355e-05,
"loss": 1.0988,
"step": 456
},
{
"epoch": 0.48264026402640264,
"grad_norm": 0.6604731085051239,
"learning_rate": 1.2322785606104772e-05,
"loss": 1.0964,
"step": 457
},
{
"epoch": 0.4836963696369637,
"grad_norm": 0.6394473053362363,
"learning_rate": 1.2286863068949322e-05,
"loss": 1.0331,
"step": 458
},
{
"epoch": 0.48475247524752474,
"grad_norm": 0.7183964304237911,
"learning_rate": 1.2250909365849366e-05,
"loss": 1.0897,
"step": 459
},
{
"epoch": 0.4858085808580858,
"grad_norm": 0.6579097091730257,
"learning_rate": 1.2214924986791003e-05,
"loss": 1.0908,
"step": 460
},
{
"epoch": 0.4868646864686469,
"grad_norm": 0.6531155391823701,
"learning_rate": 1.2178910422178394e-05,
"loss": 1.1104,
"step": 461
},
{
"epoch": 0.4879207920792079,
"grad_norm": 0.6580245142297171,
"learning_rate": 1.2142866162827074e-05,
"loss": 1.0617,
"step": 462
},
{
"epoch": 0.488976897689769,
"grad_norm": 0.6263777846873808,
"learning_rate": 1.2106792699957264e-05,
"loss": 1.0761,
"step": 463
},
{
"epoch": 0.49003300330033,
"grad_norm": 0.6640583409025862,
"learning_rate": 1.2070690525187186e-05,
"loss": 1.0465,
"step": 464
},
{
"epoch": 0.4910891089108911,
"grad_norm": 0.6748700147111397,
"learning_rate": 1.2034560130526341e-05,
"loss": 1.1002,
"step": 465
},
{
"epoch": 0.49214521452145216,
"grad_norm": 0.6505604114018649,
"learning_rate": 1.1998402008368828e-05,
"loss": 1.0947,
"step": 466
},
{
"epoch": 0.4932013201320132,
"grad_norm": 0.6729660446639165,
"learning_rate": 1.1962216651486624e-05,
"loss": 1.089,
"step": 467
},
{
"epoch": 0.49425742574257425,
"grad_norm": 0.6643910561648775,
"learning_rate": 1.1926004553022861e-05,
"loss": 1.0594,
"step": 468
},
{
"epoch": 0.49531353135313533,
"grad_norm": 0.6992911821661513,
"learning_rate": 1.1889766206485121e-05,
"loss": 1.1025,
"step": 469
},
{
"epoch": 0.49636963696369635,
"grad_norm": 0.6449322373454167,
"learning_rate": 1.1853502105738692e-05,
"loss": 1.0562,
"step": 470
},
{
"epoch": 0.4974257425742574,
"grad_norm": 0.6564386612451293,
"learning_rate": 1.1817212744999848e-05,
"loss": 1.0828,
"step": 471
},
{
"epoch": 0.4984818481848185,
"grad_norm": 0.7001578812716058,
"learning_rate": 1.1780898618829122e-05,
"loss": 1.1166,
"step": 472
},
{
"epoch": 0.4995379537953795,
"grad_norm": 1.52419069466188,
"learning_rate": 1.1744560222124541e-05,
"loss": 1.061,
"step": 473
},
{
"epoch": 0.5005940594059406,
"grad_norm": 0.6512775120226911,
"learning_rate": 1.1708198050114916e-05,
"loss": 1.1128,
"step": 474
},
{
"epoch": 0.5016501650165016,
"grad_norm": 0.6299866616636621,
"learning_rate": 1.1671812598353058e-05,
"loss": 1.0617,
"step": 475
},
{
"epoch": 0.5027062706270627,
"grad_norm": 0.7667550037383974,
"learning_rate": 1.1635404362709044e-05,
"loss": 1.0561,
"step": 476
},
{
"epoch": 0.5037623762376238,
"grad_norm": 0.6790528715380245,
"learning_rate": 1.1598973839363462e-05,
"loss": 1.0862,
"step": 477
},
{
"epoch": 0.5048184818481848,
"grad_norm": 0.6909498811370186,
"learning_rate": 1.1562521524800632e-05,
"loss": 1.0834,
"step": 478
},
{
"epoch": 0.5058745874587459,
"grad_norm": 0.7559493747137904,
"learning_rate": 1.1526047915801863e-05,
"loss": 1.0752,
"step": 479
},
{
"epoch": 0.5069306930693069,
"grad_norm": 0.6776150699452357,
"learning_rate": 1.1489553509438658e-05,
"loss": 1.0854,
"step": 480
},
{
"epoch": 0.507986798679868,
"grad_norm": 0.6532383455079253,
"learning_rate": 1.1453038803065958e-05,
"loss": 1.0704,
"step": 481
},
{
"epoch": 0.5090429042904291,
"grad_norm": 0.7532718213065208,
"learning_rate": 1.1416504294315358e-05,
"loss": 1.0683,
"step": 482
},
{
"epoch": 0.5100990099009901,
"grad_norm": 0.6193789910814789,
"learning_rate": 1.1379950481088318e-05,
"loss": 1.0164,
"step": 483
},
{
"epoch": 0.5111551155115511,
"grad_norm": 0.668810042552327,
"learning_rate": 1.1343377861549394e-05,
"loss": 1.0797,
"step": 484
},
{
"epoch": 0.5122112211221123,
"grad_norm": 0.6380506008747914,
"learning_rate": 1.1306786934119433e-05,
"loss": 1.0731,
"step": 485
},
{
"epoch": 0.5132673267326733,
"grad_norm": 0.6263275212686538,
"learning_rate": 1.1270178197468788e-05,
"loss": 1.0769,
"step": 486
},
{
"epoch": 0.5143234323432343,
"grad_norm": 0.6726331555093921,
"learning_rate": 1.1233552150510523e-05,
"loss": 1.0575,
"step": 487
},
{
"epoch": 0.5153795379537954,
"grad_norm": 0.6454739989020323,
"learning_rate": 1.11969092923936e-05,
"loss": 1.0679,
"step": 488
},
{
"epoch": 0.5164356435643565,
"grad_norm": 0.660210083615381,
"learning_rate": 1.1160250122496106e-05,
"loss": 1.099,
"step": 489
},
{
"epoch": 0.5174917491749175,
"grad_norm": 0.628835234007593,
"learning_rate": 1.1123575140418415e-05,
"loss": 1.0467,
"step": 490
},
{
"epoch": 0.5185478547854786,
"grad_norm": 0.7233986253009854,
"learning_rate": 1.1086884845976396e-05,
"loss": 1.0411,
"step": 491
},
{
"epoch": 0.5196039603960396,
"grad_norm": 0.6549133421776986,
"learning_rate": 1.1050179739194608e-05,
"loss": 1.0915,
"step": 492
},
{
"epoch": 0.5206600660066006,
"grad_norm": 0.658851734463155,
"learning_rate": 1.1013460320299459e-05,
"loss": 1.1102,
"step": 493
},
{
"epoch": 0.5217161716171617,
"grad_norm": 0.6307453013864768,
"learning_rate": 1.0976727089712424e-05,
"loss": 1.0752,
"step": 494
},
{
"epoch": 0.5227722772277228,
"grad_norm": 0.6810468818831488,
"learning_rate": 1.0939980548043194e-05,
"loss": 1.062,
"step": 495
},
{
"epoch": 0.5238283828382838,
"grad_norm": 0.6410648173319452,
"learning_rate": 1.0903221196082874e-05,
"loss": 1.0464,
"step": 496
},
{
"epoch": 0.5248844884488448,
"grad_norm": 0.6105520394215146,
"learning_rate": 1.086644953479715e-05,
"loss": 1.0438,
"step": 497
},
{
"epoch": 0.525940594059406,
"grad_norm": 1.016041467787214,
"learning_rate": 1.0829666065319457e-05,
"loss": 1.1111,
"step": 498
},
{
"epoch": 0.526996699669967,
"grad_norm": 0.6448914910111976,
"learning_rate": 1.0792871288944165e-05,
"loss": 1.0716,
"step": 499
},
{
"epoch": 0.528052805280528,
"grad_norm": 0.6273752088375579,
"learning_rate": 1.075606570711973e-05,
"loss": 1.1054,
"step": 500
},
{
"epoch": 0.5291089108910891,
"grad_norm": 0.6584746278570264,
"learning_rate": 1.0719249821441862e-05,
"loss": 1.1026,
"step": 501
},
{
"epoch": 0.5301650165016502,
"grad_norm": 0.6406394590411313,
"learning_rate": 1.0682424133646712e-05,
"loss": 1.078,
"step": 502
},
{
"epoch": 0.5312211221122112,
"grad_norm": 0.6554297358182956,
"learning_rate": 1.0645589145603998e-05,
"loss": 1.0564,
"step": 503
},
{
"epoch": 0.5322772277227723,
"grad_norm": 0.6712874502294911,
"learning_rate": 1.06087453593102e-05,
"loss": 1.0586,
"step": 504
},
{
"epoch": 0.5333333333333333,
"grad_norm": 0.6425196860025773,
"learning_rate": 1.0571893276881688e-05,
"loss": 1.0429,
"step": 505
},
{
"epoch": 0.5343894389438943,
"grad_norm": 0.6015124699013814,
"learning_rate": 1.05350334005479e-05,
"loss": 1.0955,
"step": 506
},
{
"epoch": 0.5354455445544555,
"grad_norm": 0.6356136162724818,
"learning_rate": 1.04981662326445e-05,
"loss": 1.0844,
"step": 507
},
{
"epoch": 0.5365016501650165,
"grad_norm": 0.6596086796938498,
"learning_rate": 1.0461292275606512e-05,
"loss": 1.0542,
"step": 508
},
{
"epoch": 0.5375577557755775,
"grad_norm": 0.8990225329901785,
"learning_rate": 1.0424412031961485e-05,
"loss": 1.0735,
"step": 509
},
{
"epoch": 0.5386138613861386,
"grad_norm": 0.6220607411553244,
"learning_rate": 1.038752600432265e-05,
"loss": 1.0874,
"step": 510
},
{
"epoch": 0.5396699669966997,
"grad_norm": 0.7063371513993385,
"learning_rate": 1.0350634695382054e-05,
"loss": 1.1192,
"step": 511
},
{
"epoch": 0.5407260726072607,
"grad_norm": 0.6699589084670984,
"learning_rate": 1.031373860790373e-05,
"loss": 1.0457,
"step": 512
},
{
"epoch": 0.5417821782178218,
"grad_norm": 0.6302570320473359,
"learning_rate": 1.0276838244716824e-05,
"loss": 1.0977,
"step": 513
},
{
"epoch": 0.5428382838283828,
"grad_norm": 0.6529430632720555,
"learning_rate": 1.0239934108708762e-05,
"loss": 1.06,
"step": 514
},
{
"epoch": 0.5438943894389439,
"grad_norm": 0.6466205602621441,
"learning_rate": 1.0203026702818383e-05,
"loss": 1.0843,
"step": 515
},
{
"epoch": 0.544950495049505,
"grad_norm": 0.6629230272608421,
"learning_rate": 1.0166116530029082e-05,
"loss": 1.096,
"step": 516
},
{
"epoch": 0.546006600660066,
"grad_norm": 0.6542374389650554,
"learning_rate": 1.0129204093361971e-05,
"loss": 1.0336,
"step": 517
},
{
"epoch": 0.547062706270627,
"grad_norm": 0.6942494889939974,
"learning_rate": 1.0092289895869011e-05,
"loss": 1.1299,
"step": 518
},
{
"epoch": 0.5481188118811882,
"grad_norm": 0.6092827039840173,
"learning_rate": 1.0055374440626162e-05,
"loss": 1.1157,
"step": 519
},
{
"epoch": 0.5491749174917492,
"grad_norm": 0.6373596973625276,
"learning_rate": 1.0018458230726524e-05,
"loss": 1.0807,
"step": 520
},
{
"epoch": 0.5502310231023102,
"grad_norm": 0.7122364704468821,
"learning_rate": 9.981541769273478e-06,
"loss": 1.1283,
"step": 521
},
{
"epoch": 0.5512871287128713,
"grad_norm": 0.622809330454637,
"learning_rate": 9.944625559373841e-06,
"loss": 1.0822,
"step": 522
},
{
"epoch": 0.5523432343234324,
"grad_norm": 0.6267990306163699,
"learning_rate": 9.90771010413099e-06,
"loss": 1.0706,
"step": 523
},
{
"epoch": 0.5533993399339934,
"grad_norm": 0.6754063743427098,
"learning_rate": 9.870795906638032e-06,
"loss": 1.1026,
"step": 524
},
{
"epoch": 0.5544554455445545,
"grad_norm": 0.6479794714724849,
"learning_rate": 9.833883469970924e-06,
"loss": 1.0491,
"step": 525
},
{
"epoch": 0.5555115511551155,
"grad_norm": 0.6278524227889138,
"learning_rate": 9.79697329718162e-06,
"loss": 1.0753,
"step": 526
},
{
"epoch": 0.5565676567656765,
"grad_norm": 0.6187469902783017,
"learning_rate": 9.760065891291241e-06,
"loss": 1.039,
"step": 527
},
{
"epoch": 0.5576237623762377,
"grad_norm": 0.670102175425111,
"learning_rate": 9.723161755283177e-06,
"loss": 1.1003,
"step": 528
},
{
"epoch": 0.5586798679867987,
"grad_norm": 0.6310968994804294,
"learning_rate": 9.686261392096275e-06,
"loss": 1.0854,
"step": 529
},
{
"epoch": 0.5597359735973597,
"grad_norm": 0.6643837973515145,
"learning_rate": 9.649365304617953e-06,
"loss": 1.0633,
"step": 530
},
{
"epoch": 0.5607920792079208,
"grad_norm": 0.6926668841540797,
"learning_rate": 9.612473995677354e-06,
"loss": 1.0754,
"step": 531
},
{
"epoch": 0.5618481848184819,
"grad_norm": 0.6477526851129435,
"learning_rate": 9.57558796803852e-06,
"loss": 1.073,
"step": 532
},
{
"epoch": 0.5629042904290429,
"grad_norm": 0.6378451282041759,
"learning_rate": 9.538707724393491e-06,
"loss": 1.0702,
"step": 533
},
{
"epoch": 0.5639603960396039,
"grad_norm": 0.6590852953187469,
"learning_rate": 9.501833767355502e-06,
"loss": 1.0517,
"step": 534
},
{
"epoch": 0.565016501650165,
"grad_norm": 0.6485001244215962,
"learning_rate": 9.464966599452103e-06,
"loss": 1.0564,
"step": 535
},
{
"epoch": 0.5660726072607261,
"grad_norm": 0.6999941396993619,
"learning_rate": 9.428106723118316e-06,
"loss": 1.0633,
"step": 536
},
{
"epoch": 0.5671287128712871,
"grad_norm": 0.6561912022396205,
"learning_rate": 9.391254640689806e-06,
"loss": 1.0643,
"step": 537
},
{
"epoch": 0.5681848184818482,
"grad_norm": 0.6717639341809902,
"learning_rate": 9.354410854396004e-06,
"loss": 1.0669,
"step": 538
},
{
"epoch": 0.5692409240924092,
"grad_norm": 0.666725215026084,
"learning_rate": 9.317575866353293e-06,
"loss": 1.0513,
"step": 539
},
{
"epoch": 0.5702970297029702,
"grad_norm": 0.6242914943482076,
"learning_rate": 9.28075017855814e-06,
"loss": 1.0872,
"step": 540
},
{
"epoch": 0.5713531353135314,
"grad_norm": 0.8650424834653854,
"learning_rate": 9.243934292880274e-06,
"loss": 1.0486,
"step": 541
},
{
"epoch": 0.5724092409240924,
"grad_norm": 0.6933139314339579,
"learning_rate": 9.20712871105584e-06,
"loss": 1.0313,
"step": 542
},
{
"epoch": 0.5734653465346534,
"grad_norm": 0.6551148415855093,
"learning_rate": 9.170333934680545e-06,
"loss": 1.0808,
"step": 543
},
{
"epoch": 0.5745214521452146,
"grad_norm": 0.6475466469373219,
"learning_rate": 9.133550465202855e-06,
"loss": 1.1091,
"step": 544
},
{
"epoch": 0.5755775577557756,
"grad_norm": 0.6593942569422725,
"learning_rate": 9.096778803917126e-06,
"loss": 1.0855,
"step": 545
},
{
"epoch": 0.5766336633663366,
"grad_norm": 0.6019221254726662,
"learning_rate": 9.06001945195681e-06,
"loss": 1.0518,
"step": 546
},
{
"epoch": 0.5776897689768977,
"grad_norm": 0.6225223543658038,
"learning_rate": 9.023272910287581e-06,
"loss": 1.0677,
"step": 547
},
{
"epoch": 0.5787458745874587,
"grad_norm": 0.6443907146340897,
"learning_rate": 8.986539679700543e-06,
"loss": 1.0926,
"step": 548
},
{
"epoch": 0.5798019801980198,
"grad_norm": 0.6552607126455486,
"learning_rate": 8.949820260805397e-06,
"loss": 1.0589,
"step": 549
},
{
"epoch": 0.5808580858085809,
"grad_norm": 0.6282256020658356,
"learning_rate": 8.913115154023604e-06,
"loss": 1.0866,
"step": 550
},
{
"epoch": 0.5819141914191419,
"grad_norm": 0.6493815083462167,
"learning_rate": 8.876424859581589e-06,
"loss": 1.0666,
"step": 551
},
{
"epoch": 0.5829702970297029,
"grad_norm": 0.7028490811889769,
"learning_rate": 8.839749877503899e-06,
"loss": 1.008,
"step": 552
},
{
"epoch": 0.5840264026402641,
"grad_norm": 0.6772559411479638,
"learning_rate": 8.803090707606403e-06,
"loss": 1.0853,
"step": 553
},
{
"epoch": 0.5850825082508251,
"grad_norm": 0.6526762052822437,
"learning_rate": 8.766447849489483e-06,
"loss": 1.1011,
"step": 554
},
{
"epoch": 0.5861386138613861,
"grad_norm": 0.6245726753579093,
"learning_rate": 8.729821802531213e-06,
"loss": 1.0451,
"step": 555
},
{
"epoch": 0.5871947194719472,
"grad_norm": 0.692008544739748,
"learning_rate": 8.693213065880568e-06,
"loss": 1.1019,
"step": 556
},
{
"epoch": 0.5882508250825083,
"grad_norm": 0.6471685295777168,
"learning_rate": 8.656622138450606e-06,
"loss": 1.0468,
"step": 557
},
{
"epoch": 0.5893069306930693,
"grad_norm": 0.6839589883882217,
"learning_rate": 8.620049518911683e-06,
"loss": 1.1174,
"step": 558
},
{
"epoch": 0.5903630363036304,
"grad_norm": 0.628833588880883,
"learning_rate": 8.583495705684647e-06,
"loss": 1.0969,
"step": 559
},
{
"epoch": 0.5914191419141914,
"grad_norm": 0.6348537469397999,
"learning_rate": 8.546961196934043e-06,
"loss": 1.0958,
"step": 560
},
{
"epoch": 0.5924752475247524,
"grad_norm": 1.0987589456999174,
"learning_rate": 8.510446490561344e-06,
"loss": 1.0567,
"step": 561
},
{
"epoch": 0.5935313531353136,
"grad_norm": 0.6354900524306181,
"learning_rate": 8.473952084198139e-06,
"loss": 1.0371,
"step": 562
},
{
"epoch": 0.5945874587458746,
"grad_norm": 0.7632801462181814,
"learning_rate": 8.43747847519937e-06,
"loss": 1.0942,
"step": 563
},
{
"epoch": 0.5956435643564356,
"grad_norm": 0.5992996830750966,
"learning_rate": 8.401026160636543e-06,
"loss": 1.0628,
"step": 564
},
{
"epoch": 0.5966996699669967,
"grad_norm": 0.597960232873711,
"learning_rate": 8.364595637290958e-06,
"loss": 1.0542,
"step": 565
},
{
"epoch": 0.5977557755775578,
"grad_norm": 0.6566424931719332,
"learning_rate": 8.328187401646947e-06,
"loss": 1.0556,
"step": 566
},
{
"epoch": 0.5988118811881188,
"grad_norm": 0.636839567352727,
"learning_rate": 8.291801949885085e-06,
"loss": 1.044,
"step": 567
},
{
"epoch": 0.5998679867986799,
"grad_norm": 0.6442680888084503,
"learning_rate": 8.25543977787546e-06,
"loss": 1.098,
"step": 568
},
{
"epoch": 0.6009240924092409,
"grad_norm": 0.6238064489974741,
"learning_rate": 8.21910138117088e-06,
"loss": 1.0367,
"step": 569
},
{
"epoch": 0.601980198019802,
"grad_norm": 0.6601045221799112,
"learning_rate": 8.182787255000155e-06,
"loss": 1.0634,
"step": 570
},
{
"epoch": 0.6030363036303631,
"grad_norm": 0.65229448316619,
"learning_rate": 8.146497894261313e-06,
"loss": 1.0795,
"step": 571
},
{
"epoch": 0.6040924092409241,
"grad_norm": 0.6072161377230995,
"learning_rate": 8.11023379351488e-06,
"loss": 0.9894,
"step": 572
},
{
"epoch": 0.6051485148514851,
"grad_norm": 0.6361463050351425,
"learning_rate": 8.07399544697714e-06,
"loss": 1.0714,
"step": 573
},
{
"epoch": 0.6062046204620462,
"grad_norm": 0.6723793588965653,
"learning_rate": 8.037783348513376e-06,
"loss": 1.0756,
"step": 574
},
{
"epoch": 0.6072607260726073,
"grad_norm": 0.6581626261505407,
"learning_rate": 8.001597991631175e-06,
"loss": 1.0459,
"step": 575
},
{
"epoch": 0.6083168316831683,
"grad_norm": 0.6897804522659089,
"learning_rate": 7.965439869473664e-06,
"loss": 1.0945,
"step": 576
},
{
"epoch": 0.6093729372937293,
"grad_norm": 0.6058459648411084,
"learning_rate": 7.929309474812818e-06,
"loss": 1.0428,
"step": 577
},
{
"epoch": 0.6104290429042905,
"grad_norm": 0.6284006466858604,
"learning_rate": 7.89320730004274e-06,
"loss": 1.063,
"step": 578
},
{
"epoch": 0.6114851485148515,
"grad_norm": 0.6457154650687478,
"learning_rate": 7.857133837172928e-06,
"loss": 1.0938,
"step": 579
},
{
"epoch": 0.6125412541254125,
"grad_norm": 0.6387228368627329,
"learning_rate": 7.82108957782161e-06,
"loss": 1.1058,
"step": 580
},
{
"epoch": 0.6135973597359736,
"grad_norm": 0.641340459873197,
"learning_rate": 7.785075013209e-06,
"loss": 1.0626,
"step": 581
},
{
"epoch": 0.6146534653465346,
"grad_norm": 0.6633774129559028,
"learning_rate": 7.749090634150636e-06,
"loss": 1.0527,
"step": 582
},
{
"epoch": 0.6157095709570957,
"grad_norm": 0.6214574701982329,
"learning_rate": 7.71313693105068e-06,
"loss": 1.0989,
"step": 583
},
{
"epoch": 0.6167656765676568,
"grad_norm": 0.640402254130247,
"learning_rate": 7.677214393895228e-06,
"loss": 1.0357,
"step": 584
},
{
"epoch": 0.6178217821782178,
"grad_norm": 0.7113797275498897,
"learning_rate": 7.641323512245647e-06,
"loss": 1.064,
"step": 585
},
{
"epoch": 0.6188778877887788,
"grad_norm": 0.5883095590159562,
"learning_rate": 7.605464775231885e-06,
"loss": 1.0774,
"step": 586
},
{
"epoch": 0.61993399339934,
"grad_norm": 0.6242214040477823,
"learning_rate": 7.569638671545823e-06,
"loss": 1.0754,
"step": 587
},
{
"epoch": 0.620990099009901,
"grad_norm": 0.6224016879726396,
"learning_rate": 7.533845689434602e-06,
"loss": 1.0806,
"step": 588
},
{
"epoch": 0.622046204620462,
"grad_norm": 0.6348552630665582,
"learning_rate": 7.498086316693977e-06,
"loss": 1.0896,
"step": 589
},
{
"epoch": 0.6231023102310231,
"grad_norm": 0.6085659355582564,
"learning_rate": 7.4623610406616674e-06,
"loss": 1.0584,
"step": 590
},
{
"epoch": 0.6241584158415842,
"grad_norm": 0.6097927597171036,
"learning_rate": 7.426670348210703e-06,
"loss": 1.0789,
"step": 591
},
{
"epoch": 0.6252145214521452,
"grad_norm": 0.5966109084143094,
"learning_rate": 7.3910147257428165e-06,
"loss": 1.0902,
"step": 592
},
{
"epoch": 0.6262706270627063,
"grad_norm": 0.6636998774487679,
"learning_rate": 7.355394659181787e-06,
"loss": 1.0852,
"step": 593
},
{
"epoch": 0.6273267326732673,
"grad_norm": 0.5947998782897114,
"learning_rate": 7.3198106339668285e-06,
"loss": 1.0245,
"step": 594
},
{
"epoch": 0.6283828382838283,
"grad_norm": 0.6271416063227222,
"learning_rate": 7.284263135045986e-06,
"loss": 1.0845,
"step": 595
},
{
"epoch": 0.6294389438943895,
"grad_norm": 0.6272277845299414,
"learning_rate": 7.248752646869495e-06,
"loss": 1.0834,
"step": 596
},
{
"epoch": 0.6304950495049505,
"grad_norm": 0.6568219702638607,
"learning_rate": 7.21327965338322e-06,
"loss": 1.072,
"step": 597
},
{
"epoch": 0.6315511551155115,
"grad_norm": 0.6099089406781558,
"learning_rate": 7.177844638022026e-06,
"loss": 1.0431,
"step": 598
},
{
"epoch": 0.6326072607260727,
"grad_norm": 0.6624644094145423,
"learning_rate": 7.142448083703204e-06,
"loss": 1.041,
"step": 599
},
{
"epoch": 0.6336633663366337,
"grad_norm": 0.6427543195515124,
"learning_rate": 7.107090472819895e-06,
"loss": 1.0809,
"step": 600
},
{
"epoch": 0.6347194719471947,
"grad_norm": 0.6366838117669377,
"learning_rate": 7.071772287234497e-06,
"loss": 1.0392,
"step": 601
},
{
"epoch": 0.6357755775577558,
"grad_norm": 0.6043301414836174,
"learning_rate": 7.036494008272124e-06,
"loss": 1.0992,
"step": 602
},
{
"epoch": 0.6368316831683168,
"grad_norm": 0.6217046349775222,
"learning_rate": 7.0012561167140215e-06,
"loss": 1.0268,
"step": 603
},
{
"epoch": 0.6378877887788779,
"grad_norm": 0.6299687652588752,
"learning_rate": 6.966059092791033e-06,
"loss": 1.0399,
"step": 604
},
{
"epoch": 0.638943894389439,
"grad_norm": 0.5836422956194852,
"learning_rate": 6.930903416177044e-06,
"loss": 1.0642,
"step": 605
},
{
"epoch": 0.64,
"grad_norm": 0.6193959383560707,
"learning_rate": 6.8957895659824424e-06,
"loss": 1.0208,
"step": 606
},
{
"epoch": 0.641056105610561,
"grad_norm": 0.6428907248857738,
"learning_rate": 6.86071802074761e-06,
"loss": 1.0863,
"step": 607
},
{
"epoch": 0.6421122112211222,
"grad_norm": 0.613700580260683,
"learning_rate": 6.82568925843637e-06,
"loss": 1.0548,
"step": 608
},
{
"epoch": 0.6431683168316832,
"grad_norm": 0.6793420964767515,
"learning_rate": 6.790703756429503e-06,
"loss": 1.0827,
"step": 609
},
{
"epoch": 0.6442244224422442,
"grad_norm": 0.6373696560243863,
"learning_rate": 6.7557619915182195e-06,
"loss": 1.0806,
"step": 610
},
{
"epoch": 0.6452805280528053,
"grad_norm": 0.5885138660061023,
"learning_rate": 6.720864439897667e-06,
"loss": 1.0489,
"step": 611
},
{
"epoch": 0.6463366336633664,
"grad_norm": 0.6240799958641428,
"learning_rate": 6.686011577160451e-06,
"loss": 1.0794,
"step": 612
},
{
"epoch": 0.6473927392739274,
"grad_norm": 0.6447109853779157,
"learning_rate": 6.651203878290139e-06,
"loss": 1.0481,
"step": 613
},
{
"epoch": 0.6484488448844884,
"grad_norm": 0.6163667666057177,
"learning_rate": 6.616441817654802e-06,
"loss": 1.0855,
"step": 614
},
{
"epoch": 0.6495049504950495,
"grad_norm": 0.6302669796473992,
"learning_rate": 6.581725869000536e-06,
"loss": 1.0737,
"step": 615
},
{
"epoch": 0.6505610561056105,
"grad_norm": 0.6308065398704275,
"learning_rate": 6.547056505445006e-06,
"loss": 1.039,
"step": 616
},
{
"epoch": 0.6516171617161716,
"grad_norm": 0.690539777018884,
"learning_rate": 6.512434199471016e-06,
"loss": 1.0804,
"step": 617
},
{
"epoch": 0.6526732673267327,
"grad_norm": 0.6733652331633804,
"learning_rate": 6.477859422920046e-06,
"loss": 1.0373,
"step": 618
},
{
"epoch": 0.6537293729372937,
"grad_norm": 0.6831160523150637,
"learning_rate": 6.4433326469858445e-06,
"loss": 1.1081,
"step": 619
},
{
"epoch": 0.6547854785478547,
"grad_norm": 0.6349575305293349,
"learning_rate": 6.408854342207983e-06,
"loss": 1.058,
"step": 620
},
{
"epoch": 0.6558415841584159,
"grad_norm": 0.5999091978743029,
"learning_rate": 6.374424978465467e-06,
"loss": 1.0199,
"step": 621
},
{
"epoch": 0.6568976897689769,
"grad_norm": 0.6089363106148836,
"learning_rate": 6.340045024970316e-06,
"loss": 1.0984,
"step": 622
},
{
"epoch": 0.6579537953795379,
"grad_norm": 0.6392504119805615,
"learning_rate": 6.305714950261168e-06,
"loss": 1.0571,
"step": 623
},
{
"epoch": 0.659009900990099,
"grad_norm": 0.5914840827087966,
"learning_rate": 6.2714352221969155e-06,
"loss": 1.067,
"step": 624
},
{
"epoch": 0.6600660066006601,
"grad_norm": 0.7219132711851788,
"learning_rate": 6.237206307950298e-06,
"loss": 1.0543,
"step": 625
},
{
"epoch": 0.6611221122112211,
"grad_norm": 0.6370208283754883,
"learning_rate": 6.203028674001568e-06,
"loss": 1.0919,
"step": 626
},
{
"epoch": 0.6621782178217822,
"grad_norm": 0.6194951584096007,
"learning_rate": 6.168902786132105e-06,
"loss": 1.0557,
"step": 627
},
{
"epoch": 0.6632343234323432,
"grad_norm": 0.6191095430164476,
"learning_rate": 6.1348291094180766e-06,
"loss": 1.065,
"step": 628
},
{
"epoch": 0.6642904290429043,
"grad_norm": 0.6146874885292317,
"learning_rate": 6.10080810822412e-06,
"loss": 1.0445,
"step": 629
},
{
"epoch": 0.6653465346534654,
"grad_norm": 0.7455943723312123,
"learning_rate": 6.0668402461969815e-06,
"loss": 1.0479,
"step": 630
},
{
"epoch": 0.6664026402640264,
"grad_norm": 0.6040612196637005,
"learning_rate": 6.032925986259224e-06,
"loss": 1.079,
"step": 631
},
{
"epoch": 0.6674587458745874,
"grad_norm": 0.6447777341454854,
"learning_rate": 5.9990657906029025e-06,
"loss": 1.0616,
"step": 632
},
{
"epoch": 0.6685148514851486,
"grad_norm": 0.6018104864850141,
"learning_rate": 5.965260120683265e-06,
"loss": 1.0701,
"step": 633
},
{
"epoch": 0.6695709570957096,
"grad_norm": 0.6477856132604624,
"learning_rate": 5.931509437212483e-06,
"loss": 1.0825,
"step": 634
},
{
"epoch": 0.6706270627062706,
"grad_norm": 0.6281016258607297,
"learning_rate": 5.897814200153349e-06,
"loss": 1.0839,
"step": 635
},
{
"epoch": 0.6716831683168317,
"grad_norm": 0.6302624934522487,
"learning_rate": 5.864174868713018e-06,
"loss": 1.0298,
"step": 636
},
{
"epoch": 0.6727392739273927,
"grad_norm": 0.6540549922471253,
"learning_rate": 5.830591901336753e-06,
"loss": 1.0611,
"step": 637
},
{
"epoch": 0.6737953795379538,
"grad_norm": 0.6129993818144718,
"learning_rate": 5.7970657557016706e-06,
"loss": 1.0517,
"step": 638
},
{
"epoch": 0.6748514851485149,
"grad_norm": 0.604212659124076,
"learning_rate": 5.763596888710513e-06,
"loss": 1.0367,
"step": 639
},
{
"epoch": 0.6759075907590759,
"grad_norm": 0.6143148089353159,
"learning_rate": 5.730185756485396e-06,
"loss": 1.0088,
"step": 640
},
{
"epoch": 0.6769636963696369,
"grad_norm": 0.5885458233445019,
"learning_rate": 5.696832814361635e-06,
"loss": 1.0372,
"step": 641
},
{
"epoch": 0.6780198019801981,
"grad_norm": 0.6242123362716384,
"learning_rate": 5.663538516881503e-06,
"loss": 1.0529,
"step": 642
},
{
"epoch": 0.6790759075907591,
"grad_norm": 0.64430708778126,
"learning_rate": 5.6303033177880525e-06,
"loss": 1.0584,
"step": 643
},
{
"epoch": 0.6801320132013201,
"grad_norm": 0.606541726445129,
"learning_rate": 5.597127670018927e-06,
"loss": 1.1114,
"step": 644
},
{
"epoch": 0.6811881188118812,
"grad_norm": 0.6316054222756604,
"learning_rate": 5.564012025700185e-06,
"loss": 1.0471,
"step": 645
},
{
"epoch": 0.6822442244224423,
"grad_norm": 0.598654438361347,
"learning_rate": 5.53095683614016e-06,
"loss": 1.0435,
"step": 646
},
{
"epoch": 0.6833003300330033,
"grad_norm": 0.6820604647126184,
"learning_rate": 5.497962551823266e-06,
"loss": 1.0866,
"step": 647
},
{
"epoch": 0.6843564356435644,
"grad_norm": 0.6386719190110759,
"learning_rate": 5.465029622403912e-06,
"loss": 1.0634,
"step": 648
},
{
"epoch": 0.6854125412541254,
"grad_norm": 0.593078417726103,
"learning_rate": 5.432158496700329e-06,
"loss": 1.012,
"step": 649
},
{
"epoch": 0.6864686468646864,
"grad_norm": 0.5921222365106154,
"learning_rate": 5.399349622688479e-06,
"loss": 1.0212,
"step": 650
},
{
"epoch": 0.6875247524752475,
"grad_norm": 0.6278487259824775,
"learning_rate": 5.366603447495942e-06,
"loss": 1.0671,
"step": 651
},
{
"epoch": 0.6885808580858086,
"grad_norm": 0.6048779311240523,
"learning_rate": 5.333920417395817e-06,
"loss": 1.0493,
"step": 652
},
{
"epoch": 0.6896369636963696,
"grad_norm": 0.5791853596604644,
"learning_rate": 5.3013009778006545e-06,
"loss": 1.1017,
"step": 653
},
{
"epoch": 0.6906930693069306,
"grad_norm": 0.61604028592432,
"learning_rate": 5.2687455732563665e-06,
"loss": 1.0674,
"step": 654
},
{
"epoch": 0.6917491749174918,
"grad_norm": 0.6280675549913735,
"learning_rate": 5.236254647436196e-06,
"loss": 1.0581,
"step": 655
},
{
"epoch": 0.6928052805280528,
"grad_norm": 0.6219099217802987,
"learning_rate": 5.203828643134643e-06,
"loss": 1.0463,
"step": 656
},
{
"epoch": 0.6938613861386138,
"grad_norm": 0.5920143059830639,
"learning_rate": 5.171468002261431e-06,
"loss": 1.0457,
"step": 657
},
{
"epoch": 0.6949174917491749,
"grad_norm": 0.6003415819321816,
"learning_rate": 5.139173165835514e-06,
"loss": 1.1186,
"step": 658
},
{
"epoch": 0.695973597359736,
"grad_norm": 0.6035994016011499,
"learning_rate": 5.106944573979034e-06,
"loss": 1.0535,
"step": 659
},
{
"epoch": 0.697029702970297,
"grad_norm": 0.5982854504892641,
"learning_rate": 5.074782665911341e-06,
"loss": 1.0445,
"step": 660
},
{
"epoch": 0.6980858085808581,
"grad_norm": 0.6071022064113362,
"learning_rate": 5.042687879942996e-06,
"loss": 1.0698,
"step": 661
},
{
"epoch": 0.6991419141914191,
"grad_norm": 0.6108513576377169,
"learning_rate": 5.01066065346981e-06,
"loss": 1.0723,
"step": 662
},
{
"epoch": 0.7001980198019802,
"grad_norm": 0.6038528550708323,
"learning_rate": 4.978701422966868e-06,
"loss": 1.0882,
"step": 663
},
{
"epoch": 0.7012541254125413,
"grad_norm": 0.6050857951969918,
"learning_rate": 4.946810623982595e-06,
"loss": 1.0461,
"step": 664
},
{
"epoch": 0.7023102310231023,
"grad_norm": 0.6176305312088001,
"learning_rate": 4.914988691132816e-06,
"loss": 1.071,
"step": 665
},
{
"epoch": 0.7033663366336633,
"grad_norm": 0.6038971458354055,
"learning_rate": 4.883236058094825e-06,
"loss": 1.0653,
"step": 666
},
{
"epoch": 0.7044224422442245,
"grad_norm": 0.636619361060515,
"learning_rate": 4.851553157601484e-06,
"loss": 1.0774,
"step": 667
},
{
"epoch": 0.7054785478547855,
"grad_norm": 0.6672994998650077,
"learning_rate": 4.819940421435321e-06,
"loss": 1.0831,
"step": 668
},
{
"epoch": 0.7065346534653465,
"grad_norm": 0.5785523434515408,
"learning_rate": 4.788398280422646e-06,
"loss": 1.0386,
"step": 669
},
{
"epoch": 0.7075907590759076,
"grad_norm": 0.6288939432817907,
"learning_rate": 4.756927164427685e-06,
"loss": 1.0351,
"step": 670
},
{
"epoch": 0.7086468646864686,
"grad_norm": 0.594766016694064,
"learning_rate": 4.725527502346708e-06,
"loss": 1.0196,
"step": 671
},
{
"epoch": 0.7097029702970297,
"grad_norm": 0.621443790986153,
"learning_rate": 4.694199722102213e-06,
"loss": 1.0878,
"step": 672
},
{
"epoch": 0.7107590759075908,
"grad_norm": 0.6245088291804015,
"learning_rate": 4.6629442506370485e-06,
"loss": 1.0901,
"step": 673
},
{
"epoch": 0.7118151815181518,
"grad_norm": 0.6050968511957825,
"learning_rate": 4.631761513908632e-06,
"loss": 1.0593,
"step": 674
},
{
"epoch": 0.7128712871287128,
"grad_norm": 0.6089174625929747,
"learning_rate": 4.600651936883143e-06,
"loss": 1.0314,
"step": 675
},
{
"epoch": 0.713927392739274,
"grad_norm": 0.601538918146824,
"learning_rate": 4.569615943529709e-06,
"loss": 1.0367,
"step": 676
},
{
"epoch": 0.714983498349835,
"grad_norm": 0.6130630086776728,
"learning_rate": 4.538653956814647e-06,
"loss": 1.0784,
"step": 677
},
{
"epoch": 0.716039603960396,
"grad_norm": 0.6018622537506033,
"learning_rate": 4.507766398695691e-06,
"loss": 1.0803,
"step": 678
},
{
"epoch": 0.7170957095709571,
"grad_norm": 0.6139029093379007,
"learning_rate": 4.476953690116245e-06,
"loss": 1.0998,
"step": 679
},
{
"epoch": 0.7181518151815182,
"grad_norm": 0.6159883281320775,
"learning_rate": 4.446216250999641e-06,
"loss": 1.0509,
"step": 680
},
{
"epoch": 0.7192079207920792,
"grad_norm": 0.5923867864910479,
"learning_rate": 4.4155545002434206e-06,
"loss": 1.072,
"step": 681
},
{
"epoch": 0.7202640264026403,
"grad_norm": 0.6238896074841173,
"learning_rate": 4.3849688557136385e-06,
"loss": 1.0665,
"step": 682
},
{
"epoch": 0.7213201320132013,
"grad_norm": 0.5919918842107508,
"learning_rate": 4.354459734239128e-06,
"loss": 1.0381,
"step": 683
},
{
"epoch": 0.7223762376237624,
"grad_norm": 0.591363543205708,
"learning_rate": 4.3240275516058735e-06,
"loss": 1.0071,
"step": 684
},
{
"epoch": 0.7234323432343235,
"grad_norm": 0.6076015714210775,
"learning_rate": 4.293672722551303e-06,
"loss": 1.0316,
"step": 685
},
{
"epoch": 0.7244884488448845,
"grad_norm": 0.6151963236298081,
"learning_rate": 4.263395660758653e-06,
"loss": 1.0811,
"step": 686
},
{
"epoch": 0.7255445544554455,
"grad_norm": 0.6111759089018257,
"learning_rate": 4.2331967788513295e-06,
"loss": 1.0572,
"step": 687
},
{
"epoch": 0.7266006600660067,
"grad_norm": 0.615165230198708,
"learning_rate": 4.20307648838728e-06,
"loss": 1.0886,
"step": 688
},
{
"epoch": 0.7276567656765677,
"grad_norm": 0.6047097053325939,
"learning_rate": 4.173035199853401e-06,
"loss": 1.0568,
"step": 689
},
{
"epoch": 0.7287128712871287,
"grad_norm": 0.6351096674096501,
"learning_rate": 4.143073322659912e-06,
"loss": 1.1052,
"step": 690
},
{
"epoch": 0.7297689768976897,
"grad_norm": 0.5957490816545931,
"learning_rate": 4.113191265134801e-06,
"loss": 1.035,
"step": 691
},
{
"epoch": 0.7308250825082508,
"grad_norm": 0.5903977649680111,
"learning_rate": 4.083389434518268e-06,
"loss": 1.0567,
"step": 692
},
{
"epoch": 0.7318811881188119,
"grad_norm": 0.6152706663632109,
"learning_rate": 4.053668236957135e-06,
"loss": 1.0337,
"step": 693
},
{
"epoch": 0.7329372937293729,
"grad_norm": 0.6314325827672514,
"learning_rate": 4.024028077499359e-06,
"loss": 1.0443,
"step": 694
},
{
"epoch": 0.733993399339934,
"grad_norm": 0.6060155110249291,
"learning_rate": 3.994469360088479e-06,
"loss": 1.083,
"step": 695
},
{
"epoch": 0.735049504950495,
"grad_norm": 0.5891538433027292,
"learning_rate": 3.964992487558122e-06,
"loss": 1.0007,
"step": 696
},
{
"epoch": 0.7361056105610561,
"grad_norm": 0.5803287392110636,
"learning_rate": 3.9355978616265145e-06,
"loss": 1.0309,
"step": 697
},
{
"epoch": 0.7371617161716172,
"grad_norm": 0.5832902011164153,
"learning_rate": 3.906285882890999e-06,
"loss": 1.0647,
"step": 698
},
{
"epoch": 0.7382178217821782,
"grad_norm": 0.5926749443120413,
"learning_rate": 3.877056950822598e-06,
"loss": 1.0445,
"step": 699
},
{
"epoch": 0.7392739273927392,
"grad_norm": 0.6045206511356436,
"learning_rate": 3.847911463760529e-06,
"loss": 1.0667,
"step": 700
},
{
"epoch": 0.7403300330033004,
"grad_norm": 0.5867571705607466,
"learning_rate": 3.8188498189068215e-06,
"loss": 1.0673,
"step": 701
},
{
"epoch": 0.7413861386138614,
"grad_norm": 0.6634763933031027,
"learning_rate": 3.7898724123208754e-06,
"loss": 1.0527,
"step": 702
},
{
"epoch": 0.7424422442244224,
"grad_norm": 0.5725149569564073,
"learning_rate": 3.7609796389140562e-06,
"loss": 1.0754,
"step": 703
},
{
"epoch": 0.7434983498349835,
"grad_norm": 0.6185726548354541,
"learning_rate": 3.7321718924443507e-06,
"loss": 1.0208,
"step": 704
},
{
"epoch": 0.7445544554455445,
"grad_norm": 0.5892594403032279,
"learning_rate": 3.703449565510964e-06,
"loss": 1.0506,
"step": 705
},
{
"epoch": 0.7456105610561056,
"grad_norm": 0.6023158477476279,
"learning_rate": 3.674813049548982e-06,
"loss": 1.0558,
"step": 706
},
{
"epoch": 0.7466666666666667,
"grad_norm": 0.584136523450935,
"learning_rate": 3.646262734824042e-06,
"loss": 1.0765,
"step": 707
},
{
"epoch": 0.7477227722772277,
"grad_norm": 0.5780157896711601,
"learning_rate": 3.617799010427001e-06,
"loss": 1.0517,
"step": 708
},
{
"epoch": 0.7487788778877887,
"grad_norm": 0.6100725649589752,
"learning_rate": 3.5894222642686596e-06,
"loss": 1.056,
"step": 709
},
{
"epoch": 0.7498349834983499,
"grad_norm": 0.5872621578262758,
"learning_rate": 3.5611328830744276e-06,
"loss": 1.0837,
"step": 710
},
{
"epoch": 0.7508910891089109,
"grad_norm": 0.6145463464332913,
"learning_rate": 3.5329312523791137e-06,
"loss": 1.057,
"step": 711
},
{
"epoch": 0.7519471947194719,
"grad_norm": 0.6030205938418854,
"learning_rate": 3.5048177565216236e-06,
"loss": 1.0143,
"step": 712
},
{
"epoch": 0.753003300330033,
"grad_norm": 0.5768502910899453,
"learning_rate": 3.4767927786397358e-06,
"loss": 1.0533,
"step": 713
},
{
"epoch": 0.7540594059405941,
"grad_norm": 3.7575040186745525,
"learning_rate": 3.4488567006648986e-06,
"loss": 1.0595,
"step": 714
},
{
"epoch": 0.7551155115511551,
"grad_norm": 0.609044952801109,
"learning_rate": 3.4210099033169987e-06,
"loss": 1.0633,
"step": 715
},
{
"epoch": 0.7561716171617162,
"grad_norm": 0.5748622010795755,
"learning_rate": 3.3932527660991877e-06,
"loss": 1.0486,
"step": 716
},
{
"epoch": 0.7572277227722772,
"grad_norm": 0.5964982728687123,
"learning_rate": 3.365585667292702e-06,
"loss": 1.0554,
"step": 717
},
{
"epoch": 0.7582838283828383,
"grad_norm": 0.6136617198344335,
"learning_rate": 3.338008983951724e-06,
"loss": 1.0521,
"step": 718
},
{
"epoch": 0.7593399339933994,
"grad_norm": 0.58637405304574,
"learning_rate": 3.310523091898221e-06,
"loss": 1.0514,
"step": 719
},
{
"epoch": 0.7603960396039604,
"grad_norm": 0.5892089407154528,
"learning_rate": 3.2831283657168277e-06,
"loss": 1.018,
"step": 720
},
{
"epoch": 0.7614521452145214,
"grad_norm": 0.5869150442861135,
"learning_rate": 3.2558251787497663e-06,
"loss": 1.0645,
"step": 721
},
{
"epoch": 0.7625082508250826,
"grad_norm": 0.5938985456413898,
"learning_rate": 3.228613903091723e-06,
"loss": 1.0439,
"step": 722
},
{
"epoch": 0.7635643564356436,
"grad_norm": 0.6147593406961209,
"learning_rate": 3.2014949095848024e-06,
"loss": 1.0611,
"step": 723
},
{
"epoch": 0.7646204620462046,
"grad_norm": 0.6009442772412664,
"learning_rate": 3.174468567813461e-06,
"loss": 1.0619,
"step": 724
},
{
"epoch": 0.7656765676567657,
"grad_norm": 0.6189646665909346,
"learning_rate": 3.1475352460994744e-06,
"loss": 1.0457,
"step": 725
},
{
"epoch": 0.7667326732673267,
"grad_norm": 0.5855531587740079,
"learning_rate": 3.1206953114969196e-06,
"loss": 1.0845,
"step": 726
},
{
"epoch": 0.7677887788778878,
"grad_norm": 0.6082406460765348,
"learning_rate": 3.093949129787165e-06,
"loss": 1.0656,
"step": 727
},
{
"epoch": 0.7688448844884489,
"grad_norm": 0.5942907527831861,
"learning_rate": 3.067297065473902e-06,
"loss": 1.0799,
"step": 728
},
{
"epoch": 0.7699009900990099,
"grad_norm": 0.6054554888092444,
"learning_rate": 3.0407394817781598e-06,
"loss": 1.0432,
"step": 729
},
{
"epoch": 0.7709570957095709,
"grad_norm": 0.607575682373182,
"learning_rate": 3.014276740633352e-06,
"loss": 1.0651,
"step": 730
},
{
"epoch": 0.772013201320132,
"grad_norm": 0.6120427051057853,
"learning_rate": 2.9879092026803736e-06,
"loss": 1.0821,
"step": 731
},
{
"epoch": 0.7730693069306931,
"grad_norm": 0.5965105297898173,
"learning_rate": 2.961637227262655e-06,
"loss": 1.0638,
"step": 732
},
{
"epoch": 0.7741254125412541,
"grad_norm": 0.5894087721196123,
"learning_rate": 2.9354611724212768e-06,
"loss": 1.0735,
"step": 733
},
{
"epoch": 0.7751815181518151,
"grad_norm": 0.5990070602676681,
"learning_rate": 2.9093813948900886e-06,
"loss": 1.0643,
"step": 734
},
{
"epoch": 0.7762376237623763,
"grad_norm": 0.6181626705963309,
"learning_rate": 2.883398250090861e-06,
"loss": 1.0554,
"step": 735
},
{
"epoch": 0.7772937293729373,
"grad_norm": 0.5967245281971223,
"learning_rate": 2.8575120921284115e-06,
"loss": 1.0602,
"step": 736
},
{
"epoch": 0.7783498349834983,
"grad_norm": 0.5825057676710558,
"learning_rate": 2.8317232737858034e-06,
"loss": 1.0095,
"step": 737
},
{
"epoch": 0.7794059405940594,
"grad_norm": 0.6142662225207307,
"learning_rate": 2.8060321465195406e-06,
"loss": 1.0704,
"step": 738
},
{
"epoch": 0.7804620462046205,
"grad_norm": 0.6038406282040624,
"learning_rate": 2.780439060454756e-06,
"loss": 1.1077,
"step": 739
},
{
"epoch": 0.7815181518151815,
"grad_norm": 0.6265967960214729,
"learning_rate": 2.7549443643804585e-06,
"loss": 1.0542,
"step": 740
},
{
"epoch": 0.7825742574257426,
"grad_norm": 0.6393551269033466,
"learning_rate": 2.7295484057447707e-06,
"loss": 1.0924,
"step": 741
},
{
"epoch": 0.7836303630363036,
"grad_norm": 0.6172915306093409,
"learning_rate": 2.7042515306501992e-06,
"loss": 1.0714,
"step": 742
},
{
"epoch": 0.7846864686468646,
"grad_norm": 0.6258825721645193,
"learning_rate": 2.6790540838489132e-06,
"loss": 1.0762,
"step": 743
},
{
"epoch": 0.7857425742574258,
"grad_norm": 0.6256389234763432,
"learning_rate": 2.6539564087380454e-06,
"loss": 1.092,
"step": 744
},
{
"epoch": 0.7867986798679868,
"grad_norm": 0.5946339465084223,
"learning_rate": 2.628958847355029e-06,
"loss": 1.0209,
"step": 745
},
{
"epoch": 0.7878547854785478,
"grad_norm": 0.6099146580993546,
"learning_rate": 2.6040617403728985e-06,
"loss": 1.0477,
"step": 746
},
{
"epoch": 0.788910891089109,
"grad_norm": 0.6074085846935977,
"learning_rate": 2.579265427095692e-06,
"loss": 1.0572,
"step": 747
},
{
"epoch": 0.78996699669967,
"grad_norm": 0.6060174166337768,
"learning_rate": 2.5545702454537945e-06,
"loss": 1.0476,
"step": 748
},
{
"epoch": 0.791023102310231,
"grad_norm": 0.5972539346186229,
"learning_rate": 2.5299765319993464e-06,
"loss": 1.0213,
"step": 749
},
{
"epoch": 0.7920792079207921,
"grad_norm": 0.5758672219558948,
"learning_rate": 2.5054846219016547e-06,
"loss": 1.0178,
"step": 750
},
{
"epoch": 0.7931353135313531,
"grad_norm": 0.5963706750048153,
"learning_rate": 2.4810948489426213e-06,
"loss": 1.0266,
"step": 751
},
{
"epoch": 0.7941914191419142,
"grad_norm": 0.6320858153055192,
"learning_rate": 2.4568075455122077e-06,
"loss": 1.0576,
"step": 752
},
{
"epoch": 0.7952475247524753,
"grad_norm": 0.6045772318662876,
"learning_rate": 2.43262304260388e-06,
"loss": 1.0497,
"step": 753
},
{
"epoch": 0.7963036303630363,
"grad_norm": 0.596669747738968,
"learning_rate": 2.4085416698101207e-06,
"loss": 1.0564,
"step": 754
},
{
"epoch": 0.7973597359735973,
"grad_norm": 0.6785702253944232,
"learning_rate": 2.3845637553179333e-06,
"loss": 1.0539,
"step": 755
},
{
"epoch": 0.7984158415841585,
"grad_norm": 0.6103995803541596,
"learning_rate": 2.3606896259043598e-06,
"loss": 1.0629,
"step": 756
},
{
"epoch": 0.7994719471947195,
"grad_norm": 0.587679479762791,
"learning_rate": 2.3369196069320333e-06,
"loss": 1.0172,
"step": 757
},
{
"epoch": 0.8005280528052805,
"grad_norm": 0.5850629688105288,
"learning_rate": 2.3132540223447465e-06,
"loss": 1.0409,
"step": 758
},
{
"epoch": 0.8015841584158416,
"grad_norm": 0.5951725306004091,
"learning_rate": 2.2896931946630307e-06,
"loss": 1.0686,
"step": 759
},
{
"epoch": 0.8026402640264027,
"grad_norm": 0.5774680808441511,
"learning_rate": 2.2662374449797663e-06,
"loss": 1.0466,
"step": 760
},
{
"epoch": 0.8036963696369637,
"grad_norm": 0.6159712892718747,
"learning_rate": 2.2428870929558012e-06,
"loss": 1.0452,
"step": 761
},
{
"epoch": 0.8047524752475248,
"grad_norm": 0.6033027227166724,
"learning_rate": 2.2196424568156073e-06,
"loss": 1.018,
"step": 762
},
{
"epoch": 0.8058085808580858,
"grad_norm": 0.6138490858392686,
"learning_rate": 2.196503853342915e-06,
"loss": 1.0923,
"step": 763
},
{
"epoch": 0.8068646864686468,
"grad_norm": 0.5800997169014955,
"learning_rate": 2.1734715978764367e-06,
"loss": 1.0467,
"step": 764
},
{
"epoch": 0.807920792079208,
"grad_norm": 0.6120586454380811,
"learning_rate": 2.1505460043055316e-06,
"loss": 1.059,
"step": 765
},
{
"epoch": 0.808976897689769,
"grad_norm": 0.6180803405363252,
"learning_rate": 2.127727385065951e-06,
"loss": 1.0209,
"step": 766
},
{
"epoch": 0.81003300330033,
"grad_norm": 0.5794833086104146,
"learning_rate": 2.105016051135571e-06,
"loss": 1.0562,
"step": 767
},
{
"epoch": 0.8110891089108911,
"grad_norm": 0.6002727933704824,
"learning_rate": 2.0824123120301564e-06,
"loss": 1.073,
"step": 768
},
{
"epoch": 0.8121452145214522,
"grad_norm": 0.5989393274976571,
"learning_rate": 2.059916475799143e-06,
"loss": 1.0514,
"step": 769
},
{
"epoch": 0.8132013201320132,
"grad_norm": 0.55448736093409,
"learning_rate": 2.037528849021441e-06,
"loss": 1.022,
"step": 770
},
{
"epoch": 0.8142574257425742,
"grad_norm": 0.5958512473422111,
"learning_rate": 2.015249736801248e-06,
"loss": 1.0637,
"step": 771
},
{
"epoch": 0.8153135313531353,
"grad_norm": 0.5905190448172049,
"learning_rate": 1.9930794427639134e-06,
"loss": 1.0326,
"step": 772
},
{
"epoch": 0.8163696369636964,
"grad_norm": 0.6005763627937065,
"learning_rate": 1.9710182690517634e-06,
"loss": 1.0265,
"step": 773
},
{
"epoch": 0.8174257425742574,
"grad_norm": 0.6008463970654853,
"learning_rate": 1.949066516320025e-06,
"loss": 1.0657,
"step": 774
},
{
"epoch": 0.8184818481848185,
"grad_norm": 0.6039412808999973,
"learning_rate": 1.9272244837326957e-06,
"loss": 1.0851,
"step": 775
},
{
"epoch": 0.8195379537953795,
"grad_norm": 0.5664172927346423,
"learning_rate": 1.9054924689584864e-06,
"loss": 1.0612,
"step": 776
},
{
"epoch": 0.8205940594059405,
"grad_norm": 0.5934387454854436,
"learning_rate": 1.8838707681667534e-06,
"loss": 1.066,
"step": 777
},
{
"epoch": 0.8216501650165017,
"grad_norm": 0.5825269703324212,
"learning_rate": 1.8623596760234664e-06,
"loss": 1.0341,
"step": 778
},
{
"epoch": 0.8227062706270627,
"grad_norm": 0.5988856806221552,
"learning_rate": 1.8409594856871971e-06,
"loss": 1.0788,
"step": 779
},
{
"epoch": 0.8237623762376237,
"grad_norm": 0.5636198790698115,
"learning_rate": 1.819670488805111e-06,
"loss": 1.0515,
"step": 780
},
{
"epoch": 0.8248184818481848,
"grad_norm": 0.6059696359980392,
"learning_rate": 1.7984929755090141e-06,
"loss": 0.9939,
"step": 781
},
{
"epoch": 0.8258745874587459,
"grad_norm": 0.5996108339791207,
"learning_rate": 1.7774272344113775e-06,
"loss": 1.048,
"step": 782
},
{
"epoch": 0.8269306930693069,
"grad_norm": 0.5617418378398924,
"learning_rate": 1.7564735526014065e-06,
"loss": 1.0098,
"step": 783
},
{
"epoch": 0.827986798679868,
"grad_norm": 0.6240551085598972,
"learning_rate": 1.7356322156411487e-06,
"loss": 1.0495,
"step": 784
},
{
"epoch": 0.829042904290429,
"grad_norm": 0.5971218564097233,
"learning_rate": 1.7149035075615795e-06,
"loss": 1.0956,
"step": 785
},
{
"epoch": 0.8300990099009901,
"grad_norm": 0.5818275856642195,
"learning_rate": 1.6942877108587397e-06,
"loss": 1.0754,
"step": 786
},
{
"epoch": 0.8311551155115512,
"grad_norm": 0.589554399528218,
"learning_rate": 1.673785106489888e-06,
"loss": 1.0686,
"step": 787
},
{
"epoch": 0.8322112211221122,
"grad_norm": 0.5748110193581939,
"learning_rate": 1.653395973869668e-06,
"loss": 1.0065,
"step": 788
},
{
"epoch": 0.8332673267326732,
"grad_norm": 0.6110297751771375,
"learning_rate": 1.6331205908663006e-06,
"loss": 1.0525,
"step": 789
},
{
"epoch": 0.8343234323432344,
"grad_norm": 0.6071048714059832,
"learning_rate": 1.6129592337977995e-06,
"loss": 1.037,
"step": 790
},
{
"epoch": 0.8353795379537954,
"grad_norm": 0.5852916840836574,
"learning_rate": 1.5929121774282087e-06,
"loss": 1.0368,
"step": 791
},
{
"epoch": 0.8364356435643564,
"grad_norm": 0.594017366955068,
"learning_rate": 1.5729796949638475e-06,
"loss": 1.0356,
"step": 792
},
{
"epoch": 0.8374917491749175,
"grad_norm": 0.6079339116988817,
"learning_rate": 1.553162058049591e-06,
"loss": 1.0418,
"step": 793
},
{
"epoch": 0.8385478547854786,
"grad_norm": 0.5916568697519751,
"learning_rate": 1.5334595367651805e-06,
"loss": 1.0661,
"step": 794
},
{
"epoch": 0.8396039603960396,
"grad_norm": 0.6188545558154127,
"learning_rate": 1.5138723996215233e-06,
"loss": 1.0353,
"step": 795
},
{
"epoch": 0.8406600660066007,
"grad_norm": 0.6079802303096038,
"learning_rate": 1.494400913557047e-06,
"loss": 1.0616,
"step": 796
},
{
"epoch": 0.8417161716171617,
"grad_norm": 0.6232413631834959,
"learning_rate": 1.475045343934054e-06,
"loss": 1.038,
"step": 797
},
{
"epoch": 0.8427722772277227,
"grad_norm": 0.6076988812462554,
"learning_rate": 1.4558059545351144e-06,
"loss": 1.0575,
"step": 798
},
{
"epoch": 0.8438283828382839,
"grad_norm": 0.5790431263489528,
"learning_rate": 1.4366830075594606e-06,
"loss": 1.0137,
"step": 799
},
{
"epoch": 0.8448844884488449,
"grad_norm": 0.5811220021292428,
"learning_rate": 1.4176767636194122e-06,
"loss": 1.0566,
"step": 800
},
{
"epoch": 0.8459405940594059,
"grad_norm": 0.5683458137394329,
"learning_rate": 1.3987874817368453e-06,
"loss": 1.04,
"step": 801
},
{
"epoch": 0.846996699669967,
"grad_norm": 0.5841723596888482,
"learning_rate": 1.3800154193396365e-06,
"loss": 1.0616,
"step": 802
},
{
"epoch": 0.8480528052805281,
"grad_norm": 0.5882563186654681,
"learning_rate": 1.3613608322581685e-06,
"loss": 1.0724,
"step": 803
},
{
"epoch": 0.8491089108910891,
"grad_norm": 0.5753139184853604,
"learning_rate": 1.3428239747218409e-06,
"loss": 1.0426,
"step": 804
},
{
"epoch": 0.8501650165016502,
"grad_norm": 0.6033035701147642,
"learning_rate": 1.3244050993556068e-06,
"loss": 1.0766,
"step": 805
},
{
"epoch": 0.8512211221122112,
"grad_norm": 0.5893668381784217,
"learning_rate": 1.3061044571765268e-06,
"loss": 1.0464,
"step": 806
},
{
"epoch": 0.8522772277227723,
"grad_norm": 0.5883144427217948,
"learning_rate": 1.2879222975903494e-06,
"loss": 1.048,
"step": 807
},
{
"epoch": 0.8533333333333334,
"grad_norm": 0.5947412409095595,
"learning_rate": 1.2698588683881185e-06,
"loss": 1.0564,
"step": 808
},
{
"epoch": 0.8543894389438944,
"grad_norm": 0.5768985399318071,
"learning_rate": 1.2519144157427843e-06,
"loss": 1.0561,
"step": 809
},
{
"epoch": 0.8554455445544554,
"grad_norm": 0.5617804398437672,
"learning_rate": 1.2340891842058511e-06,
"loss": 1.0348,
"step": 810
},
{
"epoch": 0.8565016501650164,
"grad_norm": 0.5976993261723598,
"learning_rate": 1.21638341670406e-06,
"loss": 1.0647,
"step": 811
},
{
"epoch": 0.8575577557755776,
"grad_norm": 0.5773614653446701,
"learning_rate": 1.1987973545360554e-06,
"loss": 1.0494,
"step": 812
},
{
"epoch": 0.8586138613861386,
"grad_norm": 0.58482037543191,
"learning_rate": 1.1813312373691121e-06,
"loss": 1.0578,
"step": 813
},
{
"epoch": 0.8596699669966996,
"grad_norm": 0.604151251393403,
"learning_rate": 1.1639853032358618e-06,
"loss": 1.0358,
"step": 814
},
{
"epoch": 0.8607260726072608,
"grad_norm": 0.5766953480952333,
"learning_rate": 1.14675978853106e-06,
"loss": 1.0682,
"step": 815
},
{
"epoch": 0.8617821782178218,
"grad_norm": 0.6021395874875991,
"learning_rate": 1.1296549280083413e-06,
"loss": 1.0684,
"step": 816
},
{
"epoch": 0.8628382838283828,
"grad_norm": 0.598988130201394,
"learning_rate": 1.112670954777043e-06,
"loss": 1.0753,
"step": 817
},
{
"epoch": 0.8638943894389439,
"grad_norm": 0.5801986154544232,
"learning_rate": 1.095808100299025e-06,
"loss": 1.0676,
"step": 818
},
{
"epoch": 0.8649504950495049,
"grad_norm": 0.6088636291430268,
"learning_rate": 1.0790665943855028e-06,
"loss": 1.0461,
"step": 819
},
{
"epoch": 0.866006600660066,
"grad_norm": 0.569529206680046,
"learning_rate": 1.0624466651939248e-06,
"loss": 1.0307,
"step": 820
},
{
"epoch": 0.8670627062706271,
"grad_norm": 0.5960218225538954,
"learning_rate": 1.0459485392248625e-06,
"loss": 1.1047,
"step": 821
},
{
"epoch": 0.8681188118811881,
"grad_norm": 0.6127632851978548,
"learning_rate": 1.0295724413189212e-06,
"loss": 1.0532,
"step": 822
},
{
"epoch": 0.8691749174917491,
"grad_norm": 0.6050867635576841,
"learning_rate": 1.0133185946536784e-06,
"loss": 1.069,
"step": 823
},
{
"epoch": 0.8702310231023103,
"grad_norm": 0.5642402519336399,
"learning_rate": 9.971872207406397e-07,
"loss": 1.0403,
"step": 824
},
{
"epoch": 0.8712871287128713,
"grad_norm": 0.5665005551609948,
"learning_rate": 9.811785394222252e-07,
"loss": 1.0248,
"step": 825
},
{
"epoch": 0.8723432343234323,
"grad_norm": 0.6331616070832992,
"learning_rate": 9.652927688687607e-07,
"loss": 1.0472,
"step": 826
},
{
"epoch": 0.8733993399339934,
"grad_norm": 0.586614327428352,
"learning_rate": 9.49530125575524e-07,
"loss": 1.0411,
"step": 827
},
{
"epoch": 0.8744554455445545,
"grad_norm": 0.6028813459619752,
"learning_rate": 9.338908243597766e-07,
"loss": 1.0609,
"step": 828
},
{
"epoch": 0.8755115511551155,
"grad_norm": 0.5642349081740724,
"learning_rate": 9.183750783578438e-07,
"loss": 1.0338,
"step": 829
},
{
"epoch": 0.8765676567656766,
"grad_norm": 0.5770890935316254,
"learning_rate": 9.029830990222133e-07,
"loss": 1.0431,
"step": 830
},
{
"epoch": 0.8776237623762376,
"grad_norm": 0.5626202644046979,
"learning_rate": 8.87715096118642e-07,
"loss": 1.0512,
"step": 831
},
{
"epoch": 0.8786798679867986,
"grad_norm": 0.5756568895323909,
"learning_rate": 8.725712777233175e-07,
"loss": 1.0724,
"step": 832
},
{
"epoch": 0.8797359735973598,
"grad_norm": 0.5957060297085304,
"learning_rate": 8.575518502199953e-07,
"loss": 1.0698,
"step": 833
},
{
"epoch": 0.8807920792079208,
"grad_norm": 0.5719849071167402,
"learning_rate": 8.426570182972072e-07,
"loss": 1.0596,
"step": 834
},
{
"epoch": 0.8818481848184818,
"grad_norm": 0.6145001290684519,
"learning_rate": 8.278869849454718e-07,
"loss": 1.0307,
"step": 835
},
{
"epoch": 0.882904290429043,
"grad_norm": 0.5701473488901982,
"learning_rate": 8.132419514545065e-07,
"loss": 1.0635,
"step": 836
},
{
"epoch": 0.883960396039604,
"grad_norm": 0.5710581252275827,
"learning_rate": 7.98722117410512e-07,
"loss": 1.0163,
"step": 837
},
{
"epoch": 0.885016501650165,
"grad_norm": 0.5885953782078319,
"learning_rate": 7.843276806934353e-07,
"loss": 1.0751,
"step": 838
},
{
"epoch": 0.8860726072607261,
"grad_norm": 0.5555035042952674,
"learning_rate": 7.700588374742757e-07,
"loss": 1.0299,
"step": 839
},
{
"epoch": 0.8871287128712871,
"grad_norm": 0.5945327481715702,
"learning_rate": 7.55915782212413e-07,
"loss": 1.0257,
"step": 840
},
{
"epoch": 0.8881848184818482,
"grad_norm": 0.588931699327697,
"learning_rate": 7.418987076529582e-07,
"loss": 1.0247,
"step": 841
},
{
"epoch": 0.8892409240924093,
"grad_norm": 0.5654370099244245,
"learning_rate": 7.280078048241279e-07,
"loss": 1.0556,
"step": 842
},
{
"epoch": 0.8902970297029703,
"grad_norm": 0.5822900618088553,
"learning_rate": 7.142432630346319e-07,
"loss": 1.0427,
"step": 843
},
{
"epoch": 0.8913531353135313,
"grad_norm": 0.6259228515062768,
"learning_rate": 7.006052698711075e-07,
"loss": 1.0133,
"step": 844
},
{
"epoch": 0.8924092409240925,
"grad_norm": 0.5953368638397918,
"learning_rate": 6.870940111955526e-07,
"loss": 1.0798,
"step": 845
},
{
"epoch": 0.8934653465346535,
"grad_norm": 0.5947934320947895,
"learning_rate": 6.737096711427915e-07,
"loss": 1.0732,
"step": 846
},
{
"epoch": 0.8945214521452145,
"grad_norm": 0.5758676590416022,
"learning_rate": 6.604524321179761e-07,
"loss": 1.0338,
"step": 847
},
{
"epoch": 0.8955775577557756,
"grad_norm": 0.5663518130149425,
"learning_rate": 6.47322474794091e-07,
"loss": 1.0751,
"step": 848
},
{
"epoch": 0.8966336633663367,
"grad_norm": 0.6050719308146318,
"learning_rate": 6.343199781094933e-07,
"loss": 1.0855,
"step": 849
},
{
"epoch": 0.8976897689768977,
"grad_norm": 0.5994569043877743,
"learning_rate": 6.214451192654747e-07,
"loss": 1.0345,
"step": 850
},
{
"epoch": 0.8987458745874587,
"grad_norm": 0.5611678489674179,
"learning_rate": 6.086980737238458e-07,
"loss": 1.0726,
"step": 851
},
{
"epoch": 0.8998019801980198,
"grad_norm": 0.572829744270503,
"learning_rate": 5.960790152045482e-07,
"loss": 1.0316,
"step": 852
},
{
"epoch": 0.9008580858085808,
"grad_norm": 0.5918967161168934,
"learning_rate": 5.83588115683279e-07,
"loss": 1.0425,
"step": 853
},
{
"epoch": 0.9019141914191419,
"grad_norm": 0.5887708305901134,
"learning_rate": 5.71225545389158e-07,
"loss": 1.0578,
"step": 854
},
{
"epoch": 0.902970297029703,
"grad_norm": 0.6026922759066556,
"learning_rate": 5.589914728024004e-07,
"loss": 1.0492,
"step": 855
},
{
"epoch": 0.904026402640264,
"grad_norm": 0.5703964057004133,
"learning_rate": 5.468860646520169e-07,
"loss": 1.0495,
"step": 856
},
{
"epoch": 0.905082508250825,
"grad_norm": 0.5875016190885093,
"learning_rate": 5.34909485913554e-07,
"loss": 1.0922,
"step": 857
},
{
"epoch": 0.9061386138613862,
"grad_norm": 0.6174787527190042,
"learning_rate": 5.230618998068371e-07,
"loss": 1.075,
"step": 858
},
{
"epoch": 0.9071947194719472,
"grad_norm": 0.6038593526944847,
"learning_rate": 5.113434677937457e-07,
"loss": 1.0538,
"step": 859
},
{
"epoch": 0.9082508250825082,
"grad_norm": 0.5794879170451946,
"learning_rate": 4.997543495760126e-07,
"loss": 1.0818,
"step": 860
},
{
"epoch": 0.9093069306930693,
"grad_norm": 0.5772338214620376,
"learning_rate": 4.882947030930585e-07,
"loss": 1.0484,
"step": 861
},
{
"epoch": 0.9103630363036304,
"grad_norm": 0.5882987897143979,
"learning_rate": 4.769646845198217e-07,
"loss": 1.0316,
"step": 862
},
{
"epoch": 0.9114191419141914,
"grad_norm": 0.6105319362168437,
"learning_rate": 4.6576444826464173e-07,
"loss": 1.0607,
"step": 863
},
{
"epoch": 0.9124752475247525,
"grad_norm": 0.5842216299811218,
"learning_rate": 4.5469414696715287e-07,
"loss": 1.0934,
"step": 864
},
{
"epoch": 0.9135313531353135,
"grad_norm": 0.5673860562914085,
"learning_rate": 4.437539314962047e-07,
"loss": 1.0265,
"step": 865
},
{
"epoch": 0.9145874587458745,
"grad_norm": 0.5768128222157233,
"learning_rate": 4.329439509477995e-07,
"loss": 1.0349,
"step": 866
},
{
"epoch": 0.9156435643564357,
"grad_norm": 0.5681597606215066,
"learning_rate": 4.222643526430703e-07,
"loss": 1.0709,
"step": 867
},
{
"epoch": 0.9166996699669967,
"grad_norm": 0.5859378415432109,
"learning_rate": 4.11715282126266e-07,
"loss": 1.0191,
"step": 868
},
{
"epoch": 0.9177557755775577,
"grad_norm": 0.5852875151583498,
"learning_rate": 4.012968831627695e-07,
"loss": 1.076,
"step": 869
},
{
"epoch": 0.9188118811881189,
"grad_norm": 0.5703088817217045,
"learning_rate": 3.9100929773713937e-07,
"loss": 1.075,
"step": 870
},
{
"epoch": 0.9198679867986799,
"grad_norm": 0.5732823631437448,
"learning_rate": 3.808526660511758e-07,
"loss": 1.0429,
"step": 871
},
{
"epoch": 0.9209240924092409,
"grad_norm": 0.5757739482942087,
"learning_rate": 3.708271265220087e-07,
"loss": 1.0429,
"step": 872
},
{
"epoch": 0.921980198019802,
"grad_norm": 0.5949158833904653,
"learning_rate": 3.60932815780205e-07,
"loss": 1.1035,
"step": 873
},
{
"epoch": 0.923036303630363,
"grad_norm": 0.5949198317311929,
"learning_rate": 3.5116986866792104e-07,
"loss": 1.0694,
"step": 874
},
{
"epoch": 0.9240924092409241,
"grad_norm": 0.5648923103284136,
"learning_rate": 3.4153841823705403e-07,
"loss": 1.0191,
"step": 875
},
{
"epoch": 0.9251485148514852,
"grad_norm": 0.5662607923956534,
"learning_rate": 3.3203859574742816e-07,
"loss": 1.0317,
"step": 876
},
{
"epoch": 0.9262046204620462,
"grad_norm": 0.5738212298136834,
"learning_rate": 3.226705306650113e-07,
"loss": 1.0425,
"step": 877
},
{
"epoch": 0.9272607260726072,
"grad_norm": 0.57032333558157,
"learning_rate": 3.1343435066015114e-07,
"loss": 1.058,
"step": 878
},
{
"epoch": 0.9283168316831684,
"grad_norm": 0.5818612305824606,
"learning_rate": 3.043301816058264e-07,
"loss": 1.0666,
"step": 879
},
{
"epoch": 0.9293729372937294,
"grad_norm": 0.5824817361211397,
"learning_rate": 2.953581475759404e-07,
"loss": 1.0552,
"step": 880
},
{
"epoch": 0.9304290429042904,
"grad_norm": 0.6003461729829411,
"learning_rate": 2.865183708436292e-07,
"loss": 1.0561,
"step": 881
},
{
"epoch": 0.9314851485148515,
"grad_norm": 0.5940485337695551,
"learning_rate": 2.778109718795907e-07,
"loss": 1.0424,
"step": 882
},
{
"epoch": 0.9325412541254126,
"grad_norm": 0.5685852740717898,
"learning_rate": 2.6923606935044477e-07,
"loss": 1.0372,
"step": 883
},
{
"epoch": 0.9335973597359736,
"grad_norm": 0.5703044178370517,
"learning_rate": 2.60793780117119e-07,
"loss": 1.0613,
"step": 884
},
{
"epoch": 0.9346534653465347,
"grad_norm": 0.6379028737154024,
"learning_rate": 2.524842192332522e-07,
"loss": 1.1179,
"step": 885
},
{
"epoch": 0.9357095709570957,
"grad_norm": 0.5689655419954607,
"learning_rate": 2.443074999436257e-07,
"loss": 1.0461,
"step": 886
},
{
"epoch": 0.9367656765676567,
"grad_norm": 0.58099598189365,
"learning_rate": 2.3626373368262678e-07,
"loss": 1.0807,
"step": 887
},
{
"epoch": 0.9378217821782178,
"grad_norm": 0.6058927155869095,
"learning_rate": 2.2835303007272324e-07,
"loss": 1.0793,
"step": 888
},
{
"epoch": 0.9388778877887789,
"grad_norm": 0.5555908677656496,
"learning_rate": 2.205754969229701e-07,
"loss": 1.0525,
"step": 889
},
{
"epoch": 0.9399339933993399,
"grad_norm": 0.575960952917683,
"learning_rate": 2.1293124022754409e-07,
"loss": 1.0261,
"step": 890
},
{
"epoch": 0.9409900990099009,
"grad_norm": 0.6729057613250063,
"learning_rate": 2.05420364164296e-07,
"loss": 1.0379,
"step": 891
},
{
"epoch": 0.9420462046204621,
"grad_norm": 0.5728146699912483,
"learning_rate": 1.9804297109333292e-07,
"loss": 1.0172,
"step": 892
},
{
"epoch": 0.9431023102310231,
"grad_norm": 0.5669759764838711,
"learning_rate": 1.9079916155562038e-07,
"loss": 1.0813,
"step": 893
},
{
"epoch": 0.9441584158415841,
"grad_norm": 0.5619885865846266,
"learning_rate": 1.8368903427161354e-07,
"loss": 1.047,
"step": 894
},
{
"epoch": 0.9452145214521452,
"grad_norm": 0.5551488603046879,
"learning_rate": 1.767126861399171e-07,
"loss": 1.0545,
"step": 895
},
{
"epoch": 0.9462706270627063,
"grad_norm": 0.579430806419476,
"learning_rate": 1.6987021223595302e-07,
"loss": 1.0445,
"step": 896
},
{
"epoch": 0.9473267326732673,
"grad_norm": 0.564498963914545,
"learning_rate": 1.631617058106749e-07,
"loss": 1.0041,
"step": 897
},
{
"epoch": 0.9483828382838284,
"grad_norm": 0.5915389443156301,
"learning_rate": 1.5658725828929688e-07,
"loss": 1.0737,
"step": 898
},
{
"epoch": 0.9494389438943894,
"grad_norm": 0.6036903420265376,
"learning_rate": 1.5014695927003885e-07,
"loss": 1.0519,
"step": 899
},
{
"epoch": 0.9504950495049505,
"grad_norm": 0.5763790857478481,
"learning_rate": 1.4384089652291544e-07,
"loss": 1.0505,
"step": 900
},
{
"epoch": 0.9515511551155116,
"grad_norm": 0.5952616518855375,
"learning_rate": 1.3766915598853347e-07,
"loss": 1.0821,
"step": 901
},
{
"epoch": 0.9526072607260726,
"grad_norm": 0.5659966512157708,
"learning_rate": 1.3163182177692523e-07,
"loss": 1.0385,
"step": 902
},
{
"epoch": 0.9536633663366336,
"grad_norm": 0.6167853550960234,
"learning_rate": 1.2572897616639602e-07,
"loss": 1.048,
"step": 903
},
{
"epoch": 0.9547194719471948,
"grad_norm": 0.6022648661609009,
"learning_rate": 1.1996069960240943e-07,
"loss": 1.0092,
"step": 904
},
{
"epoch": 0.9557755775577558,
"grad_norm": 0.5808476193276615,
"learning_rate": 1.1432707069649051e-07,
"loss": 1.0327,
"step": 905
},
{
"epoch": 0.9568316831683168,
"grad_norm": 0.5706256269568292,
"learning_rate": 1.0882816622514559e-07,
"loss": 1.0396,
"step": 906
},
{
"epoch": 0.9578877887788779,
"grad_norm": 0.5836958255562936,
"learning_rate": 1.0346406112882956e-07,
"loss": 1.0014,
"step": 907
},
{
"epoch": 0.9589438943894389,
"grad_norm": 0.5788115787030745,
"learning_rate": 9.823482851091359e-08,
"loss": 1.0556,
"step": 908
},
{
"epoch": 0.96,
"grad_norm": 0.5686103172784417,
"learning_rate": 9.314053963669245e-08,
"loss": 1.0525,
"step": 909
},
{
"epoch": 0.9610561056105611,
"grad_norm": 0.5578581983933412,
"learning_rate": 8.818126393241644e-08,
"loss": 1.0088,
"step": 910
},
{
"epoch": 0.9621122112211221,
"grad_norm": 0.6166688741901469,
"learning_rate": 8.335706898433993e-08,
"loss": 1.0865,
"step": 911
},
{
"epoch": 0.9631683168316831,
"grad_norm": 0.5837239864692064,
"learning_rate": 7.86680205378043e-08,
"loss": 1.0688,
"step": 912
},
{
"epoch": 0.9642244224422443,
"grad_norm": 0.5781956490665567,
"learning_rate": 7.411418249634095e-08,
"loss": 1.0328,
"step": 913
},
{
"epoch": 0.9652805280528053,
"grad_norm": 0.5892439235057432,
"learning_rate": 6.96956169207963e-08,
"loss": 1.0214,
"step": 914
},
{
"epoch": 0.9663366336633663,
"grad_norm": 1.0033785466144793,
"learning_rate": 6.541238402849482e-08,
"loss": 1.1066,
"step": 915
},
{
"epoch": 0.9673927392739274,
"grad_norm": 0.5546954470911944,
"learning_rate": 6.126454219240852e-08,
"loss": 1.0296,
"step": 916
},
{
"epoch": 0.9684488448844885,
"grad_norm": 0.5812657672102908,
"learning_rate": 5.7252147940369816e-08,
"loss": 1.031,
"step": 917
},
{
"epoch": 0.9695049504950495,
"grad_norm": 0.5852439085030394,
"learning_rate": 5.3375255954295494e-08,
"loss": 1.0732,
"step": 918
},
{
"epoch": 0.9705610561056106,
"grad_norm": 0.5780038308683062,
"learning_rate": 4.9633919069442815e-08,
"loss": 1.0317,
"step": 919
},
{
"epoch": 0.9716171617161716,
"grad_norm": 0.5824012540598311,
"learning_rate": 4.602818827369127e-08,
"loss": 1.0067,
"step": 920
},
{
"epoch": 0.9726732673267326,
"grad_norm": 0.570236890097286,
"learning_rate": 4.25581127068464e-08,
"loss": 1.0524,
"step": 921
},
{
"epoch": 0.9737293729372938,
"grad_norm": 0.5700268160260389,
"learning_rate": 3.9223739659970393e-08,
"loss": 1.0637,
"step": 922
},
{
"epoch": 0.9747854785478548,
"grad_norm": 0.5687465968215338,
"learning_rate": 3.602511457473479e-08,
"loss": 1.0383,
"step": 923
},
{
"epoch": 0.9758415841584158,
"grad_norm": 0.57077168138311,
"learning_rate": 3.296228104280874e-08,
"loss": 1.0413,
"step": 924
},
{
"epoch": 0.976897689768977,
"grad_norm": 0.5669093579030313,
"learning_rate": 3.0035280805255086e-08,
"loss": 1.065,
"step": 925
},
{
"epoch": 0.977953795379538,
"grad_norm": 0.5909682996160041,
"learning_rate": 2.724415375196965e-08,
"loss": 1.0861,
"step": 926
},
{
"epoch": 0.979009900990099,
"grad_norm": 0.5837378098976076,
"learning_rate": 2.4588937921132815e-08,
"loss": 1.0826,
"step": 927
},
{
"epoch": 0.98006600660066,
"grad_norm": 0.6027089711944906,
"learning_rate": 2.206966949869327e-08,
"loss": 1.0271,
"step": 928
},
{
"epoch": 0.9811221122112211,
"grad_norm": 0.5698703377592116,
"learning_rate": 1.968638281787394e-08,
"loss": 1.0446,
"step": 929
},
{
"epoch": 0.9821782178217822,
"grad_norm": 0.5789924962313012,
"learning_rate": 1.7439110358704603e-08,
"loss": 1.0663,
"step": 930
},
{
"epoch": 0.9832343234323432,
"grad_norm": 0.572670690017874,
"learning_rate": 1.5327882747578903e-08,
"loss": 1.0307,
"step": 931
},
{
"epoch": 0.9842904290429043,
"grad_norm": 0.5949797848212109,
"learning_rate": 1.3352728756836908e-08,
"loss": 1.0563,
"step": 932
},
{
"epoch": 0.9853465346534653,
"grad_norm": 0.5748038161815299,
"learning_rate": 1.1513675304373195e-08,
"loss": 1.1098,
"step": 933
},
{
"epoch": 0.9864026402640264,
"grad_norm": 0.5705410226589203,
"learning_rate": 9.810747453271596e-09,
"loss": 1.0925,
"step": 934
},
{
"epoch": 0.9874587458745875,
"grad_norm": 0.59136841164916,
"learning_rate": 8.243968411461023e-09,
"loss": 1.0379,
"step": 935
},
{
"epoch": 0.9885148514851485,
"grad_norm": 0.5824165070706044,
"learning_rate": 6.813359531397945e-09,
"loss": 1.0537,
"step": 936
},
{
"epoch": 0.9895709570957095,
"grad_norm": 0.5697273674916493,
"learning_rate": 5.518940309779952e-09,
"loss": 1.0625,
"step": 937
},
{
"epoch": 0.9906270627062707,
"grad_norm": 0.6056959917226936,
"learning_rate": 4.3607283872793e-09,
"loss": 1.0346,
"step": 938
},
{
"epoch": 0.9916831683168317,
"grad_norm": 0.5986814847000105,
"learning_rate": 3.3387395482975538e-09,
"loss": 1.0879,
"step": 939
},
{
"epoch": 0.9927392739273927,
"grad_norm": 0.5605642551944755,
"learning_rate": 2.4529877207557507e-09,
"loss": 1.0337,
"step": 940
},
{
"epoch": 0.9937953795379538,
"grad_norm": 0.5902554878958108,
"learning_rate": 1.7034849759023365e-09,
"loss": 1.0707,
"step": 941
},
{
"epoch": 0.9948514851485148,
"grad_norm": 0.6907958764376239,
"learning_rate": 1.09024152814885e-09,
"loss": 1.0483,
"step": 942
},
{
"epoch": 0.9959075907590759,
"grad_norm": 0.5857032910039901,
"learning_rate": 6.132657349322557e-10,
"loss": 1.0447,
"step": 943
},
{
"epoch": 0.996963696369637,
"grad_norm": 0.5623583822542519,
"learning_rate": 2.725640965961507e-10,
"loss": 1.0523,
"step": 944
},
{
"epoch": 0.998019801980198,
"grad_norm": 0.5811852816377518,
"learning_rate": 6.814125631082746e-11,
"loss": 1.0667,
"step": 945
},
{
"epoch": 0.999075907590759,
"grad_norm": 0.5830035456168559,
"learning_rate": 0.0,
"loss": 1.0889,
"step": 946
},
{
"epoch": 0.999075907590759,
"step": 946,
"total_flos": 396093963632640.0,
"train_loss": 1.0916298654316343,
"train_runtime": 4492.7948,
"train_samples_per_second": 26.975,
"train_steps_per_second": 0.211
}
],
"logging_steps": 1,
"max_steps": 946,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 396093963632640.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}