lillian039's picture
Model save
9e6a07c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9965217391304346,
"eval_steps": 500,
"global_step": 574,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0034782608695652175,
"grad_norm": 0.1545655359230834,
"learning_rate": 3.448275862068966e-06,
"loss": 0.1804,
"step": 1
},
{
"epoch": 0.006956521739130435,
"grad_norm": 0.15798307731453395,
"learning_rate": 6.896551724137932e-06,
"loss": 0.1443,
"step": 2
},
{
"epoch": 0.010434782608695653,
"grad_norm": 0.14372383095748037,
"learning_rate": 1.0344827586206897e-05,
"loss": 0.1414,
"step": 3
},
{
"epoch": 0.01391304347826087,
"grad_norm": 0.2420744995778043,
"learning_rate": 1.3793103448275863e-05,
"loss": 0.1926,
"step": 4
},
{
"epoch": 0.017391304347826087,
"grad_norm": 0.1463366912249852,
"learning_rate": 1.7241379310344828e-05,
"loss": 0.1598,
"step": 5
},
{
"epoch": 0.020869565217391306,
"grad_norm": 0.2742107559459329,
"learning_rate": 2.0689655172413793e-05,
"loss": 0.2451,
"step": 6
},
{
"epoch": 0.02434782608695652,
"grad_norm": 0.1545956455873269,
"learning_rate": 2.413793103448276e-05,
"loss": 0.1467,
"step": 7
},
{
"epoch": 0.02782608695652174,
"grad_norm": 0.11833712816221738,
"learning_rate": 2.7586206896551727e-05,
"loss": 0.1146,
"step": 8
},
{
"epoch": 0.03130434782608696,
"grad_norm": 0.1636683202816951,
"learning_rate": 3.103448275862069e-05,
"loss": 0.1511,
"step": 9
},
{
"epoch": 0.034782608695652174,
"grad_norm": 0.12096851431359755,
"learning_rate": 3.4482758620689657e-05,
"loss": 0.1392,
"step": 10
},
{
"epoch": 0.03826086956521739,
"grad_norm": 0.20113450226273455,
"learning_rate": 3.793103448275862e-05,
"loss": 0.1829,
"step": 11
},
{
"epoch": 0.04173913043478261,
"grad_norm": 0.1724183342324261,
"learning_rate": 4.1379310344827587e-05,
"loss": 0.1393,
"step": 12
},
{
"epoch": 0.04521739130434783,
"grad_norm": 0.16317141755627293,
"learning_rate": 4.482758620689655e-05,
"loss": 0.1569,
"step": 13
},
{
"epoch": 0.04869565217391304,
"grad_norm": 0.20158026184467487,
"learning_rate": 4.827586206896552e-05,
"loss": 0.1719,
"step": 14
},
{
"epoch": 0.05217391304347826,
"grad_norm": 0.19268086804807166,
"learning_rate": 5.172413793103449e-05,
"loss": 0.1783,
"step": 15
},
{
"epoch": 0.05565217391304348,
"grad_norm": 0.15367624679456215,
"learning_rate": 5.517241379310345e-05,
"loss": 0.1457,
"step": 16
},
{
"epoch": 0.059130434782608696,
"grad_norm": 0.16131163703415627,
"learning_rate": 5.862068965517241e-05,
"loss": 0.1741,
"step": 17
},
{
"epoch": 0.06260869565217392,
"grad_norm": 0.1513439967052575,
"learning_rate": 6.206896551724138e-05,
"loss": 0.1567,
"step": 18
},
{
"epoch": 0.06608695652173913,
"grad_norm": 0.11397034244477378,
"learning_rate": 6.551724137931034e-05,
"loss": 0.1448,
"step": 19
},
{
"epoch": 0.06956521739130435,
"grad_norm": 0.18890710907597627,
"learning_rate": 6.896551724137931e-05,
"loss": 0.1576,
"step": 20
},
{
"epoch": 0.07304347826086957,
"grad_norm": 0.17148715059837027,
"learning_rate": 7.241379310344828e-05,
"loss": 0.1531,
"step": 21
},
{
"epoch": 0.07652173913043478,
"grad_norm": 0.15845773761518642,
"learning_rate": 7.586206896551724e-05,
"loss": 0.1795,
"step": 22
},
{
"epoch": 0.08,
"grad_norm": 0.16432205778499775,
"learning_rate": 7.931034482758621e-05,
"loss": 0.1455,
"step": 23
},
{
"epoch": 0.08347826086956522,
"grad_norm": 0.18507516537799124,
"learning_rate": 8.275862068965517e-05,
"loss": 0.1792,
"step": 24
},
{
"epoch": 0.08695652173913043,
"grad_norm": 0.1489906198108428,
"learning_rate": 8.620689655172413e-05,
"loss": 0.1575,
"step": 25
},
{
"epoch": 0.09043478260869565,
"grad_norm": 0.19257597111889158,
"learning_rate": 8.96551724137931e-05,
"loss": 0.1977,
"step": 26
},
{
"epoch": 0.09391304347826086,
"grad_norm": 0.15869513580726594,
"learning_rate": 9.310344827586207e-05,
"loss": 0.1491,
"step": 27
},
{
"epoch": 0.09739130434782609,
"grad_norm": 0.23763138206897608,
"learning_rate": 9.655172413793105e-05,
"loss": 0.2305,
"step": 28
},
{
"epoch": 0.10086956521739131,
"grad_norm": 0.19313130092481448,
"learning_rate": 0.0001,
"loss": 0.1991,
"step": 29
},
{
"epoch": 0.10434782608695652,
"grad_norm": 0.15957163254805692,
"learning_rate": 0.00010344827586206898,
"loss": 0.1494,
"step": 30
},
{
"epoch": 0.10782608695652174,
"grad_norm": 0.15175494387195537,
"learning_rate": 0.00010689655172413792,
"loss": 0.1539,
"step": 31
},
{
"epoch": 0.11130434782608696,
"grad_norm": 0.1577067484050021,
"learning_rate": 0.0001103448275862069,
"loss": 0.1481,
"step": 32
},
{
"epoch": 0.11478260869565217,
"grad_norm": 0.09295501667856695,
"learning_rate": 0.00011379310344827588,
"loss": 0.1018,
"step": 33
},
{
"epoch": 0.11826086956521739,
"grad_norm": 0.13149067291539926,
"learning_rate": 0.00011724137931034482,
"loss": 0.1176,
"step": 34
},
{
"epoch": 0.12173913043478261,
"grad_norm": 0.15815867098069847,
"learning_rate": 0.0001206896551724138,
"loss": 0.1315,
"step": 35
},
{
"epoch": 0.12521739130434784,
"grad_norm": 0.1228801998135233,
"learning_rate": 0.00012413793103448277,
"loss": 0.1226,
"step": 36
},
{
"epoch": 0.12869565217391304,
"grad_norm": 0.14615808183921733,
"learning_rate": 0.00012758620689655174,
"loss": 0.1351,
"step": 37
},
{
"epoch": 0.13217391304347825,
"grad_norm": 0.13959696283916806,
"learning_rate": 0.00013103448275862068,
"loss": 0.1265,
"step": 38
},
{
"epoch": 0.1356521739130435,
"grad_norm": 0.1674438071444559,
"learning_rate": 0.00013448275862068965,
"loss": 0.1763,
"step": 39
},
{
"epoch": 0.1391304347826087,
"grad_norm": 0.14248711889055726,
"learning_rate": 0.00013793103448275863,
"loss": 0.1273,
"step": 40
},
{
"epoch": 0.1426086956521739,
"grad_norm": 0.12483278168498144,
"learning_rate": 0.0001413793103448276,
"loss": 0.1158,
"step": 41
},
{
"epoch": 0.14608695652173914,
"grad_norm": 0.12252417486446492,
"learning_rate": 0.00014482758620689657,
"loss": 0.0978,
"step": 42
},
{
"epoch": 0.14956521739130435,
"grad_norm": 0.1379518468653693,
"learning_rate": 0.00014827586206896554,
"loss": 0.1265,
"step": 43
},
{
"epoch": 0.15304347826086956,
"grad_norm": 0.1523565561366162,
"learning_rate": 0.00015172413793103449,
"loss": 0.1823,
"step": 44
},
{
"epoch": 0.1565217391304348,
"grad_norm": 0.1801898533175253,
"learning_rate": 0.00015517241379310346,
"loss": 0.1999,
"step": 45
},
{
"epoch": 0.16,
"grad_norm": 0.13012748020707876,
"learning_rate": 0.00015862068965517243,
"loss": 0.1409,
"step": 46
},
{
"epoch": 0.1634782608695652,
"grad_norm": 0.1413893808116691,
"learning_rate": 0.00016206896551724137,
"loss": 0.1262,
"step": 47
},
{
"epoch": 0.16695652173913045,
"grad_norm": 0.16233434268275468,
"learning_rate": 0.00016551724137931035,
"loss": 0.1467,
"step": 48
},
{
"epoch": 0.17043478260869566,
"grad_norm": 0.15079503853002107,
"learning_rate": 0.00016896551724137932,
"loss": 0.1058,
"step": 49
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.15412291289995766,
"learning_rate": 0.00017241379310344826,
"loss": 0.168,
"step": 50
},
{
"epoch": 0.17739130434782607,
"grad_norm": 0.1722020517750421,
"learning_rate": 0.00017586206896551723,
"loss": 0.1183,
"step": 51
},
{
"epoch": 0.1808695652173913,
"grad_norm": 0.10905711916480021,
"learning_rate": 0.0001793103448275862,
"loss": 0.1093,
"step": 52
},
{
"epoch": 0.18434782608695652,
"grad_norm": 0.16963364557672264,
"learning_rate": 0.00018275862068965518,
"loss": 0.1557,
"step": 53
},
{
"epoch": 0.18782608695652173,
"grad_norm": 0.15154120729033607,
"learning_rate": 0.00018620689655172415,
"loss": 0.1594,
"step": 54
},
{
"epoch": 0.19130434782608696,
"grad_norm": 0.13757866713331232,
"learning_rate": 0.00018965517241379312,
"loss": 0.1407,
"step": 55
},
{
"epoch": 0.19478260869565217,
"grad_norm": 0.08797746875562075,
"learning_rate": 0.0001931034482758621,
"loss": 0.0941,
"step": 56
},
{
"epoch": 0.19826086956521738,
"grad_norm": 0.18086221573643768,
"learning_rate": 0.00019655172413793104,
"loss": 0.1781,
"step": 57
},
{
"epoch": 0.20173913043478262,
"grad_norm": 0.17700454857957337,
"learning_rate": 0.0002,
"loss": 0.1879,
"step": 58
},
{
"epoch": 0.20521739130434782,
"grad_norm": 0.1558083475840659,
"learning_rate": 0.00019999814660065618,
"loss": 0.1831,
"step": 59
},
{
"epoch": 0.20869565217391303,
"grad_norm": 0.1032213761254349,
"learning_rate": 0.00019999258647132646,
"loss": 0.1188,
"step": 60
},
{
"epoch": 0.21217391304347827,
"grad_norm": 0.14893393244118194,
"learning_rate": 0.00019998331981811366,
"loss": 0.1554,
"step": 61
},
{
"epoch": 0.21565217391304348,
"grad_norm": 0.14353596472572114,
"learning_rate": 0.00019997034698451395,
"loss": 0.1807,
"step": 62
},
{
"epoch": 0.21913043478260869,
"grad_norm": 0.1051492618618541,
"learning_rate": 0.00019995366845140415,
"loss": 0.1278,
"step": 63
},
{
"epoch": 0.22260869565217392,
"grad_norm": 0.15519178380797527,
"learning_rate": 0.00019993328483702393,
"loss": 0.1718,
"step": 64
},
{
"epoch": 0.22608695652173913,
"grad_norm": 0.16979535445201727,
"learning_rate": 0.00019990919689695286,
"loss": 0.1759,
"step": 65
},
{
"epoch": 0.22956521739130434,
"grad_norm": 0.19955078650794816,
"learning_rate": 0.0001998814055240823,
"loss": 0.1659,
"step": 66
},
{
"epoch": 0.23304347826086957,
"grad_norm": 0.21069141049146595,
"learning_rate": 0.00019984991174858257,
"loss": 0.1591,
"step": 67
},
{
"epoch": 0.23652173913043478,
"grad_norm": 0.10858740428706376,
"learning_rate": 0.00019981471673786452,
"loss": 0.1143,
"step": 68
},
{
"epoch": 0.24,
"grad_norm": 0.12877038648097636,
"learning_rate": 0.00019977582179653633,
"loss": 0.113,
"step": 69
},
{
"epoch": 0.24347826086956523,
"grad_norm": 0.15092333453545853,
"learning_rate": 0.00019973322836635518,
"loss": 0.183,
"step": 70
},
{
"epoch": 0.24695652173913044,
"grad_norm": 0.12997966260226232,
"learning_rate": 0.00019968693802617374,
"loss": 0.144,
"step": 71
},
{
"epoch": 0.25043478260869567,
"grad_norm": 0.12761141406209162,
"learning_rate": 0.00019963695249188183,
"loss": 0.1292,
"step": 72
},
{
"epoch": 0.2539130434782609,
"grad_norm": 0.16597376098252953,
"learning_rate": 0.00019958327361634248,
"loss": 0.1645,
"step": 73
},
{
"epoch": 0.2573913043478261,
"grad_norm": 0.10098015772720864,
"learning_rate": 0.00019952590338932356,
"loss": 0.1067,
"step": 74
},
{
"epoch": 0.2608695652173913,
"grad_norm": 0.15925018221087978,
"learning_rate": 0.00019946484393742399,
"loss": 0.1554,
"step": 75
},
{
"epoch": 0.2643478260869565,
"grad_norm": 0.1532302933171606,
"learning_rate": 0.0001994000975239946,
"loss": 0.1817,
"step": 76
},
{
"epoch": 0.2678260869565217,
"grad_norm": 0.15154786378403498,
"learning_rate": 0.00019933166654905466,
"loss": 0.1467,
"step": 77
},
{
"epoch": 0.271304347826087,
"grad_norm": 0.15690138906152937,
"learning_rate": 0.00019925955354920265,
"loss": 0.1373,
"step": 78
},
{
"epoch": 0.2747826086956522,
"grad_norm": 0.1859438689490505,
"learning_rate": 0.0001991837611975223,
"loss": 0.1932,
"step": 79
},
{
"epoch": 0.2782608695652174,
"grad_norm": 0.14861843675913228,
"learning_rate": 0.00019910429230348347,
"loss": 0.1675,
"step": 80
},
{
"epoch": 0.2817391304347826,
"grad_norm": 0.14218774514095903,
"learning_rate": 0.00019902114981283812,
"loss": 0.1283,
"step": 81
},
{
"epoch": 0.2852173913043478,
"grad_norm": 0.15988803314683084,
"learning_rate": 0.00019893433680751103,
"loss": 0.1336,
"step": 82
},
{
"epoch": 0.288695652173913,
"grad_norm": 0.15975061567872123,
"learning_rate": 0.0001988438565054855,
"loss": 0.1676,
"step": 83
},
{
"epoch": 0.2921739130434783,
"grad_norm": 0.0903484060539206,
"learning_rate": 0.00019874971226068415,
"loss": 0.0909,
"step": 84
},
{
"epoch": 0.2956521739130435,
"grad_norm": 0.12570120193815287,
"learning_rate": 0.00019865190756284467,
"loss": 0.1333,
"step": 85
},
{
"epoch": 0.2991304347826087,
"grad_norm": 0.12595056424947598,
"learning_rate": 0.0001985504460373903,
"loss": 0.1092,
"step": 86
},
{
"epoch": 0.3026086956521739,
"grad_norm": 0.13479356357232541,
"learning_rate": 0.0001984453314452955,
"loss": 0.1478,
"step": 87
},
{
"epoch": 0.3060869565217391,
"grad_norm": 0.13307683198992498,
"learning_rate": 0.00019833656768294662,
"loss": 0.146,
"step": 88
},
{
"epoch": 0.3095652173913043,
"grad_norm": 0.14686125301552883,
"learning_rate": 0.0001982241587819974,
"loss": 0.1285,
"step": 89
},
{
"epoch": 0.3130434782608696,
"grad_norm": 0.12720833595582368,
"learning_rate": 0.00019810810890921943,
"loss": 0.1437,
"step": 90
},
{
"epoch": 0.3165217391304348,
"grad_norm": 0.13968930311918126,
"learning_rate": 0.00019798842236634797,
"loss": 0.1291,
"step": 91
},
{
"epoch": 0.32,
"grad_norm": 0.16133982393912974,
"learning_rate": 0.00019786510358992213,
"loss": 0.2008,
"step": 92
},
{
"epoch": 0.3234782608695652,
"grad_norm": 0.1266301495042648,
"learning_rate": 0.00019773815715112074,
"loss": 0.1372,
"step": 93
},
{
"epoch": 0.3269565217391304,
"grad_norm": 0.12427333520991247,
"learning_rate": 0.00019760758775559274,
"loss": 0.1432,
"step": 94
},
{
"epoch": 0.33043478260869563,
"grad_norm": 0.13028439018276217,
"learning_rate": 0.0001974734002432827,
"loss": 0.1354,
"step": 95
},
{
"epoch": 0.3339130434782609,
"grad_norm": 0.13268075146491365,
"learning_rate": 0.00019733559958825167,
"loss": 0.1189,
"step": 96
},
{
"epoch": 0.3373913043478261,
"grad_norm": 0.2048660606818272,
"learning_rate": 0.00019719419089849247,
"loss": 0.1566,
"step": 97
},
{
"epoch": 0.3408695652173913,
"grad_norm": 0.11124284248033606,
"learning_rate": 0.00019704917941574051,
"loss": 0.1299,
"step": 98
},
{
"epoch": 0.3443478260869565,
"grad_norm": 0.1415128364022893,
"learning_rate": 0.00019690057051527965,
"loss": 0.1396,
"step": 99
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.10665529705089029,
"learning_rate": 0.00019674836970574254,
"loss": 0.1314,
"step": 100
},
{
"epoch": 0.35130434782608694,
"grad_norm": 0.14169554362167064,
"learning_rate": 0.00019659258262890683,
"loss": 0.1281,
"step": 101
},
{
"epoch": 0.35478260869565215,
"grad_norm": 0.16648182361835823,
"learning_rate": 0.00019643321505948585,
"loss": 0.1511,
"step": 102
},
{
"epoch": 0.3582608695652174,
"grad_norm": 0.15512935363008726,
"learning_rate": 0.00019627027290491458,
"loss": 0.1362,
"step": 103
},
{
"epoch": 0.3617391304347826,
"grad_norm": 0.14829391492240007,
"learning_rate": 0.00019610376220513068,
"loss": 0.16,
"step": 104
},
{
"epoch": 0.3652173913043478,
"grad_norm": 0.1721382097621375,
"learning_rate": 0.00019593368913235052,
"loss": 0.1927,
"step": 105
},
{
"epoch": 0.36869565217391304,
"grad_norm": 0.1073039991014123,
"learning_rate": 0.0001957600599908406,
"loss": 0.1077,
"step": 106
},
{
"epoch": 0.37217391304347824,
"grad_norm": 0.1765959958499992,
"learning_rate": 0.00019558288121668363,
"loss": 0.1679,
"step": 107
},
{
"epoch": 0.37565217391304345,
"grad_norm": 0.13247232361226763,
"learning_rate": 0.00019540215937754007,
"loss": 0.1201,
"step": 108
},
{
"epoch": 0.3791304347826087,
"grad_norm": 0.13402863250728775,
"learning_rate": 0.0001952179011724047,
"loss": 0.1331,
"step": 109
},
{
"epoch": 0.3826086956521739,
"grad_norm": 0.15379139900705738,
"learning_rate": 0.00019503011343135825,
"loss": 0.1507,
"step": 110
},
{
"epoch": 0.38608695652173913,
"grad_norm": 0.12569941197730944,
"learning_rate": 0.00019483880311531424,
"loss": 0.1245,
"step": 111
},
{
"epoch": 0.38956521739130434,
"grad_norm": 0.13176534371798201,
"learning_rate": 0.00019464397731576094,
"loss": 0.1346,
"step": 112
},
{
"epoch": 0.39304347826086955,
"grad_norm": 0.1308496741778078,
"learning_rate": 0.00019444564325449853,
"loss": 0.1528,
"step": 113
},
{
"epoch": 0.39652173913043476,
"grad_norm": 0.11662685828907265,
"learning_rate": 0.00019424380828337144,
"loss": 0.1042,
"step": 114
},
{
"epoch": 0.4,
"grad_norm": 0.15311025163121064,
"learning_rate": 0.0001940384798839957,
"loss": 0.124,
"step": 115
},
{
"epoch": 0.40347826086956523,
"grad_norm": 0.14271720010282954,
"learning_rate": 0.00019382966566748168,
"loss": 0.1385,
"step": 116
},
{
"epoch": 0.40695652173913044,
"grad_norm": 0.21076081706460564,
"learning_rate": 0.00019361737337415206,
"loss": 0.2177,
"step": 117
},
{
"epoch": 0.41043478260869565,
"grad_norm": 0.1326954013355056,
"learning_rate": 0.0001934016108732548,
"loss": 0.1491,
"step": 118
},
{
"epoch": 0.41391304347826086,
"grad_norm": 0.10972822431140547,
"learning_rate": 0.00019318238616267141,
"loss": 0.1135,
"step": 119
},
{
"epoch": 0.41739130434782606,
"grad_norm": 0.11664553001228962,
"learning_rate": 0.00019295970736862064,
"loss": 0.1335,
"step": 120
},
{
"epoch": 0.42086956521739133,
"grad_norm": 0.12037673410124465,
"learning_rate": 0.00019273358274535704,
"loss": 0.0989,
"step": 121
},
{
"epoch": 0.42434782608695654,
"grad_norm": 0.13278062849114713,
"learning_rate": 0.00019250402067486522,
"loss": 0.1328,
"step": 122
},
{
"epoch": 0.42782608695652175,
"grad_norm": 0.13381559738712595,
"learning_rate": 0.00019227102966654896,
"loss": 0.1296,
"step": 123
},
{
"epoch": 0.43130434782608695,
"grad_norm": 0.1646662488521753,
"learning_rate": 0.00019203461835691594,
"loss": 0.1581,
"step": 124
},
{
"epoch": 0.43478260869565216,
"grad_norm": 0.15934887298251812,
"learning_rate": 0.00019179479550925747,
"loss": 0.1627,
"step": 125
},
{
"epoch": 0.43826086956521737,
"grad_norm": 0.1410826901549644,
"learning_rate": 0.00019155157001332374,
"loss": 0.1789,
"step": 126
},
{
"epoch": 0.44173913043478263,
"grad_norm": 0.16699816673214457,
"learning_rate": 0.0001913049508849942,
"loss": 0.1608,
"step": 127
},
{
"epoch": 0.44521739130434784,
"grad_norm": 0.11736817608666682,
"learning_rate": 0.00019105494726594344,
"loss": 0.1387,
"step": 128
},
{
"epoch": 0.44869565217391305,
"grad_norm": 0.13490354839004873,
"learning_rate": 0.00019080156842330242,
"loss": 0.1355,
"step": 129
},
{
"epoch": 0.45217391304347826,
"grad_norm": 0.166052611822799,
"learning_rate": 0.00019054482374931467,
"loss": 0.1628,
"step": 130
},
{
"epoch": 0.45565217391304347,
"grad_norm": 0.10962794054522577,
"learning_rate": 0.00019028472276098844,
"loss": 0.1109,
"step": 131
},
{
"epoch": 0.4591304347826087,
"grad_norm": 0.10757925577294936,
"learning_rate": 0.00019002127509974376,
"loss": 0.1124,
"step": 132
},
{
"epoch": 0.46260869565217394,
"grad_norm": 0.14061789137211347,
"learning_rate": 0.00018975449053105505,
"loss": 0.1445,
"step": 133
},
{
"epoch": 0.46608695652173915,
"grad_norm": 0.1096963245848753,
"learning_rate": 0.00018948437894408918,
"loss": 0.1265,
"step": 134
},
{
"epoch": 0.46956521739130436,
"grad_norm": 0.12314690150275322,
"learning_rate": 0.00018921095035133898,
"loss": 0.1202,
"step": 135
},
{
"epoch": 0.47304347826086957,
"grad_norm": 0.1779920573282376,
"learning_rate": 0.0001889342148882519,
"loss": 0.1997,
"step": 136
},
{
"epoch": 0.4765217391304348,
"grad_norm": 0.13319522745287313,
"learning_rate": 0.00018865418281285444,
"loss": 0.1402,
"step": 137
},
{
"epoch": 0.48,
"grad_norm": 0.12083080356885761,
"learning_rate": 0.00018837086450537193,
"loss": 0.1238,
"step": 138
},
{
"epoch": 0.4834782608695652,
"grad_norm": 0.1582932839712108,
"learning_rate": 0.00018808427046784366,
"loss": 0.1499,
"step": 139
},
{
"epoch": 0.48695652173913045,
"grad_norm": 0.14876994205070418,
"learning_rate": 0.00018779441132373362,
"loss": 0.1557,
"step": 140
},
{
"epoch": 0.49043478260869566,
"grad_norm": 0.17699025587530975,
"learning_rate": 0.0001875012978175368,
"loss": 0.1967,
"step": 141
},
{
"epoch": 0.49391304347826087,
"grad_norm": 0.14037478538346934,
"learning_rate": 0.00018720494081438078,
"loss": 0.1596,
"step": 142
},
{
"epoch": 0.4973913043478261,
"grad_norm": 0.11128336848068965,
"learning_rate": 0.00018690535129962306,
"loss": 0.1013,
"step": 143
},
{
"epoch": 0.5008695652173913,
"grad_norm": 0.15354451724868373,
"learning_rate": 0.00018660254037844388,
"loss": 0.1812,
"step": 144
},
{
"epoch": 0.5043478260869565,
"grad_norm": 0.17621002427736646,
"learning_rate": 0.00018629651927543447,
"loss": 0.22,
"step": 145
},
{
"epoch": 0.5078260869565218,
"grad_norm": 0.11412894846283952,
"learning_rate": 0.000185987299334181,
"loss": 0.1277,
"step": 146
},
{
"epoch": 0.5113043478260869,
"grad_norm": 0.10330685267150483,
"learning_rate": 0.0001856748920168443,
"loss": 0.1149,
"step": 147
},
{
"epoch": 0.5147826086956522,
"grad_norm": 0.16038774046228474,
"learning_rate": 0.00018535930890373466,
"loss": 0.1614,
"step": 148
},
{
"epoch": 0.5182608695652174,
"grad_norm": 0.12341631086149,
"learning_rate": 0.00018504056169288275,
"loss": 0.1243,
"step": 149
},
{
"epoch": 0.5217391304347826,
"grad_norm": 0.14222035267405325,
"learning_rate": 0.00018471866219960602,
"loss": 0.1591,
"step": 150
},
{
"epoch": 0.5252173913043479,
"grad_norm": 0.15381954436682013,
"learning_rate": 0.0001843936223560707,
"loss": 0.1411,
"step": 151
},
{
"epoch": 0.528695652173913,
"grad_norm": 0.16749949682456056,
"learning_rate": 0.0001840654542108494,
"loss": 0.173,
"step": 152
},
{
"epoch": 0.5321739130434783,
"grad_norm": 0.16138212597769477,
"learning_rate": 0.0001837341699284746,
"loss": 0.1378,
"step": 153
},
{
"epoch": 0.5356521739130434,
"grad_norm": 0.11820972909841256,
"learning_rate": 0.0001833997817889878,
"loss": 0.1415,
"step": 154
},
{
"epoch": 0.5391304347826087,
"grad_norm": 0.1732254350869074,
"learning_rate": 0.00018306230218748413,
"loss": 0.1565,
"step": 155
},
{
"epoch": 0.542608695652174,
"grad_norm": 0.12134029048709205,
"learning_rate": 0.000182721743633653,
"loss": 0.1354,
"step": 156
},
{
"epoch": 0.5460869565217391,
"grad_norm": 0.15757519533817987,
"learning_rate": 0.00018237811875131444,
"loss": 0.1783,
"step": 157
},
{
"epoch": 0.5495652173913044,
"grad_norm": 0.1389328342147638,
"learning_rate": 0.0001820314402779511,
"loss": 0.1373,
"step": 158
},
{
"epoch": 0.5530434782608695,
"grad_norm": 0.13113073991864377,
"learning_rate": 0.00018168172106423607,
"loss": 0.1272,
"step": 159
},
{
"epoch": 0.5565217391304348,
"grad_norm": 0.14093537485863689,
"learning_rate": 0.00018132897407355657,
"loss": 0.1364,
"step": 160
},
{
"epoch": 0.56,
"grad_norm": 0.1407116914405213,
"learning_rate": 0.00018097321238153338,
"loss": 0.1329,
"step": 161
},
{
"epoch": 0.5634782608695652,
"grad_norm": 0.14535376492750982,
"learning_rate": 0.00018061444917553629,
"loss": 0.1692,
"step": 162
},
{
"epoch": 0.5669565217391305,
"grad_norm": 0.14031883322639,
"learning_rate": 0.00018025269775419507,
"loss": 0.1356,
"step": 163
},
{
"epoch": 0.5704347826086956,
"grad_norm": 0.1551541472991319,
"learning_rate": 0.00017988797152690671,
"loss": 0.148,
"step": 164
},
{
"epoch": 0.5739130434782609,
"grad_norm": 0.16740550198996068,
"learning_rate": 0.00017952028401333817,
"loss": 0.1643,
"step": 165
},
{
"epoch": 0.577391304347826,
"grad_norm": 0.11979937989365573,
"learning_rate": 0.00017914964884292544,
"loss": 0.1282,
"step": 166
},
{
"epoch": 0.5808695652173913,
"grad_norm": 0.11342656946095574,
"learning_rate": 0.00017877607975436805,
"loss": 0.1192,
"step": 167
},
{
"epoch": 0.5843478260869566,
"grad_norm": 0.12812233079916055,
"learning_rate": 0.00017839959059512016,
"loss": 0.1513,
"step": 168
},
{
"epoch": 0.5878260869565217,
"grad_norm": 0.12442713946144991,
"learning_rate": 0.00017802019532087694,
"loss": 0.1456,
"step": 169
},
{
"epoch": 0.591304347826087,
"grad_norm": 0.13585627394105457,
"learning_rate": 0.00017763790799505747,
"loss": 0.155,
"step": 170
},
{
"epoch": 0.5947826086956521,
"grad_norm": 0.10995274239294903,
"learning_rate": 0.00017725274278828325,
"loss": 0.1008,
"step": 171
},
{
"epoch": 0.5982608695652174,
"grad_norm": 0.13574783390341455,
"learning_rate": 0.0001768647139778532,
"loss": 0.1766,
"step": 172
},
{
"epoch": 0.6017391304347826,
"grad_norm": 0.12560446559496083,
"learning_rate": 0.00017647383594721416,
"loss": 0.1378,
"step": 173
},
{
"epoch": 0.6052173913043478,
"grad_norm": 0.24726328454376442,
"learning_rate": 0.0001760801231854278,
"loss": 0.2,
"step": 174
},
{
"epoch": 0.6086956521739131,
"grad_norm": 0.1300492912908485,
"learning_rate": 0.00017568359028663364,
"loss": 0.1353,
"step": 175
},
{
"epoch": 0.6121739130434782,
"grad_norm": 0.12024702168048951,
"learning_rate": 0.00017528425194950794,
"loss": 0.1346,
"step": 176
},
{
"epoch": 0.6156521739130435,
"grad_norm": 0.13400618019089086,
"learning_rate": 0.000174882122976719,
"loss": 0.147,
"step": 177
},
{
"epoch": 0.6191304347826087,
"grad_norm": 0.10665251622268654,
"learning_rate": 0.0001744772182743782,
"loss": 0.1269,
"step": 178
},
{
"epoch": 0.6226086956521739,
"grad_norm": 0.12190300959390951,
"learning_rate": 0.00017406955285148782,
"loss": 0.1263,
"step": 179
},
{
"epoch": 0.6260869565217392,
"grad_norm": 0.08623960123094311,
"learning_rate": 0.0001736591418193844,
"loss": 0.1075,
"step": 180
},
{
"epoch": 0.6295652173913043,
"grad_norm": 0.15899695178173323,
"learning_rate": 0.00017324600039117863,
"loss": 0.1335,
"step": 181
},
{
"epoch": 0.6330434782608696,
"grad_norm": 0.12405567103892874,
"learning_rate": 0.00017283014388119159,
"loss": 0.1261,
"step": 182
},
{
"epoch": 0.6365217391304347,
"grad_norm": 0.12227415658908525,
"learning_rate": 0.000172411587704387,
"loss": 0.1394,
"step": 183
},
{
"epoch": 0.64,
"grad_norm": 0.10299259784769293,
"learning_rate": 0.0001719903473757996,
"loss": 0.1179,
"step": 184
},
{
"epoch": 0.6434782608695652,
"grad_norm": 0.18072288336432377,
"learning_rate": 0.00017156643850996047,
"loss": 0.1678,
"step": 185
},
{
"epoch": 0.6469565217391304,
"grad_norm": 0.13931470098249313,
"learning_rate": 0.0001711398768203178,
"loss": 0.1468,
"step": 186
},
{
"epoch": 0.6504347826086957,
"grad_norm": 0.142891653601056,
"learning_rate": 0.00017071067811865476,
"loss": 0.1699,
"step": 187
},
{
"epoch": 0.6539130434782608,
"grad_norm": 0.1543203031358245,
"learning_rate": 0.00017027885831450318,
"loss": 0.163,
"step": 188
},
{
"epoch": 0.6573913043478261,
"grad_norm": 0.08881257657108957,
"learning_rate": 0.0001698444334145539,
"loss": 0.0956,
"step": 189
},
{
"epoch": 0.6608695652173913,
"grad_norm": 0.1437015724786564,
"learning_rate": 0.0001694074195220634,
"loss": 0.1531,
"step": 190
},
{
"epoch": 0.6643478260869565,
"grad_norm": 0.15239548568770145,
"learning_rate": 0.0001689678328362569,
"loss": 0.1583,
"step": 191
},
{
"epoch": 0.6678260869565218,
"grad_norm": 0.12999990256807817,
"learning_rate": 0.00016852568965172791,
"loss": 0.1241,
"step": 192
},
{
"epoch": 0.671304347826087,
"grad_norm": 0.16058602233359284,
"learning_rate": 0.00016808100635783423,
"loss": 0.1901,
"step": 193
},
{
"epoch": 0.6747826086956522,
"grad_norm": 0.09752013699351626,
"learning_rate": 0.00016763379943809028,
"loss": 0.1104,
"step": 194
},
{
"epoch": 0.6782608695652174,
"grad_norm": 0.1171558354901818,
"learning_rate": 0.00016718408546955636,
"loss": 0.1393,
"step": 195
},
{
"epoch": 0.6817391304347826,
"grad_norm": 0.12541030208785753,
"learning_rate": 0.00016673188112222394,
"loss": 0.1339,
"step": 196
},
{
"epoch": 0.6852173913043478,
"grad_norm": 0.16378504667963803,
"learning_rate": 0.00016627720315839784,
"loss": 0.1896,
"step": 197
},
{
"epoch": 0.688695652173913,
"grad_norm": 0.1254436356043883,
"learning_rate": 0.0001658200684320748,
"loss": 0.155,
"step": 198
},
{
"epoch": 0.6921739130434783,
"grad_norm": 0.10926424609512125,
"learning_rate": 0.00016536049388831894,
"loss": 0.1333,
"step": 199
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.12166335086653808,
"learning_rate": 0.00016489849656263337,
"loss": 0.1307,
"step": 200
},
{
"epoch": 0.6991304347826087,
"grad_norm": 0.09726778569787221,
"learning_rate": 0.00016443409358032887,
"loss": 0.1093,
"step": 201
},
{
"epoch": 0.7026086956521739,
"grad_norm": 0.18623972301385774,
"learning_rate": 0.00016396730215588915,
"loss": 0.1329,
"step": 202
},
{
"epoch": 0.7060869565217391,
"grad_norm": 0.1036420764487769,
"learning_rate": 0.00016349813959233255,
"loss": 0.1066,
"step": 203
},
{
"epoch": 0.7095652173913043,
"grad_norm": 0.15859483282291995,
"learning_rate": 0.00016302662328057088,
"loss": 0.1236,
"step": 204
},
{
"epoch": 0.7130434782608696,
"grad_norm": 0.1352010399451213,
"learning_rate": 0.00016255277069876454,
"loss": 0.1556,
"step": 205
},
{
"epoch": 0.7165217391304348,
"grad_norm": 0.0847816136200446,
"learning_rate": 0.00016207659941167485,
"loss": 0.1033,
"step": 206
},
{
"epoch": 0.72,
"grad_norm": 0.13868944339810388,
"learning_rate": 0.00016159812707001282,
"loss": 0.1583,
"step": 207
},
{
"epoch": 0.7234782608695652,
"grad_norm": 0.11403894766591344,
"learning_rate": 0.00016111737140978494,
"loss": 0.1193,
"step": 208
},
{
"epoch": 0.7269565217391304,
"grad_norm": 0.11921529189670015,
"learning_rate": 0.00016063435025163569,
"loss": 0.1272,
"step": 209
},
{
"epoch": 0.7304347826086957,
"grad_norm": 0.16113792796352755,
"learning_rate": 0.00016014908150018703,
"loss": 0.1972,
"step": 210
},
{
"epoch": 0.7339130434782609,
"grad_norm": 0.12349845734675136,
"learning_rate": 0.00015966158314337472,
"loss": 0.1462,
"step": 211
},
{
"epoch": 0.7373913043478261,
"grad_norm": 0.1502644739489071,
"learning_rate": 0.00015917187325178138,
"loss": 0.1626,
"step": 212
},
{
"epoch": 0.7408695652173913,
"grad_norm": 0.14447398546355603,
"learning_rate": 0.00015867996997796685,
"loss": 0.1653,
"step": 213
},
{
"epoch": 0.7443478260869565,
"grad_norm": 0.13747896173823398,
"learning_rate": 0.0001581858915557953,
"loss": 0.1436,
"step": 214
},
{
"epoch": 0.7478260869565218,
"grad_norm": 0.14978167508747187,
"learning_rate": 0.00015768965629975914,
"loss": 0.146,
"step": 215
},
{
"epoch": 0.7513043478260869,
"grad_norm": 0.10530370902507546,
"learning_rate": 0.0001571912826043003,
"loss": 0.1067,
"step": 216
},
{
"epoch": 0.7547826086956522,
"grad_norm": 0.15065236331393017,
"learning_rate": 0.00015669078894312848,
"loss": 0.1278,
"step": 217
},
{
"epoch": 0.7582608695652174,
"grad_norm": 0.13038147931466645,
"learning_rate": 0.00015618819386853606,
"loss": 0.1363,
"step": 218
},
{
"epoch": 0.7617391304347826,
"grad_norm": 0.12241560985671367,
"learning_rate": 0.0001556835160107107,
"loss": 0.1381,
"step": 219
},
{
"epoch": 0.7652173913043478,
"grad_norm": 0.1032079433563102,
"learning_rate": 0.0001551767740770446,
"loss": 0.1329,
"step": 220
},
{
"epoch": 0.768695652173913,
"grad_norm": 0.10420850780658172,
"learning_rate": 0.00015466798685144113,
"loss": 0.108,
"step": 221
},
{
"epoch": 0.7721739130434783,
"grad_norm": 0.12440213702363168,
"learning_rate": 0.00015415717319361847,
"loss": 0.1378,
"step": 222
},
{
"epoch": 0.7756521739130435,
"grad_norm": 0.1441063665454779,
"learning_rate": 0.00015364435203841058,
"loss": 0.1546,
"step": 223
},
{
"epoch": 0.7791304347826087,
"grad_norm": 0.10283016985275265,
"learning_rate": 0.00015312954239506533,
"loss": 0.1398,
"step": 224
},
{
"epoch": 0.782608695652174,
"grad_norm": 0.11879627421875508,
"learning_rate": 0.0001526127633465398,
"loss": 0.1394,
"step": 225
},
{
"epoch": 0.7860869565217391,
"grad_norm": 0.1340444040194527,
"learning_rate": 0.00015209403404879303,
"loss": 0.1371,
"step": 226
},
{
"epoch": 0.7895652173913044,
"grad_norm": 0.15078724481486633,
"learning_rate": 0.00015157337373007578,
"loss": 0.1626,
"step": 227
},
{
"epoch": 0.7930434782608695,
"grad_norm": 0.14991040307874806,
"learning_rate": 0.0001510508016902179,
"loss": 0.1563,
"step": 228
},
{
"epoch": 0.7965217391304348,
"grad_norm": 0.11713195212511589,
"learning_rate": 0.00015052633729991294,
"loss": 0.1372,
"step": 229
},
{
"epoch": 0.8,
"grad_norm": 0.10665559275288661,
"learning_rate": 0.00015000000000000001,
"loss": 0.1174,
"step": 230
},
{
"epoch": 0.8034782608695652,
"grad_norm": 0.15701030356110557,
"learning_rate": 0.00014947180930074326,
"loss": 0.1575,
"step": 231
},
{
"epoch": 0.8069565217391305,
"grad_norm": 0.11847918443040721,
"learning_rate": 0.00014894178478110857,
"loss": 0.1203,
"step": 232
},
{
"epoch": 0.8104347826086956,
"grad_norm": 0.1285162400608025,
"learning_rate": 0.0001484099460880379,
"loss": 0.133,
"step": 233
},
{
"epoch": 0.8139130434782609,
"grad_norm": 0.1512166257756219,
"learning_rate": 0.00014787631293572092,
"loss": 0.1584,
"step": 234
},
{
"epoch": 0.8173913043478261,
"grad_norm": 0.1584657384276377,
"learning_rate": 0.00014734090510486433,
"loss": 0.176,
"step": 235
},
{
"epoch": 0.8208695652173913,
"grad_norm": 0.10354148249587801,
"learning_rate": 0.0001468037424419586,
"loss": 0.1288,
"step": 236
},
{
"epoch": 0.8243478260869566,
"grad_norm": 0.11214117311491091,
"learning_rate": 0.0001462648448585423,
"loss": 0.1221,
"step": 237
},
{
"epoch": 0.8278260869565217,
"grad_norm": 0.14772445459512365,
"learning_rate": 0.00014572423233046386,
"loss": 0.1329,
"step": 238
},
{
"epoch": 0.831304347826087,
"grad_norm": 0.14615479240284515,
"learning_rate": 0.0001451819248971415,
"loss": 0.1643,
"step": 239
},
{
"epoch": 0.8347826086956521,
"grad_norm": 0.12753795686628652,
"learning_rate": 0.00014463794266081993,
"loss": 0.1557,
"step": 240
},
{
"epoch": 0.8382608695652174,
"grad_norm": 0.13887522594093168,
"learning_rate": 0.00014409230578582566,
"loss": 0.1639,
"step": 241
},
{
"epoch": 0.8417391304347827,
"grad_norm": 0.16912324583465613,
"learning_rate": 0.00014354503449781912,
"loss": 0.1688,
"step": 242
},
{
"epoch": 0.8452173913043478,
"grad_norm": 0.09449246440948272,
"learning_rate": 0.0001429961490830453,
"loss": 0.0993,
"step": 243
},
{
"epoch": 0.8486956521739131,
"grad_norm": 0.10550648117339549,
"learning_rate": 0.00014244566988758152,
"loss": 0.1356,
"step": 244
},
{
"epoch": 0.8521739130434782,
"grad_norm": 0.10969662638776663,
"learning_rate": 0.00014189361731658338,
"loss": 0.1239,
"step": 245
},
{
"epoch": 0.8556521739130435,
"grad_norm": 0.14808204518572862,
"learning_rate": 0.00014134001183352832,
"loss": 0.1579,
"step": 246
},
{
"epoch": 0.8591304347826086,
"grad_norm": 0.13859857433183218,
"learning_rate": 0.00014078487395945713,
"loss": 0.1747,
"step": 247
},
{
"epoch": 0.8626086956521739,
"grad_norm": 0.13502318508676295,
"learning_rate": 0.00014022822427221324,
"loss": 0.1558,
"step": 248
},
{
"epoch": 0.8660869565217392,
"grad_norm": 0.11993193249652914,
"learning_rate": 0.00013967008340567998,
"loss": 0.1318,
"step": 249
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.14432862128479182,
"learning_rate": 0.0001391104720490156,
"loss": 0.1718,
"step": 250
},
{
"epoch": 0.8730434782608696,
"grad_norm": 0.10960589296514184,
"learning_rate": 0.0001385494109458866,
"loss": 0.1216,
"step": 251
},
{
"epoch": 0.8765217391304347,
"grad_norm": 0.1444495982064661,
"learning_rate": 0.00013798692089369855,
"loss": 0.1511,
"step": 252
},
{
"epoch": 0.88,
"grad_norm": 0.14195714442676055,
"learning_rate": 0.00013742302274282533,
"loss": 0.164,
"step": 253
},
{
"epoch": 0.8834782608695653,
"grad_norm": 0.15939971031248268,
"learning_rate": 0.00013685773739583617,
"loss": 0.1589,
"step": 254
},
{
"epoch": 0.8869565217391304,
"grad_norm": 0.10567415705517683,
"learning_rate": 0.00013629108580672094,
"loss": 0.1006,
"step": 255
},
{
"epoch": 0.8904347826086957,
"grad_norm": 0.12878257656430525,
"learning_rate": 0.0001357230889801133,
"loss": 0.1267,
"step": 256
},
{
"epoch": 0.8939130434782608,
"grad_norm": 0.11395046485825466,
"learning_rate": 0.0001351537679705121,
"loss": 0.134,
"step": 257
},
{
"epoch": 0.8973913043478261,
"grad_norm": 0.13632342342499126,
"learning_rate": 0.00013458314388150114,
"loss": 0.1598,
"step": 258
},
{
"epoch": 0.9008695652173913,
"grad_norm": 0.16308025278021065,
"learning_rate": 0.00013401123786496664,
"loss": 0.2041,
"step": 259
},
{
"epoch": 0.9043478260869565,
"grad_norm": 0.10241355755764081,
"learning_rate": 0.00013343807112031327,
"loss": 0.1081,
"step": 260
},
{
"epoch": 0.9078260869565218,
"grad_norm": 0.1310395387251736,
"learning_rate": 0.00013286366489367846,
"loss": 0.158,
"step": 261
},
{
"epoch": 0.9113043478260869,
"grad_norm": 0.13100096116141785,
"learning_rate": 0.00013228804047714463,
"loss": 0.1607,
"step": 262
},
{
"epoch": 0.9147826086956522,
"grad_norm": 0.11969415969012737,
"learning_rate": 0.00013171121920795014,
"loss": 0.1308,
"step": 263
},
{
"epoch": 0.9182608695652174,
"grad_norm": 0.1295097570140744,
"learning_rate": 0.00013113322246769817,
"loss": 0.1502,
"step": 264
},
{
"epoch": 0.9217391304347826,
"grad_norm": 0.11814028103328439,
"learning_rate": 0.00013055407168156437,
"loss": 0.1241,
"step": 265
},
{
"epoch": 0.9252173913043479,
"grad_norm": 0.11218111509954955,
"learning_rate": 0.00012997378831750242,
"loss": 0.1381,
"step": 266
},
{
"epoch": 0.928695652173913,
"grad_norm": 0.12021997514568723,
"learning_rate": 0.00012939239388544852,
"loss": 0.1395,
"step": 267
},
{
"epoch": 0.9321739130434783,
"grad_norm": 0.12114779793419364,
"learning_rate": 0.00012880990993652377,
"loss": 0.117,
"step": 268
},
{
"epoch": 0.9356521739130435,
"grad_norm": 0.1690185626815269,
"learning_rate": 0.00012822635806223557,
"loss": 0.2055,
"step": 269
},
{
"epoch": 0.9391304347826087,
"grad_norm": 0.10540099318141671,
"learning_rate": 0.00012764175989367718,
"loss": 0.1292,
"step": 270
},
{
"epoch": 0.9426086956521739,
"grad_norm": 0.1123676795677547,
"learning_rate": 0.00012705613710072575,
"loss": 0.1401,
"step": 271
},
{
"epoch": 0.9460869565217391,
"grad_norm": 0.12163076229024251,
"learning_rate": 0.00012646951139123934,
"loss": 0.1393,
"step": 272
},
{
"epoch": 0.9495652173913044,
"grad_norm": 0.10635388207764115,
"learning_rate": 0.00012588190451025207,
"loss": 0.1192,
"step": 273
},
{
"epoch": 0.9530434782608695,
"grad_norm": 0.1324746367162532,
"learning_rate": 0.00012529333823916807,
"loss": 0.1674,
"step": 274
},
{
"epoch": 0.9565217391304348,
"grad_norm": 0.12690900530317173,
"learning_rate": 0.00012470383439495416,
"loss": 0.164,
"step": 275
},
{
"epoch": 0.96,
"grad_norm": 0.12178811089584775,
"learning_rate": 0.0001241134148293311,
"loss": 0.1472,
"step": 276
},
{
"epoch": 0.9634782608695652,
"grad_norm": 0.09558226725121408,
"learning_rate": 0.0001235221014279636,
"loss": 0.1107,
"step": 277
},
{
"epoch": 0.9669565217391304,
"grad_norm": 0.11947361537383715,
"learning_rate": 0.00012292991610964903,
"loss": 0.1454,
"step": 278
},
{
"epoch": 0.9704347826086956,
"grad_norm": 0.09245448807939725,
"learning_rate": 0.000122336880825505,
"loss": 0.1063,
"step": 279
},
{
"epoch": 0.9739130434782609,
"grad_norm": 0.12313564570662155,
"learning_rate": 0.00012174301755815571,
"loss": 0.1482,
"step": 280
},
{
"epoch": 0.9773913043478261,
"grad_norm": 0.14222809451041388,
"learning_rate": 0.00012114834832091691,
"loss": 0.1905,
"step": 281
},
{
"epoch": 0.9808695652173913,
"grad_norm": 0.10079732072591296,
"learning_rate": 0.00012055289515698007,
"loss": 0.1114,
"step": 282
},
{
"epoch": 0.9843478260869565,
"grad_norm": 0.0893949612581931,
"learning_rate": 0.00011995668013859529,
"loss": 0.1057,
"step": 283
},
{
"epoch": 0.9878260869565217,
"grad_norm": 0.0986410641315097,
"learning_rate": 0.00011935972536625302,
"loss": 0.111,
"step": 284
},
{
"epoch": 0.991304347826087,
"grad_norm": 0.10054024829355615,
"learning_rate": 0.00011876205296786493,
"loss": 0.0972,
"step": 285
},
{
"epoch": 0.9947826086956522,
"grad_norm": 0.12467802363495945,
"learning_rate": 0.00011816368509794364,
"loss": 0.147,
"step": 286
},
{
"epoch": 0.9982608695652174,
"grad_norm": 0.08424816142149656,
"learning_rate": 0.00011756464393678153,
"loss": 0.103,
"step": 287
},
{
"epoch": 0.9982608695652174,
"eval_loss": 0.1444740742444992,
"eval_runtime": 52.3252,
"eval_samples_per_second": 4.568,
"eval_steps_per_second": 0.573,
"step": 287
},
{
"epoch": 1.0017391304347827,
"grad_norm": 0.11878547881930412,
"learning_rate": 0.00011696495168962847,
"loss": 0.1385,
"step": 288
},
{
"epoch": 1.0052173913043478,
"grad_norm": 0.09391887138015648,
"learning_rate": 0.00011636463058586881,
"loss": 0.0826,
"step": 289
},
{
"epoch": 1.008695652173913,
"grad_norm": 0.1221171087699073,
"learning_rate": 0.00011576370287819736,
"loss": 0.1305,
"step": 290
},
{
"epoch": 1.0121739130434784,
"grad_norm": 0.08852002687146088,
"learning_rate": 0.0001151621908417945,
"loss": 0.0893,
"step": 291
},
{
"epoch": 1.0156521739130435,
"grad_norm": 0.11159916956566551,
"learning_rate": 0.00011456011677350051,
"loss": 0.1112,
"step": 292
},
{
"epoch": 1.0191304347826087,
"grad_norm": 0.10003818148322566,
"learning_rate": 0.000113957502990989,
"loss": 0.091,
"step": 293
},
{
"epoch": 1.0226086956521738,
"grad_norm": 0.16412668815167833,
"learning_rate": 0.0001133543718319398,
"loss": 0.0684,
"step": 294
},
{
"epoch": 1.0260869565217392,
"grad_norm": 0.12591860799015855,
"learning_rate": 0.0001127507456532108,
"loss": 0.1155,
"step": 295
},
{
"epoch": 1.0295652173913044,
"grad_norm": 0.09691052326677896,
"learning_rate": 0.00011214664683000927,
"loss": 0.0655,
"step": 296
},
{
"epoch": 1.0330434782608695,
"grad_norm": 0.11401647857375072,
"learning_rate": 0.00011154209775506241,
"loss": 0.0819,
"step": 297
},
{
"epoch": 1.0365217391304349,
"grad_norm": 0.12069848422212905,
"learning_rate": 0.00011093712083778746,
"loss": 0.0827,
"step": 298
},
{
"epoch": 1.04,
"grad_norm": 0.11216573920077354,
"learning_rate": 0.00011033173850346082,
"loss": 0.0754,
"step": 299
},
{
"epoch": 1.0434782608695652,
"grad_norm": 0.14906810717855873,
"learning_rate": 0.0001097259731923869,
"loss": 0.0888,
"step": 300
},
{
"epoch": 1.0469565217391303,
"grad_norm": 0.17640102936065463,
"learning_rate": 0.00010911984735906635,
"loss": 0.0987,
"step": 301
},
{
"epoch": 1.0504347826086957,
"grad_norm": 0.10731016230700624,
"learning_rate": 0.00010851338347136357,
"loss": 0.0654,
"step": 302
},
{
"epoch": 1.0539130434782609,
"grad_norm": 0.13955232812110846,
"learning_rate": 0.000107906604009674,
"loss": 0.0766,
"step": 303
},
{
"epoch": 1.057391304347826,
"grad_norm": 0.13869916502517549,
"learning_rate": 0.00010729953146609076,
"loss": 0.0905,
"step": 304
},
{
"epoch": 1.0608695652173914,
"grad_norm": 0.16180614723177286,
"learning_rate": 0.00010669218834357091,
"loss": 0.1025,
"step": 305
},
{
"epoch": 1.0643478260869565,
"grad_norm": 0.09389888673848854,
"learning_rate": 0.00010608459715510139,
"loss": 0.0613,
"step": 306
},
{
"epoch": 1.0678260869565217,
"grad_norm": 0.11083339472481404,
"learning_rate": 0.00010547678042286436,
"loss": 0.0705,
"step": 307
},
{
"epoch": 1.0713043478260869,
"grad_norm": 0.15345557779758465,
"learning_rate": 0.00010486876067740252,
"loss": 0.0878,
"step": 308
},
{
"epoch": 1.0747826086956522,
"grad_norm": 0.12649607806775048,
"learning_rate": 0.00010426056045678376,
"loss": 0.0879,
"step": 309
},
{
"epoch": 1.0782608695652174,
"grad_norm": 0.14680466140336335,
"learning_rate": 0.0001036522023057659,
"loss": 0.0958,
"step": 310
},
{
"epoch": 1.0817391304347825,
"grad_norm": 0.11612953696390602,
"learning_rate": 0.0001030437087749609,
"loss": 0.0736,
"step": 311
},
{
"epoch": 1.085217391304348,
"grad_norm": 0.11879942840457153,
"learning_rate": 0.00010243510241999899,
"loss": 0.0723,
"step": 312
},
{
"epoch": 1.088695652173913,
"grad_norm": 0.13060110667263794,
"learning_rate": 0.0001018264058006925,
"loss": 0.0935,
"step": 313
},
{
"epoch": 1.0921739130434782,
"grad_norm": 0.14907408553806142,
"learning_rate": 0.00010121764148019976,
"loss": 0.1067,
"step": 314
},
{
"epoch": 1.0956521739130434,
"grad_norm": 0.09945695753413593,
"learning_rate": 0.00010060883202418862,
"loss": 0.0717,
"step": 315
},
{
"epoch": 1.0991304347826087,
"grad_norm": 0.14172732221333895,
"learning_rate": 0.0001,
"loss": 0.0965,
"step": 316
},
{
"epoch": 1.102608695652174,
"grad_norm": 0.1308399790176956,
"learning_rate": 9.93911679758114e-05,
"loss": 0.1035,
"step": 317
},
{
"epoch": 1.106086956521739,
"grad_norm": 0.11697945837103665,
"learning_rate": 9.878235851980025e-05,
"loss": 0.0904,
"step": 318
},
{
"epoch": 1.1095652173913044,
"grad_norm": 0.12653991847887303,
"learning_rate": 9.817359419930751e-05,
"loss": 0.0856,
"step": 319
},
{
"epoch": 1.1130434782608696,
"grad_norm": 0.1217289403364997,
"learning_rate": 9.756489758000105e-05,
"loss": 0.0868,
"step": 320
},
{
"epoch": 1.1165217391304347,
"grad_norm": 0.11310356101526439,
"learning_rate": 9.69562912250391e-05,
"loss": 0.0866,
"step": 321
},
{
"epoch": 1.12,
"grad_norm": 0.10719359269477195,
"learning_rate": 9.63477976942341e-05,
"loss": 0.0716,
"step": 322
},
{
"epoch": 1.1234782608695653,
"grad_norm": 0.1512816323423573,
"learning_rate": 9.573943954321626e-05,
"loss": 0.104,
"step": 323
},
{
"epoch": 1.1269565217391304,
"grad_norm": 0.09749679838740939,
"learning_rate": 9.513123932259751e-05,
"loss": 0.0767,
"step": 324
},
{
"epoch": 1.1304347826086956,
"grad_norm": 0.12636925131896773,
"learning_rate": 9.452321957713564e-05,
"loss": 0.0874,
"step": 325
},
{
"epoch": 1.133913043478261,
"grad_norm": 0.08724868085956655,
"learning_rate": 9.391540284489862e-05,
"loss": 0.0675,
"step": 326
},
{
"epoch": 1.137391304347826,
"grad_norm": 0.09917562166921519,
"learning_rate": 9.330781165642907e-05,
"loss": 0.0835,
"step": 327
},
{
"epoch": 1.1408695652173912,
"grad_norm": 0.11005238071063954,
"learning_rate": 9.270046853390925e-05,
"loss": 0.0926,
"step": 328
},
{
"epoch": 1.1443478260869564,
"grad_norm": 0.13592915315342272,
"learning_rate": 9.209339599032601e-05,
"loss": 0.0921,
"step": 329
},
{
"epoch": 1.1478260869565218,
"grad_norm": 0.09959026553962852,
"learning_rate": 9.148661652863642e-05,
"loss": 0.0669,
"step": 330
},
{
"epoch": 1.151304347826087,
"grad_norm": 0.12926733392574546,
"learning_rate": 9.088015264093365e-05,
"loss": 0.0882,
"step": 331
},
{
"epoch": 1.154782608695652,
"grad_norm": 0.12554624045521445,
"learning_rate": 9.027402680761309e-05,
"loss": 0.0988,
"step": 332
},
{
"epoch": 1.1582608695652175,
"grad_norm": 0.1672440454873292,
"learning_rate": 8.966826149653923e-05,
"loss": 0.1213,
"step": 333
},
{
"epoch": 1.1617391304347826,
"grad_norm": 0.11985957465820539,
"learning_rate": 8.906287916221259e-05,
"loss": 0.0868,
"step": 334
},
{
"epoch": 1.1652173913043478,
"grad_norm": 0.1272151243776101,
"learning_rate": 8.845790224493763e-05,
"loss": 0.0936,
"step": 335
},
{
"epoch": 1.1686956521739131,
"grad_norm": 0.1328045736153317,
"learning_rate": 8.785335316999078e-05,
"loss": 0.1051,
"step": 336
},
{
"epoch": 1.1721739130434783,
"grad_norm": 0.09448312790900673,
"learning_rate": 8.724925434678923e-05,
"loss": 0.0735,
"step": 337
},
{
"epoch": 1.1756521739130434,
"grad_norm": 0.13775516158820159,
"learning_rate": 8.664562816806022e-05,
"loss": 0.0826,
"step": 338
},
{
"epoch": 1.1791304347826088,
"grad_norm": 0.095050504784669,
"learning_rate": 8.604249700901101e-05,
"loss": 0.0606,
"step": 339
},
{
"epoch": 1.182608695652174,
"grad_norm": 0.10883208791380891,
"learning_rate": 8.543988322649954e-05,
"loss": 0.0776,
"step": 340
},
{
"epoch": 1.1860869565217391,
"grad_norm": 0.1432959854298642,
"learning_rate": 8.483780915820553e-05,
"loss": 0.105,
"step": 341
},
{
"epoch": 1.1895652173913043,
"grad_norm": 0.1934560716364753,
"learning_rate": 8.423629712180265e-05,
"loss": 0.1167,
"step": 342
},
{
"epoch": 1.1930434782608696,
"grad_norm": 0.14737287305329302,
"learning_rate": 8.363536941413121e-05,
"loss": 0.0952,
"step": 343
},
{
"epoch": 1.1965217391304348,
"grad_norm": 0.1535547643880873,
"learning_rate": 8.303504831037154e-05,
"loss": 0.1146,
"step": 344
},
{
"epoch": 1.2,
"grad_norm": 0.15481576726903015,
"learning_rate": 8.243535606321848e-05,
"loss": 0.1088,
"step": 345
},
{
"epoch": 1.203478260869565,
"grad_norm": 0.1589929120048658,
"learning_rate": 8.183631490205637e-05,
"loss": 0.1288,
"step": 346
},
{
"epoch": 1.2069565217391305,
"grad_norm": 0.12926833828040588,
"learning_rate": 8.12379470321351e-05,
"loss": 0.0779,
"step": 347
},
{
"epoch": 1.2104347826086956,
"grad_norm": 0.10432967192535712,
"learning_rate": 8.064027463374702e-05,
"loss": 0.0733,
"step": 348
},
{
"epoch": 1.2139130434782608,
"grad_norm": 0.1423904166119135,
"learning_rate": 8.004331986140474e-05,
"loss": 0.097,
"step": 349
},
{
"epoch": 1.2173913043478262,
"grad_norm": 0.16415634432026194,
"learning_rate": 7.944710484301995e-05,
"loss": 0.1044,
"step": 350
},
{
"epoch": 1.2208695652173913,
"grad_norm": 0.14367056293640723,
"learning_rate": 7.88516516790831e-05,
"loss": 0.108,
"step": 351
},
{
"epoch": 1.2243478260869565,
"grad_norm": 0.09627642646890802,
"learning_rate": 7.825698244184431e-05,
"loss": 0.0716,
"step": 352
},
{
"epoch": 1.2278260869565218,
"grad_norm": 0.12349504031653168,
"learning_rate": 7.766311917449501e-05,
"loss": 0.0846,
"step": 353
},
{
"epoch": 1.231304347826087,
"grad_norm": 0.11917707968673376,
"learning_rate": 7.707008389035101e-05,
"loss": 0.0893,
"step": 354
},
{
"epoch": 1.2347826086956522,
"grad_norm": 0.14958731827081473,
"learning_rate": 7.647789857203645e-05,
"loss": 0.1005,
"step": 355
},
{
"epoch": 1.2382608695652173,
"grad_norm": 0.09807418540274827,
"learning_rate": 7.588658517066892e-05,
"loss": 0.0777,
"step": 356
},
{
"epoch": 1.2417391304347827,
"grad_norm": 0.13031128610452009,
"learning_rate": 7.529616560504585e-05,
"loss": 0.0877,
"step": 357
},
{
"epoch": 1.2452173913043478,
"grad_norm": 0.15458552977098033,
"learning_rate": 7.470666176083192e-05,
"loss": 0.1006,
"step": 358
},
{
"epoch": 1.248695652173913,
"grad_norm": 0.10086297540969145,
"learning_rate": 7.411809548974792e-05,
"loss": 0.0771,
"step": 359
},
{
"epoch": 1.2521739130434781,
"grad_norm": 0.10503599360725659,
"learning_rate": 7.353048860876064e-05,
"loss": 0.0699,
"step": 360
},
{
"epoch": 1.2556521739130435,
"grad_norm": 0.11445411107296893,
"learning_rate": 7.294386289927425e-05,
"loss": 0.0878,
"step": 361
},
{
"epoch": 1.2591304347826087,
"grad_norm": 0.09163778675554561,
"learning_rate": 7.235824010632283e-05,
"loss": 0.0774,
"step": 362
},
{
"epoch": 1.2626086956521738,
"grad_norm": 0.12753545759992949,
"learning_rate": 7.177364193776441e-05,
"loss": 0.0891,
"step": 363
},
{
"epoch": 1.2660869565217392,
"grad_norm": 0.10783034916975004,
"learning_rate": 7.119009006347625e-05,
"loss": 0.0727,
"step": 364
},
{
"epoch": 1.2695652173913043,
"grad_norm": 0.12242485363979573,
"learning_rate": 7.060760611455152e-05,
"loss": 0.0628,
"step": 365
},
{
"epoch": 1.2730434782608695,
"grad_norm": 0.0974356463850898,
"learning_rate": 7.002621168249759e-05,
"loss": 0.0791,
"step": 366
},
{
"epoch": 1.2765217391304349,
"grad_norm": 0.11983018538507342,
"learning_rate": 6.944592831843566e-05,
"loss": 0.067,
"step": 367
},
{
"epoch": 1.28,
"grad_norm": 0.1364747598273945,
"learning_rate": 6.886677753230184e-05,
"loss": 0.0905,
"step": 368
},
{
"epoch": 1.2834782608695652,
"grad_norm": 0.13965549240604952,
"learning_rate": 6.82887807920499e-05,
"loss": 0.0965,
"step": 369
},
{
"epoch": 1.2869565217391306,
"grad_norm": 0.1361838338173524,
"learning_rate": 6.77119595228554e-05,
"loss": 0.0884,
"step": 370
},
{
"epoch": 1.2904347826086957,
"grad_norm": 0.1554086553741736,
"learning_rate": 6.713633510632157e-05,
"loss": 0.1058,
"step": 371
},
{
"epoch": 1.2939130434782609,
"grad_norm": 0.13154153458769796,
"learning_rate": 6.656192887968675e-05,
"loss": 0.1069,
"step": 372
},
{
"epoch": 1.297391304347826,
"grad_norm": 0.12317336873376321,
"learning_rate": 6.598876213503339e-05,
"loss": 0.0855,
"step": 373
},
{
"epoch": 1.3008695652173912,
"grad_norm": 0.12111523304638382,
"learning_rate": 6.541685611849887e-05,
"loss": 0.0796,
"step": 374
},
{
"epoch": 1.3043478260869565,
"grad_norm": 0.11822393281008113,
"learning_rate": 6.484623202948789e-05,
"loss": 0.0678,
"step": 375
},
{
"epoch": 1.3078260869565217,
"grad_norm": 0.14902345594338023,
"learning_rate": 6.427691101988673e-05,
"loss": 0.095,
"step": 376
},
{
"epoch": 1.3113043478260868,
"grad_norm": 0.1804018948634972,
"learning_rate": 6.370891419327907e-05,
"loss": 0.1282,
"step": 377
},
{
"epoch": 1.3147826086956522,
"grad_norm": 0.11547994985396455,
"learning_rate": 6.314226260416382e-05,
"loss": 0.0794,
"step": 378
},
{
"epoch": 1.3182608695652174,
"grad_norm": 0.13442398839445116,
"learning_rate": 6.257697725717468e-05,
"loss": 0.0828,
"step": 379
},
{
"epoch": 1.3217391304347825,
"grad_norm": 0.16157920308299395,
"learning_rate": 6.201307910630146e-05,
"loss": 0.0862,
"step": 380
},
{
"epoch": 1.325217391304348,
"grad_norm": 0.09483163105782791,
"learning_rate": 6.145058905411343e-05,
"loss": 0.0602,
"step": 381
},
{
"epoch": 1.328695652173913,
"grad_norm": 0.1326696358587778,
"learning_rate": 6.0889527950984416e-05,
"loss": 0.081,
"step": 382
},
{
"epoch": 1.3321739130434782,
"grad_norm": 0.09578653192083227,
"learning_rate": 6.0329916594320054e-05,
"loss": 0.0632,
"step": 383
},
{
"epoch": 1.3356521739130436,
"grad_norm": 0.1445496359915367,
"learning_rate": 5.977177572778678e-05,
"loss": 0.1043,
"step": 384
},
{
"epoch": 1.3391304347826087,
"grad_norm": 0.11696872605657838,
"learning_rate": 5.921512604054289e-05,
"loss": 0.075,
"step": 385
},
{
"epoch": 1.342608695652174,
"grad_norm": 0.10474941138685831,
"learning_rate": 5.865998816647171e-05,
"loss": 0.0808,
"step": 386
},
{
"epoch": 1.3460869565217393,
"grad_norm": 0.12195030923899196,
"learning_rate": 5.8106382683416635e-05,
"loss": 0.0906,
"step": 387
},
{
"epoch": 1.3495652173913044,
"grad_norm": 0.1247261310171403,
"learning_rate": 5.755433011241851e-05,
"loss": 0.0799,
"step": 388
},
{
"epoch": 1.3530434782608696,
"grad_norm": 0.12001527150963033,
"learning_rate": 5.7003850916954705e-05,
"loss": 0.0737,
"step": 389
},
{
"epoch": 1.3565217391304347,
"grad_norm": 0.12921970865724472,
"learning_rate": 5.645496550218089e-05,
"loss": 0.0802,
"step": 390
},
{
"epoch": 1.3599999999999999,
"grad_norm": 0.14148810186262428,
"learning_rate": 5.5907694214174344e-05,
"loss": 0.0998,
"step": 391
},
{
"epoch": 1.3634782608695653,
"grad_norm": 0.1822115264684952,
"learning_rate": 5.536205733918007e-05,
"loss": 0.1139,
"step": 392
},
{
"epoch": 1.3669565217391304,
"grad_norm": 0.11275316954836014,
"learning_rate": 5.4818075102858526e-05,
"loss": 0.0839,
"step": 393
},
{
"epoch": 1.3704347826086956,
"grad_norm": 0.1049274592340904,
"learning_rate": 5.4275767669536146e-05,
"loss": 0.078,
"step": 394
},
{
"epoch": 1.373913043478261,
"grad_norm": 0.1275403647919897,
"learning_rate": 5.373515514145772e-05,
"loss": 0.0882,
"step": 395
},
{
"epoch": 1.377391304347826,
"grad_norm": 0.1414442736987841,
"learning_rate": 5.3196257558041386e-05,
"loss": 0.0905,
"step": 396
},
{
"epoch": 1.3808695652173912,
"grad_norm": 0.1647573834843455,
"learning_rate": 5.265909489513567e-05,
"loss": 0.0868,
"step": 397
},
{
"epoch": 1.3843478260869566,
"grad_norm": 0.14978728162298646,
"learning_rate": 5.212368706427912e-05,
"loss": 0.0967,
"step": 398
},
{
"epoch": 1.3878260869565218,
"grad_norm": 0.13582863247078658,
"learning_rate": 5.159005391196213e-05,
"loss": 0.0888,
"step": 399
},
{
"epoch": 1.391304347826087,
"grad_norm": 0.11281045642311609,
"learning_rate": 5.105821521889147e-05,
"loss": 0.0899,
"step": 400
},
{
"epoch": 1.3947826086956523,
"grad_norm": 0.1525391794429011,
"learning_rate": 5.052819069925676e-05,
"loss": 0.1121,
"step": 401
},
{
"epoch": 1.3982608695652174,
"grad_norm": 0.10553540876961562,
"learning_rate": 5.000000000000002e-05,
"loss": 0.0667,
"step": 402
},
{
"epoch": 1.4017391304347826,
"grad_norm": 0.14272542918507544,
"learning_rate": 4.947366270008707e-05,
"loss": 0.1049,
"step": 403
},
{
"epoch": 1.4052173913043478,
"grad_norm": 0.11523131534313182,
"learning_rate": 4.894919830978212e-05,
"loss": 0.083,
"step": 404
},
{
"epoch": 1.4086956521739131,
"grad_norm": 0.11250758245733375,
"learning_rate": 4.8426626269924266e-05,
"loss": 0.0822,
"step": 405
},
{
"epoch": 1.4121739130434783,
"grad_norm": 0.13451779717959741,
"learning_rate": 4.790596595120699e-05,
"loss": 0.0967,
"step": 406
},
{
"epoch": 1.4156521739130434,
"grad_norm": 0.17014026695649226,
"learning_rate": 4.738723665346021e-05,
"loss": 0.0952,
"step": 407
},
{
"epoch": 1.4191304347826086,
"grad_norm": 0.11335400231382785,
"learning_rate": 4.687045760493468e-05,
"loss": 0.0765,
"step": 408
},
{
"epoch": 1.422608695652174,
"grad_norm": 0.13153029025610707,
"learning_rate": 4.635564796158945e-05,
"loss": 0.0942,
"step": 409
},
{
"epoch": 1.4260869565217391,
"grad_norm": 0.14072727769903307,
"learning_rate": 4.5842826806381544e-05,
"loss": 0.1033,
"step": 410
},
{
"epoch": 1.4295652173913043,
"grad_norm": 0.19021079673592267,
"learning_rate": 4.533201314855891e-05,
"loss": 0.0908,
"step": 411
},
{
"epoch": 1.4330434782608696,
"grad_norm": 0.1282315437032552,
"learning_rate": 4.48232259229554e-05,
"loss": 0.0923,
"step": 412
},
{
"epoch": 1.4365217391304348,
"grad_norm": 0.10482566251391306,
"learning_rate": 4.431648398928933e-05,
"loss": 0.0769,
"step": 413
},
{
"epoch": 1.44,
"grad_norm": 0.0989285401022153,
"learning_rate": 4.381180613146395e-05,
"loss": 0.0627,
"step": 414
},
{
"epoch": 1.4434782608695653,
"grad_norm": 0.15004726013623923,
"learning_rate": 4.3309211056871546e-05,
"loss": 0.107,
"step": 415
},
{
"epoch": 1.4469565217391305,
"grad_norm": 0.10917064763259954,
"learning_rate": 4.280871739569972e-05,
"loss": 0.0723,
"step": 416
},
{
"epoch": 1.4504347826086956,
"grad_norm": 0.14217337210991582,
"learning_rate": 4.231034370024088e-05,
"loss": 0.0876,
"step": 417
},
{
"epoch": 1.453913043478261,
"grad_norm": 0.12259499737310682,
"learning_rate": 4.181410844420474e-05,
"loss": 0.072,
"step": 418
},
{
"epoch": 1.4573913043478262,
"grad_norm": 0.1383064965783125,
"learning_rate": 4.132003002203314e-05,
"loss": 0.1001,
"step": 419
},
{
"epoch": 1.4608695652173913,
"grad_norm": 0.15628614353703477,
"learning_rate": 4.0828126748218654e-05,
"loss": 0.1024,
"step": 420
},
{
"epoch": 1.4643478260869565,
"grad_norm": 0.15540806197515133,
"learning_rate": 4.0338416856625294e-05,
"loss": 0.1064,
"step": 421
},
{
"epoch": 1.4678260869565216,
"grad_norm": 0.12867401972303838,
"learning_rate": 3.985091849981297e-05,
"loss": 0.0814,
"step": 422
},
{
"epoch": 1.471304347826087,
"grad_norm": 0.10461015345788115,
"learning_rate": 3.936564974836431e-05,
"loss": 0.0551,
"step": 423
},
{
"epoch": 1.4747826086956521,
"grad_norm": 0.17422707198524348,
"learning_rate": 3.8882628590215074e-05,
"loss": 0.1068,
"step": 424
},
{
"epoch": 1.4782608695652173,
"grad_norm": 0.11823762504382565,
"learning_rate": 3.840187292998717e-05,
"loss": 0.0847,
"step": 425
},
{
"epoch": 1.4817391304347827,
"grad_norm": 0.14190454091036495,
"learning_rate": 3.7923400588325155e-05,
"loss": 0.0985,
"step": 426
},
{
"epoch": 1.4852173913043478,
"grad_norm": 0.1487917306625744,
"learning_rate": 3.7447229301235445e-05,
"loss": 0.0972,
"step": 427
},
{
"epoch": 1.488695652173913,
"grad_norm": 0.11307811508469943,
"learning_rate": 3.697337671942913e-05,
"loss": 0.0769,
"step": 428
},
{
"epoch": 1.4921739130434784,
"grad_norm": 0.12456291954504964,
"learning_rate": 3.6501860407667465e-05,
"loss": 0.0757,
"step": 429
},
{
"epoch": 1.4956521739130435,
"grad_norm": 0.14812964550659216,
"learning_rate": 3.60326978441109e-05,
"loss": 0.1029,
"step": 430
},
{
"epoch": 1.4991304347826087,
"grad_norm": 0.1681784734853534,
"learning_rate": 3.556590641967115e-05,
"loss": 0.1252,
"step": 431
},
{
"epoch": 1.502608695652174,
"grad_norm": 0.14613030602008723,
"learning_rate": 3.510150343736668e-05,
"loss": 0.0912,
"step": 432
},
{
"epoch": 1.5060869565217392,
"grad_norm": 0.15179818766879094,
"learning_rate": 3.463950611168111e-05,
"loss": 0.0858,
"step": 433
},
{
"epoch": 1.5095652173913043,
"grad_norm": 0.12461414121764455,
"learning_rate": 3.4179931567925216e-05,
"loss": 0.0824,
"step": 434
},
{
"epoch": 1.5130434782608697,
"grad_norm": 0.11765068168074926,
"learning_rate": 3.372279684160221e-05,
"loss": 0.0862,
"step": 435
},
{
"epoch": 1.5165217391304346,
"grad_norm": 0.14280556708472175,
"learning_rate": 3.3268118877776066e-05,
"loss": 0.0954,
"step": 436
},
{
"epoch": 1.52,
"grad_norm": 0.11285620318100742,
"learning_rate": 3.281591453044366e-05,
"loss": 0.0735,
"step": 437
},
{
"epoch": 1.5234782608695652,
"grad_norm": 0.10694921241597416,
"learning_rate": 3.236620056190972e-05,
"loss": 0.069,
"step": 438
},
{
"epoch": 1.5269565217391303,
"grad_norm": 0.12484188708941266,
"learning_rate": 3.191899364216581e-05,
"loss": 0.083,
"step": 439
},
{
"epoch": 1.5304347826086957,
"grad_norm": 0.15429288005492145,
"learning_rate": 3.147431034827208e-05,
"loss": 0.1033,
"step": 440
},
{
"epoch": 1.5339130434782609,
"grad_norm": 0.1253058317602747,
"learning_rate": 3.103216716374312e-05,
"loss": 0.0751,
"step": 441
},
{
"epoch": 1.537391304347826,
"grad_norm": 0.11203979862187523,
"learning_rate": 3.059258047793661e-05,
"loss": 0.0804,
"step": 442
},
{
"epoch": 1.5408695652173914,
"grad_norm": 0.13184136276253297,
"learning_rate": 3.0155566585446117e-05,
"loss": 0.0892,
"step": 443
},
{
"epoch": 1.5443478260869565,
"grad_norm": 0.10496670695439927,
"learning_rate": 2.9721141685496823e-05,
"loss": 0.08,
"step": 444
},
{
"epoch": 1.5478260869565217,
"grad_norm": 0.11136343180704414,
"learning_rate": 2.9289321881345254e-05,
"loss": 0.0764,
"step": 445
},
{
"epoch": 1.551304347826087,
"grad_norm": 0.14576709922104164,
"learning_rate": 2.8860123179682242e-05,
"loss": 0.1061,
"step": 446
},
{
"epoch": 1.5547826086956522,
"grad_norm": 0.09499364976886815,
"learning_rate": 2.8433561490039573e-05,
"loss": 0.0745,
"step": 447
},
{
"epoch": 1.5582608695652174,
"grad_norm": 0.12469651410155881,
"learning_rate": 2.800965262420043e-05,
"loss": 0.086,
"step": 448
},
{
"epoch": 1.5617391304347827,
"grad_norm": 0.0950193427692519,
"learning_rate": 2.7588412295613043e-05,
"loss": 0.0548,
"step": 449
},
{
"epoch": 1.5652173913043477,
"grad_norm": 0.1436085195291988,
"learning_rate": 2.716985611880841e-05,
"loss": 0.0923,
"step": 450
},
{
"epoch": 1.568695652173913,
"grad_norm": 0.1220012073528301,
"learning_rate": 2.675399960882138e-05,
"loss": 0.0835,
"step": 451
},
{
"epoch": 1.5721739130434784,
"grad_norm": 0.14250023280956398,
"learning_rate": 2.6340858180615646e-05,
"loss": 0.0817,
"step": 452
},
{
"epoch": 1.5756521739130434,
"grad_norm": 0.14016261789642684,
"learning_rate": 2.593044714851218e-05,
"loss": 0.1009,
"step": 453
},
{
"epoch": 1.5791304347826087,
"grad_norm": 0.1519687009324273,
"learning_rate": 2.5522781725621813e-05,
"loss": 0.0936,
"step": 454
},
{
"epoch": 1.5826086956521739,
"grad_norm": 0.10018240850657148,
"learning_rate": 2.511787702328102e-05,
"loss": 0.0695,
"step": 455
},
{
"epoch": 1.586086956521739,
"grad_norm": 0.15832897678113741,
"learning_rate": 2.471574805049206e-05,
"loss": 0.103,
"step": 456
},
{
"epoch": 1.5895652173913044,
"grad_norm": 0.09635042116603919,
"learning_rate": 2.4316409713366352e-05,
"loss": 0.0713,
"step": 457
},
{
"epoch": 1.5930434782608696,
"grad_norm": 0.16551038949811617,
"learning_rate": 2.3919876814572194e-05,
"loss": 0.1165,
"step": 458
},
{
"epoch": 1.5965217391304347,
"grad_norm": 0.1591761285439053,
"learning_rate": 2.352616405278586e-05,
"loss": 0.1065,
"step": 459
},
{
"epoch": 1.6,
"grad_norm": 0.1257794232379624,
"learning_rate": 2.3135286022146785e-05,
"loss": 0.0878,
"step": 460
},
{
"epoch": 1.6034782608695652,
"grad_norm": 0.13064370809940834,
"learning_rate": 2.2747257211716757e-05,
"loss": 0.0878,
"step": 461
},
{
"epoch": 1.6069565217391304,
"grad_norm": 0.1373673611302553,
"learning_rate": 2.236209200494258e-05,
"loss": 0.08,
"step": 462
},
{
"epoch": 1.6104347826086958,
"grad_norm": 0.15683223957755238,
"learning_rate": 2.1979804679123106e-05,
"loss": 0.097,
"step": 463
},
{
"epoch": 1.613913043478261,
"grad_norm": 0.11215372603755155,
"learning_rate": 2.1600409404879874e-05,
"loss": 0.0759,
"step": 464
},
{
"epoch": 1.617391304347826,
"grad_norm": 0.12472859826284394,
"learning_rate": 2.122392024563199e-05,
"loss": 0.0798,
"step": 465
},
{
"epoch": 1.6208695652173915,
"grad_norm": 0.14167323311602448,
"learning_rate": 2.0850351157074598e-05,
"loss": 0.1025,
"step": 466
},
{
"epoch": 1.6243478260869564,
"grad_norm": 0.13106838058233283,
"learning_rate": 2.047971598666184e-05,
"loss": 0.0966,
"step": 467
},
{
"epoch": 1.6278260869565218,
"grad_norm": 0.12245656492036927,
"learning_rate": 2.011202847309329e-05,
"loss": 0.0858,
"step": 468
},
{
"epoch": 1.631304347826087,
"grad_norm": 0.15076412437271922,
"learning_rate": 1.9747302245804945e-05,
"loss": 0.0988,
"step": 469
},
{
"epoch": 1.634782608695652,
"grad_norm": 0.1890224571658569,
"learning_rate": 1.9385550824463727e-05,
"loss": 0.141,
"step": 470
},
{
"epoch": 1.6382608695652174,
"grad_norm": 0.12643818292640252,
"learning_rate": 1.9026787618466646e-05,
"loss": 0.0821,
"step": 471
},
{
"epoch": 1.6417391304347826,
"grad_norm": 0.11974342973177961,
"learning_rate": 1.8671025926443465e-05,
"loss": 0.0852,
"step": 472
},
{
"epoch": 1.6452173913043477,
"grad_norm": 0.11053773314022491,
"learning_rate": 1.8318278935763955e-05,
"loss": 0.0693,
"step": 473
},
{
"epoch": 1.6486956521739131,
"grad_norm": 0.12718860708539992,
"learning_rate": 1.7968559722048906e-05,
"loss": 0.0759,
"step": 474
},
{
"epoch": 1.6521739130434783,
"grad_norm": 0.11472304774066805,
"learning_rate": 1.762188124868557e-05,
"loss": 0.0822,
"step": 475
},
{
"epoch": 1.6556521739130434,
"grad_norm": 0.1586172339858714,
"learning_rate": 1.7278256366347035e-05,
"loss": 0.1156,
"step": 476
},
{
"epoch": 1.6591304347826088,
"grad_norm": 0.16408772559550205,
"learning_rate": 1.6937697812515894e-05,
"loss": 0.0918,
"step": 477
},
{
"epoch": 1.662608695652174,
"grad_norm": 0.12800527362364758,
"learning_rate": 1.660021821101222e-05,
"loss": 0.0789,
"step": 478
},
{
"epoch": 1.666086956521739,
"grad_norm": 0.15521778399290198,
"learning_rate": 1.626583007152539e-05,
"loss": 0.0987,
"step": 479
},
{
"epoch": 1.6695652173913045,
"grad_norm": 0.14944005207844402,
"learning_rate": 1.5934545789150623e-05,
"loss": 0.1133,
"step": 480
},
{
"epoch": 1.6730434782608694,
"grad_norm": 0.12173810785220801,
"learning_rate": 1.5606377643929304e-05,
"loss": 0.0794,
"step": 481
},
{
"epoch": 1.6765217391304348,
"grad_norm": 0.12290655885053603,
"learning_rate": 1.5281337800393968e-05,
"loss": 0.0717,
"step": 482
},
{
"epoch": 1.6800000000000002,
"grad_norm": 0.13763340851307898,
"learning_rate": 1.4959438307117247e-05,
"loss": 0.095,
"step": 483
},
{
"epoch": 1.683478260869565,
"grad_norm": 0.10678789082393463,
"learning_rate": 1.4640691096265358e-05,
"loss": 0.0838,
"step": 484
},
{
"epoch": 1.6869565217391305,
"grad_norm": 0.12694424997511286,
"learning_rate": 1.4325107983155694e-05,
"loss": 0.0884,
"step": 485
},
{
"epoch": 1.6904347826086956,
"grad_norm": 0.13805939087384794,
"learning_rate": 1.401270066581899e-05,
"loss": 0.0884,
"step": 486
},
{
"epoch": 1.6939130434782608,
"grad_norm": 0.1116542985760522,
"learning_rate": 1.3703480724565577e-05,
"loss": 0.0819,
"step": 487
},
{
"epoch": 1.6973913043478261,
"grad_norm": 0.130701148914566,
"learning_rate": 1.339745962155613e-05,
"loss": 0.0942,
"step": 488
},
{
"epoch": 1.7008695652173913,
"grad_norm": 0.12303229923584438,
"learning_rate": 1.3094648700376954e-05,
"loss": 0.0968,
"step": 489
},
{
"epoch": 1.7043478260869565,
"grad_norm": 0.10050903994662669,
"learning_rate": 1.2795059185619229e-05,
"loss": 0.064,
"step": 490
},
{
"epoch": 1.7078260869565218,
"grad_norm": 0.13529518412698788,
"learning_rate": 1.249870218246323e-05,
"loss": 0.0891,
"step": 491
},
{
"epoch": 1.711304347826087,
"grad_norm": 0.11568064512791533,
"learning_rate": 1.2205588676266388e-05,
"loss": 0.0841,
"step": 492
},
{
"epoch": 1.7147826086956521,
"grad_norm": 0.11324213029173631,
"learning_rate": 1.1915729532156372e-05,
"loss": 0.0693,
"step": 493
},
{
"epoch": 1.7182608695652175,
"grad_norm": 0.12078490458473878,
"learning_rate": 1.1629135494628096e-05,
"loss": 0.0809,
"step": 494
},
{
"epoch": 1.7217391304347827,
"grad_norm": 0.15619885447728415,
"learning_rate": 1.134581718714558e-05,
"loss": 0.0982,
"step": 495
},
{
"epoch": 1.7252173913043478,
"grad_norm": 0.13958396553029748,
"learning_rate": 1.1065785111748117e-05,
"loss": 0.1006,
"step": 496
},
{
"epoch": 1.7286956521739132,
"grad_norm": 0.11936287781907709,
"learning_rate": 1.0789049648661043e-05,
"loss": 0.0778,
"step": 497
},
{
"epoch": 1.7321739130434781,
"grad_norm": 0.13994107260501892,
"learning_rate": 1.0515621055910817e-05,
"loss": 0.0994,
"step": 498
},
{
"epoch": 1.7356521739130435,
"grad_norm": 0.10069177741815626,
"learning_rate": 1.0245509468944992e-05,
"loss": 0.0798,
"step": 499
},
{
"epoch": 1.7391304347826086,
"grad_norm": 0.1520239032704441,
"learning_rate": 9.978724900256265e-06,
"loss": 0.0936,
"step": 500
},
{
"epoch": 1.7426086956521738,
"grad_norm": 0.12537489299552443,
"learning_rate": 9.715277239011578e-06,
"loss": 0.0759,
"step": 501
},
{
"epoch": 1.7460869565217392,
"grad_norm": 0.16914167358101417,
"learning_rate": 9.455176250685338e-06,
"loss": 0.1159,
"step": 502
},
{
"epoch": 1.7495652173913043,
"grad_norm": 0.12340433382499669,
"learning_rate": 9.198431576697608e-06,
"loss": 0.0809,
"step": 503
},
{
"epoch": 1.7530434782608695,
"grad_norm": 0.16038700994407892,
"learning_rate": 8.945052734056581e-06,
"loss": 0.0927,
"step": 504
},
{
"epoch": 1.7565217391304349,
"grad_norm": 0.18736397280927972,
"learning_rate": 8.695049115005837e-06,
"loss": 0.1138,
"step": 505
},
{
"epoch": 1.76,
"grad_norm": 0.11455094890434803,
"learning_rate": 8.448429986676298e-06,
"loss": 0.0876,
"step": 506
},
{
"epoch": 1.7634782608695652,
"grad_norm": 0.13381829396413253,
"learning_rate": 8.205204490742536e-06,
"loss": 0.0932,
"step": 507
},
{
"epoch": 1.7669565217391305,
"grad_norm": 0.10231732967595585,
"learning_rate": 7.96538164308407e-06,
"loss": 0.0702,
"step": 508
},
{
"epoch": 1.7704347826086957,
"grad_norm": 0.0947188798552471,
"learning_rate": 7.728970333451035e-06,
"loss": 0.0706,
"step": 509
},
{
"epoch": 1.7739130434782608,
"grad_norm": 0.09733737409054823,
"learning_rate": 7.4959793251348055e-06,
"loss": 0.0644,
"step": 510
},
{
"epoch": 1.7773913043478262,
"grad_norm": 0.11169634637379897,
"learning_rate": 7.2664172546429655e-06,
"loss": 0.0709,
"step": 511
},
{
"epoch": 1.7808695652173911,
"grad_norm": 0.12974806998277916,
"learning_rate": 7.040292631379386e-06,
"loss": 0.0856,
"step": 512
},
{
"epoch": 1.7843478260869565,
"grad_norm": 0.13011819014873824,
"learning_rate": 6.817613837328573e-06,
"loss": 0.0924,
"step": 513
},
{
"epoch": 1.787826086956522,
"grad_norm": 0.1508887480796253,
"learning_rate": 6.598389126745208e-06,
"loss": 0.1101,
"step": 514
},
{
"epoch": 1.7913043478260868,
"grad_norm": 0.1528558553271661,
"learning_rate": 6.382626625847921e-06,
"loss": 0.1014,
"step": 515
},
{
"epoch": 1.7947826086956522,
"grad_norm": 0.13295695013628608,
"learning_rate": 6.170334332518324e-06,
"loss": 0.0866,
"step": 516
},
{
"epoch": 1.7982608695652174,
"grad_norm": 0.16036744040311404,
"learning_rate": 5.961520116004327e-06,
"loss": 0.1076,
"step": 517
},
{
"epoch": 1.8017391304347825,
"grad_norm": 0.11717096876409042,
"learning_rate": 5.756191716628556e-06,
"loss": 0.0688,
"step": 518
},
{
"epoch": 1.8052173913043479,
"grad_norm": 0.11484830279438352,
"learning_rate": 5.554356745501454e-06,
"loss": 0.0694,
"step": 519
},
{
"epoch": 1.808695652173913,
"grad_norm": 0.17176181086966022,
"learning_rate": 5.3560226842390596e-06,
"loss": 0.1032,
"step": 520
},
{
"epoch": 1.8121739130434782,
"grad_norm": 0.11739088349195866,
"learning_rate": 5.1611968846857815e-06,
"loss": 0.0732,
"step": 521
},
{
"epoch": 1.8156521739130436,
"grad_norm": 0.13709017479262753,
"learning_rate": 4.969886568641757e-06,
"loss": 0.0918,
"step": 522
},
{
"epoch": 1.8191304347826087,
"grad_norm": 0.1280476174629274,
"learning_rate": 4.7820988275953045e-06,
"loss": 0.0938,
"step": 523
},
{
"epoch": 1.8226086956521739,
"grad_norm": 0.11201422652339658,
"learning_rate": 4.597840622459937e-06,
"loss": 0.0814,
"step": 524
},
{
"epoch": 1.8260869565217392,
"grad_norm": 0.09871056879272744,
"learning_rate": 4.417118783316388e-06,
"loss": 0.072,
"step": 525
},
{
"epoch": 1.8295652173913044,
"grad_norm": 0.10542472286239411,
"learning_rate": 4.2399400091594154e-06,
"loss": 0.068,
"step": 526
},
{
"epoch": 1.8330434782608696,
"grad_norm": 0.14017893040374907,
"learning_rate": 4.066310867649481e-06,
"loss": 0.1032,
"step": 527
},
{
"epoch": 1.836521739130435,
"grad_norm": 0.11855048113345314,
"learning_rate": 3.896237794869339e-06,
"loss": 0.0783,
"step": 528
},
{
"epoch": 1.8399999999999999,
"grad_norm": 0.1244102175680237,
"learning_rate": 3.729727095085422e-06,
"loss": 0.0922,
"step": 529
},
{
"epoch": 1.8434782608695652,
"grad_norm": 0.12180644294551433,
"learning_rate": 3.566784940514145e-06,
"loss": 0.0807,
"step": 530
},
{
"epoch": 1.8469565217391304,
"grad_norm": 0.09761026100653182,
"learning_rate": 3.40741737109318e-06,
"loss": 0.0641,
"step": 531
},
{
"epoch": 1.8504347826086955,
"grad_norm": 0.09710029722289329,
"learning_rate": 3.2516302942574793e-06,
"loss": 0.067,
"step": 532
},
{
"epoch": 1.853913043478261,
"grad_norm": 0.10724535703528021,
"learning_rate": 3.0994294847203733e-06,
"loss": 0.0743,
"step": 533
},
{
"epoch": 1.857391304347826,
"grad_norm": 0.13083100814230067,
"learning_rate": 2.9508205842594728e-06,
"loss": 0.0754,
"step": 534
},
{
"epoch": 1.8608695652173912,
"grad_norm": 0.12672158607204304,
"learning_rate": 2.8058091015075394e-06,
"loss": 0.078,
"step": 535
},
{
"epoch": 1.8643478260869566,
"grad_norm": 0.17103224377006737,
"learning_rate": 2.6644004117483356e-06,
"loss": 0.0922,
"step": 536
},
{
"epoch": 1.8678260869565217,
"grad_norm": 0.134150142101436,
"learning_rate": 2.526599756717285e-06,
"loss": 0.1002,
"step": 537
},
{
"epoch": 1.871304347826087,
"grad_norm": 0.129521169878982,
"learning_rate": 2.392412244407294e-06,
"loss": 0.0836,
"step": 538
},
{
"epoch": 1.8747826086956523,
"grad_norm": 0.10885289790789841,
"learning_rate": 2.26184284887927e-06,
"loss": 0.0774,
"step": 539
},
{
"epoch": 1.8782608695652174,
"grad_norm": 0.10488094490283079,
"learning_rate": 2.134896410077891e-06,
"loss": 0.0789,
"step": 540
},
{
"epoch": 1.8817391304347826,
"grad_norm": 0.11889491296378912,
"learning_rate": 2.011577633652062e-06,
"loss": 0.0782,
"step": 541
},
{
"epoch": 1.885217391304348,
"grad_norm": 0.12096235669049085,
"learning_rate": 1.8918910907805732e-06,
"loss": 0.0881,
"step": 542
},
{
"epoch": 1.8886956521739129,
"grad_norm": 0.1106479394276716,
"learning_rate": 1.7758412180026273e-06,
"loss": 0.0802,
"step": 543
},
{
"epoch": 1.8921739130434783,
"grad_norm": 0.12821924742613686,
"learning_rate": 1.6634323170533928e-06,
"loss": 0.0911,
"step": 544
},
{
"epoch": 1.8956521739130436,
"grad_norm": 0.15604807612172736,
"learning_rate": 1.5546685547045192e-06,
"loss": 0.1,
"step": 545
},
{
"epoch": 1.8991304347826086,
"grad_norm": 0.1478681396223387,
"learning_rate": 1.4495539626097288e-06,
"loss": 0.0804,
"step": 546
},
{
"epoch": 1.902608695652174,
"grad_norm": 0.13421748048136942,
"learning_rate": 1.348092437155346e-06,
"loss": 0.089,
"step": 547
},
{
"epoch": 1.906086956521739,
"grad_norm": 0.11687932254739727,
"learning_rate": 1.2502877393158586e-06,
"loss": 0.0871,
"step": 548
},
{
"epoch": 1.9095652173913042,
"grad_norm": 0.15643926713744022,
"learning_rate": 1.1561434945145277e-06,
"loss": 0.104,
"step": 549
},
{
"epoch": 1.9130434782608696,
"grad_norm": 0.10696169647909613,
"learning_rate": 1.0656631924889749e-06,
"loss": 0.0716,
"step": 550
},
{
"epoch": 1.9165217391304348,
"grad_norm": 0.14019705935951768,
"learning_rate": 9.788501871618728e-07,
"loss": 0.0898,
"step": 551
},
{
"epoch": 1.92,
"grad_norm": 0.15767772433554056,
"learning_rate": 8.957076965165235e-07,
"loss": 0.1015,
"step": 552
},
{
"epoch": 1.9234782608695653,
"grad_norm": 0.12202925229447881,
"learning_rate": 8.162388024777201e-07,
"loss": 0.0889,
"step": 553
},
{
"epoch": 1.9269565217391305,
"grad_norm": 0.14213284579860058,
"learning_rate": 7.404464507973608e-07,
"loss": 0.1061,
"step": 554
},
{
"epoch": 1.9304347826086956,
"grad_norm": 0.11946138428666646,
"learning_rate": 6.683334509453465e-07,
"loss": 0.0756,
"step": 555
},
{
"epoch": 1.933913043478261,
"grad_norm": 0.1776730484619494,
"learning_rate": 5.999024760054095e-07,
"loss": 0.1156,
"step": 556
},
{
"epoch": 1.9373913043478261,
"grad_norm": 0.15552558119011417,
"learning_rate": 5.351560625760254e-07,
"loss": 0.1111,
"step": 557
},
{
"epoch": 1.9408695652173913,
"grad_norm": 0.1269110866764246,
"learning_rate": 4.7409661067642217e-07,
"loss": 0.0929,
"step": 558
},
{
"epoch": 1.9443478260869567,
"grad_norm": 0.10309350272790443,
"learning_rate": 4.167263836575286e-07,
"loss": 0.0547,
"step": 559
},
{
"epoch": 1.9478260869565216,
"grad_norm": 0.12377918248036159,
"learning_rate": 3.630475081181861e-07,
"loss": 0.0808,
"step": 560
},
{
"epoch": 1.951304347826087,
"grad_norm": 0.12729430798666608,
"learning_rate": 3.1306197382624526e-07,
"loss": 0.077,
"step": 561
},
{
"epoch": 1.9547826086956521,
"grad_norm": 0.11766868772742071,
"learning_rate": 2.667716336448356e-07,
"loss": 0.0871,
"step": 562
},
{
"epoch": 1.9582608695652173,
"grad_norm": 0.12138412723458143,
"learning_rate": 2.2417820346367635e-07,
"loss": 0.0983,
"step": 563
},
{
"epoch": 1.9617391304347827,
"grad_norm": 0.12163696179721654,
"learning_rate": 1.8528326213548274e-07,
"loss": 0.0855,
"step": 564
},
{
"epoch": 1.9652173913043478,
"grad_norm": 0.1569270166290431,
"learning_rate": 1.50088251417424e-07,
"loss": 0.1015,
"step": 565
},
{
"epoch": 1.968695652173913,
"grad_norm": 0.12730784199491677,
"learning_rate": 1.1859447591769934e-07,
"loss": 0.0878,
"step": 566
},
{
"epoch": 1.9721739130434783,
"grad_norm": 0.12648022636737355,
"learning_rate": 9.080310304716567e-08,
"loss": 0.0842,
"step": 567
},
{
"epoch": 1.9756521739130435,
"grad_norm": 0.11283992913356376,
"learning_rate": 6.671516297606095e-08,
"loss": 0.0834,
"step": 568
},
{
"epoch": 1.9791304347826086,
"grad_norm": 0.10119868305303333,
"learning_rate": 4.6331548595845984e-08,
"loss": 0.0667,
"step": 569
},
{
"epoch": 1.982608695652174,
"grad_norm": 0.1227080883131745,
"learning_rate": 2.965301548606414e-08,
"loss": 0.0873,
"step": 570
},
{
"epoch": 1.9860869565217392,
"grad_norm": 0.158380237566967,
"learning_rate": 1.6680181886352676e-08,
"loss": 0.1049,
"step": 571
},
{
"epoch": 1.9895652173913043,
"grad_norm": 0.17246726825049064,
"learning_rate": 7.413528673549941e-09,
"loss": 0.0969,
"step": 572
},
{
"epoch": 1.9930434782608697,
"grad_norm": 0.15178078485673158,
"learning_rate": 1.8533993438318852e-09,
"loss": 0.0884,
"step": 573
},
{
"epoch": 1.9965217391304346,
"grad_norm": 0.1411963796704214,
"learning_rate": 0.0,
"loss": 0.0874,
"step": 574
},
{
"epoch": 1.9965217391304346,
"eval_loss": 0.14970487356185913,
"eval_runtime": 49.8439,
"eval_samples_per_second": 4.795,
"eval_steps_per_second": 0.602,
"step": 574
},
{
"epoch": 1.9965217391304346,
"step": 574,
"total_flos": 465841769250816.0,
"train_loss": 0.11642231966144947,
"train_runtime": 5186.3709,
"train_samples_per_second": 1.772,
"train_steps_per_second": 0.111
}
],
"logging_steps": 1,
"max_steps": 574,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 465841769250816.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}