spag_im_ckpt / trainer_state.json
ThWu's picture
Upload folder using huggingface_hub
6d88786 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9982466393921683,
"eval_steps": 500,
"global_step": 427,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0023378141437755697,
"grad_norm": 27.288526825189052,
"learning_rate": 0.0,
"loss": 2.2488,
"step": 1
},
{
"epoch": 0.004675628287551139,
"grad_norm": 18.402597061645437,
"learning_rate": 1.3511907721365987e-06,
"loss": 1.7216,
"step": 2
},
{
"epoch": 0.0070134424313267095,
"grad_norm": 20.827793127279097,
"learning_rate": 2.1415867051569737e-06,
"loss": 1.8096,
"step": 3
},
{
"epoch": 0.009351256575102279,
"grad_norm": 17.887138033549412,
"learning_rate": 2.7023815442731975e-06,
"loss": 1.7273,
"step": 4
},
{
"epoch": 0.011689070718877849,
"grad_norm": 13.059331628307975,
"learning_rate": 3.137367815376517e-06,
"loss": 1.3944,
"step": 5
},
{
"epoch": 0.014026884862653419,
"grad_norm": 12.915220380638836,
"learning_rate": 3.4927774772935725e-06,
"loss": 1.3309,
"step": 6
},
{
"epoch": 0.01636469900642899,
"grad_norm": 9.438170554483921,
"learning_rate": 3.7932720647964956e-06,
"loss": 1.1377,
"step": 7
},
{
"epoch": 0.018702513150204558,
"grad_norm": 22.34498398581149,
"learning_rate": 4.053572316409796e-06,
"loss": 1.276,
"step": 8
},
{
"epoch": 0.02104032729398013,
"grad_norm": 11.688684740044469,
"learning_rate": 4.2831734103139475e-06,
"loss": 1.1626,
"step": 9
},
{
"epoch": 0.023378141437755698,
"grad_norm": 7.399035066119634,
"learning_rate": 4.488558587513117e-06,
"loss": 1.0172,
"step": 10
},
{
"epoch": 0.02571595558153127,
"grad_norm": 6.844826809549996,
"learning_rate": 4.674352079940294e-06,
"loss": 1.0159,
"step": 11
},
{
"epoch": 0.028053769725306838,
"grad_norm": 7.435042287761434,
"learning_rate": 4.843968249430172e-06,
"loss": 1.0191,
"step": 12
},
{
"epoch": 0.030391583869082407,
"grad_norm": 8.142142185574825,
"learning_rate": 5e-06,
"loss": 1.0605,
"step": 13
},
{
"epoch": 0.03272939801285798,
"grad_norm": 6.515223941306322,
"learning_rate": 5e-06,
"loss": 0.9758,
"step": 14
},
{
"epoch": 0.03506721215663355,
"grad_norm": 6.0290403123052965,
"learning_rate": 4.98792270531401e-06,
"loss": 0.995,
"step": 15
},
{
"epoch": 0.037405026300409115,
"grad_norm": 5.319952909218166,
"learning_rate": 4.97584541062802e-06,
"loss": 0.9515,
"step": 16
},
{
"epoch": 0.03974284044418469,
"grad_norm": 5.748688589731486,
"learning_rate": 4.963768115942029e-06,
"loss": 1.0102,
"step": 17
},
{
"epoch": 0.04208065458796026,
"grad_norm": 5.855208805312462,
"learning_rate": 4.951690821256039e-06,
"loss": 0.9919,
"step": 18
},
{
"epoch": 0.04441846873173583,
"grad_norm": 5.223174635069425,
"learning_rate": 4.939613526570048e-06,
"loss": 0.9656,
"step": 19
},
{
"epoch": 0.046756282875511396,
"grad_norm": 6.065299105647371,
"learning_rate": 4.927536231884059e-06,
"loss": 0.9285,
"step": 20
},
{
"epoch": 0.049094097019286964,
"grad_norm": 6.07241545277926,
"learning_rate": 4.915458937198068e-06,
"loss": 1.0017,
"step": 21
},
{
"epoch": 0.05143191116306254,
"grad_norm": 5.246067572533348,
"learning_rate": 4.903381642512078e-06,
"loss": 1.0095,
"step": 22
},
{
"epoch": 0.05376972530683811,
"grad_norm": 5.990327031501364,
"learning_rate": 4.891304347826087e-06,
"loss": 0.9395,
"step": 23
},
{
"epoch": 0.056107539450613676,
"grad_norm": 5.165210682799403,
"learning_rate": 4.879227053140097e-06,
"loss": 0.9407,
"step": 24
},
{
"epoch": 0.058445353594389245,
"grad_norm": 5.126974516324422,
"learning_rate": 4.867149758454107e-06,
"loss": 0.949,
"step": 25
},
{
"epoch": 0.06078316773816481,
"grad_norm": 5.363449994215859,
"learning_rate": 4.855072463768117e-06,
"loss": 0.9574,
"step": 26
},
{
"epoch": 0.06312098188194039,
"grad_norm": 5.071572733466881,
"learning_rate": 4.8429951690821256e-06,
"loss": 0.9008,
"step": 27
},
{
"epoch": 0.06545879602571596,
"grad_norm": 5.410969697138812,
"learning_rate": 4.830917874396135e-06,
"loss": 0.9422,
"step": 28
},
{
"epoch": 0.06779661016949153,
"grad_norm": 4.602908185422313,
"learning_rate": 4.818840579710145e-06,
"loss": 0.8725,
"step": 29
},
{
"epoch": 0.0701344243132671,
"grad_norm": 4.8619656541333836,
"learning_rate": 4.806763285024155e-06,
"loss": 0.9414,
"step": 30
},
{
"epoch": 0.07247223845704266,
"grad_norm": 5.730566605120532,
"learning_rate": 4.794685990338165e-06,
"loss": 0.9039,
"step": 31
},
{
"epoch": 0.07481005260081823,
"grad_norm": 5.038586687201418,
"learning_rate": 4.782608695652174e-06,
"loss": 0.9208,
"step": 32
},
{
"epoch": 0.0771478667445938,
"grad_norm": 5.0552868730926335,
"learning_rate": 4.770531400966184e-06,
"loss": 0.8567,
"step": 33
},
{
"epoch": 0.07948568088836938,
"grad_norm": 4.718130108871858,
"learning_rate": 4.758454106280194e-06,
"loss": 0.9145,
"step": 34
},
{
"epoch": 0.08182349503214495,
"grad_norm": 5.492137838168964,
"learning_rate": 4.746376811594204e-06,
"loss": 0.8846,
"step": 35
},
{
"epoch": 0.08416130917592052,
"grad_norm": 4.796280317690393,
"learning_rate": 4.7342995169082125e-06,
"loss": 0.8973,
"step": 36
},
{
"epoch": 0.08649912331969609,
"grad_norm": 5.097877561946411,
"learning_rate": 4.722222222222222e-06,
"loss": 0.9225,
"step": 37
},
{
"epoch": 0.08883693746347165,
"grad_norm": 5.149693059570453,
"learning_rate": 4.710144927536232e-06,
"loss": 0.9087,
"step": 38
},
{
"epoch": 0.09117475160724722,
"grad_norm": 4.769756789814799,
"learning_rate": 4.698067632850242e-06,
"loss": 0.8372,
"step": 39
},
{
"epoch": 0.09351256575102279,
"grad_norm": 4.303036243240873,
"learning_rate": 4.6859903381642516e-06,
"loss": 0.899,
"step": 40
},
{
"epoch": 0.09585037989479836,
"grad_norm": 5.053977102743315,
"learning_rate": 4.673913043478261e-06,
"loss": 0.8242,
"step": 41
},
{
"epoch": 0.09818819403857393,
"grad_norm": 4.448509206619331,
"learning_rate": 4.661835748792271e-06,
"loss": 0.8513,
"step": 42
},
{
"epoch": 0.1005260081823495,
"grad_norm": 4.8321447335981595,
"learning_rate": 4.649758454106281e-06,
"loss": 0.8224,
"step": 43
},
{
"epoch": 0.10286382232612508,
"grad_norm": 5.416510574830531,
"learning_rate": 4.637681159420291e-06,
"loss": 0.9078,
"step": 44
},
{
"epoch": 0.10520163646990065,
"grad_norm": 5.548877279459332,
"learning_rate": 4.6256038647342995e-06,
"loss": 0.9292,
"step": 45
},
{
"epoch": 0.10753945061367622,
"grad_norm": 5.023304416916682,
"learning_rate": 4.613526570048309e-06,
"loss": 0.8678,
"step": 46
},
{
"epoch": 0.10987726475745178,
"grad_norm": 5.3492127097713995,
"learning_rate": 4.601449275362319e-06,
"loss": 0.8999,
"step": 47
},
{
"epoch": 0.11221507890122735,
"grad_norm": 4.59060914495858,
"learning_rate": 4.589371980676329e-06,
"loss": 0.8611,
"step": 48
},
{
"epoch": 0.11455289304500292,
"grad_norm": 4.659978410728117,
"learning_rate": 4.5772946859903385e-06,
"loss": 0.8553,
"step": 49
},
{
"epoch": 0.11689070718877849,
"grad_norm": 4.869606947497931,
"learning_rate": 4.565217391304348e-06,
"loss": 0.8353,
"step": 50
},
{
"epoch": 0.11922852133255406,
"grad_norm": 4.595092369703616,
"learning_rate": 4.553140096618358e-06,
"loss": 0.8852,
"step": 51
},
{
"epoch": 0.12156633547632963,
"grad_norm": 4.846379704504629,
"learning_rate": 4.541062801932368e-06,
"loss": 0.8516,
"step": 52
},
{
"epoch": 0.12390414962010521,
"grad_norm": 4.762019560202168,
"learning_rate": 4.5289855072463775e-06,
"loss": 0.8621,
"step": 53
},
{
"epoch": 0.12624196376388078,
"grad_norm": 4.823419642191392,
"learning_rate": 4.516908212560387e-06,
"loss": 0.849,
"step": 54
},
{
"epoch": 0.12857977790765635,
"grad_norm": 4.678982332878243,
"learning_rate": 4.504830917874396e-06,
"loss": 0.8595,
"step": 55
},
{
"epoch": 0.1309175920514319,
"grad_norm": 4.695208099270892,
"learning_rate": 4.492753623188406e-06,
"loss": 0.918,
"step": 56
},
{
"epoch": 0.13325540619520748,
"grad_norm": 4.692012801651267,
"learning_rate": 4.480676328502416e-06,
"loss": 0.8879,
"step": 57
},
{
"epoch": 0.13559322033898305,
"grad_norm": 4.459981999724462,
"learning_rate": 4.4685990338164255e-06,
"loss": 0.8878,
"step": 58
},
{
"epoch": 0.13793103448275862,
"grad_norm": 4.2801494436823,
"learning_rate": 4.456521739130435e-06,
"loss": 0.8695,
"step": 59
},
{
"epoch": 0.1402688486265342,
"grad_norm": 4.86123317504702,
"learning_rate": 4.444444444444444e-06,
"loss": 0.839,
"step": 60
},
{
"epoch": 0.14260666277030976,
"grad_norm": 4.621713656381368,
"learning_rate": 4.432367149758455e-06,
"loss": 0.8264,
"step": 61
},
{
"epoch": 0.14494447691408532,
"grad_norm": 4.437318825045428,
"learning_rate": 4.4202898550724645e-06,
"loss": 0.8575,
"step": 62
},
{
"epoch": 0.1472822910578609,
"grad_norm": 4.191896550350781,
"learning_rate": 4.408212560386474e-06,
"loss": 0.8231,
"step": 63
},
{
"epoch": 0.14962010520163646,
"grad_norm": 4.934485372283743,
"learning_rate": 4.396135265700483e-06,
"loss": 0.908,
"step": 64
},
{
"epoch": 0.15195791934541203,
"grad_norm": 5.164473939972992,
"learning_rate": 4.384057971014493e-06,
"loss": 0.8157,
"step": 65
},
{
"epoch": 0.1542957334891876,
"grad_norm": 4.6359554134854655,
"learning_rate": 4.371980676328503e-06,
"loss": 0.8553,
"step": 66
},
{
"epoch": 0.15663354763296317,
"grad_norm": 4.586287061115779,
"learning_rate": 4.3599033816425124e-06,
"loss": 0.8434,
"step": 67
},
{
"epoch": 0.15897136177673876,
"grad_norm": 4.8424129486531,
"learning_rate": 4.347826086956522e-06,
"loss": 0.8788,
"step": 68
},
{
"epoch": 0.16130917592051433,
"grad_norm": 6.155570830239365,
"learning_rate": 4.335748792270532e-06,
"loss": 0.9687,
"step": 69
},
{
"epoch": 0.1636469900642899,
"grad_norm": 4.812494847678857,
"learning_rate": 4.323671497584541e-06,
"loss": 0.9001,
"step": 70
},
{
"epoch": 0.16598480420806547,
"grad_norm": 4.5207315366098255,
"learning_rate": 4.3115942028985515e-06,
"loss": 0.8164,
"step": 71
},
{
"epoch": 0.16832261835184104,
"grad_norm": 4.468118689699742,
"learning_rate": 4.299516908212561e-06,
"loss": 0.828,
"step": 72
},
{
"epoch": 0.1706604324956166,
"grad_norm": 4.957803820804726,
"learning_rate": 4.28743961352657e-06,
"loss": 0.8509,
"step": 73
},
{
"epoch": 0.17299824663939217,
"grad_norm": 4.994668979616406,
"learning_rate": 4.27536231884058e-06,
"loss": 0.8264,
"step": 74
},
{
"epoch": 0.17533606078316774,
"grad_norm": 5.051317651149785,
"learning_rate": 4.26328502415459e-06,
"loss": 0.8575,
"step": 75
},
{
"epoch": 0.1776738749269433,
"grad_norm": 4.982871471593161,
"learning_rate": 4.251207729468599e-06,
"loss": 0.7766,
"step": 76
},
{
"epoch": 0.18001168907071888,
"grad_norm": 4.812654963388801,
"learning_rate": 4.239130434782609e-06,
"loss": 0.842,
"step": 77
},
{
"epoch": 0.18234950321449445,
"grad_norm": 4.849638872368005,
"learning_rate": 4.227053140096619e-06,
"loss": 0.8493,
"step": 78
},
{
"epoch": 0.18468731735827001,
"grad_norm": 4.941754403496056,
"learning_rate": 4.214975845410628e-06,
"loss": 0.8705,
"step": 79
},
{
"epoch": 0.18702513150204558,
"grad_norm": 4.118521255369774,
"learning_rate": 4.202898550724638e-06,
"loss": 0.9022,
"step": 80
},
{
"epoch": 0.18936294564582115,
"grad_norm": 5.048580106033392,
"learning_rate": 4.190821256038647e-06,
"loss": 0.8431,
"step": 81
},
{
"epoch": 0.19170075978959672,
"grad_norm": 5.383766123063546,
"learning_rate": 4.178743961352658e-06,
"loss": 0.8892,
"step": 82
},
{
"epoch": 0.1940385739333723,
"grad_norm": 4.850111002487489,
"learning_rate": 4.166666666666667e-06,
"loss": 0.9147,
"step": 83
},
{
"epoch": 0.19637638807714786,
"grad_norm": 4.703827358788699,
"learning_rate": 4.154589371980677e-06,
"loss": 0.8407,
"step": 84
},
{
"epoch": 0.19871420222092342,
"grad_norm": 4.5132494951253275,
"learning_rate": 4.142512077294686e-06,
"loss": 0.859,
"step": 85
},
{
"epoch": 0.201052016364699,
"grad_norm": 4.425801289741148,
"learning_rate": 4.130434782608696e-06,
"loss": 0.8643,
"step": 86
},
{
"epoch": 0.2033898305084746,
"grad_norm": 4.6519866473202285,
"learning_rate": 4.118357487922706e-06,
"loss": 0.8559,
"step": 87
},
{
"epoch": 0.20572764465225016,
"grad_norm": 4.271767242791549,
"learning_rate": 4.106280193236716e-06,
"loss": 0.8115,
"step": 88
},
{
"epoch": 0.20806545879602573,
"grad_norm": 5.056579518750136,
"learning_rate": 4.0942028985507246e-06,
"loss": 0.8447,
"step": 89
},
{
"epoch": 0.2104032729398013,
"grad_norm": 4.075100416572746,
"learning_rate": 4.082125603864734e-06,
"loss": 0.7837,
"step": 90
},
{
"epoch": 0.21274108708357686,
"grad_norm": 4.393779666632264,
"learning_rate": 4.070048309178744e-06,
"loss": 0.8368,
"step": 91
},
{
"epoch": 0.21507890122735243,
"grad_norm": 4.322824034406939,
"learning_rate": 4.057971014492754e-06,
"loss": 0.7942,
"step": 92
},
{
"epoch": 0.217416715371128,
"grad_norm": 4.691982719838354,
"learning_rate": 4.045893719806764e-06,
"loss": 0.8384,
"step": 93
},
{
"epoch": 0.21975452951490357,
"grad_norm": 4.749714290659545,
"learning_rate": 4.033816425120773e-06,
"loss": 0.86,
"step": 94
},
{
"epoch": 0.22209234365867914,
"grad_norm": 4.49073749526097,
"learning_rate": 4.021739130434783e-06,
"loss": 0.8796,
"step": 95
},
{
"epoch": 0.2244301578024547,
"grad_norm": 4.612026680332374,
"learning_rate": 4.009661835748793e-06,
"loss": 0.7836,
"step": 96
},
{
"epoch": 0.22676797194623027,
"grad_norm": 4.5466671401291165,
"learning_rate": 3.997584541062803e-06,
"loss": 0.8213,
"step": 97
},
{
"epoch": 0.22910578609000584,
"grad_norm": 4.578959418279228,
"learning_rate": 3.9855072463768115e-06,
"loss": 0.8302,
"step": 98
},
{
"epoch": 0.2314436002337814,
"grad_norm": 4.471310182272502,
"learning_rate": 3.973429951690821e-06,
"loss": 0.8386,
"step": 99
},
{
"epoch": 0.23378141437755698,
"grad_norm": 4.444066950873127,
"learning_rate": 3.961352657004831e-06,
"loss": 0.8672,
"step": 100
},
{
"epoch": 0.23611922852133255,
"grad_norm": 4.08994098536812,
"learning_rate": 3.949275362318841e-06,
"loss": 0.7914,
"step": 101
},
{
"epoch": 0.23845704266510812,
"grad_norm": 5.867972858556011,
"learning_rate": 3.9371980676328506e-06,
"loss": 0.834,
"step": 102
},
{
"epoch": 0.24079485680888368,
"grad_norm": 4.33178424044995,
"learning_rate": 3.92512077294686e-06,
"loss": 0.8312,
"step": 103
},
{
"epoch": 0.24313267095265925,
"grad_norm": 4.422360019571021,
"learning_rate": 3.91304347826087e-06,
"loss": 0.8054,
"step": 104
},
{
"epoch": 0.24547048509643482,
"grad_norm": 4.540760031449362,
"learning_rate": 3.90096618357488e-06,
"loss": 0.8011,
"step": 105
},
{
"epoch": 0.24780829924021042,
"grad_norm": 4.577644817701169,
"learning_rate": 3.88888888888889e-06,
"loss": 0.7851,
"step": 106
},
{
"epoch": 0.25014611338398596,
"grad_norm": 4.750903595759052,
"learning_rate": 3.8768115942028985e-06,
"loss": 0.8496,
"step": 107
},
{
"epoch": 0.25248392752776155,
"grad_norm": 4.744977001781623,
"learning_rate": 3.864734299516908e-06,
"loss": 0.8218,
"step": 108
},
{
"epoch": 0.2548217416715371,
"grad_norm": 4.548950141262851,
"learning_rate": 3.852657004830918e-06,
"loss": 0.8053,
"step": 109
},
{
"epoch": 0.2571595558153127,
"grad_norm": 4.44828603075951,
"learning_rate": 3.840579710144928e-06,
"loss": 0.8231,
"step": 110
},
{
"epoch": 0.25949736995908823,
"grad_norm": 4.672161591073822,
"learning_rate": 3.8285024154589375e-06,
"loss": 0.8389,
"step": 111
},
{
"epoch": 0.2618351841028638,
"grad_norm": 4.526274586937092,
"learning_rate": 3.816425120772947e-06,
"loss": 0.8683,
"step": 112
},
{
"epoch": 0.26417299824663937,
"grad_norm": 4.603415978914653,
"learning_rate": 3.804347826086957e-06,
"loss": 0.8206,
"step": 113
},
{
"epoch": 0.26651081239041496,
"grad_norm": 4.343843088593362,
"learning_rate": 3.792270531400967e-06,
"loss": 0.823,
"step": 114
},
{
"epoch": 0.2688486265341905,
"grad_norm": 4.131180727748698,
"learning_rate": 3.780193236714976e-06,
"loss": 0.7964,
"step": 115
},
{
"epoch": 0.2711864406779661,
"grad_norm": 5.611563677944062,
"learning_rate": 3.768115942028986e-06,
"loss": 0.8529,
"step": 116
},
{
"epoch": 0.2735242548217417,
"grad_norm": 4.315382063517201,
"learning_rate": 3.7560386473429956e-06,
"loss": 0.7849,
"step": 117
},
{
"epoch": 0.27586206896551724,
"grad_norm": 4.3301657812789776,
"learning_rate": 3.743961352657005e-06,
"loss": 0.8392,
"step": 118
},
{
"epoch": 0.27819988310929283,
"grad_norm": 4.763659062354643,
"learning_rate": 3.7318840579710147e-06,
"loss": 0.7846,
"step": 119
},
{
"epoch": 0.2805376972530684,
"grad_norm": 4.531318611414816,
"learning_rate": 3.7198067632850245e-06,
"loss": 0.8335,
"step": 120
},
{
"epoch": 0.28287551139684397,
"grad_norm": 4.4418077648050485,
"learning_rate": 3.707729468599034e-06,
"loss": 0.7858,
"step": 121
},
{
"epoch": 0.2852133255406195,
"grad_norm": 4.39068842397474,
"learning_rate": 3.6956521739130436e-06,
"loss": 0.8408,
"step": 122
},
{
"epoch": 0.2875511396843951,
"grad_norm": 4.585137838540199,
"learning_rate": 3.6835748792270538e-06,
"loss": 0.8316,
"step": 123
},
{
"epoch": 0.28988895382817065,
"grad_norm": 4.319672080062613,
"learning_rate": 3.6714975845410635e-06,
"loss": 0.8241,
"step": 124
},
{
"epoch": 0.29222676797194624,
"grad_norm": 4.131090234388279,
"learning_rate": 3.659420289855073e-06,
"loss": 0.7416,
"step": 125
},
{
"epoch": 0.2945645821157218,
"grad_norm": 4.081456252490184,
"learning_rate": 3.6473429951690826e-06,
"loss": 0.7958,
"step": 126
},
{
"epoch": 0.2969023962594974,
"grad_norm": 4.090503599319394,
"learning_rate": 3.635265700483092e-06,
"loss": 0.8096,
"step": 127
},
{
"epoch": 0.2992402104032729,
"grad_norm": 4.129285724564573,
"learning_rate": 3.6231884057971017e-06,
"loss": 0.7918,
"step": 128
},
{
"epoch": 0.3015780245470485,
"grad_norm": 4.506022555765926,
"learning_rate": 3.6111111111111115e-06,
"loss": 0.8333,
"step": 129
},
{
"epoch": 0.30391583869082406,
"grad_norm": 4.151575198600969,
"learning_rate": 3.5990338164251208e-06,
"loss": 0.7713,
"step": 130
},
{
"epoch": 0.30625365283459965,
"grad_norm": 4.614683656771631,
"learning_rate": 3.5869565217391305e-06,
"loss": 0.8298,
"step": 131
},
{
"epoch": 0.3085914669783752,
"grad_norm": 4.6094981031628075,
"learning_rate": 3.5748792270531403e-06,
"loss": 0.8217,
"step": 132
},
{
"epoch": 0.3109292811221508,
"grad_norm": 4.2999582776551675,
"learning_rate": 3.5628019323671496e-06,
"loss": 0.7968,
"step": 133
},
{
"epoch": 0.31326709526592633,
"grad_norm": 4.864198700798981,
"learning_rate": 3.55072463768116e-06,
"loss": 0.8141,
"step": 134
},
{
"epoch": 0.31560490940970193,
"grad_norm": 4.601546334463328,
"learning_rate": 3.5386473429951696e-06,
"loss": 0.7925,
"step": 135
},
{
"epoch": 0.3179427235534775,
"grad_norm": 4.089485101723296,
"learning_rate": 3.5265700483091793e-06,
"loss": 0.7873,
"step": 136
},
{
"epoch": 0.32028053769725306,
"grad_norm": 4.0777367885745806,
"learning_rate": 3.5144927536231887e-06,
"loss": 0.7985,
"step": 137
},
{
"epoch": 0.32261835184102866,
"grad_norm": 4.832689220436005,
"learning_rate": 3.5024154589371984e-06,
"loss": 0.8306,
"step": 138
},
{
"epoch": 0.3249561659848042,
"grad_norm": 4.888417681228503,
"learning_rate": 3.490338164251208e-06,
"loss": 0.8353,
"step": 139
},
{
"epoch": 0.3272939801285798,
"grad_norm": 4.28948650105686,
"learning_rate": 3.4782608695652175e-06,
"loss": 0.8057,
"step": 140
},
{
"epoch": 0.32963179427235534,
"grad_norm": 4.203178774124529,
"learning_rate": 3.4661835748792273e-06,
"loss": 0.7788,
"step": 141
},
{
"epoch": 0.33196960841613093,
"grad_norm": 4.637106026831514,
"learning_rate": 3.4541062801932366e-06,
"loss": 0.8521,
"step": 142
},
{
"epoch": 0.3343074225599065,
"grad_norm": 4.350395114537057,
"learning_rate": 3.4420289855072464e-06,
"loss": 0.7968,
"step": 143
},
{
"epoch": 0.33664523670368207,
"grad_norm": 4.474607456827939,
"learning_rate": 3.4299516908212565e-06,
"loss": 0.8208,
"step": 144
},
{
"epoch": 0.3389830508474576,
"grad_norm": 4.100288353060924,
"learning_rate": 3.4178743961352663e-06,
"loss": 0.8165,
"step": 145
},
{
"epoch": 0.3413208649912332,
"grad_norm": 4.6247997756094845,
"learning_rate": 3.4057971014492756e-06,
"loss": 0.8294,
"step": 146
},
{
"epoch": 0.34365867913500875,
"grad_norm": 4.525169765596723,
"learning_rate": 3.3937198067632854e-06,
"loss": 0.7713,
"step": 147
},
{
"epoch": 0.34599649327878435,
"grad_norm": 4.442206881786442,
"learning_rate": 3.381642512077295e-06,
"loss": 0.82,
"step": 148
},
{
"epoch": 0.3483343074225599,
"grad_norm": 4.225556484795958,
"learning_rate": 3.3695652173913045e-06,
"loss": 0.7886,
"step": 149
},
{
"epoch": 0.3506721215663355,
"grad_norm": 4.268743583707888,
"learning_rate": 3.3574879227053142e-06,
"loss": 0.7762,
"step": 150
},
{
"epoch": 0.353009935710111,
"grad_norm": 4.338428118785664,
"learning_rate": 3.345410628019324e-06,
"loss": 0.7719,
"step": 151
},
{
"epoch": 0.3553477498538866,
"grad_norm": 4.188696391446484,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.7745,
"step": 152
},
{
"epoch": 0.35768556399766216,
"grad_norm": 4.310914121193176,
"learning_rate": 3.321256038647343e-06,
"loss": 0.8188,
"step": 153
},
{
"epoch": 0.36002337814143776,
"grad_norm": 4.391656829031555,
"learning_rate": 3.3091787439613533e-06,
"loss": 0.8148,
"step": 154
},
{
"epoch": 0.36236119228521335,
"grad_norm": 4.104259018738402,
"learning_rate": 3.2971014492753626e-06,
"loss": 0.8519,
"step": 155
},
{
"epoch": 0.3646990064289889,
"grad_norm": 4.25629990334181,
"learning_rate": 3.2850241545893724e-06,
"loss": 0.7983,
"step": 156
},
{
"epoch": 0.3670368205727645,
"grad_norm": 4.134990269789036,
"learning_rate": 3.272946859903382e-06,
"loss": 0.7852,
"step": 157
},
{
"epoch": 0.36937463471654003,
"grad_norm": 4.0420970622040135,
"learning_rate": 3.2608695652173914e-06,
"loss": 0.7992,
"step": 158
},
{
"epoch": 0.3717124488603156,
"grad_norm": 4.341222672754024,
"learning_rate": 3.248792270531401e-06,
"loss": 0.7704,
"step": 159
},
{
"epoch": 0.37405026300409117,
"grad_norm": 4.115523347634753,
"learning_rate": 3.236714975845411e-06,
"loss": 0.791,
"step": 160
},
{
"epoch": 0.37638807714786676,
"grad_norm": 4.136587110231359,
"learning_rate": 3.2246376811594203e-06,
"loss": 0.7752,
"step": 161
},
{
"epoch": 0.3787258912916423,
"grad_norm": 4.504460772929252,
"learning_rate": 3.21256038647343e-06,
"loss": 0.8176,
"step": 162
},
{
"epoch": 0.3810637054354179,
"grad_norm": 4.629377407126395,
"learning_rate": 3.20048309178744e-06,
"loss": 0.8275,
"step": 163
},
{
"epoch": 0.38340151957919344,
"grad_norm": 4.422477761962599,
"learning_rate": 3.188405797101449e-06,
"loss": 0.7847,
"step": 164
},
{
"epoch": 0.38573933372296904,
"grad_norm": 3.9888038106102153,
"learning_rate": 3.1763285024154593e-06,
"loss": 0.7939,
"step": 165
},
{
"epoch": 0.3880771478667446,
"grad_norm": 4.125918892903183,
"learning_rate": 3.164251207729469e-06,
"loss": 0.7717,
"step": 166
},
{
"epoch": 0.3904149620105202,
"grad_norm": 6.885413034719951,
"learning_rate": 3.152173913043479e-06,
"loss": 0.8514,
"step": 167
},
{
"epoch": 0.3927527761542957,
"grad_norm": 4.446340003037039,
"learning_rate": 3.140096618357488e-06,
"loss": 0.813,
"step": 168
},
{
"epoch": 0.3950905902980713,
"grad_norm": 3.9959566422822346,
"learning_rate": 3.128019323671498e-06,
"loss": 0.7776,
"step": 169
},
{
"epoch": 0.39742840444184685,
"grad_norm": 4.627421389612255,
"learning_rate": 3.1159420289855073e-06,
"loss": 0.8395,
"step": 170
},
{
"epoch": 0.39976621858562245,
"grad_norm": 4.118715295949323,
"learning_rate": 3.103864734299517e-06,
"loss": 0.7824,
"step": 171
},
{
"epoch": 0.402104032729398,
"grad_norm": 4.109354113391549,
"learning_rate": 3.0917874396135268e-06,
"loss": 0.7961,
"step": 172
},
{
"epoch": 0.4044418468731736,
"grad_norm": 4.439845150489727,
"learning_rate": 3.079710144927536e-06,
"loss": 0.8035,
"step": 173
},
{
"epoch": 0.4067796610169492,
"grad_norm": 4.358250626799815,
"learning_rate": 3.067632850241546e-06,
"loss": 0.7829,
"step": 174
},
{
"epoch": 0.4091174751607247,
"grad_norm": 4.43053050152037,
"learning_rate": 3.055555555555556e-06,
"loss": 0.7554,
"step": 175
},
{
"epoch": 0.4114552893045003,
"grad_norm": 4.324105830729812,
"learning_rate": 3.043478260869566e-06,
"loss": 0.7763,
"step": 176
},
{
"epoch": 0.41379310344827586,
"grad_norm": 4.505708676229393,
"learning_rate": 3.031400966183575e-06,
"loss": 0.8052,
"step": 177
},
{
"epoch": 0.41613091759205145,
"grad_norm": 4.198009455233572,
"learning_rate": 3.019323671497585e-06,
"loss": 0.8036,
"step": 178
},
{
"epoch": 0.418468731735827,
"grad_norm": 4.255888057785401,
"learning_rate": 3.0072463768115946e-06,
"loss": 0.8675,
"step": 179
},
{
"epoch": 0.4208065458796026,
"grad_norm": 4.166498155365259,
"learning_rate": 2.995169082125604e-06,
"loss": 0.8099,
"step": 180
},
{
"epoch": 0.42314436002337813,
"grad_norm": 4.471408293419965,
"learning_rate": 2.9830917874396137e-06,
"loss": 0.8025,
"step": 181
},
{
"epoch": 0.4254821741671537,
"grad_norm": 4.910816764257679,
"learning_rate": 2.9710144927536235e-06,
"loss": 0.7702,
"step": 182
},
{
"epoch": 0.42781998831092927,
"grad_norm": 4.071039233797094,
"learning_rate": 2.958937198067633e-06,
"loss": 0.8143,
"step": 183
},
{
"epoch": 0.43015780245470486,
"grad_norm": 4.738565032335615,
"learning_rate": 2.9468599033816426e-06,
"loss": 0.8158,
"step": 184
},
{
"epoch": 0.4324956165984804,
"grad_norm": 4.2936029356268195,
"learning_rate": 2.9347826086956528e-06,
"loss": 0.7874,
"step": 185
},
{
"epoch": 0.434833430742256,
"grad_norm": 4.206590096270031,
"learning_rate": 2.922705314009662e-06,
"loss": 0.7997,
"step": 186
},
{
"epoch": 0.43717124488603154,
"grad_norm": 4.2051171328892085,
"learning_rate": 2.910628019323672e-06,
"loss": 0.787,
"step": 187
},
{
"epoch": 0.43950905902980714,
"grad_norm": 4.245918333471198,
"learning_rate": 2.8985507246376816e-06,
"loss": 0.7997,
"step": 188
},
{
"epoch": 0.4418468731735827,
"grad_norm": 4.179370789694772,
"learning_rate": 2.886473429951691e-06,
"loss": 0.7759,
"step": 189
},
{
"epoch": 0.4441846873173583,
"grad_norm": 4.41515451612343,
"learning_rate": 2.8743961352657007e-06,
"loss": 0.7886,
"step": 190
},
{
"epoch": 0.4465225014611338,
"grad_norm": 4.227222440463386,
"learning_rate": 2.8623188405797105e-06,
"loss": 0.8294,
"step": 191
},
{
"epoch": 0.4488603156049094,
"grad_norm": 4.095256731977333,
"learning_rate": 2.85024154589372e-06,
"loss": 0.7604,
"step": 192
},
{
"epoch": 0.451198129748685,
"grad_norm": 4.286339845869899,
"learning_rate": 2.8381642512077295e-06,
"loss": 0.8237,
"step": 193
},
{
"epoch": 0.45353594389246055,
"grad_norm": 4.141328341649525,
"learning_rate": 2.8260869565217393e-06,
"loss": 0.7961,
"step": 194
},
{
"epoch": 0.45587375803623614,
"grad_norm": 4.522982085235291,
"learning_rate": 2.8140096618357486e-06,
"loss": 0.7918,
"step": 195
},
{
"epoch": 0.4582115721800117,
"grad_norm": 4.9933547683547745,
"learning_rate": 2.801932367149759e-06,
"loss": 0.8151,
"step": 196
},
{
"epoch": 0.4605493863237873,
"grad_norm": 3.8642864404581463,
"learning_rate": 2.7898550724637686e-06,
"loss": 0.7411,
"step": 197
},
{
"epoch": 0.4628872004675628,
"grad_norm": 4.304180579272247,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.7975,
"step": 198
},
{
"epoch": 0.4652250146113384,
"grad_norm": 4.246581554029021,
"learning_rate": 2.7657004830917877e-06,
"loss": 0.829,
"step": 199
},
{
"epoch": 0.46756282875511396,
"grad_norm": 4.257923593734172,
"learning_rate": 2.7536231884057974e-06,
"loss": 0.7475,
"step": 200
},
{
"epoch": 0.46990064289888955,
"grad_norm": 4.001585884428085,
"learning_rate": 2.7415458937198068e-06,
"loss": 0.7866,
"step": 201
},
{
"epoch": 0.4722384570426651,
"grad_norm": 4.064057741085377,
"learning_rate": 2.7294685990338165e-06,
"loss": 0.7861,
"step": 202
},
{
"epoch": 0.4745762711864407,
"grad_norm": 4.10748108128691,
"learning_rate": 2.7173913043478263e-06,
"loss": 0.7735,
"step": 203
},
{
"epoch": 0.47691408533021623,
"grad_norm": 3.9433247912828455,
"learning_rate": 2.7053140096618356e-06,
"loss": 0.7494,
"step": 204
},
{
"epoch": 0.4792518994739918,
"grad_norm": 4.368990885761068,
"learning_rate": 2.6932367149758454e-06,
"loss": 0.7961,
"step": 205
},
{
"epoch": 0.48158971361776737,
"grad_norm": 4.323297445539955,
"learning_rate": 2.6811594202898555e-06,
"loss": 0.7498,
"step": 206
},
{
"epoch": 0.48392752776154296,
"grad_norm": 4.276797241413841,
"learning_rate": 2.6690821256038653e-06,
"loss": 0.79,
"step": 207
},
{
"epoch": 0.4862653419053185,
"grad_norm": 4.29615738858519,
"learning_rate": 2.6570048309178746e-06,
"loss": 0.7762,
"step": 208
},
{
"epoch": 0.4886031560490941,
"grad_norm": 4.24658335062537,
"learning_rate": 2.6449275362318844e-06,
"loss": 0.7547,
"step": 209
},
{
"epoch": 0.49094097019286964,
"grad_norm": 4.140652638469078,
"learning_rate": 2.632850241545894e-06,
"loss": 0.7568,
"step": 210
},
{
"epoch": 0.49327878433664524,
"grad_norm": 4.355835930781116,
"learning_rate": 2.6207729468599035e-06,
"loss": 0.8005,
"step": 211
},
{
"epoch": 0.49561659848042083,
"grad_norm": 4.1002906789316045,
"learning_rate": 2.6086956521739132e-06,
"loss": 0.7791,
"step": 212
},
{
"epoch": 0.4979544126241964,
"grad_norm": 4.210038749172179,
"learning_rate": 2.596618357487923e-06,
"loss": 0.7777,
"step": 213
},
{
"epoch": 0.5002922267679719,
"grad_norm": 4.1435757469488985,
"learning_rate": 2.5845410628019323e-06,
"loss": 0.7824,
"step": 214
},
{
"epoch": 0.5026300409117476,
"grad_norm": 4.309944612009968,
"learning_rate": 2.572463768115942e-06,
"loss": 0.7625,
"step": 215
},
{
"epoch": 0.5049678550555231,
"grad_norm": 4.662526042139382,
"learning_rate": 2.5603864734299523e-06,
"loss": 0.7873,
"step": 216
},
{
"epoch": 0.5073056691992986,
"grad_norm": 4.473614799031895,
"learning_rate": 2.5483091787439616e-06,
"loss": 0.7737,
"step": 217
},
{
"epoch": 0.5096434833430742,
"grad_norm": 4.54082051832202,
"learning_rate": 2.5362318840579714e-06,
"loss": 0.782,
"step": 218
},
{
"epoch": 0.5119812974868498,
"grad_norm": 3.9808775866846817,
"learning_rate": 2.524154589371981e-06,
"loss": 0.7592,
"step": 219
},
{
"epoch": 0.5143191116306254,
"grad_norm": 4.233088111283031,
"learning_rate": 2.5120772946859904e-06,
"loss": 0.774,
"step": 220
},
{
"epoch": 0.5166569257744009,
"grad_norm": 4.179314655537464,
"learning_rate": 2.5e-06,
"loss": 0.7936,
"step": 221
},
{
"epoch": 0.5189947399181765,
"grad_norm": 4.808766886416466,
"learning_rate": 2.48792270531401e-06,
"loss": 0.7961,
"step": 222
},
{
"epoch": 0.5213325540619521,
"grad_norm": 4.088801764052967,
"learning_rate": 2.4758454106280193e-06,
"loss": 0.7693,
"step": 223
},
{
"epoch": 0.5236703682057277,
"grad_norm": 4.1844548782576005,
"learning_rate": 2.4637681159420295e-06,
"loss": 0.7961,
"step": 224
},
{
"epoch": 0.5260081823495032,
"grad_norm": 3.909844659514703,
"learning_rate": 2.451690821256039e-06,
"loss": 0.7304,
"step": 225
},
{
"epoch": 0.5283459964932787,
"grad_norm": 3.7096435860994346,
"learning_rate": 2.4396135265700486e-06,
"loss": 0.7712,
"step": 226
},
{
"epoch": 0.5306838106370544,
"grad_norm": 4.0389484559123305,
"learning_rate": 2.4275362318840583e-06,
"loss": 0.7711,
"step": 227
},
{
"epoch": 0.5330216247808299,
"grad_norm": 4.171802534409844,
"learning_rate": 2.4154589371980677e-06,
"loss": 0.7768,
"step": 228
},
{
"epoch": 0.5353594389246055,
"grad_norm": 4.636520882862149,
"learning_rate": 2.4033816425120774e-06,
"loss": 0.7832,
"step": 229
},
{
"epoch": 0.537697253068381,
"grad_norm": 4.2073440647978675,
"learning_rate": 2.391304347826087e-06,
"loss": 0.7816,
"step": 230
},
{
"epoch": 0.5400350672121567,
"grad_norm": 4.115009346971059,
"learning_rate": 2.379227053140097e-06,
"loss": 0.7152,
"step": 231
},
{
"epoch": 0.5423728813559322,
"grad_norm": 4.47134068227285,
"learning_rate": 2.3671497584541063e-06,
"loss": 0.7898,
"step": 232
},
{
"epoch": 0.5447106954997077,
"grad_norm": 4.78251740854767,
"learning_rate": 2.355072463768116e-06,
"loss": 0.8101,
"step": 233
},
{
"epoch": 0.5470485096434834,
"grad_norm": 4.735288223469208,
"learning_rate": 2.3429951690821258e-06,
"loss": 0.7864,
"step": 234
},
{
"epoch": 0.5493863237872589,
"grad_norm": 4.445520808429391,
"learning_rate": 2.3309178743961355e-06,
"loss": 0.7986,
"step": 235
},
{
"epoch": 0.5517241379310345,
"grad_norm": 4.83504723163877,
"learning_rate": 2.3188405797101453e-06,
"loss": 0.8231,
"step": 236
},
{
"epoch": 0.55406195207481,
"grad_norm": 3.9498177063802897,
"learning_rate": 2.3067632850241546e-06,
"loss": 0.7834,
"step": 237
},
{
"epoch": 0.5563997662185857,
"grad_norm": 4.190234074575243,
"learning_rate": 2.2946859903381644e-06,
"loss": 0.7839,
"step": 238
},
{
"epoch": 0.5587375803623612,
"grad_norm": 4.76462271734834,
"learning_rate": 2.282608695652174e-06,
"loss": 0.8258,
"step": 239
},
{
"epoch": 0.5610753945061367,
"grad_norm": 4.369965626736373,
"learning_rate": 2.270531400966184e-06,
"loss": 0.7927,
"step": 240
},
{
"epoch": 0.5634132086499123,
"grad_norm": 4.423067504974851,
"learning_rate": 2.2584541062801937e-06,
"loss": 0.8181,
"step": 241
},
{
"epoch": 0.5657510227936879,
"grad_norm": 4.117514088831818,
"learning_rate": 2.246376811594203e-06,
"loss": 0.7471,
"step": 242
},
{
"epoch": 0.5680888369374635,
"grad_norm": 4.208191494707427,
"learning_rate": 2.2342995169082127e-06,
"loss": 0.7936,
"step": 243
},
{
"epoch": 0.570426651081239,
"grad_norm": 4.30348767627021,
"learning_rate": 2.222222222222222e-06,
"loss": 0.8087,
"step": 244
},
{
"epoch": 0.5727644652250146,
"grad_norm": 4.08781387103947,
"learning_rate": 2.2101449275362323e-06,
"loss": 0.7712,
"step": 245
},
{
"epoch": 0.5751022793687902,
"grad_norm": 4.255214633571236,
"learning_rate": 2.1980676328502416e-06,
"loss": 0.7327,
"step": 246
},
{
"epoch": 0.5774400935125658,
"grad_norm": 4.249395888532918,
"learning_rate": 2.1859903381642513e-06,
"loss": 0.8115,
"step": 247
},
{
"epoch": 0.5797779076563413,
"grad_norm": 4.048350886158577,
"learning_rate": 2.173913043478261e-06,
"loss": 0.7629,
"step": 248
},
{
"epoch": 0.5821157218001168,
"grad_norm": 4.286991029118236,
"learning_rate": 2.1618357487922704e-06,
"loss": 0.7748,
"step": 249
},
{
"epoch": 0.5844535359438925,
"grad_norm": 4.473519294462659,
"learning_rate": 2.1497584541062806e-06,
"loss": 0.7786,
"step": 250
},
{
"epoch": 0.586791350087668,
"grad_norm": 4.511510327301669,
"learning_rate": 2.13768115942029e-06,
"loss": 0.8125,
"step": 251
},
{
"epoch": 0.5891291642314436,
"grad_norm": 4.198745204040387,
"learning_rate": 2.1256038647342997e-06,
"loss": 0.7843,
"step": 252
},
{
"epoch": 0.5914669783752192,
"grad_norm": 4.3568648354588655,
"learning_rate": 2.1135265700483095e-06,
"loss": 0.7346,
"step": 253
},
{
"epoch": 0.5938047925189948,
"grad_norm": 3.8942460823301412,
"learning_rate": 2.101449275362319e-06,
"loss": 0.7879,
"step": 254
},
{
"epoch": 0.5961426066627703,
"grad_norm": 4.221148903821956,
"learning_rate": 2.089371980676329e-06,
"loss": 0.799,
"step": 255
},
{
"epoch": 0.5984804208065458,
"grad_norm": 4.041691704636457,
"learning_rate": 2.0772946859903383e-06,
"loss": 0.767,
"step": 256
},
{
"epoch": 0.6008182349503215,
"grad_norm": 4.03197715174544,
"learning_rate": 2.065217391304348e-06,
"loss": 0.7487,
"step": 257
},
{
"epoch": 0.603156049094097,
"grad_norm": 4.082902353599498,
"learning_rate": 2.053140096618358e-06,
"loss": 0.7874,
"step": 258
},
{
"epoch": 0.6054938632378726,
"grad_norm": 3.7781639431570557,
"learning_rate": 2.041062801932367e-06,
"loss": 0.7721,
"step": 259
},
{
"epoch": 0.6078316773816481,
"grad_norm": 4.280421267303715,
"learning_rate": 2.028985507246377e-06,
"loss": 0.783,
"step": 260
},
{
"epoch": 0.6101694915254238,
"grad_norm": 4.073869260462684,
"learning_rate": 2.0169082125603867e-06,
"loss": 0.7759,
"step": 261
},
{
"epoch": 0.6125073056691993,
"grad_norm": 3.935130784068012,
"learning_rate": 2.0048309178743964e-06,
"loss": 0.7669,
"step": 262
},
{
"epoch": 0.6148451198129748,
"grad_norm": 4.40643829592683,
"learning_rate": 1.9927536231884058e-06,
"loss": 0.7572,
"step": 263
},
{
"epoch": 0.6171829339567504,
"grad_norm": 4.337844456783807,
"learning_rate": 1.9806763285024155e-06,
"loss": 0.7605,
"step": 264
},
{
"epoch": 0.619520748100526,
"grad_norm": 4.281102087431204,
"learning_rate": 1.9685990338164253e-06,
"loss": 0.7393,
"step": 265
},
{
"epoch": 0.6218585622443016,
"grad_norm": 4.23207914041172,
"learning_rate": 1.956521739130435e-06,
"loss": 0.7794,
"step": 266
},
{
"epoch": 0.6241963763880771,
"grad_norm": 3.9282868393703896,
"learning_rate": 1.944444444444445e-06,
"loss": 0.7782,
"step": 267
},
{
"epoch": 0.6265341905318527,
"grad_norm": 4.098138917146235,
"learning_rate": 1.932367149758454e-06,
"loss": 0.7725,
"step": 268
},
{
"epoch": 0.6288720046756283,
"grad_norm": 4.141313603560724,
"learning_rate": 1.920289855072464e-06,
"loss": 0.7785,
"step": 269
},
{
"epoch": 0.6312098188194039,
"grad_norm": 4.611198038918517,
"learning_rate": 1.9082125603864736e-06,
"loss": 0.8185,
"step": 270
},
{
"epoch": 0.6335476329631794,
"grad_norm": 4.452172749748544,
"learning_rate": 1.8961352657004834e-06,
"loss": 0.7703,
"step": 271
},
{
"epoch": 0.635885447106955,
"grad_norm": 4.454099100217199,
"learning_rate": 1.884057971014493e-06,
"loss": 0.7756,
"step": 272
},
{
"epoch": 0.6382232612507306,
"grad_norm": 4.159216947583455,
"learning_rate": 1.8719806763285025e-06,
"loss": 0.7358,
"step": 273
},
{
"epoch": 0.6405610753945061,
"grad_norm": 4.0088196320012885,
"learning_rate": 1.8599033816425122e-06,
"loss": 0.8002,
"step": 274
},
{
"epoch": 0.6428988895382817,
"grad_norm": 4.197686175636046,
"learning_rate": 1.8478260869565218e-06,
"loss": 0.7998,
"step": 275
},
{
"epoch": 0.6452367036820573,
"grad_norm": 4.373828840174765,
"learning_rate": 1.8357487922705318e-06,
"loss": 0.742,
"step": 276
},
{
"epoch": 0.6475745178258329,
"grad_norm": 4.212073348085054,
"learning_rate": 1.8236714975845413e-06,
"loss": 0.7678,
"step": 277
},
{
"epoch": 0.6499123319696084,
"grad_norm": 3.972532257275605,
"learning_rate": 1.8115942028985508e-06,
"loss": 0.7757,
"step": 278
},
{
"epoch": 0.6522501461133839,
"grad_norm": 4.141324887414669,
"learning_rate": 1.7995169082125604e-06,
"loss": 0.7447,
"step": 279
},
{
"epoch": 0.6545879602571596,
"grad_norm": 4.319306461683,
"learning_rate": 1.7874396135265702e-06,
"loss": 0.7669,
"step": 280
},
{
"epoch": 0.6569257744009351,
"grad_norm": 4.13159761798667,
"learning_rate": 1.77536231884058e-06,
"loss": 0.753,
"step": 281
},
{
"epoch": 0.6592635885447107,
"grad_norm": 4.261205598617194,
"learning_rate": 1.7632850241545897e-06,
"loss": 0.7867,
"step": 282
},
{
"epoch": 0.6616014026884862,
"grad_norm": 4.043224440888056,
"learning_rate": 1.7512077294685992e-06,
"loss": 0.7634,
"step": 283
},
{
"epoch": 0.6639392168322619,
"grad_norm": 4.221366014724788,
"learning_rate": 1.7391304347826088e-06,
"loss": 0.8032,
"step": 284
},
{
"epoch": 0.6662770309760374,
"grad_norm": 4.2120362159497935,
"learning_rate": 1.7270531400966183e-06,
"loss": 0.7449,
"step": 285
},
{
"epoch": 0.668614845119813,
"grad_norm": 4.330019099169185,
"learning_rate": 1.7149758454106283e-06,
"loss": 0.7641,
"step": 286
},
{
"epoch": 0.6709526592635885,
"grad_norm": 4.234551345137344,
"learning_rate": 1.7028985507246378e-06,
"loss": 0.7785,
"step": 287
},
{
"epoch": 0.6732904734073641,
"grad_norm": 4.789360597178873,
"learning_rate": 1.6908212560386476e-06,
"loss": 0.7517,
"step": 288
},
{
"epoch": 0.6756282875511397,
"grad_norm": 4.087545337483895,
"learning_rate": 1.6787439613526571e-06,
"loss": 0.7398,
"step": 289
},
{
"epoch": 0.6779661016949152,
"grad_norm": 4.048928229662754,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.7759,
"step": 290
},
{
"epoch": 0.6803039158386909,
"grad_norm": 4.258228190717208,
"learning_rate": 1.6545893719806766e-06,
"loss": 0.7816,
"step": 291
},
{
"epoch": 0.6826417299824664,
"grad_norm": 4.207730290983508,
"learning_rate": 1.6425120772946862e-06,
"loss": 0.7492,
"step": 292
},
{
"epoch": 0.684979544126242,
"grad_norm": 4.211632269620855,
"learning_rate": 1.6304347826086957e-06,
"loss": 0.8045,
"step": 293
},
{
"epoch": 0.6873173582700175,
"grad_norm": 4.2791266083196575,
"learning_rate": 1.6183574879227055e-06,
"loss": 0.7686,
"step": 294
},
{
"epoch": 0.6896551724137931,
"grad_norm": 4.400251918863611,
"learning_rate": 1.606280193236715e-06,
"loss": 0.7346,
"step": 295
},
{
"epoch": 0.6919929865575687,
"grad_norm": 3.8930859729711,
"learning_rate": 1.5942028985507246e-06,
"loss": 0.7476,
"step": 296
},
{
"epoch": 0.6943308007013442,
"grad_norm": 4.179140087181349,
"learning_rate": 1.5821256038647345e-06,
"loss": 0.7758,
"step": 297
},
{
"epoch": 0.6966686148451198,
"grad_norm": 4.1025982230247005,
"learning_rate": 1.570048309178744e-06,
"loss": 0.764,
"step": 298
},
{
"epoch": 0.6990064289888954,
"grad_norm": 4.54359763623282,
"learning_rate": 1.5579710144927536e-06,
"loss": 0.813,
"step": 299
},
{
"epoch": 0.701344243132671,
"grad_norm": 3.8868646182191333,
"learning_rate": 1.5458937198067634e-06,
"loss": 0.7809,
"step": 300
},
{
"epoch": 0.7036820572764465,
"grad_norm": 4.027087287618028,
"learning_rate": 1.533816425120773e-06,
"loss": 0.7421,
"step": 301
},
{
"epoch": 0.706019871420222,
"grad_norm": 4.221180533576584,
"learning_rate": 1.521739130434783e-06,
"loss": 0.7437,
"step": 302
},
{
"epoch": 0.7083576855639977,
"grad_norm": 4.025585601097397,
"learning_rate": 1.5096618357487924e-06,
"loss": 0.7587,
"step": 303
},
{
"epoch": 0.7106954997077732,
"grad_norm": 4.082415548970675,
"learning_rate": 1.497584541062802e-06,
"loss": 0.7437,
"step": 304
},
{
"epoch": 0.7130333138515488,
"grad_norm": 3.9885030268207764,
"learning_rate": 1.4855072463768117e-06,
"loss": 0.7342,
"step": 305
},
{
"epoch": 0.7153711279953243,
"grad_norm": 4.110847006439374,
"learning_rate": 1.4734299516908213e-06,
"loss": 0.7643,
"step": 306
},
{
"epoch": 0.7177089421391,
"grad_norm": 4.018479338411149,
"learning_rate": 1.461352657004831e-06,
"loss": 0.7524,
"step": 307
},
{
"epoch": 0.7200467562828755,
"grad_norm": 3.8679633701250835,
"learning_rate": 1.4492753623188408e-06,
"loss": 0.7854,
"step": 308
},
{
"epoch": 0.722384570426651,
"grad_norm": 4.308222321507237,
"learning_rate": 1.4371980676328504e-06,
"loss": 0.7805,
"step": 309
},
{
"epoch": 0.7247223845704267,
"grad_norm": 3.8916559653506018,
"learning_rate": 1.42512077294686e-06,
"loss": 0.6789,
"step": 310
},
{
"epoch": 0.7270601987142022,
"grad_norm": 4.208472724847014,
"learning_rate": 1.4130434782608697e-06,
"loss": 0.7624,
"step": 311
},
{
"epoch": 0.7293980128579778,
"grad_norm": 4.541098999570629,
"learning_rate": 1.4009661835748794e-06,
"loss": 0.7754,
"step": 312
},
{
"epoch": 0.7317358270017533,
"grad_norm": 3.894542881557041,
"learning_rate": 1.3888888888888892e-06,
"loss": 0.7327,
"step": 313
},
{
"epoch": 0.734073641145529,
"grad_norm": 4.316419064602019,
"learning_rate": 1.3768115942028987e-06,
"loss": 0.7785,
"step": 314
},
{
"epoch": 0.7364114552893045,
"grad_norm": 3.840444616763943,
"learning_rate": 1.3647342995169083e-06,
"loss": 0.7296,
"step": 315
},
{
"epoch": 0.7387492694330801,
"grad_norm": 4.0101608921412835,
"learning_rate": 1.3526570048309178e-06,
"loss": 0.7199,
"step": 316
},
{
"epoch": 0.7410870835768556,
"grad_norm": 4.02178577481216,
"learning_rate": 1.3405797101449278e-06,
"loss": 0.7662,
"step": 317
},
{
"epoch": 0.7434248977206313,
"grad_norm": 3.955088131738884,
"learning_rate": 1.3285024154589373e-06,
"loss": 0.7196,
"step": 318
},
{
"epoch": 0.7457627118644068,
"grad_norm": 4.130879922008592,
"learning_rate": 1.316425120772947e-06,
"loss": 0.787,
"step": 319
},
{
"epoch": 0.7481005260081823,
"grad_norm": 4.0739088224040705,
"learning_rate": 1.3043478260869566e-06,
"loss": 0.7509,
"step": 320
},
{
"epoch": 0.7504383401519579,
"grad_norm": 4.2499948389358595,
"learning_rate": 1.2922705314009662e-06,
"loss": 0.7373,
"step": 321
},
{
"epoch": 0.7527761542957335,
"grad_norm": 4.048557241149405,
"learning_rate": 1.2801932367149761e-06,
"loss": 0.781,
"step": 322
},
{
"epoch": 0.7551139684395091,
"grad_norm": 4.2499198906024205,
"learning_rate": 1.2681159420289857e-06,
"loss": 0.7674,
"step": 323
},
{
"epoch": 0.7574517825832846,
"grad_norm": 4.1878094914635255,
"learning_rate": 1.2560386473429952e-06,
"loss": 0.726,
"step": 324
},
{
"epoch": 0.7597895967270601,
"grad_norm": 4.531895242987001,
"learning_rate": 1.243961352657005e-06,
"loss": 0.7849,
"step": 325
},
{
"epoch": 0.7621274108708358,
"grad_norm": 4.042779330179229,
"learning_rate": 1.2318840579710147e-06,
"loss": 0.7532,
"step": 326
},
{
"epoch": 0.7644652250146113,
"grad_norm": 3.9930786810311254,
"learning_rate": 1.2198067632850243e-06,
"loss": 0.7286,
"step": 327
},
{
"epoch": 0.7668030391583869,
"grad_norm": 5.948998810978814,
"learning_rate": 1.2077294685990338e-06,
"loss": 0.8127,
"step": 328
},
{
"epoch": 0.7691408533021625,
"grad_norm": 4.144487299852383,
"learning_rate": 1.1956521739130436e-06,
"loss": 0.7691,
"step": 329
},
{
"epoch": 0.7714786674459381,
"grad_norm": 4.128733708034505,
"learning_rate": 1.1835748792270531e-06,
"loss": 0.768,
"step": 330
},
{
"epoch": 0.7738164815897136,
"grad_norm": 4.530767375631303,
"learning_rate": 1.1714975845410629e-06,
"loss": 0.7798,
"step": 331
},
{
"epoch": 0.7761542957334892,
"grad_norm": 3.9238729668993835,
"learning_rate": 1.1594202898550726e-06,
"loss": 0.7642,
"step": 332
},
{
"epoch": 0.7784921098772648,
"grad_norm": 4.111531783109019,
"learning_rate": 1.1473429951690822e-06,
"loss": 0.7616,
"step": 333
},
{
"epoch": 0.7808299240210403,
"grad_norm": 4.140234356572554,
"learning_rate": 1.135265700483092e-06,
"loss": 0.8308,
"step": 334
},
{
"epoch": 0.7831677381648159,
"grad_norm": 4.5225578335616845,
"learning_rate": 1.1231884057971015e-06,
"loss": 0.7688,
"step": 335
},
{
"epoch": 0.7855055523085914,
"grad_norm": 4.253055596048113,
"learning_rate": 1.111111111111111e-06,
"loss": 0.7823,
"step": 336
},
{
"epoch": 0.7878433664523671,
"grad_norm": 4.214973850734774,
"learning_rate": 1.0990338164251208e-06,
"loss": 0.7015,
"step": 337
},
{
"epoch": 0.7901811805961426,
"grad_norm": 4.242093529547378,
"learning_rate": 1.0869565217391306e-06,
"loss": 0.7902,
"step": 338
},
{
"epoch": 0.7925189947399182,
"grad_norm": 4.27860016507252,
"learning_rate": 1.0748792270531403e-06,
"loss": 0.7893,
"step": 339
},
{
"epoch": 0.7948568088836937,
"grad_norm": 4.193517659739712,
"learning_rate": 1.0628019323671499e-06,
"loss": 0.7932,
"step": 340
},
{
"epoch": 0.7971946230274694,
"grad_norm": 3.861888360541971,
"learning_rate": 1.0507246376811594e-06,
"loss": 0.7271,
"step": 341
},
{
"epoch": 0.7995324371712449,
"grad_norm": 4.044324859369637,
"learning_rate": 1.0386473429951692e-06,
"loss": 0.7651,
"step": 342
},
{
"epoch": 0.8018702513150204,
"grad_norm": 4.143848474405527,
"learning_rate": 1.026570048309179e-06,
"loss": 0.7991,
"step": 343
},
{
"epoch": 0.804208065458796,
"grad_norm": 4.543740361976109,
"learning_rate": 1.0144927536231885e-06,
"loss": 0.7871,
"step": 344
},
{
"epoch": 0.8065458796025716,
"grad_norm": 4.053324740509495,
"learning_rate": 1.0024154589371982e-06,
"loss": 0.7181,
"step": 345
},
{
"epoch": 0.8088836937463472,
"grad_norm": 3.91170761323185,
"learning_rate": 9.903381642512078e-07,
"loss": 0.7167,
"step": 346
},
{
"epoch": 0.8112215078901227,
"grad_norm": 3.9769619064751174,
"learning_rate": 9.782608695652175e-07,
"loss": 0.7152,
"step": 347
},
{
"epoch": 0.8135593220338984,
"grad_norm": 4.141477101296879,
"learning_rate": 9.66183574879227e-07,
"loss": 0.806,
"step": 348
},
{
"epoch": 0.8158971361776739,
"grad_norm": 3.9266793661338566,
"learning_rate": 9.541062801932368e-07,
"loss": 0.74,
"step": 349
},
{
"epoch": 0.8182349503214494,
"grad_norm": 3.905819434278297,
"learning_rate": 9.420289855072465e-07,
"loss": 0.7621,
"step": 350
},
{
"epoch": 0.820572764465225,
"grad_norm": 4.271457136544383,
"learning_rate": 9.299516908212561e-07,
"loss": 0.7108,
"step": 351
},
{
"epoch": 0.8229105786090006,
"grad_norm": 3.9018935668444907,
"learning_rate": 9.178743961352659e-07,
"loss": 0.7326,
"step": 352
},
{
"epoch": 0.8252483927527762,
"grad_norm": 3.842764627332658,
"learning_rate": 9.057971014492754e-07,
"loss": 0.769,
"step": 353
},
{
"epoch": 0.8275862068965517,
"grad_norm": 4.12270406926976,
"learning_rate": 8.937198067632851e-07,
"loss": 0.7462,
"step": 354
},
{
"epoch": 0.8299240210403273,
"grad_norm": 4.25238665717318,
"learning_rate": 8.816425120772948e-07,
"loss": 0.7417,
"step": 355
},
{
"epoch": 0.8322618351841029,
"grad_norm": 4.10405871770544,
"learning_rate": 8.695652173913044e-07,
"loss": 0.7769,
"step": 356
},
{
"epoch": 0.8345996493278784,
"grad_norm": 4.132898802117579,
"learning_rate": 8.574879227053141e-07,
"loss": 0.7334,
"step": 357
},
{
"epoch": 0.836937463471654,
"grad_norm": 3.9812833871444573,
"learning_rate": 8.454106280193238e-07,
"loss": 0.7437,
"step": 358
},
{
"epoch": 0.8392752776154295,
"grad_norm": 4.819360178352156,
"learning_rate": 8.333333333333333e-07,
"loss": 0.7594,
"step": 359
},
{
"epoch": 0.8416130917592052,
"grad_norm": 4.27077723520544,
"learning_rate": 8.212560386473431e-07,
"loss": 0.7282,
"step": 360
},
{
"epoch": 0.8439509059029807,
"grad_norm": 4.234704486935872,
"learning_rate": 8.091787439613527e-07,
"loss": 0.7844,
"step": 361
},
{
"epoch": 0.8462887200467563,
"grad_norm": 3.660518143878683,
"learning_rate": 7.971014492753623e-07,
"loss": 0.6846,
"step": 362
},
{
"epoch": 0.8486265341905318,
"grad_norm": 4.382898231252646,
"learning_rate": 7.85024154589372e-07,
"loss": 0.7378,
"step": 363
},
{
"epoch": 0.8509643483343075,
"grad_norm": 4.03693007471031,
"learning_rate": 7.729468599033817e-07,
"loss": 0.7321,
"step": 364
},
{
"epoch": 0.853302162478083,
"grad_norm": 4.061417655548705,
"learning_rate": 7.608695652173914e-07,
"loss": 0.7427,
"step": 365
},
{
"epoch": 0.8556399766218585,
"grad_norm": 4.033537459659518,
"learning_rate": 7.48792270531401e-07,
"loss": 0.7631,
"step": 366
},
{
"epoch": 0.8579777907656342,
"grad_norm": 3.8672964986217377,
"learning_rate": 7.367149758454106e-07,
"loss": 0.7277,
"step": 367
},
{
"epoch": 0.8603156049094097,
"grad_norm": 4.1614750880483795,
"learning_rate": 7.246376811594204e-07,
"loss": 0.7821,
"step": 368
},
{
"epoch": 0.8626534190531853,
"grad_norm": 4.0347237221296846,
"learning_rate": 7.1256038647343e-07,
"loss": 0.7229,
"step": 369
},
{
"epoch": 0.8649912331969608,
"grad_norm": 4.419235250329394,
"learning_rate": 7.004830917874397e-07,
"loss": 0.7912,
"step": 370
},
{
"epoch": 0.8673290473407365,
"grad_norm": 4.0395927745176925,
"learning_rate": 6.884057971014494e-07,
"loss": 0.7781,
"step": 371
},
{
"epoch": 0.869666861484512,
"grad_norm": 4.323154501136669,
"learning_rate": 6.763285024154589e-07,
"loss": 0.7489,
"step": 372
},
{
"epoch": 0.8720046756282875,
"grad_norm": 4.0036925914792,
"learning_rate": 6.642512077294687e-07,
"loss": 0.7488,
"step": 373
},
{
"epoch": 0.8743424897720631,
"grad_norm": 4.081792943103691,
"learning_rate": 6.521739130434783e-07,
"loss": 0.7506,
"step": 374
},
{
"epoch": 0.8766803039158387,
"grad_norm": 3.961593904598705,
"learning_rate": 6.400966183574881e-07,
"loss": 0.7365,
"step": 375
},
{
"epoch": 0.8790181180596143,
"grad_norm": 5.343637922572841,
"learning_rate": 6.280193236714976e-07,
"loss": 0.8142,
"step": 376
},
{
"epoch": 0.8813559322033898,
"grad_norm": 4.234613953777181,
"learning_rate": 6.159420289855074e-07,
"loss": 0.7685,
"step": 377
},
{
"epoch": 0.8836937463471654,
"grad_norm": 3.914888154011919,
"learning_rate": 6.038647342995169e-07,
"loss": 0.7442,
"step": 378
},
{
"epoch": 0.886031560490941,
"grad_norm": 3.998960956090034,
"learning_rate": 5.917874396135266e-07,
"loss": 0.7724,
"step": 379
},
{
"epoch": 0.8883693746347165,
"grad_norm": 3.7467228875291885,
"learning_rate": 5.797101449275363e-07,
"loss": 0.7157,
"step": 380
},
{
"epoch": 0.8907071887784921,
"grad_norm": 3.921411494491602,
"learning_rate": 5.67632850241546e-07,
"loss": 0.7604,
"step": 381
},
{
"epoch": 0.8930450029222676,
"grad_norm": 4.171395377831423,
"learning_rate": 5.555555555555555e-07,
"loss": 0.7498,
"step": 382
},
{
"epoch": 0.8953828170660433,
"grad_norm": 4.1347642411133725,
"learning_rate": 5.434782608695653e-07,
"loss": 0.7472,
"step": 383
},
{
"epoch": 0.8977206312098188,
"grad_norm": 4.092973708302494,
"learning_rate": 5.314009661835749e-07,
"loss": 0.7237,
"step": 384
},
{
"epoch": 0.9000584453535944,
"grad_norm": 3.9933326706118875,
"learning_rate": 5.193236714975846e-07,
"loss": 0.7389,
"step": 385
},
{
"epoch": 0.90239625949737,
"grad_norm": 3.8068860103615174,
"learning_rate": 5.072463768115942e-07,
"loss": 0.7177,
"step": 386
},
{
"epoch": 0.9047340736411456,
"grad_norm": 4.25980749026596,
"learning_rate": 4.951690821256039e-07,
"loss": 0.758,
"step": 387
},
{
"epoch": 0.9070718877849211,
"grad_norm": 3.8688206778681278,
"learning_rate": 4.830917874396135e-07,
"loss": 0.7577,
"step": 388
},
{
"epoch": 0.9094097019286966,
"grad_norm": 4.072604714599362,
"learning_rate": 4.7101449275362324e-07,
"loss": 0.7655,
"step": 389
},
{
"epoch": 0.9117475160724723,
"grad_norm": 4.216731514011164,
"learning_rate": 4.5893719806763294e-07,
"loss": 0.7572,
"step": 390
},
{
"epoch": 0.9140853302162478,
"grad_norm": 4.204400645393741,
"learning_rate": 4.4685990338164254e-07,
"loss": 0.7595,
"step": 391
},
{
"epoch": 0.9164231443600234,
"grad_norm": 4.327014987328045,
"learning_rate": 4.347826086956522e-07,
"loss": 0.7347,
"step": 392
},
{
"epoch": 0.9187609585037989,
"grad_norm": 4.381847799007514,
"learning_rate": 4.227053140096619e-07,
"loss": 0.7505,
"step": 393
},
{
"epoch": 0.9210987726475746,
"grad_norm": 4.019350453750999,
"learning_rate": 4.1062801932367154e-07,
"loss": 0.7488,
"step": 394
},
{
"epoch": 0.9234365867913501,
"grad_norm": 3.958102022071496,
"learning_rate": 3.9855072463768114e-07,
"loss": 0.7436,
"step": 395
},
{
"epoch": 0.9257744009351256,
"grad_norm": 4.3569068621437745,
"learning_rate": 3.8647342995169085e-07,
"loss": 0.7323,
"step": 396
},
{
"epoch": 0.9281122150789012,
"grad_norm": 3.9242746982918777,
"learning_rate": 3.743961352657005e-07,
"loss": 0.7255,
"step": 397
},
{
"epoch": 0.9304500292226768,
"grad_norm": 3.91121815410949,
"learning_rate": 3.623188405797102e-07,
"loss": 0.7471,
"step": 398
},
{
"epoch": 0.9327878433664524,
"grad_norm": 3.973005041304068,
"learning_rate": 3.5024154589371985e-07,
"loss": 0.6823,
"step": 399
},
{
"epoch": 0.9351256575102279,
"grad_norm": 3.988161090830406,
"learning_rate": 3.3816425120772945e-07,
"loss": 0.6871,
"step": 400
},
{
"epoch": 0.9374634716540035,
"grad_norm": 4.296337191130102,
"learning_rate": 3.2608695652173915e-07,
"loss": 0.7236,
"step": 401
},
{
"epoch": 0.9398012857977791,
"grad_norm": 4.3179225277967515,
"learning_rate": 3.140096618357488e-07,
"loss": 0.7582,
"step": 402
},
{
"epoch": 0.9421390999415546,
"grad_norm": 4.191674727829652,
"learning_rate": 3.0193236714975846e-07,
"loss": 0.7238,
"step": 403
},
{
"epoch": 0.9444769140853302,
"grad_norm": 3.8257966103380765,
"learning_rate": 2.8985507246376816e-07,
"loss": 0.7475,
"step": 404
},
{
"epoch": 0.9468147282291058,
"grad_norm": 4.06630469936539,
"learning_rate": 2.7777777777777776e-07,
"loss": 0.7109,
"step": 405
},
{
"epoch": 0.9491525423728814,
"grad_norm": 4.583718694034358,
"learning_rate": 2.6570048309178746e-07,
"loss": 0.7623,
"step": 406
},
{
"epoch": 0.9514903565166569,
"grad_norm": 3.9553370864295694,
"learning_rate": 2.536231884057971e-07,
"loss": 0.7911,
"step": 407
},
{
"epoch": 0.9538281706604325,
"grad_norm": 4.221184826167876,
"learning_rate": 2.4154589371980677e-07,
"loss": 0.7322,
"step": 408
},
{
"epoch": 0.9561659848042081,
"grad_norm": 4.196761181297048,
"learning_rate": 2.2946859903381647e-07,
"loss": 0.7476,
"step": 409
},
{
"epoch": 0.9585037989479837,
"grad_norm": 4.185489684411542,
"learning_rate": 2.173913043478261e-07,
"loss": 0.7548,
"step": 410
},
{
"epoch": 0.9608416130917592,
"grad_norm": 4.371686498083367,
"learning_rate": 2.0531400966183577e-07,
"loss": 0.7328,
"step": 411
},
{
"epoch": 0.9631794272355347,
"grad_norm": 4.314986686818614,
"learning_rate": 1.9323671497584542e-07,
"loss": 0.7304,
"step": 412
},
{
"epoch": 0.9655172413793104,
"grad_norm": 3.9822912414587806,
"learning_rate": 1.811594202898551e-07,
"loss": 0.7395,
"step": 413
},
{
"epoch": 0.9678550555230859,
"grad_norm": 4.218523033535868,
"learning_rate": 1.6908212560386473e-07,
"loss": 0.7302,
"step": 414
},
{
"epoch": 0.9701928696668615,
"grad_norm": 4.092187481356195,
"learning_rate": 1.570048309178744e-07,
"loss": 0.7351,
"step": 415
},
{
"epoch": 0.972530683810637,
"grad_norm": 4.184125537002853,
"learning_rate": 1.4492753623188408e-07,
"loss": 0.7413,
"step": 416
},
{
"epoch": 0.9748684979544127,
"grad_norm": 3.889649663413063,
"learning_rate": 1.3285024154589373e-07,
"loss": 0.7365,
"step": 417
},
{
"epoch": 0.9772063120981882,
"grad_norm": 4.139378543594781,
"learning_rate": 1.2077294685990338e-07,
"loss": 0.7626,
"step": 418
},
{
"epoch": 0.9795441262419637,
"grad_norm": 4.016007817051792,
"learning_rate": 1.0869565217391305e-07,
"loss": 0.7428,
"step": 419
},
{
"epoch": 0.9818819403857393,
"grad_norm": 4.31935746465498,
"learning_rate": 9.661835748792271e-08,
"loss": 0.7886,
"step": 420
},
{
"epoch": 0.9842197545295149,
"grad_norm": 4.305755648868578,
"learning_rate": 8.454106280193236e-08,
"loss": 0.7552,
"step": 421
},
{
"epoch": 0.9865575686732905,
"grad_norm": 4.324910095691635,
"learning_rate": 7.246376811594204e-08,
"loss": 0.7465,
"step": 422
},
{
"epoch": 0.988895382817066,
"grad_norm": 3.694300442393254,
"learning_rate": 6.038647342995169e-08,
"loss": 0.7093,
"step": 423
},
{
"epoch": 0.9912331969608417,
"grad_norm": 3.829444377626212,
"learning_rate": 4.8309178743961356e-08,
"loss": 0.7241,
"step": 424
},
{
"epoch": 0.9935710111046172,
"grad_norm": 4.179549227414933,
"learning_rate": 3.623188405797102e-08,
"loss": 0.7663,
"step": 425
},
{
"epoch": 0.9959088252483927,
"grad_norm": 3.832323286806212,
"learning_rate": 2.4154589371980678e-08,
"loss": 0.7859,
"step": 426
},
{
"epoch": 0.9982466393921683,
"grad_norm": 4.123264294362188,
"learning_rate": 1.2077294685990339e-08,
"loss": 0.7678,
"step": 427
},
{
"epoch": 0.9982466393921683,
"step": 427,
"total_flos": 77746305761280.0,
"train_loss": 0.8145518043281323,
"train_runtime": 5184.5045,
"train_samples_per_second": 10.56,
"train_steps_per_second": 0.082
}
],
"logging_steps": 1.0,
"max_steps": 427,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"total_flos": 77746305761280.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}