beast33's picture
Training in progress, step 500, checkpoint
01c415a verified
{
"best_metric": 1.454202651977539,
"best_model_checkpoint": "miner_id_24/checkpoint-500",
"epoch": 0.06560818790185015,
"eval_steps": 100,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0001312163758037003,
"grad_norm": 4.502290725708008,
"learning_rate": 5e-06,
"loss": 2.7959,
"step": 1
},
{
"epoch": 0.0001312163758037003,
"eval_loss": 3.9797542095184326,
"eval_runtime": 1367.0112,
"eval_samples_per_second": 9.39,
"eval_steps_per_second": 2.347,
"step": 1
},
{
"epoch": 0.0002624327516074006,
"grad_norm": 4.730992794036865,
"learning_rate": 1e-05,
"loss": 3.0455,
"step": 2
},
{
"epoch": 0.0003936491274111009,
"grad_norm": 4.7559990882873535,
"learning_rate": 1.5e-05,
"loss": 3.0463,
"step": 3
},
{
"epoch": 0.0005248655032148012,
"grad_norm": 4.74609899520874,
"learning_rate": 2e-05,
"loss": 3.0803,
"step": 4
},
{
"epoch": 0.0006560818790185015,
"grad_norm": 4.335945129394531,
"learning_rate": 2.5e-05,
"loss": 2.885,
"step": 5
},
{
"epoch": 0.0007872982548222018,
"grad_norm": 4.600176811218262,
"learning_rate": 3e-05,
"loss": 3.0,
"step": 6
},
{
"epoch": 0.0009185146306259021,
"grad_norm": 4.308350563049316,
"learning_rate": 3.5e-05,
"loss": 2.9882,
"step": 7
},
{
"epoch": 0.0010497310064296024,
"grad_norm": 4.452569484710693,
"learning_rate": 4e-05,
"loss": 2.8844,
"step": 8
},
{
"epoch": 0.0011809473822333026,
"grad_norm": 5.113404750823975,
"learning_rate": 4.5e-05,
"loss": 2.8185,
"step": 9
},
{
"epoch": 0.001312163758037003,
"grad_norm": 4.601175785064697,
"learning_rate": 5e-05,
"loss": 2.7809,
"step": 10
},
{
"epoch": 0.0014433801338407032,
"grad_norm": 4.8205156326293945,
"learning_rate": 5.500000000000001e-05,
"loss": 2.514,
"step": 11
},
{
"epoch": 0.0015745965096444037,
"grad_norm": 4.931856155395508,
"learning_rate": 6e-05,
"loss": 2.6665,
"step": 12
},
{
"epoch": 0.0017058128854481039,
"grad_norm": 6.019423007965088,
"learning_rate": 6.500000000000001e-05,
"loss": 2.6067,
"step": 13
},
{
"epoch": 0.0018370292612518043,
"grad_norm": 4.860208034515381,
"learning_rate": 7e-05,
"loss": 2.7381,
"step": 14
},
{
"epoch": 0.0019682456370555047,
"grad_norm": 5.671669960021973,
"learning_rate": 7.500000000000001e-05,
"loss": 2.4756,
"step": 15
},
{
"epoch": 0.002099462012859205,
"grad_norm": 4.821615219116211,
"learning_rate": 8e-05,
"loss": 2.5178,
"step": 16
},
{
"epoch": 0.002230678388662905,
"grad_norm": 5.072446823120117,
"learning_rate": 8.5e-05,
"loss": 2.4209,
"step": 17
},
{
"epoch": 0.0023618947644666053,
"grad_norm": 5.528189659118652,
"learning_rate": 9e-05,
"loss": 2.4476,
"step": 18
},
{
"epoch": 0.002493111140270306,
"grad_norm": 4.594228267669678,
"learning_rate": 9.5e-05,
"loss": 2.2791,
"step": 19
},
{
"epoch": 0.002624327516074006,
"grad_norm": 5.120175838470459,
"learning_rate": 0.0001,
"loss": 2.2744,
"step": 20
},
{
"epoch": 0.0027555438918777063,
"grad_norm": 4.523880958557129,
"learning_rate": 9.999892908320647e-05,
"loss": 2.0823,
"step": 21
},
{
"epoch": 0.0028867602676814065,
"grad_norm": 5.103494167327881,
"learning_rate": 9.999571637870036e-05,
"loss": 2.2586,
"step": 22
},
{
"epoch": 0.003017976643485107,
"grad_norm": 4.991058826446533,
"learning_rate": 9.999036202410325e-05,
"loss": 2.1277,
"step": 23
},
{
"epoch": 0.0031491930192888073,
"grad_norm": 4.875056743621826,
"learning_rate": 9.998286624877786e-05,
"loss": 2.1464,
"step": 24
},
{
"epoch": 0.0032804093950925075,
"grad_norm": 4.341136932373047,
"learning_rate": 9.997322937381829e-05,
"loss": 2.1301,
"step": 25
},
{
"epoch": 0.0034116257708962077,
"grad_norm": 4.211912155151367,
"learning_rate": 9.996145181203615e-05,
"loss": 2.0398,
"step": 26
},
{
"epoch": 0.0035428421466999083,
"grad_norm": 4.3173651695251465,
"learning_rate": 9.994753406794301e-05,
"loss": 2.063,
"step": 27
},
{
"epoch": 0.0036740585225036085,
"grad_norm": 4.31871223449707,
"learning_rate": 9.99314767377287e-05,
"loss": 1.9695,
"step": 28
},
{
"epoch": 0.0038052748983073087,
"grad_norm": 5.267412185668945,
"learning_rate": 9.991328050923581e-05,
"loss": 2.017,
"step": 29
},
{
"epoch": 0.003936491274111009,
"grad_norm": 4.116550922393799,
"learning_rate": 9.989294616193017e-05,
"loss": 1.674,
"step": 30
},
{
"epoch": 0.004067707649914709,
"grad_norm": 4.253255367279053,
"learning_rate": 9.98704745668676e-05,
"loss": 1.8298,
"step": 31
},
{
"epoch": 0.00419892402571841,
"grad_norm": 7.945616245269775,
"learning_rate": 9.98458666866564e-05,
"loss": 1.8191,
"step": 32
},
{
"epoch": 0.00433014040152211,
"grad_norm": 4.653039455413818,
"learning_rate": 9.981912357541627e-05,
"loss": 1.786,
"step": 33
},
{
"epoch": 0.00446135677732581,
"grad_norm": 5.207880020141602,
"learning_rate": 9.97902463787331e-05,
"loss": 1.9124,
"step": 34
},
{
"epoch": 0.004592573153129511,
"grad_norm": 4.892001152038574,
"learning_rate": 9.975923633360985e-05,
"loss": 1.5737,
"step": 35
},
{
"epoch": 0.0047237895289332105,
"grad_norm": 4.775864124298096,
"learning_rate": 9.972609476841367e-05,
"loss": 1.7467,
"step": 36
},
{
"epoch": 0.004855005904736911,
"grad_norm": 5.307229518890381,
"learning_rate": 9.969082310281891e-05,
"loss": 1.5144,
"step": 37
},
{
"epoch": 0.004986222280540612,
"grad_norm": 5.3355560302734375,
"learning_rate": 9.965342284774632e-05,
"loss": 1.576,
"step": 38
},
{
"epoch": 0.0051174386563443116,
"grad_norm": 4.783757209777832,
"learning_rate": 9.961389560529836e-05,
"loss": 1.3935,
"step": 39
},
{
"epoch": 0.005248655032148012,
"grad_norm": 4.547131061553955,
"learning_rate": 9.957224306869053e-05,
"loss": 1.4015,
"step": 40
},
{
"epoch": 0.005379871407951712,
"grad_norm": 6.1063055992126465,
"learning_rate": 9.952846702217886e-05,
"loss": 1.402,
"step": 41
},
{
"epoch": 0.005511087783755413,
"grad_norm": 4.746460437774658,
"learning_rate": 9.948256934098352e-05,
"loss": 1.1739,
"step": 42
},
{
"epoch": 0.005642304159559113,
"grad_norm": 4.250523567199707,
"learning_rate": 9.943455199120837e-05,
"loss": 1.2131,
"step": 43
},
{
"epoch": 0.005773520535362813,
"grad_norm": 5.288604736328125,
"learning_rate": 9.938441702975689e-05,
"loss": 1.1556,
"step": 44
},
{
"epoch": 0.005904736911166514,
"grad_norm": 5.479332447052002,
"learning_rate": 9.933216660424395e-05,
"loss": 1.2704,
"step": 45
},
{
"epoch": 0.006035953286970214,
"grad_norm": 4.803893566131592,
"learning_rate": 9.927780295290389e-05,
"loss": 1.0887,
"step": 46
},
{
"epoch": 0.006167169662773914,
"grad_norm": 4.377706050872803,
"learning_rate": 9.922132840449459e-05,
"loss": 0.8161,
"step": 47
},
{
"epoch": 0.006298386038577615,
"grad_norm": 4.871409893035889,
"learning_rate": 9.916274537819775e-05,
"loss": 0.8478,
"step": 48
},
{
"epoch": 0.006429602414381314,
"grad_norm": 4.593444347381592,
"learning_rate": 9.91020563835152e-05,
"loss": 0.9841,
"step": 49
},
{
"epoch": 0.006560818790185015,
"grad_norm": 6.342800617218018,
"learning_rate": 9.903926402016153e-05,
"loss": 0.8137,
"step": 50
},
{
"epoch": 0.006692035165988716,
"grad_norm": 3.515503406524658,
"learning_rate": 9.897437097795257e-05,
"loss": 2.5555,
"step": 51
},
{
"epoch": 0.006823251541792415,
"grad_norm": 2.8081209659576416,
"learning_rate": 9.890738003669029e-05,
"loss": 2.4244,
"step": 52
},
{
"epoch": 0.006954467917596116,
"grad_norm": 2.0723488330841064,
"learning_rate": 9.883829406604363e-05,
"loss": 2.255,
"step": 53
},
{
"epoch": 0.007085684293399817,
"grad_norm": 1.63498055934906,
"learning_rate": 9.876711602542563e-05,
"loss": 2.219,
"step": 54
},
{
"epoch": 0.0072169006692035164,
"grad_norm": 1.5121456384658813,
"learning_rate": 9.869384896386668e-05,
"loss": 2.2263,
"step": 55
},
{
"epoch": 0.007348117045007217,
"grad_norm": 1.7543376684188843,
"learning_rate": 9.861849601988383e-05,
"loss": 2.2319,
"step": 56
},
{
"epoch": 0.007479333420810917,
"grad_norm": 2.030773639678955,
"learning_rate": 9.854106042134641e-05,
"loss": 2.299,
"step": 57
},
{
"epoch": 0.0076105497966146175,
"grad_norm": 2.207674264907837,
"learning_rate": 9.846154548533773e-05,
"loss": 2.3749,
"step": 58
},
{
"epoch": 0.007741766172418318,
"grad_norm": 2.266341209411621,
"learning_rate": 9.837995461801299e-05,
"loss": 2.3051,
"step": 59
},
{
"epoch": 0.007872982548222019,
"grad_norm": 2.5948050022125244,
"learning_rate": 9.829629131445342e-05,
"loss": 2.2048,
"step": 60
},
{
"epoch": 0.008004198924025718,
"grad_norm": 3.1021060943603516,
"learning_rate": 9.821055915851647e-05,
"loss": 2.3619,
"step": 61
},
{
"epoch": 0.008135415299829418,
"grad_norm": 3.0172550678253174,
"learning_rate": 9.812276182268236e-05,
"loss": 2.198,
"step": 62
},
{
"epoch": 0.008266631675633119,
"grad_norm": 4.317417144775391,
"learning_rate": 9.803290306789676e-05,
"loss": 2.1099,
"step": 63
},
{
"epoch": 0.00839784805143682,
"grad_norm": 4.255865097045898,
"learning_rate": 9.794098674340965e-05,
"loss": 2.2599,
"step": 64
},
{
"epoch": 0.00852906442724052,
"grad_norm": 3.712496280670166,
"learning_rate": 9.784701678661045e-05,
"loss": 2.0834,
"step": 65
},
{
"epoch": 0.00866028080304422,
"grad_norm": 5.127752304077148,
"learning_rate": 9.775099722285935e-05,
"loss": 2.082,
"step": 66
},
{
"epoch": 0.00879149717884792,
"grad_norm": 3.70094633102417,
"learning_rate": 9.765293216531486e-05,
"loss": 2.2118,
"step": 67
},
{
"epoch": 0.00892271355465162,
"grad_norm": 3.54309344291687,
"learning_rate": 9.755282581475769e-05,
"loss": 1.8701,
"step": 68
},
{
"epoch": 0.009053929930455321,
"grad_norm": 4.72603702545166,
"learning_rate": 9.74506824594107e-05,
"loss": 2.1741,
"step": 69
},
{
"epoch": 0.009185146306259022,
"grad_norm": 4.20818567276001,
"learning_rate": 9.73465064747553e-05,
"loss": 1.9426,
"step": 70
},
{
"epoch": 0.009316362682062722,
"grad_norm": 3.797800064086914,
"learning_rate": 9.724030232334391e-05,
"loss": 1.8893,
"step": 71
},
{
"epoch": 0.009447579057866421,
"grad_norm": 4.630662441253662,
"learning_rate": 9.713207455460894e-05,
"loss": 1.91,
"step": 72
},
{
"epoch": 0.009578795433670122,
"grad_norm": 4.435169696807861,
"learning_rate": 9.702182780466775e-05,
"loss": 1.8245,
"step": 73
},
{
"epoch": 0.009710011809473822,
"grad_norm": 4.108457088470459,
"learning_rate": 9.690956679612421e-05,
"loss": 1.8773,
"step": 74
},
{
"epoch": 0.009841228185277523,
"grad_norm": 3.7498910427093506,
"learning_rate": 9.67952963378663e-05,
"loss": 1.5919,
"step": 75
},
{
"epoch": 0.009972444561081224,
"grad_norm": 3.4361634254455566,
"learning_rate": 9.667902132486009e-05,
"loss": 1.5667,
"step": 76
},
{
"epoch": 0.010103660936884922,
"grad_norm": 3.993279218673706,
"learning_rate": 9.656074673794018e-05,
"loss": 1.8395,
"step": 77
},
{
"epoch": 0.010234877312688623,
"grad_norm": 3.958552598953247,
"learning_rate": 9.644047764359622e-05,
"loss": 1.7246,
"step": 78
},
{
"epoch": 0.010366093688492324,
"grad_norm": 4.14636754989624,
"learning_rate": 9.631821919375591e-05,
"loss": 1.6861,
"step": 79
},
{
"epoch": 0.010497310064296024,
"grad_norm": 4.084466934204102,
"learning_rate": 9.619397662556435e-05,
"loss": 1.4632,
"step": 80
},
{
"epoch": 0.010628526440099725,
"grad_norm": 3.8091354370117188,
"learning_rate": 9.606775526115963e-05,
"loss": 1.6115,
"step": 81
},
{
"epoch": 0.010759742815903424,
"grad_norm": 3.6534502506256104,
"learning_rate": 9.593956050744492e-05,
"loss": 1.4976,
"step": 82
},
{
"epoch": 0.010890959191707125,
"grad_norm": 4.479729652404785,
"learning_rate": 9.580939785585681e-05,
"loss": 1.6682,
"step": 83
},
{
"epoch": 0.011022175567510825,
"grad_norm": 4.333984375,
"learning_rate": 9.567727288213005e-05,
"loss": 1.7758,
"step": 84
},
{
"epoch": 0.011153391943314526,
"grad_norm": 5.193741321563721,
"learning_rate": 9.554319124605879e-05,
"loss": 1.382,
"step": 85
},
{
"epoch": 0.011284608319118226,
"grad_norm": 4.566789627075195,
"learning_rate": 9.540715869125407e-05,
"loss": 1.5179,
"step": 86
},
{
"epoch": 0.011415824694921927,
"grad_norm": 4.185393810272217,
"learning_rate": 9.526918104489777e-05,
"loss": 1.3469,
"step": 87
},
{
"epoch": 0.011547041070725626,
"grad_norm": 4.056511878967285,
"learning_rate": 9.512926421749304e-05,
"loss": 1.2551,
"step": 88
},
{
"epoch": 0.011678257446529327,
"grad_norm": 4.229364395141602,
"learning_rate": 9.498741420261108e-05,
"loss": 1.4315,
"step": 89
},
{
"epoch": 0.011809473822333027,
"grad_norm": 3.8965346813201904,
"learning_rate": 9.484363707663442e-05,
"loss": 1.4387,
"step": 90
},
{
"epoch": 0.011940690198136728,
"grad_norm": 4.694686412811279,
"learning_rate": 9.469793899849661e-05,
"loss": 1.5771,
"step": 91
},
{
"epoch": 0.012071906573940428,
"grad_norm": 5.034073352813721,
"learning_rate": 9.45503262094184e-05,
"loss": 1.6219,
"step": 92
},
{
"epoch": 0.012203122949744127,
"grad_norm": 4.071391582489014,
"learning_rate": 9.440080503264037e-05,
"loss": 1.0868,
"step": 93
},
{
"epoch": 0.012334339325547828,
"grad_norm": 4.7768168449401855,
"learning_rate": 9.42493818731521e-05,
"loss": 1.2184,
"step": 94
},
{
"epoch": 0.012465555701351529,
"grad_norm": 4.308196067810059,
"learning_rate": 9.409606321741775e-05,
"loss": 0.852,
"step": 95
},
{
"epoch": 0.01259677207715523,
"grad_norm": 3.9685397148132324,
"learning_rate": 9.394085563309827e-05,
"loss": 1.0385,
"step": 96
},
{
"epoch": 0.01272798845295893,
"grad_norm": 3.9740753173828125,
"learning_rate": 9.378376576876999e-05,
"loss": 0.9448,
"step": 97
},
{
"epoch": 0.012859204828762629,
"grad_norm": 3.3262381553649902,
"learning_rate": 9.362480035363986e-05,
"loss": 0.6915,
"step": 98
},
{
"epoch": 0.01299042120456633,
"grad_norm": 5.052366256713867,
"learning_rate": 9.34639661972572e-05,
"loss": 0.8459,
"step": 99
},
{
"epoch": 0.01312163758037003,
"grad_norm": 4.026389122009277,
"learning_rate": 9.330127018922194e-05,
"loss": 0.6136,
"step": 100
},
{
"epoch": 0.01312163758037003,
"eval_loss": 1.995261549949646,
"eval_runtime": 1352.3203,
"eval_samples_per_second": 9.492,
"eval_steps_per_second": 2.373,
"step": 100
},
{
"epoch": 0.01325285395617373,
"grad_norm": 3.1780271530151367,
"learning_rate": 9.31367192988896e-05,
"loss": 2.3667,
"step": 101
},
{
"epoch": 0.013384070331977431,
"grad_norm": 2.605407238006592,
"learning_rate": 9.297032057507264e-05,
"loss": 2.2015,
"step": 102
},
{
"epoch": 0.013515286707781132,
"grad_norm": 1.9591269493103027,
"learning_rate": 9.280208114573859e-05,
"loss": 2.0853,
"step": 103
},
{
"epoch": 0.01364650308358483,
"grad_norm": 1.4972039461135864,
"learning_rate": 9.263200821770461e-05,
"loss": 2.0733,
"step": 104
},
{
"epoch": 0.013777719459388531,
"grad_norm": 1.6665318012237549,
"learning_rate": 9.246010907632895e-05,
"loss": 2.0467,
"step": 105
},
{
"epoch": 0.013908935835192232,
"grad_norm": 1.6789878606796265,
"learning_rate": 9.228639108519868e-05,
"loss": 2.1175,
"step": 106
},
{
"epoch": 0.014040152210995933,
"grad_norm": 2.0789976119995117,
"learning_rate": 9.211086168581433e-05,
"loss": 2.2926,
"step": 107
},
{
"epoch": 0.014171368586799633,
"grad_norm": 2.8981029987335205,
"learning_rate": 9.193352839727121e-05,
"loss": 2.1756,
"step": 108
},
{
"epoch": 0.014302584962603332,
"grad_norm": 2.743936538696289,
"learning_rate": 9.175439881593716e-05,
"loss": 2.1466,
"step": 109
},
{
"epoch": 0.014433801338407033,
"grad_norm": 3.987334728240967,
"learning_rate": 9.157348061512727e-05,
"loss": 2.2559,
"step": 110
},
{
"epoch": 0.014565017714210734,
"grad_norm": 3.1965720653533936,
"learning_rate": 9.139078154477512e-05,
"loss": 2.2562,
"step": 111
},
{
"epoch": 0.014696234090014434,
"grad_norm": 3.892089605331421,
"learning_rate": 9.120630943110077e-05,
"loss": 2.1416,
"step": 112
},
{
"epoch": 0.014827450465818135,
"grad_norm": 3.2245702743530273,
"learning_rate": 9.102007217627568e-05,
"loss": 2.0573,
"step": 113
},
{
"epoch": 0.014958666841621834,
"grad_norm": 3.0087993144989014,
"learning_rate": 9.083207775808396e-05,
"loss": 1.9645,
"step": 114
},
{
"epoch": 0.015089883217425534,
"grad_norm": 2.868622303009033,
"learning_rate": 9.064233422958077e-05,
"loss": 2.0366,
"step": 115
},
{
"epoch": 0.015221099593229235,
"grad_norm": 3.084916830062866,
"learning_rate": 9.045084971874738e-05,
"loss": 2.0537,
"step": 116
},
{
"epoch": 0.015352315969032936,
"grad_norm": 3.2997586727142334,
"learning_rate": 9.025763242814291e-05,
"loss": 2.0362,
"step": 117
},
{
"epoch": 0.015483532344836636,
"grad_norm": 4.677249431610107,
"learning_rate": 9.006269063455304e-05,
"loss": 1.8742,
"step": 118
},
{
"epoch": 0.015614748720640335,
"grad_norm": 4.065469741821289,
"learning_rate": 8.986603268863536e-05,
"loss": 1.7975,
"step": 119
},
{
"epoch": 0.015745965096444037,
"grad_norm": 3.6069369316101074,
"learning_rate": 8.966766701456177e-05,
"loss": 1.7775,
"step": 120
},
{
"epoch": 0.015877181472247738,
"grad_norm": 4.557692527770996,
"learning_rate": 8.94676021096575e-05,
"loss": 1.8745,
"step": 121
},
{
"epoch": 0.016008397848051435,
"grad_norm": 3.5455398559570312,
"learning_rate": 8.926584654403724e-05,
"loss": 1.7982,
"step": 122
},
{
"epoch": 0.016139614223855136,
"grad_norm": 3.4866509437561035,
"learning_rate": 8.906240896023794e-05,
"loss": 1.7942,
"step": 123
},
{
"epoch": 0.016270830599658836,
"grad_norm": 3.9673101902008057,
"learning_rate": 8.885729807284856e-05,
"loss": 1.6027,
"step": 124
},
{
"epoch": 0.016402046975462537,
"grad_norm": 4.192202568054199,
"learning_rate": 8.865052266813685e-05,
"loss": 1.7069,
"step": 125
},
{
"epoch": 0.016533263351266238,
"grad_norm": 3.4886741638183594,
"learning_rate": 8.844209160367299e-05,
"loss": 1.6127,
"step": 126
},
{
"epoch": 0.01666447972706994,
"grad_norm": 4.3129448890686035,
"learning_rate": 8.823201380795001e-05,
"loss": 1.8012,
"step": 127
},
{
"epoch": 0.01679569610287364,
"grad_norm": 4.046361923217773,
"learning_rate": 8.802029828000156e-05,
"loss": 1.6391,
"step": 128
},
{
"epoch": 0.01692691247867734,
"grad_norm": 3.780560255050659,
"learning_rate": 8.780695408901613e-05,
"loss": 1.8057,
"step": 129
},
{
"epoch": 0.01705812885448104,
"grad_norm": 4.130734443664551,
"learning_rate": 8.759199037394887e-05,
"loss": 1.7628,
"step": 130
},
{
"epoch": 0.01718934523028474,
"grad_norm": 4.335687160491943,
"learning_rate": 8.737541634312985e-05,
"loss": 1.7281,
"step": 131
},
{
"epoch": 0.01732056160608844,
"grad_norm": 3.8232834339141846,
"learning_rate": 8.715724127386972e-05,
"loss": 1.6856,
"step": 132
},
{
"epoch": 0.01745177798189214,
"grad_norm": 4.297300338745117,
"learning_rate": 8.693747451206232e-05,
"loss": 1.4618,
"step": 133
},
{
"epoch": 0.01758299435769584,
"grad_norm": 3.4930808544158936,
"learning_rate": 8.671612547178428e-05,
"loss": 1.3184,
"step": 134
},
{
"epoch": 0.01771421073349954,
"grad_norm": 3.8762755393981934,
"learning_rate": 8.649320363489179e-05,
"loss": 1.4333,
"step": 135
},
{
"epoch": 0.01784542710930324,
"grad_norm": 4.0097126960754395,
"learning_rate": 8.626871855061438e-05,
"loss": 1.4535,
"step": 136
},
{
"epoch": 0.01797664348510694,
"grad_norm": 4.139339923858643,
"learning_rate": 8.604267983514594e-05,
"loss": 1.4856,
"step": 137
},
{
"epoch": 0.018107859860910642,
"grad_norm": 3.6388514041900635,
"learning_rate": 8.581509717123273e-05,
"loss": 1.3928,
"step": 138
},
{
"epoch": 0.018239076236714342,
"grad_norm": 3.7924964427948,
"learning_rate": 8.558598030775857e-05,
"loss": 1.2458,
"step": 139
},
{
"epoch": 0.018370292612518043,
"grad_norm": 4.4245758056640625,
"learning_rate": 8.535533905932738e-05,
"loss": 1.4169,
"step": 140
},
{
"epoch": 0.018501508988321744,
"grad_norm": 3.6946239471435547,
"learning_rate": 8.51231833058426e-05,
"loss": 1.1482,
"step": 141
},
{
"epoch": 0.018632725364125444,
"grad_norm": 3.711535692214966,
"learning_rate": 8.488952299208401e-05,
"loss": 1.065,
"step": 142
},
{
"epoch": 0.01876394173992914,
"grad_norm": 3.8962907791137695,
"learning_rate": 8.46543681272818e-05,
"loss": 1.2842,
"step": 143
},
{
"epoch": 0.018895158115732842,
"grad_norm": 3.2645106315612793,
"learning_rate": 8.44177287846877e-05,
"loss": 0.8512,
"step": 144
},
{
"epoch": 0.019026374491536543,
"grad_norm": 3.1213479042053223,
"learning_rate": 8.417961510114356e-05,
"loss": 0.7018,
"step": 145
},
{
"epoch": 0.019157590867340243,
"grad_norm": 4.530689716339111,
"learning_rate": 8.39400372766471e-05,
"loss": 0.9977,
"step": 146
},
{
"epoch": 0.019288807243143944,
"grad_norm": 3.922245740890503,
"learning_rate": 8.36990055739149e-05,
"loss": 0.6566,
"step": 147
},
{
"epoch": 0.019420023618947645,
"grad_norm": 4.297173500061035,
"learning_rate": 8.345653031794292e-05,
"loss": 0.7287,
"step": 148
},
{
"epoch": 0.019551239994751345,
"grad_norm": 4.700723648071289,
"learning_rate": 8.321262189556409e-05,
"loss": 0.724,
"step": 149
},
{
"epoch": 0.019682456370555046,
"grad_norm": 4.133970737457275,
"learning_rate": 8.296729075500344e-05,
"loss": 0.5965,
"step": 150
},
{
"epoch": 0.019813672746358747,
"grad_norm": 2.415865898132324,
"learning_rate": 8.272054740543052e-05,
"loss": 2.1392,
"step": 151
},
{
"epoch": 0.019944889122162447,
"grad_norm": 2.01712965965271,
"learning_rate": 8.247240241650918e-05,
"loss": 2.1176,
"step": 152
},
{
"epoch": 0.020076105497966148,
"grad_norm": 1.6205581426620483,
"learning_rate": 8.222286641794488e-05,
"loss": 2.1266,
"step": 153
},
{
"epoch": 0.020207321873769845,
"grad_norm": 1.8821977376937866,
"learning_rate": 8.197195009902924e-05,
"loss": 2.126,
"step": 154
},
{
"epoch": 0.020338538249573546,
"grad_norm": 1.4580323696136475,
"learning_rate": 8.171966420818228e-05,
"loss": 2.081,
"step": 155
},
{
"epoch": 0.020469754625377246,
"grad_norm": 1.5964078903198242,
"learning_rate": 8.146601955249188e-05,
"loss": 2.1827,
"step": 156
},
{
"epoch": 0.020600971001180947,
"grad_norm": 2.922008752822876,
"learning_rate": 8.121102699725089e-05,
"loss": 2.1915,
"step": 157
},
{
"epoch": 0.020732187376984648,
"grad_norm": 2.2149009704589844,
"learning_rate": 8.095469746549172e-05,
"loss": 2.1432,
"step": 158
},
{
"epoch": 0.020863403752788348,
"grad_norm": 2.5889828205108643,
"learning_rate": 8.069704193751832e-05,
"loss": 2.101,
"step": 159
},
{
"epoch": 0.02099462012859205,
"grad_norm": 3.5003163814544678,
"learning_rate": 8.043807145043604e-05,
"loss": 2.1131,
"step": 160
},
{
"epoch": 0.02112583650439575,
"grad_norm": 3.9812967777252197,
"learning_rate": 8.017779709767858e-05,
"loss": 2.1469,
"step": 161
},
{
"epoch": 0.02125705288019945,
"grad_norm": 3.2259273529052734,
"learning_rate": 7.991623002853296e-05,
"loss": 2.0289,
"step": 162
},
{
"epoch": 0.02138826925600315,
"grad_norm": 4.384939670562744,
"learning_rate": 7.965338144766186e-05,
"loss": 2.2525,
"step": 163
},
{
"epoch": 0.021519485631806848,
"grad_norm": 4.085958957672119,
"learning_rate": 7.938926261462366e-05,
"loss": 2.0116,
"step": 164
},
{
"epoch": 0.02165070200761055,
"grad_norm": 2.951061964035034,
"learning_rate": 7.912388484339012e-05,
"loss": 2.0182,
"step": 165
},
{
"epoch": 0.02178191838341425,
"grad_norm": 5.261083126068115,
"learning_rate": 7.88572595018617e-05,
"loss": 1.8649,
"step": 166
},
{
"epoch": 0.02191313475921795,
"grad_norm": 5.297954082489014,
"learning_rate": 7.858939801138061e-05,
"loss": 1.9873,
"step": 167
},
{
"epoch": 0.02204435113502165,
"grad_norm": 3.1123671531677246,
"learning_rate": 7.832031184624164e-05,
"loss": 1.9712,
"step": 168
},
{
"epoch": 0.02217556751082535,
"grad_norm": 3.0200247764587402,
"learning_rate": 7.80500125332005e-05,
"loss": 1.7175,
"step": 169
},
{
"epoch": 0.02230678388662905,
"grad_norm": 4.045231819152832,
"learning_rate": 7.777851165098012e-05,
"loss": 1.6917,
"step": 170
},
{
"epoch": 0.022438000262432752,
"grad_norm": 3.3924217224121094,
"learning_rate": 7.750582082977467e-05,
"loss": 1.915,
"step": 171
},
{
"epoch": 0.022569216638236453,
"grad_norm": 3.338836669921875,
"learning_rate": 7.723195175075136e-05,
"loss": 1.7668,
"step": 172
},
{
"epoch": 0.022700433014040153,
"grad_norm": 3.4897708892822266,
"learning_rate": 7.695691614555003e-05,
"loss": 1.7118,
"step": 173
},
{
"epoch": 0.022831649389843854,
"grad_norm": 3.5899102687835693,
"learning_rate": 7.668072579578058e-05,
"loss": 1.6634,
"step": 174
},
{
"epoch": 0.02296286576564755,
"grad_norm": 3.5203874111175537,
"learning_rate": 7.64033925325184e-05,
"loss": 1.8026,
"step": 175
},
{
"epoch": 0.023094082141451252,
"grad_norm": 3.14801287651062,
"learning_rate": 7.612492823579745e-05,
"loss": 1.6852,
"step": 176
},
{
"epoch": 0.023225298517254953,
"grad_norm": 3.505002021789551,
"learning_rate": 7.584534483410137e-05,
"loss": 1.6794,
"step": 177
},
{
"epoch": 0.023356514893058653,
"grad_norm": 3.6999619007110596,
"learning_rate": 7.55646543038526e-05,
"loss": 1.6416,
"step": 178
},
{
"epoch": 0.023487731268862354,
"grad_norm": 3.438927412033081,
"learning_rate": 7.528286866889924e-05,
"loss": 1.6118,
"step": 179
},
{
"epoch": 0.023618947644666054,
"grad_norm": 3.4257144927978516,
"learning_rate": 7.500000000000001e-05,
"loss": 1.4316,
"step": 180
},
{
"epoch": 0.023750164020469755,
"grad_norm": 3.3115530014038086,
"learning_rate": 7.471606041430723e-05,
"loss": 1.4048,
"step": 181
},
{
"epoch": 0.023881380396273456,
"grad_norm": 3.7471344470977783,
"learning_rate": 7.443106207484776e-05,
"loss": 1.4955,
"step": 182
},
{
"epoch": 0.024012596772077156,
"grad_norm": 3.900952100753784,
"learning_rate": 7.414501719000187e-05,
"loss": 1.5621,
"step": 183
},
{
"epoch": 0.024143813147880857,
"grad_norm": 3.704540491104126,
"learning_rate": 7.385793801298042e-05,
"loss": 1.4024,
"step": 184
},
{
"epoch": 0.024275029523684554,
"grad_norm": 3.8118085861206055,
"learning_rate": 7.35698368412999e-05,
"loss": 1.4171,
"step": 185
},
{
"epoch": 0.024406245899488255,
"grad_norm": 4.9681196212768555,
"learning_rate": 7.328072601625557e-05,
"loss": 1.4693,
"step": 186
},
{
"epoch": 0.024537462275291955,
"grad_norm": 3.6511571407318115,
"learning_rate": 7.2990617922393e-05,
"loss": 1.2867,
"step": 187
},
{
"epoch": 0.024668678651095656,
"grad_norm": 3.6810758113861084,
"learning_rate": 7.269952498697734e-05,
"loss": 1.1199,
"step": 188
},
{
"epoch": 0.024799895026899357,
"grad_norm": 3.7081968784332275,
"learning_rate": 7.240745967946113e-05,
"loss": 1.0948,
"step": 189
},
{
"epoch": 0.024931111402703057,
"grad_norm": 3.9994025230407715,
"learning_rate": 7.211443451095007e-05,
"loss": 1.3183,
"step": 190
},
{
"epoch": 0.025062327778506758,
"grad_norm": 3.6044254302978516,
"learning_rate": 7.18204620336671e-05,
"loss": 1.2563,
"step": 191
},
{
"epoch": 0.02519354415431046,
"grad_norm": 3.95975923538208,
"learning_rate": 7.152555484041476e-05,
"loss": 0.9602,
"step": 192
},
{
"epoch": 0.02532476053011416,
"grad_norm": 3.7815332412719727,
"learning_rate": 7.122972556403567e-05,
"loss": 1.0867,
"step": 193
},
{
"epoch": 0.02545597690591786,
"grad_norm": 3.6170129776000977,
"learning_rate": 7.09329868768714e-05,
"loss": 0.8663,
"step": 194
},
{
"epoch": 0.02558719328172156,
"grad_norm": 3.3122055530548096,
"learning_rate": 7.063535149021973e-05,
"loss": 0.8313,
"step": 195
},
{
"epoch": 0.025718409657525258,
"grad_norm": 4.299932479858398,
"learning_rate": 7.033683215379002e-05,
"loss": 1.1031,
"step": 196
},
{
"epoch": 0.025849626033328958,
"grad_norm": 3.6584527492523193,
"learning_rate": 7.003744165515705e-05,
"loss": 0.6661,
"step": 197
},
{
"epoch": 0.02598084240913266,
"grad_norm": 3.595172166824341,
"learning_rate": 6.973719281921335e-05,
"loss": 0.6774,
"step": 198
},
{
"epoch": 0.02611205878493636,
"grad_norm": 3.2948033809661865,
"learning_rate": 6.943609850761979e-05,
"loss": 0.6726,
"step": 199
},
{
"epoch": 0.02624327516074006,
"grad_norm": 4.285879135131836,
"learning_rate": 6.91341716182545e-05,
"loss": 0.648,
"step": 200
},
{
"epoch": 0.02624327516074006,
"eval_loss": 1.8295915126800537,
"eval_runtime": 1356.6996,
"eval_samples_per_second": 9.461,
"eval_steps_per_second": 2.365,
"step": 200
},
{
"epoch": 0.02637449153654376,
"grad_norm": 2.5337905883789062,
"learning_rate": 6.883142508466054e-05,
"loss": 1.9931,
"step": 201
},
{
"epoch": 0.02650570791234746,
"grad_norm": 1.4764660596847534,
"learning_rate": 6.852787187549182e-05,
"loss": 2.0433,
"step": 202
},
{
"epoch": 0.026636924288151162,
"grad_norm": 2.1547956466674805,
"learning_rate": 6.82235249939575e-05,
"loss": 1.9185,
"step": 203
},
{
"epoch": 0.026768140663954863,
"grad_norm": 2.4460480213165283,
"learning_rate": 6.7918397477265e-05,
"loss": 1.9992,
"step": 204
},
{
"epoch": 0.026899357039758563,
"grad_norm": 1.4299720525741577,
"learning_rate": 6.761250239606169e-05,
"loss": 2.0654,
"step": 205
},
{
"epoch": 0.027030573415562264,
"grad_norm": 1.9943609237670898,
"learning_rate": 6.730585285387465e-05,
"loss": 2.0979,
"step": 206
},
{
"epoch": 0.02716178979136596,
"grad_norm": 2.2427730560302734,
"learning_rate": 6.699846198654971e-05,
"loss": 2.2645,
"step": 207
},
{
"epoch": 0.02729300616716966,
"grad_norm": 2.437592029571533,
"learning_rate": 6.669034296168855e-05,
"loss": 2.2168,
"step": 208
},
{
"epoch": 0.027424222542973362,
"grad_norm": 2.5972635746002197,
"learning_rate": 6.638150897808468e-05,
"loss": 2.2821,
"step": 209
},
{
"epoch": 0.027555438918777063,
"grad_norm": 2.6581530570983887,
"learning_rate": 6.607197326515808e-05,
"loss": 2.1277,
"step": 210
},
{
"epoch": 0.027686655294580764,
"grad_norm": 3.457130193710327,
"learning_rate": 6.57617490823885e-05,
"loss": 2.0846,
"step": 211
},
{
"epoch": 0.027817871670384464,
"grad_norm": 2.862334728240967,
"learning_rate": 6.545084971874738e-05,
"loss": 2.0455,
"step": 212
},
{
"epoch": 0.027949088046188165,
"grad_norm": 2.7949256896972656,
"learning_rate": 6.513928849212873e-05,
"loss": 1.8195,
"step": 213
},
{
"epoch": 0.028080304421991865,
"grad_norm": 3.603222608566284,
"learning_rate": 6.482707874877854e-05,
"loss": 2.1141,
"step": 214
},
{
"epoch": 0.028211520797795566,
"grad_norm": 3.7938568592071533,
"learning_rate": 6.451423386272312e-05,
"loss": 2.0341,
"step": 215
},
{
"epoch": 0.028342737173599267,
"grad_norm": 3.7415919303894043,
"learning_rate": 6.420076723519614e-05,
"loss": 1.9174,
"step": 216
},
{
"epoch": 0.028473953549402964,
"grad_norm": 3.2029411792755127,
"learning_rate": 6.388669229406462e-05,
"loss": 1.9153,
"step": 217
},
{
"epoch": 0.028605169925206664,
"grad_norm": 4.757354259490967,
"learning_rate": 6.357202249325371e-05,
"loss": 2.0522,
"step": 218
},
{
"epoch": 0.028736386301010365,
"grad_norm": 4.18367862701416,
"learning_rate": 6.32567713121704e-05,
"loss": 1.7395,
"step": 219
},
{
"epoch": 0.028867602676814066,
"grad_norm": 3.411064863204956,
"learning_rate": 6.294095225512603e-05,
"loss": 1.8052,
"step": 220
},
{
"epoch": 0.028998819052617766,
"grad_norm": 3.278550624847412,
"learning_rate": 6.26245788507579e-05,
"loss": 1.9089,
"step": 221
},
{
"epoch": 0.029130035428421467,
"grad_norm": 3.312464714050293,
"learning_rate": 6.230766465144967e-05,
"loss": 1.7548,
"step": 222
},
{
"epoch": 0.029261251804225168,
"grad_norm": 3.0991432666778564,
"learning_rate": 6.199022323275083e-05,
"loss": 1.6692,
"step": 223
},
{
"epoch": 0.02939246818002887,
"grad_norm": 3.4286837577819824,
"learning_rate": 6.167226819279528e-05,
"loss": 1.79,
"step": 224
},
{
"epoch": 0.02952368455583257,
"grad_norm": 3.0509002208709717,
"learning_rate": 6.135381315171867e-05,
"loss": 1.5622,
"step": 225
},
{
"epoch": 0.02965490093163627,
"grad_norm": 3.1480135917663574,
"learning_rate": 6.103487175107507e-05,
"loss": 1.622,
"step": 226
},
{
"epoch": 0.02978611730743997,
"grad_norm": 3.6017837524414062,
"learning_rate": 6.071545765325254e-05,
"loss": 1.6357,
"step": 227
},
{
"epoch": 0.029917333683243667,
"grad_norm": 3.5763702392578125,
"learning_rate": 6.0395584540887963e-05,
"loss": 1.647,
"step": 228
},
{
"epoch": 0.030048550059047368,
"grad_norm": 3.3560070991516113,
"learning_rate": 6.007526611628086e-05,
"loss": 1.4219,
"step": 229
},
{
"epoch": 0.03017976643485107,
"grad_norm": 3.4218170642852783,
"learning_rate": 5.9754516100806423e-05,
"loss": 1.5417,
"step": 230
},
{
"epoch": 0.03031098281065477,
"grad_norm": 3.3482179641723633,
"learning_rate": 5.9433348234327765e-05,
"loss": 1.4549,
"step": 231
},
{
"epoch": 0.03044219918645847,
"grad_norm": 3.298544406890869,
"learning_rate": 5.911177627460739e-05,
"loss": 1.4727,
"step": 232
},
{
"epoch": 0.03057341556226217,
"grad_norm": 3.360682487487793,
"learning_rate": 5.8789813996717736e-05,
"loss": 1.4171,
"step": 233
},
{
"epoch": 0.03070463193806587,
"grad_norm": 3.3437387943267822,
"learning_rate": 5.8467475192451226e-05,
"loss": 1.2873,
"step": 234
},
{
"epoch": 0.03083584831386957,
"grad_norm": 3.6862969398498535,
"learning_rate": 5.814477366972945e-05,
"loss": 1.5642,
"step": 235
},
{
"epoch": 0.030967064689673272,
"grad_norm": 3.4449985027313232,
"learning_rate": 5.782172325201155e-05,
"loss": 1.2568,
"step": 236
},
{
"epoch": 0.031098281065476973,
"grad_norm": 4.007747173309326,
"learning_rate": 5.749833777770225e-05,
"loss": 1.1918,
"step": 237
},
{
"epoch": 0.03122949744128067,
"grad_norm": 3.507702589035034,
"learning_rate": 5.717463109955896e-05,
"loss": 1.2402,
"step": 238
},
{
"epoch": 0.03136071381708437,
"grad_norm": 3.6946218013763428,
"learning_rate": 5.685061708409841e-05,
"loss": 1.4288,
"step": 239
},
{
"epoch": 0.031491930192888075,
"grad_norm": 4.057668209075928,
"learning_rate": 5.6526309611002594e-05,
"loss": 1.1473,
"step": 240
},
{
"epoch": 0.03162314656869177,
"grad_norm": 3.8214073181152344,
"learning_rate": 5.6201722572524275e-05,
"loss": 1.1556,
"step": 241
},
{
"epoch": 0.031754362944495476,
"grad_norm": 3.187711000442505,
"learning_rate": 5.587686987289189e-05,
"loss": 0.8827,
"step": 242
},
{
"epoch": 0.03188557932029917,
"grad_norm": 7.194174289703369,
"learning_rate": 5.5551765427713884e-05,
"loss": 1.1186,
"step": 243
},
{
"epoch": 0.03201679569610287,
"grad_norm": 3.3873300552368164,
"learning_rate": 5.522642316338268e-05,
"loss": 0.9614,
"step": 244
},
{
"epoch": 0.032148012071906575,
"grad_norm": 3.6875414848327637,
"learning_rate": 5.490085701647805e-05,
"loss": 0.9514,
"step": 245
},
{
"epoch": 0.03227922844771027,
"grad_norm": 2.922820806503296,
"learning_rate": 5.457508093317013e-05,
"loss": 0.6782,
"step": 246
},
{
"epoch": 0.032410444823513976,
"grad_norm": 2.960690975189209,
"learning_rate": 5.4249108868622086e-05,
"loss": 0.5822,
"step": 247
},
{
"epoch": 0.03254166119931767,
"grad_norm": 2.816145658493042,
"learning_rate": 5.392295478639225e-05,
"loss": 0.5743,
"step": 248
},
{
"epoch": 0.03267287757512138,
"grad_norm": 3.569626569747925,
"learning_rate": 5.359663265783598e-05,
"loss": 0.6907,
"step": 249
},
{
"epoch": 0.032804093950925074,
"grad_norm": 4.098865032196045,
"learning_rate": 5.327015646150716e-05,
"loss": 0.7811,
"step": 250
},
{
"epoch": 0.03293531032672878,
"grad_norm": 1.4596612453460693,
"learning_rate": 5.294354018255945e-05,
"loss": 1.9946,
"step": 251
},
{
"epoch": 0.033066526702532476,
"grad_norm": 1.3144938945770264,
"learning_rate": 5.26167978121472e-05,
"loss": 1.9186,
"step": 252
},
{
"epoch": 0.03319774307833618,
"grad_norm": 1.3099392652511597,
"learning_rate": 5.228994334682604e-05,
"loss": 1.9639,
"step": 253
},
{
"epoch": 0.03332895945413988,
"grad_norm": 1.3149683475494385,
"learning_rate": 5.196299078795344e-05,
"loss": 1.9173,
"step": 254
},
{
"epoch": 0.033460175829943574,
"grad_norm": 1.3609029054641724,
"learning_rate": 5.1635954141088813e-05,
"loss": 2.0152,
"step": 255
},
{
"epoch": 0.03359139220574728,
"grad_norm": 1.6170399188995361,
"learning_rate": 5.1308847415393666e-05,
"loss": 2.0613,
"step": 256
},
{
"epoch": 0.033722608581550975,
"grad_norm": 2.1593005657196045,
"learning_rate": 5.0981684623031415e-05,
"loss": 2.0864,
"step": 257
},
{
"epoch": 0.03385382495735468,
"grad_norm": 2.1808066368103027,
"learning_rate": 5.0654479778567223e-05,
"loss": 2.0672,
"step": 258
},
{
"epoch": 0.033985041333158376,
"grad_norm": 2.3150367736816406,
"learning_rate": 5.0327246898367597e-05,
"loss": 2.0188,
"step": 259
},
{
"epoch": 0.03411625770896208,
"grad_norm": 2.590977430343628,
"learning_rate": 5e-05,
"loss": 2.1361,
"step": 260
},
{
"epoch": 0.03424747408476578,
"grad_norm": 2.5015792846679688,
"learning_rate": 4.9672753101632415e-05,
"loss": 1.9808,
"step": 261
},
{
"epoch": 0.03437869046056948,
"grad_norm": 3.2308530807495117,
"learning_rate": 4.934552022143279e-05,
"loss": 2.0869,
"step": 262
},
{
"epoch": 0.03450990683637318,
"grad_norm": 3.2150726318359375,
"learning_rate": 4.901831537696859e-05,
"loss": 1.9293,
"step": 263
},
{
"epoch": 0.03464112321217688,
"grad_norm": 2.861090898513794,
"learning_rate": 4.869115258460635e-05,
"loss": 2.0099,
"step": 264
},
{
"epoch": 0.03477233958798058,
"grad_norm": 2.8258917331695557,
"learning_rate": 4.83640458589112e-05,
"loss": 1.8759,
"step": 265
},
{
"epoch": 0.03490355596378428,
"grad_norm": 2.8827760219573975,
"learning_rate": 4.8037009212046586e-05,
"loss": 1.8166,
"step": 266
},
{
"epoch": 0.03503477233958798,
"grad_norm": 2.917901039123535,
"learning_rate": 4.7710056653173976e-05,
"loss": 1.8738,
"step": 267
},
{
"epoch": 0.03516598871539168,
"grad_norm": 3.3789291381835938,
"learning_rate": 4.738320218785281e-05,
"loss": 1.8185,
"step": 268
},
{
"epoch": 0.03529720509119538,
"grad_norm": 3.0679268836975098,
"learning_rate": 4.7056459817440544e-05,
"loss": 1.7584,
"step": 269
},
{
"epoch": 0.03542842146699908,
"grad_norm": 3.146171808242798,
"learning_rate": 4.6729843538492847e-05,
"loss": 1.7729,
"step": 270
},
{
"epoch": 0.035559637842802784,
"grad_norm": 3.02197265625,
"learning_rate": 4.640336734216403e-05,
"loss": 1.7293,
"step": 271
},
{
"epoch": 0.03569085421860648,
"grad_norm": 3.030005931854248,
"learning_rate": 4.607704521360776e-05,
"loss": 1.7331,
"step": 272
},
{
"epoch": 0.035822070594410185,
"grad_norm": 2.7871899604797363,
"learning_rate": 4.575089113137792e-05,
"loss": 1.5662,
"step": 273
},
{
"epoch": 0.03595328697021388,
"grad_norm": 3.5039308071136475,
"learning_rate": 4.542491906682989e-05,
"loss": 1.7353,
"step": 274
},
{
"epoch": 0.03608450334601758,
"grad_norm": 3.1538591384887695,
"learning_rate": 4.509914298352197e-05,
"loss": 1.5676,
"step": 275
},
{
"epoch": 0.036215719721821284,
"grad_norm": 2.9758079051971436,
"learning_rate": 4.477357683661734e-05,
"loss": 1.6357,
"step": 276
},
{
"epoch": 0.03634693609762498,
"grad_norm": 3.1648526191711426,
"learning_rate": 4.444823457228612e-05,
"loss": 1.5309,
"step": 277
},
{
"epoch": 0.036478152473428685,
"grad_norm": 3.3066608905792236,
"learning_rate": 4.412313012710813e-05,
"loss": 1.572,
"step": 278
},
{
"epoch": 0.03660936884923238,
"grad_norm": 3.489302635192871,
"learning_rate": 4.379827742747575e-05,
"loss": 1.6067,
"step": 279
},
{
"epoch": 0.036740585225036086,
"grad_norm": 3.4767446517944336,
"learning_rate": 4.347369038899744e-05,
"loss": 1.6319,
"step": 280
},
{
"epoch": 0.03687180160083978,
"grad_norm": 3.145461320877075,
"learning_rate": 4.3149382915901606e-05,
"loss": 1.4674,
"step": 281
},
{
"epoch": 0.03700301797664349,
"grad_norm": 3.5818727016448975,
"learning_rate": 4.282536890044104e-05,
"loss": 1.566,
"step": 282
},
{
"epoch": 0.037134234352447185,
"grad_norm": 3.861572027206421,
"learning_rate": 4.250166222229774e-05,
"loss": 1.5454,
"step": 283
},
{
"epoch": 0.03726545072825089,
"grad_norm": 3.507399320602417,
"learning_rate": 4.2178276747988446e-05,
"loss": 1.3924,
"step": 284
},
{
"epoch": 0.037396667104054586,
"grad_norm": 3.6404268741607666,
"learning_rate": 4.185522633027057e-05,
"loss": 1.4609,
"step": 285
},
{
"epoch": 0.03752788347985828,
"grad_norm": 3.455463171005249,
"learning_rate": 4.153252480754877e-05,
"loss": 1.293,
"step": 286
},
{
"epoch": 0.03765909985566199,
"grad_norm": 3.82511305809021,
"learning_rate": 4.1210186003282275e-05,
"loss": 1.3777,
"step": 287
},
{
"epoch": 0.037790316231465684,
"grad_norm": 4.107599258422852,
"learning_rate": 4.088822372539263e-05,
"loss": 1.4122,
"step": 288
},
{
"epoch": 0.03792153260726939,
"grad_norm": 3.8363401889801025,
"learning_rate": 4.0566651765672246e-05,
"loss": 1.1753,
"step": 289
},
{
"epoch": 0.038052748983073086,
"grad_norm": 3.759277105331421,
"learning_rate": 4.0245483899193595e-05,
"loss": 1.2129,
"step": 290
},
{
"epoch": 0.03818396535887679,
"grad_norm": 3.4467883110046387,
"learning_rate": 3.992473388371915e-05,
"loss": 1.1519,
"step": 291
},
{
"epoch": 0.03831518173468049,
"grad_norm": 3.933537483215332,
"learning_rate": 3.960441545911204e-05,
"loss": 1.148,
"step": 292
},
{
"epoch": 0.03844639811048419,
"grad_norm": 3.5908944606781006,
"learning_rate": 3.928454234674747e-05,
"loss": 1.0006,
"step": 293
},
{
"epoch": 0.03857761448628789,
"grad_norm": 3.9158682823181152,
"learning_rate": 3.896512824892495e-05,
"loss": 1.1491,
"step": 294
},
{
"epoch": 0.03870883086209159,
"grad_norm": 3.2645325660705566,
"learning_rate": 3.864618684828134e-05,
"loss": 0.6771,
"step": 295
},
{
"epoch": 0.03884004723789529,
"grad_norm": 3.201977252960205,
"learning_rate": 3.832773180720475e-05,
"loss": 0.6873,
"step": 296
},
{
"epoch": 0.038971263613698987,
"grad_norm": 3.315368175506592,
"learning_rate": 3.800977676724919e-05,
"loss": 0.6829,
"step": 297
},
{
"epoch": 0.03910247998950269,
"grad_norm": 3.72304368019104,
"learning_rate": 3.769233534855035e-05,
"loss": 0.8417,
"step": 298
},
{
"epoch": 0.03923369636530639,
"grad_norm": 3.146021842956543,
"learning_rate": 3.73754211492421e-05,
"loss": 0.5809,
"step": 299
},
{
"epoch": 0.03936491274111009,
"grad_norm": 3.441509485244751,
"learning_rate": 3.705904774487396e-05,
"loss": 0.5907,
"step": 300
},
{
"epoch": 0.03936491274111009,
"eval_loss": 1.6083000898361206,
"eval_runtime": 1364.8957,
"eval_samples_per_second": 9.404,
"eval_steps_per_second": 2.351,
"step": 300
},
{
"epoch": 0.03949612911691379,
"grad_norm": 1.1431409120559692,
"learning_rate": 3.6743228687829595e-05,
"loss": 1.9111,
"step": 301
},
{
"epoch": 0.03962734549271749,
"grad_norm": 1.1873606443405151,
"learning_rate": 3.642797750674629e-05,
"loss": 1.8401,
"step": 302
},
{
"epoch": 0.03975856186852119,
"grad_norm": 1.1036310195922852,
"learning_rate": 3.6113307705935396e-05,
"loss": 1.9566,
"step": 303
},
{
"epoch": 0.039889778244324894,
"grad_norm": 1.2092435359954834,
"learning_rate": 3.579923276480387e-05,
"loss": 1.9171,
"step": 304
},
{
"epoch": 0.04002099462012859,
"grad_norm": 1.284285545349121,
"learning_rate": 3.5485766137276894e-05,
"loss": 1.8699,
"step": 305
},
{
"epoch": 0.040152210995932296,
"grad_norm": 1.4310280084609985,
"learning_rate": 3.5172921251221455e-05,
"loss": 1.9898,
"step": 306
},
{
"epoch": 0.04028342737173599,
"grad_norm": 1.8922241926193237,
"learning_rate": 3.486071150787128e-05,
"loss": 2.0229,
"step": 307
},
{
"epoch": 0.04041464374753969,
"grad_norm": 2.3455190658569336,
"learning_rate": 3.4549150281252636e-05,
"loss": 2.1236,
"step": 308
},
{
"epoch": 0.040545860123343394,
"grad_norm": 2.310145616531372,
"learning_rate": 3.423825091761153e-05,
"loss": 2.0076,
"step": 309
},
{
"epoch": 0.04067707649914709,
"grad_norm": 2.770493507385254,
"learning_rate": 3.392802673484193e-05,
"loss": 2.1042,
"step": 310
},
{
"epoch": 0.040808292874950795,
"grad_norm": 2.8514256477355957,
"learning_rate": 3.361849102191533e-05,
"loss": 2.1725,
"step": 311
},
{
"epoch": 0.04093950925075449,
"grad_norm": 2.9664652347564697,
"learning_rate": 3.330965703831146e-05,
"loss": 1.9951,
"step": 312
},
{
"epoch": 0.0410707256265582,
"grad_norm": 3.0636472702026367,
"learning_rate": 3.300153801345028e-05,
"loss": 2.0478,
"step": 313
},
{
"epoch": 0.041201942002361894,
"grad_norm": 3.0987637042999268,
"learning_rate": 3.2694147146125345e-05,
"loss": 1.8762,
"step": 314
},
{
"epoch": 0.0413331583781656,
"grad_norm": 3.3828492164611816,
"learning_rate": 3.2387497603938326e-05,
"loss": 2.0024,
"step": 315
},
{
"epoch": 0.041464374753969295,
"grad_norm": 3.3837296962738037,
"learning_rate": 3.2081602522734986e-05,
"loss": 1.8804,
"step": 316
},
{
"epoch": 0.041595591129773,
"grad_norm": 3.1520280838012695,
"learning_rate": 3.177647500604252e-05,
"loss": 1.7544,
"step": 317
},
{
"epoch": 0.041726807505576696,
"grad_norm": 3.064986228942871,
"learning_rate": 3.147212812450819e-05,
"loss": 1.7702,
"step": 318
},
{
"epoch": 0.04185802388138039,
"grad_norm": 2.9096457958221436,
"learning_rate": 3.116857491533947e-05,
"loss": 1.7361,
"step": 319
},
{
"epoch": 0.0419892402571841,
"grad_norm": 3.204127073287964,
"learning_rate": 3.086582838174551e-05,
"loss": 2.0178,
"step": 320
},
{
"epoch": 0.042120456632987795,
"grad_norm": 3.014491081237793,
"learning_rate": 3.056390149238022e-05,
"loss": 1.6361,
"step": 321
},
{
"epoch": 0.0422516730087915,
"grad_norm": 2.9719576835632324,
"learning_rate": 3.0262807180786647e-05,
"loss": 1.5053,
"step": 322
},
{
"epoch": 0.042382889384595196,
"grad_norm": 2.9963722229003906,
"learning_rate": 2.996255834484296e-05,
"loss": 1.6507,
"step": 323
},
{
"epoch": 0.0425141057603989,
"grad_norm": 3.126011848449707,
"learning_rate": 2.9663167846209998e-05,
"loss": 1.6596,
"step": 324
},
{
"epoch": 0.0426453221362026,
"grad_norm": 3.2865161895751953,
"learning_rate": 2.936464850978027e-05,
"loss": 1.742,
"step": 325
},
{
"epoch": 0.0427765385120063,
"grad_norm": 3.1985416412353516,
"learning_rate": 2.9067013123128613e-05,
"loss": 1.592,
"step": 326
},
{
"epoch": 0.04290775488781,
"grad_norm": 3.0526158809661865,
"learning_rate": 2.8770274435964355e-05,
"loss": 1.6197,
"step": 327
},
{
"epoch": 0.043038971263613696,
"grad_norm": 3.0706164836883545,
"learning_rate": 2.8474445159585235e-05,
"loss": 1.6091,
"step": 328
},
{
"epoch": 0.0431701876394174,
"grad_norm": 3.1661648750305176,
"learning_rate": 2.8179537966332887e-05,
"loss": 1.4687,
"step": 329
},
{
"epoch": 0.0433014040152211,
"grad_norm": 3.272674798965454,
"learning_rate": 2.7885565489049946e-05,
"loss": 1.5756,
"step": 330
},
{
"epoch": 0.0434326203910248,
"grad_norm": 3.0554146766662598,
"learning_rate": 2.759254032053888e-05,
"loss": 1.3903,
"step": 331
},
{
"epoch": 0.0435638367668285,
"grad_norm": 3.462747097015381,
"learning_rate": 2.7300475013022663e-05,
"loss": 1.433,
"step": 332
},
{
"epoch": 0.0436950531426322,
"grad_norm": 3.4860434532165527,
"learning_rate": 2.700938207760701e-05,
"loss": 1.4024,
"step": 333
},
{
"epoch": 0.0438262695184359,
"grad_norm": 3.317476987838745,
"learning_rate": 2.671927398374443e-05,
"loss": 1.3322,
"step": 334
},
{
"epoch": 0.043957485894239604,
"grad_norm": 3.5378825664520264,
"learning_rate": 2.6430163158700115e-05,
"loss": 1.3707,
"step": 335
},
{
"epoch": 0.0440887022700433,
"grad_norm": 3.7148430347442627,
"learning_rate": 2.6142061987019577e-05,
"loss": 1.4425,
"step": 336
},
{
"epoch": 0.044219918645847005,
"grad_norm": 3.060731887817383,
"learning_rate": 2.5854982809998153e-05,
"loss": 1.1957,
"step": 337
},
{
"epoch": 0.0443511350216507,
"grad_norm": 3.4139750003814697,
"learning_rate": 2.556893792515227e-05,
"loss": 1.3196,
"step": 338
},
{
"epoch": 0.0444823513974544,
"grad_norm": 3.1842236518859863,
"learning_rate": 2.5283939585692783e-05,
"loss": 1.182,
"step": 339
},
{
"epoch": 0.0446135677732581,
"grad_norm": 3.5313189029693604,
"learning_rate": 2.500000000000001e-05,
"loss": 1.1865,
"step": 340
},
{
"epoch": 0.0447447841490618,
"grad_norm": 3.486128091812134,
"learning_rate": 2.471713133110078e-05,
"loss": 1.1062,
"step": 341
},
{
"epoch": 0.044876000524865504,
"grad_norm": 3.0605080127716064,
"learning_rate": 2.4435345696147403e-05,
"loss": 0.8957,
"step": 342
},
{
"epoch": 0.0450072169006692,
"grad_norm": 3.2468960285186768,
"learning_rate": 2.4154655165898627e-05,
"loss": 0.9158,
"step": 343
},
{
"epoch": 0.045138433276472906,
"grad_norm": 3.440025568008423,
"learning_rate": 2.3875071764202563e-05,
"loss": 0.8078,
"step": 344
},
{
"epoch": 0.0452696496522766,
"grad_norm": 2.9829273223876953,
"learning_rate": 2.3596607467481603e-05,
"loss": 0.7363,
"step": 345
},
{
"epoch": 0.04540086602808031,
"grad_norm": 3.14467453956604,
"learning_rate": 2.3319274204219428e-05,
"loss": 0.5925,
"step": 346
},
{
"epoch": 0.045532082403884004,
"grad_norm": 3.064061403274536,
"learning_rate": 2.3043083854449988e-05,
"loss": 0.6579,
"step": 347
},
{
"epoch": 0.04566329877968771,
"grad_norm": 2.8710074424743652,
"learning_rate": 2.2768048249248648e-05,
"loss": 0.5748,
"step": 348
},
{
"epoch": 0.045794515155491405,
"grad_norm": 3.819432020187378,
"learning_rate": 2.2494179170225333e-05,
"loss": 0.6714,
"step": 349
},
{
"epoch": 0.0459257315312951,
"grad_norm": 4.986090183258057,
"learning_rate": 2.2221488349019903e-05,
"loss": 0.7477,
"step": 350
},
{
"epoch": 0.04605694790709881,
"grad_norm": 2.6902003288269043,
"learning_rate": 2.194998746679952e-05,
"loss": 1.8673,
"step": 351
},
{
"epoch": 0.046188164282902504,
"grad_norm": 2.166841506958008,
"learning_rate": 2.167968815375837e-05,
"loss": 1.8827,
"step": 352
},
{
"epoch": 0.04631938065870621,
"grad_norm": 1.7183693647384644,
"learning_rate": 2.1410601988619394e-05,
"loss": 1.8546,
"step": 353
},
{
"epoch": 0.046450597034509905,
"grad_norm": 1.265217900276184,
"learning_rate": 2.1142740498138324e-05,
"loss": 1.9024,
"step": 354
},
{
"epoch": 0.04658181341031361,
"grad_norm": 1.721006155014038,
"learning_rate": 2.08761151566099e-05,
"loss": 1.955,
"step": 355
},
{
"epoch": 0.046713029786117306,
"grad_norm": 2.1906604766845703,
"learning_rate": 2.061073738537635e-05,
"loss": 2.01,
"step": 356
},
{
"epoch": 0.04684424616192101,
"grad_norm": 2.1465506553649902,
"learning_rate": 2.034661855233815e-05,
"loss": 2.0038,
"step": 357
},
{
"epoch": 0.04697546253772471,
"grad_norm": 2.263490915298462,
"learning_rate": 2.008376997146705e-05,
"loss": 2.068,
"step": 358
},
{
"epoch": 0.04710667891352841,
"grad_norm": 2.7283573150634766,
"learning_rate": 1.982220290232143e-05,
"loss": 2.0493,
"step": 359
},
{
"epoch": 0.04723789528933211,
"grad_norm": 2.6310789585113525,
"learning_rate": 1.9561928549563968e-05,
"loss": 2.069,
"step": 360
},
{
"epoch": 0.047369111665135806,
"grad_norm": 2.767486333847046,
"learning_rate": 1.9302958062481673e-05,
"loss": 2.028,
"step": 361
},
{
"epoch": 0.04750032804093951,
"grad_norm": 2.8701977729797363,
"learning_rate": 1.9045302534508297e-05,
"loss": 1.9971,
"step": 362
},
{
"epoch": 0.04763154441674321,
"grad_norm": 3.023191213607788,
"learning_rate": 1.8788973002749112e-05,
"loss": 1.9896,
"step": 363
},
{
"epoch": 0.04776276079254691,
"grad_norm": 3.0075771808624268,
"learning_rate": 1.8533980447508137e-05,
"loss": 1.9956,
"step": 364
},
{
"epoch": 0.04789397716835061,
"grad_norm": 3.155802011489868,
"learning_rate": 1.8280335791817733e-05,
"loss": 1.7729,
"step": 365
},
{
"epoch": 0.04802519354415431,
"grad_norm": 3.0290050506591797,
"learning_rate": 1.8028049900970767e-05,
"loss": 1.7798,
"step": 366
},
{
"epoch": 0.04815640991995801,
"grad_norm": 2.8980050086975098,
"learning_rate": 1.777713358205514e-05,
"loss": 1.7175,
"step": 367
},
{
"epoch": 0.048287626295761714,
"grad_norm": 3.2710909843444824,
"learning_rate": 1.7527597583490822e-05,
"loss": 1.829,
"step": 368
},
{
"epoch": 0.04841884267156541,
"grad_norm": 3.313262462615967,
"learning_rate": 1.7279452594569483e-05,
"loss": 1.8382,
"step": 369
},
{
"epoch": 0.04855005904736911,
"grad_norm": 3.1630051136016846,
"learning_rate": 1.703270924499656e-05,
"loss": 1.7057,
"step": 370
},
{
"epoch": 0.04868127542317281,
"grad_norm": 3.111182928085327,
"learning_rate": 1.678737810443593e-05,
"loss": 1.6428,
"step": 371
},
{
"epoch": 0.04881249179897651,
"grad_norm": 3.932502508163452,
"learning_rate": 1.6543469682057106e-05,
"loss": 1.7103,
"step": 372
},
{
"epoch": 0.048943708174780214,
"grad_norm": 3.1841988563537598,
"learning_rate": 1.6300994426085103e-05,
"loss": 1.5631,
"step": 373
},
{
"epoch": 0.04907492455058391,
"grad_norm": 3.1388206481933594,
"learning_rate": 1.605996272335291e-05,
"loss": 1.6667,
"step": 374
},
{
"epoch": 0.049206140926387615,
"grad_norm": 3.2520394325256348,
"learning_rate": 1.5820384898856434e-05,
"loss": 1.5319,
"step": 375
},
{
"epoch": 0.04933735730219131,
"grad_norm": 3.448615074157715,
"learning_rate": 1.5582271215312294e-05,
"loss": 1.3435,
"step": 376
},
{
"epoch": 0.049468573677995016,
"grad_norm": 3.1773369312286377,
"learning_rate": 1.5345631872718214e-05,
"loss": 1.4977,
"step": 377
},
{
"epoch": 0.04959979005379871,
"grad_norm": 3.773745536804199,
"learning_rate": 1.5110477007916001e-05,
"loss": 1.6206,
"step": 378
},
{
"epoch": 0.04973100642960242,
"grad_norm": 3.2866322994232178,
"learning_rate": 1.4876816694157419e-05,
"loss": 1.4102,
"step": 379
},
{
"epoch": 0.049862222805406115,
"grad_norm": 3.218993902206421,
"learning_rate": 1.4644660940672627e-05,
"loss": 1.4885,
"step": 380
},
{
"epoch": 0.04999343918120981,
"grad_norm": 3.413767099380493,
"learning_rate": 1.4414019692241437e-05,
"loss": 1.4064,
"step": 381
},
{
"epoch": 0.050124655557013516,
"grad_norm": 3.1845431327819824,
"learning_rate": 1.4184902828767287e-05,
"loss": 1.3509,
"step": 382
},
{
"epoch": 0.05025587193281721,
"grad_norm": 3.4849841594696045,
"learning_rate": 1.3957320164854059e-05,
"loss": 1.4778,
"step": 383
},
{
"epoch": 0.05038708830862092,
"grad_norm": 3.433154344558716,
"learning_rate": 1.373128144938563e-05,
"loss": 1.2556,
"step": 384
},
{
"epoch": 0.050518304684424614,
"grad_norm": 3.4509999752044678,
"learning_rate": 1.3506796365108232e-05,
"loss": 1.3327,
"step": 385
},
{
"epoch": 0.05064952106022832,
"grad_norm": 3.3174984455108643,
"learning_rate": 1.3283874528215733e-05,
"loss": 1.2881,
"step": 386
},
{
"epoch": 0.050780737436032015,
"grad_norm": 3.912639856338501,
"learning_rate": 1.3062525487937699e-05,
"loss": 1.2161,
"step": 387
},
{
"epoch": 0.05091195381183572,
"grad_norm": 3.7738358974456787,
"learning_rate": 1.2842758726130283e-05,
"loss": 1.4717,
"step": 388
},
{
"epoch": 0.05104317018763942,
"grad_norm": 3.3311028480529785,
"learning_rate": 1.2624583656870154e-05,
"loss": 0.9475,
"step": 389
},
{
"epoch": 0.05117438656344312,
"grad_norm": 3.2263801097869873,
"learning_rate": 1.2408009626051137e-05,
"loss": 1.0097,
"step": 390
},
{
"epoch": 0.05130560293924682,
"grad_norm": 3.5462255477905273,
"learning_rate": 1.2193045910983863e-05,
"loss": 1.0301,
"step": 391
},
{
"epoch": 0.051436819315050515,
"grad_norm": 3.075239419937134,
"learning_rate": 1.1979701719998453e-05,
"loss": 0.8822,
"step": 392
},
{
"epoch": 0.05156803569085422,
"grad_norm": 3.8291819095611572,
"learning_rate": 1.1767986192049984e-05,
"loss": 1.0269,
"step": 393
},
{
"epoch": 0.051699252066657916,
"grad_norm": 3.1441256999969482,
"learning_rate": 1.1557908396327028e-05,
"loss": 0.8817,
"step": 394
},
{
"epoch": 0.05183046844246162,
"grad_norm": 3.654750347137451,
"learning_rate": 1.134947733186315e-05,
"loss": 0.8977,
"step": 395
},
{
"epoch": 0.05196168481826532,
"grad_norm": 3.509984254837036,
"learning_rate": 1.1142701927151456e-05,
"loss": 0.7434,
"step": 396
},
{
"epoch": 0.05209290119406902,
"grad_norm": 3.4480764865875244,
"learning_rate": 1.0937591039762085e-05,
"loss": 0.6737,
"step": 397
},
{
"epoch": 0.05222411756987272,
"grad_norm": 3.189035415649414,
"learning_rate": 1.0734153455962765e-05,
"loss": 0.5615,
"step": 398
},
{
"epoch": 0.05235533394567642,
"grad_norm": 2.6568944454193115,
"learning_rate": 1.0532397890342505e-05,
"loss": 0.4878,
"step": 399
},
{
"epoch": 0.05248655032148012,
"grad_norm": 4.114481449127197,
"learning_rate": 1.0332332985438248e-05,
"loss": 0.6082,
"step": 400
},
{
"epoch": 0.05248655032148012,
"eval_loss": 1.4705314636230469,
"eval_runtime": 1366.8691,
"eval_samples_per_second": 9.391,
"eval_steps_per_second": 2.348,
"step": 400
},
{
"epoch": 0.052617766697283824,
"grad_norm": 1.5547120571136475,
"learning_rate": 1.013396731136465e-05,
"loss": 1.8584,
"step": 401
},
{
"epoch": 0.05274898307308752,
"grad_norm": 1.4148300886154175,
"learning_rate": 9.937309365446973e-06,
"loss": 1.8086,
"step": 402
},
{
"epoch": 0.05288019944889122,
"grad_norm": 1.1794555187225342,
"learning_rate": 9.742367571857091e-06,
"loss": 1.7795,
"step": 403
},
{
"epoch": 0.05301141582469492,
"grad_norm": 1.2620103359222412,
"learning_rate": 9.549150281252633e-06,
"loss": 1.8577,
"step": 404
},
{
"epoch": 0.05314263220049862,
"grad_norm": 1.2999380826950073,
"learning_rate": 9.357665770419244e-06,
"loss": 1.9158,
"step": 405
},
{
"epoch": 0.053273848576302324,
"grad_norm": 1.7266931533813477,
"learning_rate": 9.167922241916055e-06,
"loss": 1.9962,
"step": 406
},
{
"epoch": 0.05340506495210602,
"grad_norm": 1.747637152671814,
"learning_rate": 8.97992782372432e-06,
"loss": 1.9808,
"step": 407
},
{
"epoch": 0.053536281327909725,
"grad_norm": 2.2035903930664062,
"learning_rate": 8.793690568899216e-06,
"loss": 1.975,
"step": 408
},
{
"epoch": 0.05366749770371342,
"grad_norm": 2.1186153888702393,
"learning_rate": 8.609218455224893e-06,
"loss": 1.9926,
"step": 409
},
{
"epoch": 0.053798714079517126,
"grad_norm": 2.213442087173462,
"learning_rate": 8.426519384872733e-06,
"loss": 1.9039,
"step": 410
},
{
"epoch": 0.053929930455320824,
"grad_norm": 2.3685803413391113,
"learning_rate": 8.245601184062852e-06,
"loss": 1.958,
"step": 411
},
{
"epoch": 0.05406114683112453,
"grad_norm": 2.664763927459717,
"learning_rate": 8.066471602728803e-06,
"loss": 1.9546,
"step": 412
},
{
"epoch": 0.054192363206928225,
"grad_norm": 2.719207286834717,
"learning_rate": 7.889138314185678e-06,
"loss": 1.9461,
"step": 413
},
{
"epoch": 0.05432357958273192,
"grad_norm": 2.6173901557922363,
"learning_rate": 7.71360891480134e-06,
"loss": 1.8008,
"step": 414
},
{
"epoch": 0.054454795958535626,
"grad_norm": 3.068253755569458,
"learning_rate": 7.539890923671062e-06,
"loss": 1.9327,
"step": 415
},
{
"epoch": 0.05458601233433932,
"grad_norm": 2.894899606704712,
"learning_rate": 7.367991782295391e-06,
"loss": 1.9416,
"step": 416
},
{
"epoch": 0.05471722871014303,
"grad_norm": 3.028637170791626,
"learning_rate": 7.197918854261432e-06,
"loss": 1.7818,
"step": 417
},
{
"epoch": 0.054848445085946725,
"grad_norm": 2.8751864433288574,
"learning_rate": 7.029679424927365e-06,
"loss": 1.8194,
"step": 418
},
{
"epoch": 0.05497966146175043,
"grad_norm": 3.282900810241699,
"learning_rate": 6.863280701110408e-06,
"loss": 1.8425,
"step": 419
},
{
"epoch": 0.055110877837554126,
"grad_norm": 3.1004960536956787,
"learning_rate": 6.698729810778065e-06,
"loss": 1.6386,
"step": 420
},
{
"epoch": 0.05524209421335783,
"grad_norm": 3.0315091609954834,
"learning_rate": 6.536033802742813e-06,
"loss": 1.792,
"step": 421
},
{
"epoch": 0.05537331058916153,
"grad_norm": 3.5640015602111816,
"learning_rate": 6.375199646360142e-06,
"loss": 1.8219,
"step": 422
},
{
"epoch": 0.055504526964965224,
"grad_norm": 3.106428861618042,
"learning_rate": 6.216234231230012e-06,
"loss": 1.5988,
"step": 423
},
{
"epoch": 0.05563574334076893,
"grad_norm": 3.2876532077789307,
"learning_rate": 6.059144366901736e-06,
"loss": 1.6089,
"step": 424
},
{
"epoch": 0.055766959716572626,
"grad_norm": 3.1995625495910645,
"learning_rate": 5.903936782582253e-06,
"loss": 1.597,
"step": 425
},
{
"epoch": 0.05589817609237633,
"grad_norm": 3.096595287322998,
"learning_rate": 5.750618126847912e-06,
"loss": 1.3686,
"step": 426
},
{
"epoch": 0.05602939246818003,
"grad_norm": 3.0396366119384766,
"learning_rate": 5.599194967359639e-06,
"loss": 1.5684,
"step": 427
},
{
"epoch": 0.05616060884398373,
"grad_norm": 3.076733350753784,
"learning_rate": 5.449673790581611e-06,
"loss": 1.4296,
"step": 428
},
{
"epoch": 0.05629182521978743,
"grad_norm": 3.201915740966797,
"learning_rate": 5.302061001503394e-06,
"loss": 1.6379,
"step": 429
},
{
"epoch": 0.05642304159559113,
"grad_norm": 3.393874168395996,
"learning_rate": 5.156362923365588e-06,
"loss": 1.5963,
"step": 430
},
{
"epoch": 0.05655425797139483,
"grad_norm": 3.4268441200256348,
"learning_rate": 5.012585797388936e-06,
"loss": 1.3698,
"step": 431
},
{
"epoch": 0.05668547434719853,
"grad_norm": 3.298431158065796,
"learning_rate": 4.87073578250698e-06,
"loss": 1.5274,
"step": 432
},
{
"epoch": 0.05681669072300223,
"grad_norm": 3.310758352279663,
"learning_rate": 4.730818955102234e-06,
"loss": 1.2892,
"step": 433
},
{
"epoch": 0.05694790709880593,
"grad_norm": 3.1434218883514404,
"learning_rate": 4.592841308745932e-06,
"loss": 1.2763,
"step": 434
},
{
"epoch": 0.05707912347460963,
"grad_norm": 3.1354024410247803,
"learning_rate": 4.456808753941205e-06,
"loss": 1.2465,
"step": 435
},
{
"epoch": 0.05721033985041333,
"grad_norm": 3.235828161239624,
"learning_rate": 4.322727117869951e-06,
"loss": 1.1989,
"step": 436
},
{
"epoch": 0.05734155622621703,
"grad_norm": 3.4798684120178223,
"learning_rate": 4.190602144143207e-06,
"loss": 1.3004,
"step": 437
},
{
"epoch": 0.05747277260202073,
"grad_norm": 3.5043392181396484,
"learning_rate": 4.06043949255509e-06,
"loss": 1.1935,
"step": 438
},
{
"epoch": 0.057603988977824434,
"grad_norm": 3.677044630050659,
"learning_rate": 3.932244738840379e-06,
"loss": 1.2309,
"step": 439
},
{
"epoch": 0.05773520535362813,
"grad_norm": 4.231906414031982,
"learning_rate": 3.8060233744356633e-06,
"loss": 1.2004,
"step": 440
},
{
"epoch": 0.057866421729431836,
"grad_norm": 3.8758881092071533,
"learning_rate": 3.681780806244095e-06,
"loss": 1.1777,
"step": 441
},
{
"epoch": 0.05799763810523553,
"grad_norm": 3.5995683670043945,
"learning_rate": 3.5595223564037884e-06,
"loss": 1.1675,
"step": 442
},
{
"epoch": 0.05812885448103924,
"grad_norm": 3.3509786128997803,
"learning_rate": 3.4392532620598216e-06,
"loss": 1.1342,
"step": 443
},
{
"epoch": 0.058260070856842934,
"grad_norm": 3.3423001766204834,
"learning_rate": 3.3209786751399187e-06,
"loss": 1.0987,
"step": 444
},
{
"epoch": 0.05839128723264663,
"grad_norm": 3.2946929931640625,
"learning_rate": 3.2047036621337236e-06,
"loss": 0.8206,
"step": 445
},
{
"epoch": 0.058522503608450335,
"grad_norm": 3.773805618286133,
"learning_rate": 3.0904332038757977e-06,
"loss": 0.792,
"step": 446
},
{
"epoch": 0.05865371998425403,
"grad_norm": 3.8902366161346436,
"learning_rate": 2.978172195332263e-06,
"loss": 0.9764,
"step": 447
},
{
"epoch": 0.05878493636005774,
"grad_norm": 3.055663824081421,
"learning_rate": 2.8679254453910785e-06,
"loss": 0.5963,
"step": 448
},
{
"epoch": 0.058916152735861434,
"grad_norm": 2.5872833728790283,
"learning_rate": 2.759697676656098e-06,
"loss": 0.4251,
"step": 449
},
{
"epoch": 0.05904736911166514,
"grad_norm": 2.976865530014038,
"learning_rate": 2.653493525244721e-06,
"loss": 0.5668,
"step": 450
},
{
"epoch": 0.059178585487468835,
"grad_norm": 1.0085222721099854,
"learning_rate": 2.549317540589308e-06,
"loss": 1.8009,
"step": 451
},
{
"epoch": 0.05930980186327254,
"grad_norm": 1.0669842958450317,
"learning_rate": 2.4471741852423237e-06,
"loss": 1.7289,
"step": 452
},
{
"epoch": 0.059441018239076236,
"grad_norm": 1.1122716665267944,
"learning_rate": 2.3470678346851518e-06,
"loss": 1.8283,
"step": 453
},
{
"epoch": 0.05957223461487994,
"grad_norm": 1.1549385786056519,
"learning_rate": 2.2490027771406687e-06,
"loss": 1.925,
"step": 454
},
{
"epoch": 0.05970345099068364,
"grad_norm": 1.1655536890029907,
"learning_rate": 2.152983213389559e-06,
"loss": 1.9207,
"step": 455
},
{
"epoch": 0.059834667366487335,
"grad_norm": 1.4617455005645752,
"learning_rate": 2.0590132565903476e-06,
"loss": 1.9206,
"step": 456
},
{
"epoch": 0.05996588374229104,
"grad_norm": 1.655059576034546,
"learning_rate": 1.9670969321032407e-06,
"loss": 1.9933,
"step": 457
},
{
"epoch": 0.060097100118094736,
"grad_norm": 1.801466703414917,
"learning_rate": 1.8772381773176417e-06,
"loss": 1.9565,
"step": 458
},
{
"epoch": 0.06022831649389844,
"grad_norm": 1.9467339515686035,
"learning_rate": 1.7894408414835362e-06,
"loss": 1.9179,
"step": 459
},
{
"epoch": 0.06035953286970214,
"grad_norm": 2.1917128562927246,
"learning_rate": 1.70370868554659e-06,
"loss": 1.9006,
"step": 460
},
{
"epoch": 0.06049074924550584,
"grad_norm": 2.31794810295105,
"learning_rate": 1.620045381987012e-06,
"loss": 1.8937,
"step": 461
},
{
"epoch": 0.06062196562130954,
"grad_norm": 2.556521415710449,
"learning_rate": 1.5384545146622852e-06,
"loss": 1.9225,
"step": 462
},
{
"epoch": 0.06075318199711324,
"grad_norm": 2.7441983222961426,
"learning_rate": 1.4589395786535953e-06,
"loss": 1.7219,
"step": 463
},
{
"epoch": 0.06088439837291694,
"grad_norm": 2.604498863220215,
"learning_rate": 1.3815039801161721e-06,
"loss": 1.7661,
"step": 464
},
{
"epoch": 0.061015614748720644,
"grad_norm": 2.8174169063568115,
"learning_rate": 1.3061510361333185e-06,
"loss": 1.8408,
"step": 465
},
{
"epoch": 0.06114683112452434,
"grad_norm": 2.8200817108154297,
"learning_rate": 1.232883974574367e-06,
"loss": 1.7759,
"step": 466
},
{
"epoch": 0.06127804750032804,
"grad_norm": 3.005772829055786,
"learning_rate": 1.1617059339563807e-06,
"loss": 1.7675,
"step": 467
},
{
"epoch": 0.06140926387613174,
"grad_norm": 2.790365219116211,
"learning_rate": 1.0926199633097157e-06,
"loss": 1.6673,
"step": 468
},
{
"epoch": 0.06154048025193544,
"grad_norm": 2.7628681659698486,
"learning_rate": 1.0256290220474307e-06,
"loss": 1.5659,
"step": 469
},
{
"epoch": 0.06167169662773914,
"grad_norm": 2.9881839752197266,
"learning_rate": 9.607359798384785e-07,
"loss": 1.6035,
"step": 470
},
{
"epoch": 0.06180291300354284,
"grad_norm": 3.119401693344116,
"learning_rate": 8.979436164848088e-07,
"loss": 1.7385,
"step": 471
},
{
"epoch": 0.061934129379346545,
"grad_norm": 3.145608425140381,
"learning_rate": 8.372546218022747e-07,
"loss": 1.8044,
"step": 472
},
{
"epoch": 0.06206534575515024,
"grad_norm": 3.2894794940948486,
"learning_rate": 7.786715955054203e-07,
"loss": 1.6499,
"step": 473
},
{
"epoch": 0.062196562130953946,
"grad_norm": 3.073772430419922,
"learning_rate": 7.221970470961125e-07,
"loss": 1.6461,
"step": 474
},
{
"epoch": 0.06232777850675764,
"grad_norm": 2.914522647857666,
"learning_rate": 6.678333957560512e-07,
"loss": 1.4814,
"step": 475
},
{
"epoch": 0.06245899488256134,
"grad_norm": 3.2599356174468994,
"learning_rate": 6.15582970243117e-07,
"loss": 1.5119,
"step": 476
},
{
"epoch": 0.06259021125836504,
"grad_norm": 3.0420191287994385,
"learning_rate": 5.654480087916303e-07,
"loss": 1.5962,
"step": 477
},
{
"epoch": 0.06272142763416874,
"grad_norm": 3.278502941131592,
"learning_rate": 5.174306590164879e-07,
"loss": 1.5504,
"step": 478
},
{
"epoch": 0.06285264400997244,
"grad_norm": 2.9558990001678467,
"learning_rate": 4.715329778211375e-07,
"loss": 1.4017,
"step": 479
},
{
"epoch": 0.06298386038577615,
"grad_norm": 3.2562479972839355,
"learning_rate": 4.277569313094809e-07,
"loss": 1.3569,
"step": 480
},
{
"epoch": 0.06311507676157985,
"grad_norm": 3.3667497634887695,
"learning_rate": 3.8610439470164737e-07,
"loss": 1.452,
"step": 481
},
{
"epoch": 0.06324629313738354,
"grad_norm": 3.1160595417022705,
"learning_rate": 3.465771522536854e-07,
"loss": 1.2796,
"step": 482
},
{
"epoch": 0.06337750951318724,
"grad_norm": 3.3417913913726807,
"learning_rate": 3.09176897181096e-07,
"loss": 1.4638,
"step": 483
},
{
"epoch": 0.06350872588899095,
"grad_norm": 3.3148908615112305,
"learning_rate": 2.7390523158633554e-07,
"loss": 1.3271,
"step": 484
},
{
"epoch": 0.06363994226479465,
"grad_norm": 3.5485448837280273,
"learning_rate": 2.407636663901591e-07,
"loss": 1.4274,
"step": 485
},
{
"epoch": 0.06377115864059835,
"grad_norm": 3.49263858795166,
"learning_rate": 2.0975362126691712e-07,
"loss": 1.3793,
"step": 486
},
{
"epoch": 0.06390237501640204,
"grad_norm": 3.290903091430664,
"learning_rate": 1.8087642458373134e-07,
"loss": 1.2698,
"step": 487
},
{
"epoch": 0.06403359139220574,
"grad_norm": 3.2131378650665283,
"learning_rate": 1.5413331334360182e-07,
"loss": 1.1554,
"step": 488
},
{
"epoch": 0.06416480776800945,
"grad_norm": 3.7000808715820312,
"learning_rate": 1.2952543313240472e-07,
"loss": 1.2878,
"step": 489
},
{
"epoch": 0.06429602414381315,
"grad_norm": 3.0949819087982178,
"learning_rate": 1.0705383806982606e-07,
"loss": 1.0326,
"step": 490
},
{
"epoch": 0.06442724051961685,
"grad_norm": 3.2864902019500732,
"learning_rate": 8.671949076420882e-08,
"loss": 1.041,
"step": 491
},
{
"epoch": 0.06455845689542054,
"grad_norm": 3.193171262741089,
"learning_rate": 6.852326227130834e-08,
"loss": 0.9282,
"step": 492
},
{
"epoch": 0.06468967327122425,
"grad_norm": 3.8569085597991943,
"learning_rate": 5.246593205699424e-08,
"loss": 0.8974,
"step": 493
},
{
"epoch": 0.06482088964702795,
"grad_norm": 3.4853317737579346,
"learning_rate": 3.8548187963854956e-08,
"loss": 0.797,
"step": 494
},
{
"epoch": 0.06495210602283165,
"grad_norm": 3.2885305881500244,
"learning_rate": 2.6770626181715773e-08,
"loss": 0.9086,
"step": 495
},
{
"epoch": 0.06508332239863535,
"grad_norm": 3.696621894836426,
"learning_rate": 1.7133751222137007e-08,
"loss": 0.8038,
"step": 496
},
{
"epoch": 0.06521453877443904,
"grad_norm": 3.6335575580596924,
"learning_rate": 9.637975896759077e-09,
"loss": 0.9278,
"step": 497
},
{
"epoch": 0.06534575515024275,
"grad_norm": 3.405994176864624,
"learning_rate": 4.2836212996499865e-09,
"loss": 0.7081,
"step": 498
},
{
"epoch": 0.06547697152604645,
"grad_norm": 2.7525062561035156,
"learning_rate": 1.0709167935385455e-09,
"loss": 0.5322,
"step": 499
},
{
"epoch": 0.06560818790185015,
"grad_norm": 3.2091686725616455,
"learning_rate": 0.0,
"loss": 0.491,
"step": 500
},
{
"epoch": 0.06560818790185015,
"eval_loss": 1.454202651977539,
"eval_runtime": 1356.7364,
"eval_samples_per_second": 9.461,
"eval_steps_per_second": 2.365,
"step": 500
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.026400488001372e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}