tyzhu's picture
End of training
bb7096f verified
raw
history blame contribute delete
No virus
40.2 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 19.99770484278173,
"eval_steps": 500,
"global_step": 21780,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.09180628873077806,
"grad_norm": 0.26081690192222595,
"learning_rate": 0.0001,
"loss": 1.6526,
"step": 100
},
{
"epoch": 0.18361257746155613,
"grad_norm": 0.25705486536026,
"learning_rate": 0.0001,
"loss": 1.5908,
"step": 200
},
{
"epoch": 0.2754188661923342,
"grad_norm": 0.2516353726387024,
"learning_rate": 0.0001,
"loss": 1.5712,
"step": 300
},
{
"epoch": 0.36722515492311225,
"grad_norm": 0.22883176803588867,
"learning_rate": 0.0001,
"loss": 1.5768,
"step": 400
},
{
"epoch": 0.4590314436538903,
"grad_norm": 0.236283078789711,
"learning_rate": 0.0001,
"loss": 1.5744,
"step": 500
},
{
"epoch": 0.5508377323846684,
"grad_norm": 0.231370747089386,
"learning_rate": 0.0001,
"loss": 1.5722,
"step": 600
},
{
"epoch": 0.6426440211154464,
"grad_norm": 0.24125300347805023,
"learning_rate": 0.0001,
"loss": 1.5736,
"step": 700
},
{
"epoch": 0.7344503098462245,
"grad_norm": 0.2657057046890259,
"learning_rate": 0.0001,
"loss": 1.5831,
"step": 800
},
{
"epoch": 0.8262565985770025,
"grad_norm": 0.27335691452026367,
"learning_rate": 0.0001,
"loss": 1.5521,
"step": 900
},
{
"epoch": 0.9180628873077806,
"grad_norm": 0.2826825976371765,
"learning_rate": 0.0001,
"loss": 1.5635,
"step": 1000
},
{
"epoch": 0.999770484278173,
"eval_accuracy": 0.6795836972343523,
"eval_loss": 1.4614675045013428,
"eval_runtime": 9.1641,
"eval_samples_per_second": 54.561,
"eval_steps_per_second": 6.875,
"step": 1089
},
{
"epoch": 1.0098691760385587,
"grad_norm": 0.24073347449302673,
"learning_rate": 0.0001,
"loss": 1.5475,
"step": 1100
},
{
"epoch": 1.1016754647693368,
"grad_norm": 0.3140055239200592,
"learning_rate": 0.0001,
"loss": 1.4788,
"step": 1200
},
{
"epoch": 1.1934817535001148,
"grad_norm": 0.3724195659160614,
"learning_rate": 0.0001,
"loss": 1.4698,
"step": 1300
},
{
"epoch": 1.2852880422308928,
"grad_norm": 0.34302350878715515,
"learning_rate": 0.0001,
"loss": 1.4629,
"step": 1400
},
{
"epoch": 1.377094330961671,
"grad_norm": 0.35881391167640686,
"learning_rate": 0.0001,
"loss": 1.4596,
"step": 1500
},
{
"epoch": 1.468900619692449,
"grad_norm": 0.3676307797431946,
"learning_rate": 0.0001,
"loss": 1.4718,
"step": 1600
},
{
"epoch": 1.560706908423227,
"grad_norm": 0.3709953725337982,
"learning_rate": 0.0001,
"loss": 1.435,
"step": 1700
},
{
"epoch": 1.652513197154005,
"grad_norm": 0.38531753420829773,
"learning_rate": 0.0001,
"loss": 1.4553,
"step": 1800
},
{
"epoch": 1.744319485884783,
"grad_norm": 0.40058839321136475,
"learning_rate": 0.0001,
"loss": 1.4444,
"step": 1900
},
{
"epoch": 1.836125774615561,
"grad_norm": 0.4059107303619385,
"learning_rate": 0.0001,
"loss": 1.4381,
"step": 2000
},
{
"epoch": 1.9279320633463393,
"grad_norm": 0.4201526939868927,
"learning_rate": 0.0001,
"loss": 1.4521,
"step": 2100
},
{
"epoch": 1.999540968556346,
"eval_accuracy": 0.6873740902474527,
"eval_loss": 1.362550973892212,
"eval_runtime": 9.1565,
"eval_samples_per_second": 54.606,
"eval_steps_per_second": 6.88,
"step": 2178
},
{
"epoch": 2.0197383520771175,
"grad_norm": 0.397616446018219,
"learning_rate": 0.0001,
"loss": 1.4163,
"step": 2200
},
{
"epoch": 2.1115446408078955,
"grad_norm": 0.4666147232055664,
"learning_rate": 0.0001,
"loss": 1.3015,
"step": 2300
},
{
"epoch": 2.2033509295386735,
"grad_norm": 0.5091608762741089,
"learning_rate": 0.0001,
"loss": 1.3117,
"step": 2400
},
{
"epoch": 2.2951572182694515,
"grad_norm": 0.44425851106643677,
"learning_rate": 0.0001,
"loss": 1.3042,
"step": 2500
},
{
"epoch": 2.3869635070002295,
"grad_norm": 0.4947376251220703,
"learning_rate": 0.0001,
"loss": 1.3091,
"step": 2600
},
{
"epoch": 2.4787697957310075,
"grad_norm": 0.49756625294685364,
"learning_rate": 0.0001,
"loss": 1.2878,
"step": 2700
},
{
"epoch": 2.5705760844617855,
"grad_norm": 0.48819100856781006,
"learning_rate": 0.0001,
"loss": 1.3036,
"step": 2800
},
{
"epoch": 2.6623823731925635,
"grad_norm": 0.49992629885673523,
"learning_rate": 0.0001,
"loss": 1.29,
"step": 2900
},
{
"epoch": 2.754188661923342,
"grad_norm": 0.5537226796150208,
"learning_rate": 0.0001,
"loss": 1.3049,
"step": 3000
},
{
"epoch": 2.84599495065412,
"grad_norm": 0.5161275267601013,
"learning_rate": 0.0001,
"loss": 1.2796,
"step": 3100
},
{
"epoch": 2.937801239384898,
"grad_norm": 0.5615408420562744,
"learning_rate": 0.0001,
"loss": 1.2848,
"step": 3200
},
{
"epoch": 2.9993114528345193,
"eval_accuracy": 0.6957729257641921,
"eval_loss": 1.257521390914917,
"eval_runtime": 9.264,
"eval_samples_per_second": 53.972,
"eval_steps_per_second": 6.801,
"step": 3267
},
{
"epoch": 3.029607528115676,
"grad_norm": 0.6211069226264954,
"learning_rate": 0.0001,
"loss": 1.2349,
"step": 3300
},
{
"epoch": 3.121413816846454,
"grad_norm": 0.6274811029434204,
"learning_rate": 0.0001,
"loss": 1.1362,
"step": 3400
},
{
"epoch": 3.213220105577232,
"grad_norm": 0.7168062925338745,
"learning_rate": 0.0001,
"loss": 1.1299,
"step": 3500
},
{
"epoch": 3.30502639430801,
"grad_norm": 0.6573987603187561,
"learning_rate": 0.0001,
"loss": 1.153,
"step": 3600
},
{
"epoch": 3.396832683038788,
"grad_norm": 0.702870786190033,
"learning_rate": 0.0001,
"loss": 1.1402,
"step": 3700
},
{
"epoch": 3.488638971769566,
"grad_norm": 0.6937388181686401,
"learning_rate": 0.0001,
"loss": 1.1344,
"step": 3800
},
{
"epoch": 3.580445260500344,
"grad_norm": 0.705838680267334,
"learning_rate": 0.0001,
"loss": 1.125,
"step": 3900
},
{
"epoch": 3.672251549231122,
"grad_norm": 0.8442272543907166,
"learning_rate": 0.0001,
"loss": 1.1423,
"step": 4000
},
{
"epoch": 3.7640578379619005,
"grad_norm": 0.9211050868034363,
"learning_rate": 0.0001,
"loss": 1.1227,
"step": 4100
},
{
"epoch": 3.8558641266926785,
"grad_norm": 0.6930621862411499,
"learning_rate": 0.0001,
"loss": 1.1286,
"step": 4200
},
{
"epoch": 3.9476704154234565,
"grad_norm": 0.6763383746147156,
"learning_rate": 0.0001,
"loss": 1.1197,
"step": 4300
},
{
"epoch": 4.0,
"eval_accuracy": 0.7054410480349345,
"eval_loss": 1.1526687145233154,
"eval_runtime": 8.1013,
"eval_samples_per_second": 61.718,
"eval_steps_per_second": 7.777,
"step": 4357
},
{
"epoch": 4.039476704154235,
"grad_norm": 0.6891958713531494,
"learning_rate": 0.0001,
"loss": 1.0562,
"step": 4400
},
{
"epoch": 4.131282992885013,
"grad_norm": 0.7408663630485535,
"learning_rate": 0.0001,
"loss": 0.9585,
"step": 4500
},
{
"epoch": 4.223089281615791,
"grad_norm": 0.8520354628562927,
"learning_rate": 0.0001,
"loss": 0.9652,
"step": 4600
},
{
"epoch": 4.314895570346569,
"grad_norm": 0.8522772789001465,
"learning_rate": 0.0001,
"loss": 0.9819,
"step": 4700
},
{
"epoch": 4.406701859077347,
"grad_norm": 0.8211854696273804,
"learning_rate": 0.0001,
"loss": 0.9777,
"step": 4800
},
{
"epoch": 4.498508147808125,
"grad_norm": 0.8455579280853271,
"learning_rate": 0.0001,
"loss": 0.9748,
"step": 4900
},
{
"epoch": 4.590314436538903,
"grad_norm": 0.9336457848548889,
"learning_rate": 0.0001,
"loss": 0.9806,
"step": 5000
},
{
"epoch": 4.682120725269681,
"grad_norm": 0.8030388355255127,
"learning_rate": 0.0001,
"loss": 0.977,
"step": 5100
},
{
"epoch": 4.773927014000459,
"grad_norm": 0.8392836451530457,
"learning_rate": 0.0001,
"loss": 0.9773,
"step": 5200
},
{
"epoch": 4.865733302731237,
"grad_norm": 0.823242723941803,
"learning_rate": 0.0001,
"loss": 0.9776,
"step": 5300
},
{
"epoch": 4.957539591462015,
"grad_norm": 0.9073436260223389,
"learning_rate": 0.0001,
"loss": 0.9756,
"step": 5400
},
{
"epoch": 4.999770484278173,
"eval_accuracy": 0.7142823871906842,
"eval_loss": 1.0531576871871948,
"eval_runtime": 9.2013,
"eval_samples_per_second": 54.34,
"eval_steps_per_second": 6.847,
"step": 5446
},
{
"epoch": 5.049345880192793,
"grad_norm": 1.1068471670150757,
"learning_rate": 0.0001,
"loss": 0.887,
"step": 5500
},
{
"epoch": 5.141152168923571,
"grad_norm": 0.9577746987342834,
"learning_rate": 0.0001,
"loss": 0.8052,
"step": 5600
},
{
"epoch": 5.232958457654349,
"grad_norm": 0.9153982996940613,
"learning_rate": 0.0001,
"loss": 0.8118,
"step": 5700
},
{
"epoch": 5.324764746385127,
"grad_norm": 1.0308676958084106,
"learning_rate": 0.0001,
"loss": 0.8332,
"step": 5800
},
{
"epoch": 5.416571035115905,
"grad_norm": 0.9150503873825073,
"learning_rate": 0.0001,
"loss": 0.8242,
"step": 5900
},
{
"epoch": 5.508377323846684,
"grad_norm": 1.0191842317581177,
"learning_rate": 0.0001,
"loss": 0.8323,
"step": 6000
},
{
"epoch": 5.600183612577462,
"grad_norm": 1.1198716163635254,
"learning_rate": 0.0001,
"loss": 0.8301,
"step": 6100
},
{
"epoch": 5.69198990130824,
"grad_norm": 0.996842622756958,
"learning_rate": 0.0001,
"loss": 0.84,
"step": 6200
},
{
"epoch": 5.783796190039018,
"grad_norm": 1.086377739906311,
"learning_rate": 0.0001,
"loss": 0.8414,
"step": 6300
},
{
"epoch": 5.875602478769796,
"grad_norm": 0.9792770147323608,
"learning_rate": 0.0001,
"loss": 0.8277,
"step": 6400
},
{
"epoch": 5.967408767500574,
"grad_norm": 1.0763967037200928,
"learning_rate": 0.0001,
"loss": 0.8393,
"step": 6500
},
{
"epoch": 5.999540968556346,
"eval_accuracy": 0.7241455604075692,
"eval_loss": 0.9538469314575195,
"eval_runtime": 8.1157,
"eval_samples_per_second": 61.609,
"eval_steps_per_second": 7.763,
"step": 6535
},
{
"epoch": 6.059215056231352,
"grad_norm": 1.08652663230896,
"learning_rate": 0.0001,
"loss": 0.729,
"step": 6600
},
{
"epoch": 6.15102134496213,
"grad_norm": 0.9392278790473938,
"learning_rate": 0.0001,
"loss": 0.6772,
"step": 6700
},
{
"epoch": 6.242827633692908,
"grad_norm": 1.126567006111145,
"learning_rate": 0.0001,
"loss": 0.6904,
"step": 6800
},
{
"epoch": 6.334633922423686,
"grad_norm": 1.0995755195617676,
"learning_rate": 0.0001,
"loss": 0.69,
"step": 6900
},
{
"epoch": 6.426440211154464,
"grad_norm": 0.983116090297699,
"learning_rate": 0.0001,
"loss": 0.6962,
"step": 7000
},
{
"epoch": 6.518246499885242,
"grad_norm": 1.2054848670959473,
"learning_rate": 0.0001,
"loss": 0.7061,
"step": 7100
},
{
"epoch": 6.61005278861602,
"grad_norm": 1.2262558937072754,
"learning_rate": 0.0001,
"loss": 0.71,
"step": 7200
},
{
"epoch": 6.701859077346798,
"grad_norm": 1.069161295890808,
"learning_rate": 0.0001,
"loss": 0.7003,
"step": 7300
},
{
"epoch": 6.793665366077576,
"grad_norm": 1.2181183099746704,
"learning_rate": 0.0001,
"loss": 0.7024,
"step": 7400
},
{
"epoch": 6.885471654808354,
"grad_norm": 1.0989437103271484,
"learning_rate": 0.0001,
"loss": 0.7002,
"step": 7500
},
{
"epoch": 6.977277943539132,
"grad_norm": 1.1168180704116821,
"learning_rate": 0.0001,
"loss": 0.7125,
"step": 7600
},
{
"epoch": 6.999311452834519,
"eval_accuracy": 0.7324425036390102,
"eval_loss": 0.8674135208129883,
"eval_runtime": 9.2169,
"eval_samples_per_second": 54.248,
"eval_steps_per_second": 6.835,
"step": 7624
},
{
"epoch": 7.06908423226991,
"grad_norm": 1.0699574947357178,
"learning_rate": 0.0001,
"loss": 0.6029,
"step": 7700
},
{
"epoch": 7.160890521000688,
"grad_norm": 0.9937657713890076,
"learning_rate": 0.0001,
"loss": 0.5805,
"step": 7800
},
{
"epoch": 7.252696809731467,
"grad_norm": 1.178791880607605,
"learning_rate": 0.0001,
"loss": 0.5752,
"step": 7900
},
{
"epoch": 7.344503098462245,
"grad_norm": 1.2159409523010254,
"learning_rate": 0.0001,
"loss": 0.5912,
"step": 8000
},
{
"epoch": 7.436309387193023,
"grad_norm": 1.0133622884750366,
"learning_rate": 0.0001,
"loss": 0.5932,
"step": 8100
},
{
"epoch": 7.528115675923801,
"grad_norm": 1.0923631191253662,
"learning_rate": 0.0001,
"loss": 0.6071,
"step": 8200
},
{
"epoch": 7.619921964654579,
"grad_norm": 1.3819491863250732,
"learning_rate": 0.0001,
"loss": 0.5956,
"step": 8300
},
{
"epoch": 7.711728253385357,
"grad_norm": 1.182358980178833,
"learning_rate": 0.0001,
"loss": 0.5922,
"step": 8400
},
{
"epoch": 7.803534542116135,
"grad_norm": 1.1674267053604126,
"learning_rate": 0.0001,
"loss": 0.5912,
"step": 8500
},
{
"epoch": 7.895340830846913,
"grad_norm": 1.1732617616653442,
"learning_rate": 0.0001,
"loss": 0.5969,
"step": 8600
},
{
"epoch": 7.987147119577691,
"grad_norm": 1.2167391777038574,
"learning_rate": 0.0001,
"loss": 0.6144,
"step": 8700
},
{
"epoch": 8.0,
"eval_accuracy": 0.7404046579330422,
"eval_loss": 0.7907233834266663,
"eval_runtime": 9.1451,
"eval_samples_per_second": 54.674,
"eval_steps_per_second": 6.889,
"step": 8714
},
{
"epoch": 8.07895340830847,
"grad_norm": 1.1496325731277466,
"learning_rate": 0.0001,
"loss": 0.5043,
"step": 8800
},
{
"epoch": 8.170759697039248,
"grad_norm": 1.4953396320343018,
"learning_rate": 0.0001,
"loss": 0.4848,
"step": 8900
},
{
"epoch": 8.262565985770026,
"grad_norm": 1.2796908617019653,
"learning_rate": 0.0001,
"loss": 0.5007,
"step": 9000
},
{
"epoch": 8.354372274500804,
"grad_norm": 1.2108944654464722,
"learning_rate": 0.0001,
"loss": 0.4987,
"step": 9100
},
{
"epoch": 8.446178563231582,
"grad_norm": 0.9534372687339783,
"learning_rate": 0.0001,
"loss": 0.5068,
"step": 9200
},
{
"epoch": 8.53798485196236,
"grad_norm": 1.1545357704162598,
"learning_rate": 0.0001,
"loss": 0.5072,
"step": 9300
},
{
"epoch": 8.629791140693138,
"grad_norm": 1.2086093425750732,
"learning_rate": 0.0001,
"loss": 0.5173,
"step": 9400
},
{
"epoch": 8.721597429423916,
"grad_norm": 1.20607328414917,
"learning_rate": 0.0001,
"loss": 0.519,
"step": 9500
},
{
"epoch": 8.813403718154694,
"grad_norm": 1.2534675598144531,
"learning_rate": 0.0001,
"loss": 0.5261,
"step": 9600
},
{
"epoch": 8.905210006885472,
"grad_norm": 1.2726677656173706,
"learning_rate": 0.0001,
"loss": 0.5213,
"step": 9700
},
{
"epoch": 8.99701629561625,
"grad_norm": 1.28297758102417,
"learning_rate": 0.0001,
"loss": 0.5355,
"step": 9800
},
{
"epoch": 8.999770484278173,
"eval_accuracy": 0.7468762736535662,
"eval_loss": 0.7288308143615723,
"eval_runtime": 9.1574,
"eval_samples_per_second": 54.601,
"eval_steps_per_second": 6.88,
"step": 9803
},
{
"epoch": 9.088822584347028,
"grad_norm": 1.278200626373291,
"learning_rate": 0.0001,
"loss": 0.4247,
"step": 9900
},
{
"epoch": 9.180628873077806,
"grad_norm": 1.3318266868591309,
"learning_rate": 0.0001,
"loss": 0.4284,
"step": 10000
},
{
"epoch": 9.272435161808584,
"grad_norm": 1.209088683128357,
"learning_rate": 0.0001,
"loss": 0.4381,
"step": 10100
},
{
"epoch": 9.364241450539362,
"grad_norm": 1.0169490575790405,
"learning_rate": 0.0001,
"loss": 0.443,
"step": 10200
},
{
"epoch": 9.45604773927014,
"grad_norm": 1.4842835664749146,
"learning_rate": 0.0001,
"loss": 0.4424,
"step": 10300
},
{
"epoch": 9.547854028000918,
"grad_norm": 1.1761025190353394,
"learning_rate": 0.0001,
"loss": 0.4463,
"step": 10400
},
{
"epoch": 9.639660316731696,
"grad_norm": 1.270493984222412,
"learning_rate": 0.0001,
"loss": 0.4541,
"step": 10500
},
{
"epoch": 9.731466605462474,
"grad_norm": 1.346306562423706,
"learning_rate": 0.0001,
"loss": 0.4551,
"step": 10600
},
{
"epoch": 9.823272894193252,
"grad_norm": 1.2559789419174194,
"learning_rate": 0.0001,
"loss": 0.4611,
"step": 10700
},
{
"epoch": 9.91507918292403,
"grad_norm": 1.4359290599822998,
"learning_rate": 0.0001,
"loss": 0.4584,
"step": 10800
},
{
"epoch": 9.997704842781731,
"eval_accuracy": 0.75309461426492,
"eval_loss": 0.6794138550758362,
"eval_runtime": 8.1091,
"eval_samples_per_second": 61.659,
"eval_steps_per_second": 7.769,
"step": 10890
},
{
"epoch": 10.009180628873079,
"grad_norm": 1.3423351049423218,
"learning_rate": 0.0001,
"loss": 0.386,
"step": 10900
},
{
"epoch": 10.100986917603857,
"grad_norm": 1.132673740386963,
"learning_rate": 0.0001,
"loss": 0.3726,
"step": 11000
},
{
"epoch": 10.192793206334635,
"grad_norm": 1.614931344985962,
"learning_rate": 0.0001,
"loss": 0.3819,
"step": 11100
},
{
"epoch": 10.284599495065413,
"grad_norm": 1.3352196216583252,
"learning_rate": 0.0001,
"loss": 0.3876,
"step": 11200
},
{
"epoch": 10.37640578379619,
"grad_norm": 1.0957690477371216,
"learning_rate": 0.0001,
"loss": 0.3973,
"step": 11300
},
{
"epoch": 10.468212072526969,
"grad_norm": 1.142330527305603,
"learning_rate": 0.0001,
"loss": 0.396,
"step": 11400
},
{
"epoch": 10.560018361257747,
"grad_norm": 1.4076579809188843,
"learning_rate": 0.0001,
"loss": 0.402,
"step": 11500
},
{
"epoch": 10.651824649988525,
"grad_norm": 1.389333963394165,
"learning_rate": 0.0001,
"loss": 0.3962,
"step": 11600
},
{
"epoch": 10.743630938719303,
"grad_norm": 1.4440951347351074,
"learning_rate": 0.0001,
"loss": 0.4049,
"step": 11700
},
{
"epoch": 10.83543722745008,
"grad_norm": 1.4290118217468262,
"learning_rate": 0.0001,
"loss": 0.4093,
"step": 11800
},
{
"epoch": 10.927243516180859,
"grad_norm": 1.46366548538208,
"learning_rate": 0.0001,
"loss": 0.413,
"step": 11900
},
{
"epoch": 10.999770484278173,
"eval_accuracy": 0.7576768558951965,
"eval_loss": 0.6291825175285339,
"eval_runtime": 8.951,
"eval_samples_per_second": 55.86,
"eval_steps_per_second": 7.038,
"step": 11979
},
{
"epoch": 11.019049804911637,
"grad_norm": 1.1713697910308838,
"learning_rate": 0.0001,
"loss": 0.395,
"step": 12000
},
{
"epoch": 11.110856093642415,
"grad_norm": 1.277626395225525,
"learning_rate": 0.0001,
"loss": 0.3355,
"step": 12100
},
{
"epoch": 11.202662382373193,
"grad_norm": 1.3597822189331055,
"learning_rate": 0.0001,
"loss": 0.3412,
"step": 12200
},
{
"epoch": 11.29446867110397,
"grad_norm": 1.4017976522445679,
"learning_rate": 0.0001,
"loss": 0.3414,
"step": 12300
},
{
"epoch": 11.386274959834749,
"grad_norm": 1.409915804862976,
"learning_rate": 0.0001,
"loss": 0.3558,
"step": 12400
},
{
"epoch": 11.478081248565527,
"grad_norm": 1.400634765625,
"learning_rate": 0.0001,
"loss": 0.3577,
"step": 12500
},
{
"epoch": 11.569887537296305,
"grad_norm": 1.5898892879486084,
"learning_rate": 0.0001,
"loss": 0.354,
"step": 12600
},
{
"epoch": 11.661693826027083,
"grad_norm": 1.3252007961273193,
"learning_rate": 0.0001,
"loss": 0.3682,
"step": 12700
},
{
"epoch": 11.75350011475786,
"grad_norm": 1.302128791809082,
"learning_rate": 0.0001,
"loss": 0.3715,
"step": 12800
},
{
"epoch": 11.845306403488639,
"grad_norm": 1.3374468088150024,
"learning_rate": 0.0001,
"loss": 0.3707,
"step": 12900
},
{
"epoch": 11.937112692219417,
"grad_norm": 1.1755791902542114,
"learning_rate": 0.0001,
"loss": 0.3731,
"step": 13000
},
{
"epoch": 11.999540968556346,
"eval_accuracy": 0.76164192139738,
"eval_loss": 0.5926400423049927,
"eval_runtime": 9.0463,
"eval_samples_per_second": 55.271,
"eval_steps_per_second": 6.964,
"step": 13068
},
{
"epoch": 12.028918980950195,
"grad_norm": 1.3085649013519287,
"learning_rate": 0.0001,
"loss": 0.3482,
"step": 13100
},
{
"epoch": 12.120725269680973,
"grad_norm": 1.1860175132751465,
"learning_rate": 0.0001,
"loss": 0.2982,
"step": 13200
},
{
"epoch": 12.21253155841175,
"grad_norm": 1.1902750730514526,
"learning_rate": 0.0001,
"loss": 0.3095,
"step": 13300
},
{
"epoch": 12.304337847142529,
"grad_norm": 1.2473431825637817,
"learning_rate": 0.0001,
"loss": 0.3216,
"step": 13400
},
{
"epoch": 12.396144135873307,
"grad_norm": 1.443493366241455,
"learning_rate": 0.0001,
"loss": 0.319,
"step": 13500
},
{
"epoch": 12.487950424604085,
"grad_norm": 1.4389948844909668,
"learning_rate": 0.0001,
"loss": 0.328,
"step": 13600
},
{
"epoch": 12.579756713334863,
"grad_norm": 1.1586631536483765,
"learning_rate": 0.0001,
"loss": 0.3285,
"step": 13700
},
{
"epoch": 12.671563002065641,
"grad_norm": 1.180396318435669,
"learning_rate": 0.0001,
"loss": 0.3311,
"step": 13800
},
{
"epoch": 12.763369290796419,
"grad_norm": 1.4230598211288452,
"learning_rate": 0.0001,
"loss": 0.3346,
"step": 13900
},
{
"epoch": 12.855175579527197,
"grad_norm": 1.5782092809677124,
"learning_rate": 0.0001,
"loss": 0.3415,
"step": 14000
},
{
"epoch": 12.946981868257975,
"grad_norm": 1.418642282485962,
"learning_rate": 0.0001,
"loss": 0.3423,
"step": 14100
},
{
"epoch": 12.999311452834519,
"eval_accuracy": 0.7655866084425036,
"eval_loss": 0.5619787573814392,
"eval_runtime": 8.9635,
"eval_samples_per_second": 55.782,
"eval_steps_per_second": 7.029,
"step": 14157
},
{
"epoch": 13.038788156988753,
"grad_norm": 1.1923723220825195,
"learning_rate": 0.0001,
"loss": 0.3119,
"step": 14200
},
{
"epoch": 13.130594445719531,
"grad_norm": 1.2736058235168457,
"learning_rate": 0.0001,
"loss": 0.2762,
"step": 14300
},
{
"epoch": 13.22240073445031,
"grad_norm": 0.9496171474456787,
"learning_rate": 0.0001,
"loss": 0.2844,
"step": 14400
},
{
"epoch": 13.314207023181089,
"grad_norm": 1.22100031375885,
"learning_rate": 0.0001,
"loss": 0.2938,
"step": 14500
},
{
"epoch": 13.406013311911867,
"grad_norm": 1.381606101989746,
"learning_rate": 0.0001,
"loss": 0.2978,
"step": 14600
},
{
"epoch": 13.497819600642645,
"grad_norm": 1.43625807762146,
"learning_rate": 0.0001,
"loss": 0.3035,
"step": 14700
},
{
"epoch": 13.589625889373423,
"grad_norm": 1.4393320083618164,
"learning_rate": 0.0001,
"loss": 0.3065,
"step": 14800
},
{
"epoch": 13.6814321781042,
"grad_norm": 1.184833288192749,
"learning_rate": 0.0001,
"loss": 0.3091,
"step": 14900
},
{
"epoch": 13.773238466834979,
"grad_norm": 1.4501614570617676,
"learning_rate": 0.0001,
"loss": 0.3103,
"step": 15000
},
{
"epoch": 13.865044755565757,
"grad_norm": 1.368249535560608,
"learning_rate": 0.0001,
"loss": 0.3137,
"step": 15100
},
{
"epoch": 13.956851044296535,
"grad_norm": 1.4249024391174316,
"learning_rate": 0.0001,
"loss": 0.3185,
"step": 15200
},
{
"epoch": 14.0,
"eval_accuracy": 0.7682037845705968,
"eval_loss": 0.542601466178894,
"eval_runtime": 8.9503,
"eval_samples_per_second": 55.864,
"eval_steps_per_second": 7.039,
"step": 15247
},
{
"epoch": 14.048657333027313,
"grad_norm": 1.3438467979431152,
"learning_rate": 0.0001,
"loss": 0.2849,
"step": 15300
},
{
"epoch": 14.14046362175809,
"grad_norm": 1.4617668390274048,
"learning_rate": 0.0001,
"loss": 0.2658,
"step": 15400
},
{
"epoch": 14.232269910488869,
"grad_norm": 1.266655683517456,
"learning_rate": 0.0001,
"loss": 0.2679,
"step": 15500
},
{
"epoch": 14.324076199219647,
"grad_norm": 1.2162944078445435,
"learning_rate": 0.0001,
"loss": 0.2711,
"step": 15600
},
{
"epoch": 14.415882487950425,
"grad_norm": 1.10415518283844,
"learning_rate": 0.0001,
"loss": 0.2763,
"step": 15700
},
{
"epoch": 14.507688776681203,
"grad_norm": 1.1962913274765015,
"learning_rate": 0.0001,
"loss": 0.2827,
"step": 15800
},
{
"epoch": 14.59949506541198,
"grad_norm": 1.2264560461044312,
"learning_rate": 0.0001,
"loss": 0.2845,
"step": 15900
},
{
"epoch": 14.691301354142759,
"grad_norm": 1.3857085704803467,
"learning_rate": 0.0001,
"loss": 0.2897,
"step": 16000
},
{
"epoch": 14.783107642873537,
"grad_norm": 1.447581171989441,
"learning_rate": 0.0001,
"loss": 0.2894,
"step": 16100
},
{
"epoch": 14.874913931604315,
"grad_norm": 1.3408719301223755,
"learning_rate": 0.0001,
"loss": 0.2899,
"step": 16200
},
{
"epoch": 14.966720220335093,
"grad_norm": 1.695694088935852,
"learning_rate": 0.0001,
"loss": 0.2924,
"step": 16300
},
{
"epoch": 14.999770484278173,
"eval_accuracy": 0.7708355167394468,
"eval_loss": 0.5231938362121582,
"eval_runtime": 9.1808,
"eval_samples_per_second": 54.462,
"eval_steps_per_second": 6.862,
"step": 16336
},
{
"epoch": 15.05852650906587,
"grad_norm": 1.1147023439407349,
"learning_rate": 0.0001,
"loss": 0.2605,
"step": 16400
},
{
"epoch": 15.150332797796649,
"grad_norm": 1.518908977508545,
"learning_rate": 0.0001,
"loss": 0.245,
"step": 16500
},
{
"epoch": 15.242139086527427,
"grad_norm": 1.1342830657958984,
"learning_rate": 0.0001,
"loss": 0.2447,
"step": 16600
},
{
"epoch": 15.333945375258205,
"grad_norm": 1.2657541036605835,
"learning_rate": 0.0001,
"loss": 0.2599,
"step": 16700
},
{
"epoch": 15.425751663988983,
"grad_norm": 0.9707338809967041,
"learning_rate": 0.0001,
"loss": 0.2591,
"step": 16800
},
{
"epoch": 15.517557952719761,
"grad_norm": 1.2904791831970215,
"learning_rate": 0.0001,
"loss": 0.264,
"step": 16900
},
{
"epoch": 15.609364241450539,
"grad_norm": 1.4617804288864136,
"learning_rate": 0.0001,
"loss": 0.2665,
"step": 17000
},
{
"epoch": 15.701170530181317,
"grad_norm": 1.1893932819366455,
"learning_rate": 0.0001,
"loss": 0.2689,
"step": 17100
},
{
"epoch": 15.792976818912095,
"grad_norm": 1.3138148784637451,
"learning_rate": 0.0001,
"loss": 0.2731,
"step": 17200
},
{
"epoch": 15.884783107642873,
"grad_norm": 1.2247110605239868,
"learning_rate": 0.0001,
"loss": 0.278,
"step": 17300
},
{
"epoch": 15.976589396373651,
"grad_norm": 1.1995705366134644,
"learning_rate": 0.0001,
"loss": 0.2824,
"step": 17400
},
{
"epoch": 15.999540968556346,
"eval_accuracy": 0.7727045123726346,
"eval_loss": 0.5129293203353882,
"eval_runtime": 8.9728,
"eval_samples_per_second": 55.724,
"eval_steps_per_second": 7.021,
"step": 17425
},
{
"epoch": 16.06839568510443,
"grad_norm": 1.088183045387268,
"learning_rate": 0.0001,
"loss": 0.2408,
"step": 17500
},
{
"epoch": 16.160201973835207,
"grad_norm": 1.27170991897583,
"learning_rate": 0.0001,
"loss": 0.2339,
"step": 17600
},
{
"epoch": 16.252008262565987,
"grad_norm": 1.093220591545105,
"learning_rate": 0.0001,
"loss": 0.2381,
"step": 17700
},
{
"epoch": 16.343814551296763,
"grad_norm": 1.3761118650436401,
"learning_rate": 0.0001,
"loss": 0.2361,
"step": 17800
},
{
"epoch": 16.435620840027543,
"grad_norm": 1.3061089515686035,
"learning_rate": 0.0001,
"loss": 0.2437,
"step": 17900
},
{
"epoch": 16.52742712875832,
"grad_norm": 1.318901538848877,
"learning_rate": 0.0001,
"loss": 0.2475,
"step": 18000
},
{
"epoch": 16.6192334174891,
"grad_norm": 1.241626262664795,
"learning_rate": 0.0001,
"loss": 0.2542,
"step": 18100
},
{
"epoch": 16.711039706219875,
"grad_norm": 1.1289949417114258,
"learning_rate": 0.0001,
"loss": 0.2566,
"step": 18200
},
{
"epoch": 16.802845994950655,
"grad_norm": 1.4046275615692139,
"learning_rate": 0.0001,
"loss": 0.2594,
"step": 18300
},
{
"epoch": 16.89465228368143,
"grad_norm": 1.1862374544143677,
"learning_rate": 0.0001,
"loss": 0.2611,
"step": 18400
},
{
"epoch": 16.98645857241221,
"grad_norm": 1.3014901876449585,
"learning_rate": 0.0001,
"loss": 0.2669,
"step": 18500
},
{
"epoch": 16.99931145283452,
"eval_accuracy": 0.774806404657933,
"eval_loss": 0.49875929951667786,
"eval_runtime": 9.0234,
"eval_samples_per_second": 55.411,
"eval_steps_per_second": 6.982,
"step": 18514
},
{
"epoch": 17.078264861142987,
"grad_norm": 1.0681638717651367,
"learning_rate": 0.0001,
"loss": 0.2239,
"step": 18600
},
{
"epoch": 17.170071149873767,
"grad_norm": 1.1279337406158447,
"learning_rate": 0.0001,
"loss": 0.2223,
"step": 18700
},
{
"epoch": 17.261877438604543,
"grad_norm": 1.3798402547836304,
"learning_rate": 0.0001,
"loss": 0.2241,
"step": 18800
},
{
"epoch": 17.353683727335323,
"grad_norm": 1.1741504669189453,
"learning_rate": 0.0001,
"loss": 0.2326,
"step": 18900
},
{
"epoch": 17.4454900160661,
"grad_norm": 1.1289469003677368,
"learning_rate": 0.0001,
"loss": 0.2345,
"step": 19000
},
{
"epoch": 17.53729630479688,
"grad_norm": 1.508701205253601,
"learning_rate": 0.0001,
"loss": 0.2421,
"step": 19100
},
{
"epoch": 17.629102593527655,
"grad_norm": 1.449561357498169,
"learning_rate": 0.0001,
"loss": 0.2387,
"step": 19200
},
{
"epoch": 17.720908882258435,
"grad_norm": 1.1868849992752075,
"learning_rate": 0.0001,
"loss": 0.2402,
"step": 19300
},
{
"epoch": 17.81271517098921,
"grad_norm": 1.4335336685180664,
"learning_rate": 0.0001,
"loss": 0.249,
"step": 19400
},
{
"epoch": 17.90452145971999,
"grad_norm": 1.3802162408828735,
"learning_rate": 0.0001,
"loss": 0.2491,
"step": 19500
},
{
"epoch": 17.996327748450767,
"grad_norm": 1.3790746927261353,
"learning_rate": 0.0001,
"loss": 0.2517,
"step": 19600
},
{
"epoch": 18.0,
"eval_accuracy": 0.776174672489083,
"eval_loss": 0.4891900420188904,
"eval_runtime": 8.9325,
"eval_samples_per_second": 55.975,
"eval_steps_per_second": 7.053,
"step": 19604
},
{
"epoch": 18.088134037181547,
"grad_norm": 1.1314564943313599,
"learning_rate": 0.0001,
"loss": 0.2088,
"step": 19700
},
{
"epoch": 18.179940325912327,
"grad_norm": 1.2055948972702026,
"learning_rate": 0.0001,
"loss": 0.2128,
"step": 19800
},
{
"epoch": 18.271746614643103,
"grad_norm": 1.1677360534667969,
"learning_rate": 0.0001,
"loss": 0.2178,
"step": 19900
},
{
"epoch": 18.363552903373883,
"grad_norm": 1.2793176174163818,
"learning_rate": 0.0001,
"loss": 0.2216,
"step": 20000
},
{
"epoch": 18.45535919210466,
"grad_norm": 1.187522292137146,
"learning_rate": 0.0001,
"loss": 0.2243,
"step": 20100
},
{
"epoch": 18.54716548083544,
"grad_norm": 1.5564976930618286,
"learning_rate": 0.0001,
"loss": 0.2249,
"step": 20200
},
{
"epoch": 18.638971769566215,
"grad_norm": 1.2912520170211792,
"learning_rate": 0.0001,
"loss": 0.2319,
"step": 20300
},
{
"epoch": 18.730778058296995,
"grad_norm": 1.5046939849853516,
"learning_rate": 0.0001,
"loss": 0.2343,
"step": 20400
},
{
"epoch": 18.82258434702777,
"grad_norm": 1.4738825559616089,
"learning_rate": 0.0001,
"loss": 0.2342,
"step": 20500
},
{
"epoch": 18.91439063575855,
"grad_norm": 1.427435278892517,
"learning_rate": 0.0001,
"loss": 0.2376,
"step": 20600
},
{
"epoch": 18.999770484278173,
"eval_accuracy": 0.7773391557496361,
"eval_loss": 0.4808199405670166,
"eval_runtime": 8.9815,
"eval_samples_per_second": 55.67,
"eval_steps_per_second": 7.014,
"step": 20693
},
{
"epoch": 19.006196924489327,
"grad_norm": 1.294245719909668,
"learning_rate": 0.0001,
"loss": 0.2396,
"step": 20700
},
{
"epoch": 19.098003213220107,
"grad_norm": 0.9566488862037659,
"learning_rate": 0.0001,
"loss": 0.2006,
"step": 20800
},
{
"epoch": 19.189809501950883,
"grad_norm": 1.184180736541748,
"learning_rate": 0.0001,
"loss": 0.2049,
"step": 20900
},
{
"epoch": 19.281615790681663,
"grad_norm": 1.1258317232131958,
"learning_rate": 0.0001,
"loss": 0.2081,
"step": 21000
},
{
"epoch": 19.37342207941244,
"grad_norm": 1.2547038793563843,
"learning_rate": 0.0001,
"loss": 0.2133,
"step": 21100
},
{
"epoch": 19.46522836814322,
"grad_norm": 1.3770051002502441,
"learning_rate": 0.0001,
"loss": 0.2175,
"step": 21200
},
{
"epoch": 19.557034656873995,
"grad_norm": 1.3640483617782593,
"learning_rate": 0.0001,
"loss": 0.2178,
"step": 21300
},
{
"epoch": 19.648840945604775,
"grad_norm": 1.2219371795654297,
"learning_rate": 0.0001,
"loss": 0.2233,
"step": 21400
},
{
"epoch": 19.74064723433555,
"grad_norm": 1.3438184261322021,
"learning_rate": 0.0001,
"loss": 0.224,
"step": 21500
},
{
"epoch": 19.83245352306633,
"grad_norm": 1.2909867763519287,
"learning_rate": 0.0001,
"loss": 0.2274,
"step": 21600
},
{
"epoch": 19.924259811797107,
"grad_norm": 1.482640027999878,
"learning_rate": 0.0001,
"loss": 0.2316,
"step": 21700
},
{
"epoch": 19.99770484278173,
"eval_accuracy": 0.7780232896652111,
"eval_loss": 0.4803846478462219,
"eval_runtime": 8.977,
"eval_samples_per_second": 55.698,
"eval_steps_per_second": 7.018,
"step": 21780
},
{
"epoch": 19.99770484278173,
"step": 21780,
"total_flos": 2.2953494160657613e+18,
"train_loss": 0.0,
"train_runtime": 0.0873,
"train_samples_per_second": 7982672.15,
"train_steps_per_second": 249415.561
}
],
"logging_steps": 100,
"max_steps": 21780,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"total_flos": 2.2953494160657613e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}