kanishka's picture
End of training
b27daa8 verified
raw
history blame contribute delete
No virus
70.4 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 20.0,
"eval_steps": 500,
"global_step": 371860,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05378368203087183,
"grad_norm": 1.0489896535873413,
"learning_rate": 3.125e-06,
"loss": 7.5484,
"step": 1000
},
{
"epoch": 0.10756736406174366,
"grad_norm": 0.7405256628990173,
"learning_rate": 6.25e-06,
"loss": 5.8085,
"step": 2000
},
{
"epoch": 0.1613510460926155,
"grad_norm": 0.8219595551490784,
"learning_rate": 9.375000000000001e-06,
"loss": 5.3802,
"step": 3000
},
{
"epoch": 0.21513472812348733,
"grad_norm": 0.9044649600982666,
"learning_rate": 1.25e-05,
"loss": 5.1583,
"step": 4000
},
{
"epoch": 0.2689184101543592,
"grad_norm": 1.00751793384552,
"learning_rate": 1.5625e-05,
"loss": 5.0035,
"step": 5000
},
{
"epoch": 0.322702092185231,
"grad_norm": 1.116739273071289,
"learning_rate": 1.8750000000000002e-05,
"loss": 4.8663,
"step": 6000
},
{
"epoch": 0.37648577421610285,
"grad_norm": 1.1231814622879028,
"learning_rate": 2.1875e-05,
"loss": 4.7541,
"step": 7000
},
{
"epoch": 0.43026945624697466,
"grad_norm": 1.0907611846923828,
"learning_rate": 2.5e-05,
"loss": 4.6488,
"step": 8000
},
{
"epoch": 0.4840531382778465,
"grad_norm": 1.113344669342041,
"learning_rate": 2.8125000000000003e-05,
"loss": 4.5619,
"step": 9000
},
{
"epoch": 0.5378368203087184,
"grad_norm": 1.0354745388031006,
"learning_rate": 3.125e-05,
"loss": 4.4881,
"step": 10000
},
{
"epoch": 0.5916205023395902,
"grad_norm": 1.5056177377700806,
"learning_rate": 3.4371875e-05,
"loss": 4.4158,
"step": 11000
},
{
"epoch": 0.645404184370462,
"grad_norm": 0.9814821481704712,
"learning_rate": 3.7496875e-05,
"loss": 4.3495,
"step": 12000
},
{
"epoch": 0.6991878664013338,
"grad_norm": 1.0727801322937012,
"learning_rate": 4.0621875e-05,
"loss": 4.2913,
"step": 13000
},
{
"epoch": 0.7529715484322057,
"grad_norm": 1.0662927627563477,
"learning_rate": 4.374375e-05,
"loss": 4.2355,
"step": 14000
},
{
"epoch": 0.8067552304630775,
"grad_norm": 1.030743956565857,
"learning_rate": 4.686875e-05,
"loss": 4.1862,
"step": 15000
},
{
"epoch": 0.8605389124939493,
"grad_norm": 1.0385651588439941,
"learning_rate": 4.9990625000000004e-05,
"loss": 4.1341,
"step": 16000
},
{
"epoch": 0.9143225945248211,
"grad_norm": 0.9680750966072083,
"learning_rate": 5.3115625000000005e-05,
"loss": 4.0955,
"step": 17000
},
{
"epoch": 0.968106276555693,
"grad_norm": 1.0905612707138062,
"learning_rate": 5.6240625e-05,
"loss": 4.0555,
"step": 18000
},
{
"epoch": 1.0,
"eval_accuracy": 0.30862948193360185,
"eval_loss": 4.260831832885742,
"eval_runtime": 152.7708,
"eval_samples_per_second": 379.117,
"eval_steps_per_second": 5.924,
"step": 18593
},
{
"epoch": 1.0218899585865648,
"grad_norm": 0.9744522571563721,
"learning_rate": 5.93625e-05,
"loss": 4.02,
"step": 19000
},
{
"epoch": 1.0756736406174368,
"grad_norm": 0.952312707901001,
"learning_rate": 6.24875e-05,
"loss": 3.9723,
"step": 20000
},
{
"epoch": 1.1294573226483084,
"grad_norm": 0.9960818290710449,
"learning_rate": 6.56125e-05,
"loss": 3.9319,
"step": 21000
},
{
"epoch": 1.1832410046791804,
"grad_norm": 1.0043728351593018,
"learning_rate": 6.8734375e-05,
"loss": 3.8906,
"step": 22000
},
{
"epoch": 1.2370246867100523,
"grad_norm": 0.9806647300720215,
"learning_rate": 7.185937500000001e-05,
"loss": 3.8556,
"step": 23000
},
{
"epoch": 1.290808368740924,
"grad_norm": 0.9609583020210266,
"learning_rate": 7.4978125e-05,
"loss": 3.8292,
"step": 24000
},
{
"epoch": 1.3445920507717959,
"grad_norm": 0.9188491106033325,
"learning_rate": 7.8103125e-05,
"loss": 3.8004,
"step": 25000
},
{
"epoch": 1.3983757328026676,
"grad_norm": 0.932732880115509,
"learning_rate": 8.122500000000001e-05,
"loss": 3.769,
"step": 26000
},
{
"epoch": 1.4521594148335395,
"grad_norm": 0.8833909034729004,
"learning_rate": 8.435e-05,
"loss": 3.748,
"step": 27000
},
{
"epoch": 1.5059430968644114,
"grad_norm": 0.9042672514915466,
"learning_rate": 8.746875e-05,
"loss": 3.7239,
"step": 28000
},
{
"epoch": 1.5597267788952833,
"grad_norm": 0.9524121880531311,
"learning_rate": 9.059375e-05,
"loss": 3.7076,
"step": 29000
},
{
"epoch": 1.613510460926155,
"grad_norm": 0.8914125561714172,
"learning_rate": 9.3715625e-05,
"loss": 3.6853,
"step": 30000
},
{
"epoch": 1.6672941429570267,
"grad_norm": 0.8666671514511108,
"learning_rate": 9.68375e-05,
"loss": 3.6694,
"step": 31000
},
{
"epoch": 1.7210778249878986,
"grad_norm": 0.8737355470657349,
"learning_rate": 9.99625e-05,
"loss": 3.6462,
"step": 32000
},
{
"epoch": 1.7748615070187705,
"grad_norm": 0.8654928803443909,
"learning_rate": 9.970929206143706e-05,
"loss": 3.6316,
"step": 33000
},
{
"epoch": 1.8286451890496425,
"grad_norm": 0.8172135949134827,
"learning_rate": 9.941505325722356e-05,
"loss": 3.616,
"step": 34000
},
{
"epoch": 1.8824288710805142,
"grad_norm": 0.8614993691444397,
"learning_rate": 9.912110869181429e-05,
"loss": 3.5943,
"step": 35000
},
{
"epoch": 1.9362125531113858,
"grad_norm": 0.8271329998970032,
"learning_rate": 9.882716412640499e-05,
"loss": 3.5805,
"step": 36000
},
{
"epoch": 1.9899962351422578,
"grad_norm": 0.8484081029891968,
"learning_rate": 9.85329253221915e-05,
"loss": 3.5626,
"step": 37000
},
{
"epoch": 2.0,
"eval_accuracy": 0.3633626394917919,
"eval_loss": 3.7489588260650635,
"eval_runtime": 153.9094,
"eval_samples_per_second": 376.312,
"eval_steps_per_second": 5.88,
"step": 37186
},
{
"epoch": 2.0437799171731297,
"grad_norm": 0.834202766418457,
"learning_rate": 9.8238686517978e-05,
"loss": 3.5259,
"step": 38000
},
{
"epoch": 2.0975635992040016,
"grad_norm": 0.8602707982063293,
"learning_rate": 9.79444477137645e-05,
"loss": 3.514,
"step": 39000
},
{
"epoch": 2.1513472812348735,
"grad_norm": 0.7993927001953125,
"learning_rate": 9.765050314835522e-05,
"loss": 3.4999,
"step": 40000
},
{
"epoch": 2.205130963265745,
"grad_norm": 0.8245725035667419,
"learning_rate": 9.735626434414172e-05,
"loss": 3.4915,
"step": 41000
},
{
"epoch": 2.258914645296617,
"grad_norm": 0.8083192110061646,
"learning_rate": 9.706261401753663e-05,
"loss": 3.4863,
"step": 42000
},
{
"epoch": 2.312698327327489,
"grad_norm": 0.8514248728752136,
"learning_rate": 9.676837521332313e-05,
"loss": 3.4754,
"step": 43000
},
{
"epoch": 2.3664820093583607,
"grad_norm": 0.8821371793746948,
"learning_rate": 9.647413640910963e-05,
"loss": 3.468,
"step": 44000
},
{
"epoch": 2.4202656913892326,
"grad_norm": 0.8014013767242432,
"learning_rate": 9.617989760489615e-05,
"loss": 3.4582,
"step": 45000
},
{
"epoch": 2.4740493734201046,
"grad_norm": 0.8032485246658325,
"learning_rate": 9.588565880068265e-05,
"loss": 3.4497,
"step": 46000
},
{
"epoch": 2.527833055450976,
"grad_norm": 0.8060674667358398,
"learning_rate": 9.559171423527336e-05,
"loss": 3.4412,
"step": 47000
},
{
"epoch": 2.581616737481848,
"grad_norm": 0.7642372250556946,
"learning_rate": 9.529747543105986e-05,
"loss": 3.4374,
"step": 48000
},
{
"epoch": 2.63540041951272,
"grad_norm": 0.8085050582885742,
"learning_rate": 9.500382510445478e-05,
"loss": 3.4257,
"step": 49000
},
{
"epoch": 2.6891841015435918,
"grad_norm": 0.7765064239501953,
"learning_rate": 9.470958630024128e-05,
"loss": 3.4264,
"step": 50000
},
{
"epoch": 2.7429677835744632,
"grad_norm": 0.7633680105209351,
"learning_rate": 9.441534749602779e-05,
"loss": 3.4157,
"step": 51000
},
{
"epoch": 2.796751465605335,
"grad_norm": 0.7525299191474915,
"learning_rate": 9.412140293061849e-05,
"loss": 3.4093,
"step": 52000
},
{
"epoch": 2.850535147636207,
"grad_norm": 0.8231662511825562,
"learning_rate": 9.3827164126405e-05,
"loss": 3.4054,
"step": 53000
},
{
"epoch": 2.904318829667079,
"grad_norm": 0.7820568084716797,
"learning_rate": 9.35329253221915e-05,
"loss": 3.3949,
"step": 54000
},
{
"epoch": 2.958102511697951,
"grad_norm": 0.7471756935119629,
"learning_rate": 9.323898075678222e-05,
"loss": 3.3926,
"step": 55000
},
{
"epoch": 3.0,
"eval_accuracy": 0.38042721878496405,
"eval_loss": 3.5728847980499268,
"eval_runtime": 153.8413,
"eval_samples_per_second": 376.479,
"eval_steps_per_second": 5.883,
"step": 55779
},
{
"epoch": 3.011886193728823,
"grad_norm": 0.7675151824951172,
"learning_rate": 9.294474195256872e-05,
"loss": 3.373,
"step": 56000
},
{
"epoch": 3.0656698757596943,
"grad_norm": 0.8319113850593567,
"learning_rate": 9.26505031483552e-05,
"loss": 3.3312,
"step": 57000
},
{
"epoch": 3.119453557790566,
"grad_norm": 0.745871365070343,
"learning_rate": 9.23562643441417e-05,
"loss": 3.3325,
"step": 58000
},
{
"epoch": 3.173237239821438,
"grad_norm": 0.7976841926574707,
"learning_rate": 9.206231977873242e-05,
"loss": 3.3371,
"step": 59000
},
{
"epoch": 3.22702092185231,
"grad_norm": 0.7604719996452332,
"learning_rate": 9.176808097451893e-05,
"loss": 3.3258,
"step": 60000
},
{
"epoch": 3.280804603883182,
"grad_norm": 0.7433556318283081,
"learning_rate": 9.147413640910963e-05,
"loss": 3.329,
"step": 61000
},
{
"epoch": 3.3345882859140534,
"grad_norm": 0.7537267804145813,
"learning_rate": 9.117989760489613e-05,
"loss": 3.3206,
"step": 62000
},
{
"epoch": 3.3883719679449253,
"grad_norm": 0.7688448429107666,
"learning_rate": 9.088595303948685e-05,
"loss": 3.3202,
"step": 63000
},
{
"epoch": 3.4421556499757973,
"grad_norm": 0.7358716726303101,
"learning_rate": 9.059171423527336e-05,
"loss": 3.3166,
"step": 64000
},
{
"epoch": 3.495939332006669,
"grad_norm": 0.7672792673110962,
"learning_rate": 9.029747543105986e-05,
"loss": 3.317,
"step": 65000
},
{
"epoch": 3.549723014037541,
"grad_norm": 0.7670078873634338,
"learning_rate": 9.000353086565056e-05,
"loss": 3.3139,
"step": 66000
},
{
"epoch": 3.603506696068413,
"grad_norm": 0.7238633632659912,
"learning_rate": 8.970929206143706e-05,
"loss": 3.3109,
"step": 67000
},
{
"epoch": 3.657290378099285,
"grad_norm": 0.6910108923912048,
"learning_rate": 8.941505325722357e-05,
"loss": 3.31,
"step": 68000
},
{
"epoch": 3.7110740601301564,
"grad_norm": 0.7354035973548889,
"learning_rate": 8.912110869181429e-05,
"loss": 3.3028,
"step": 69000
},
{
"epoch": 3.7648577421610283,
"grad_norm": 0.7346329092979431,
"learning_rate": 8.882686988760079e-05,
"loss": 3.3016,
"step": 70000
},
{
"epoch": 3.8186414241919002,
"grad_norm": 0.7276666164398193,
"learning_rate": 8.85329253221915e-05,
"loss": 3.2962,
"step": 71000
},
{
"epoch": 3.872425106222772,
"grad_norm": 0.7881675958633423,
"learning_rate": 8.823927499558642e-05,
"loss": 3.2929,
"step": 72000
},
{
"epoch": 3.9262087882536436,
"grad_norm": 0.731143593788147,
"learning_rate": 8.794503619137292e-05,
"loss": 3.2931,
"step": 73000
},
{
"epoch": 3.9799924702845155,
"grad_norm": 0.7707085013389587,
"learning_rate": 8.765079738715942e-05,
"loss": 3.2863,
"step": 74000
},
{
"epoch": 4.0,
"eval_accuracy": 0.38832393254759884,
"eval_loss": 3.512221574783325,
"eval_runtime": 153.8816,
"eval_samples_per_second": 376.38,
"eval_steps_per_second": 5.881,
"step": 74372
},
{
"epoch": 4.033776152315387,
"grad_norm": 0.8132415413856506,
"learning_rate": 8.735655858294592e-05,
"loss": 3.2565,
"step": 75000
},
{
"epoch": 4.087559834346259,
"grad_norm": 0.7561783790588379,
"learning_rate": 8.706231977873243e-05,
"loss": 3.2326,
"step": 76000
},
{
"epoch": 4.141343516377131,
"grad_norm": 0.757048487663269,
"learning_rate": 8.676808097451893e-05,
"loss": 3.2321,
"step": 77000
},
{
"epoch": 4.195127198408003,
"grad_norm": 0.7583024501800537,
"learning_rate": 8.647384217030541e-05,
"loss": 3.2371,
"step": 78000
},
{
"epoch": 4.248910880438875,
"grad_norm": 0.7448434233665466,
"learning_rate": 8.617989760489615e-05,
"loss": 3.2365,
"step": 79000
},
{
"epoch": 4.302694562469747,
"grad_norm": 0.7461341619491577,
"learning_rate": 8.588565880068264e-05,
"loss": 3.2369,
"step": 80000
},
{
"epoch": 4.356478244500618,
"grad_norm": 0.7581353187561035,
"learning_rate": 8.559141999646914e-05,
"loss": 3.2349,
"step": 81000
},
{
"epoch": 4.41026192653149,
"grad_norm": 0.7130771279335022,
"learning_rate": 8.529718119225564e-05,
"loss": 3.2389,
"step": 82000
},
{
"epoch": 4.464045608562362,
"grad_norm": 0.7467326521873474,
"learning_rate": 8.500323662684634e-05,
"loss": 3.2338,
"step": 83000
},
{
"epoch": 4.517829290593234,
"grad_norm": 0.7349050641059875,
"learning_rate": 8.470929206143707e-05,
"loss": 3.231,
"step": 84000
},
{
"epoch": 4.571612972624106,
"grad_norm": 0.7301473021507263,
"learning_rate": 8.441505325722357e-05,
"loss": 3.2323,
"step": 85000
},
{
"epoch": 4.625396654654978,
"grad_norm": 0.7459990978240967,
"learning_rate": 8.412110869181427e-05,
"loss": 3.2319,
"step": 86000
},
{
"epoch": 4.6791803366858495,
"grad_norm": 0.7310500144958496,
"learning_rate": 8.382686988760077e-05,
"loss": 3.2316,
"step": 87000
},
{
"epoch": 4.7329640187167215,
"grad_norm": 0.7355625033378601,
"learning_rate": 8.35329253221915e-05,
"loss": 3.2298,
"step": 88000
},
{
"epoch": 4.786747700747593,
"grad_norm": 0.7653241157531738,
"learning_rate": 8.3238686517978e-05,
"loss": 3.2223,
"step": 89000
},
{
"epoch": 4.840531382778465,
"grad_norm": 0.7360557913780212,
"learning_rate": 8.29447419525687e-05,
"loss": 3.2246,
"step": 90000
},
{
"epoch": 4.894315064809337,
"grad_norm": 0.726395308971405,
"learning_rate": 8.26505031483552e-05,
"loss": 3.2265,
"step": 91000
},
{
"epoch": 4.948098746840209,
"grad_norm": 0.7324568033218384,
"learning_rate": 8.23562643441417e-05,
"loss": 3.2223,
"step": 92000
},
{
"epoch": 5.0,
"eval_accuracy": 0.3931525087864058,
"eval_loss": 3.470435857772827,
"eval_runtime": 153.6492,
"eval_samples_per_second": 376.95,
"eval_steps_per_second": 5.89,
"step": 92965
},
{
"epoch": 5.00188242887108,
"grad_norm": 0.7105480432510376,
"learning_rate": 8.20620255399282e-05,
"loss": 3.2157,
"step": 93000
},
{
"epoch": 5.055666110901952,
"grad_norm": 0.77333664894104,
"learning_rate": 8.176778673571471e-05,
"loss": 3.167,
"step": 94000
},
{
"epoch": 5.109449792932824,
"grad_norm": 0.7714924812316895,
"learning_rate": 8.147384217030543e-05,
"loss": 3.1654,
"step": 95000
},
{
"epoch": 5.163233474963696,
"grad_norm": 0.7432717084884644,
"learning_rate": 8.117960336609193e-05,
"loss": 3.1702,
"step": 96000
},
{
"epoch": 5.217017156994568,
"grad_norm": 0.7248101830482483,
"learning_rate": 8.088536456187843e-05,
"loss": 3.1683,
"step": 97000
},
{
"epoch": 5.27080083902544,
"grad_norm": 0.7558987140655518,
"learning_rate": 8.059112575766492e-05,
"loss": 3.1737,
"step": 98000
},
{
"epoch": 5.324584521056312,
"grad_norm": 0.7432435750961304,
"learning_rate": 8.029718119225564e-05,
"loss": 3.1774,
"step": 99000
},
{
"epoch": 5.3783682030871836,
"grad_norm": 0.7622554898262024,
"learning_rate": 8.000323662684636e-05,
"loss": 3.1746,
"step": 100000
},
{
"epoch": 5.4321518851180555,
"grad_norm": 0.742205798625946,
"learning_rate": 7.970899782263285e-05,
"loss": 3.1724,
"step": 101000
},
{
"epoch": 5.485935567148927,
"grad_norm": 0.7343482971191406,
"learning_rate": 7.941475901841935e-05,
"loss": 3.1735,
"step": 102000
},
{
"epoch": 5.539719249179798,
"grad_norm": 0.7449206709861755,
"learning_rate": 7.912052021420585e-05,
"loss": 3.1736,
"step": 103000
},
{
"epoch": 5.59350293121067,
"grad_norm": 0.7648908495903015,
"learning_rate": 7.882628140999235e-05,
"loss": 3.1748,
"step": 104000
},
{
"epoch": 5.647286613241542,
"grad_norm": 0.706194281578064,
"learning_rate": 7.853233684458307e-05,
"loss": 3.1724,
"step": 105000
},
{
"epoch": 5.701070295272414,
"grad_norm": 0.7112085819244385,
"learning_rate": 7.823839227917378e-05,
"loss": 3.173,
"step": 106000
},
{
"epoch": 5.754853977303286,
"grad_norm": 0.7374659776687622,
"learning_rate": 7.794415347496028e-05,
"loss": 3.174,
"step": 107000
},
{
"epoch": 5.808637659334158,
"grad_norm": 0.7422733902931213,
"learning_rate": 7.764991467074678e-05,
"loss": 3.1727,
"step": 108000
},
{
"epoch": 5.86242134136503,
"grad_norm": 0.7205289602279663,
"learning_rate": 7.735567586653328e-05,
"loss": 3.1717,
"step": 109000
},
{
"epoch": 5.916205023395902,
"grad_norm": 0.7607922554016113,
"learning_rate": 7.706143706231978e-05,
"loss": 3.1679,
"step": 110000
},
{
"epoch": 5.969988705426774,
"grad_norm": 0.7205678224563599,
"learning_rate": 7.67674924969105e-05,
"loss": 3.1687,
"step": 111000
},
{
"epoch": 6.0,
"eval_accuracy": 0.39598443418529594,
"eval_loss": 3.4542243480682373,
"eval_runtime": 154.6543,
"eval_samples_per_second": 374.5,
"eval_steps_per_second": 5.852,
"step": 111558
},
{
"epoch": 6.023772387457646,
"grad_norm": 0.7311733365058899,
"learning_rate": 7.6473253692697e-05,
"loss": 3.143,
"step": 112000
},
{
"epoch": 6.077556069488518,
"grad_norm": 0.7482650876045227,
"learning_rate": 7.617930912728771e-05,
"loss": 3.1172,
"step": 113000
},
{
"epoch": 6.131339751519389,
"grad_norm": 0.768342912197113,
"learning_rate": 7.588507032307421e-05,
"loss": 3.1151,
"step": 114000
},
{
"epoch": 6.1851234335502605,
"grad_norm": 0.769808828830719,
"learning_rate": 7.559112575766493e-05,
"loss": 3.1213,
"step": 115000
},
{
"epoch": 6.238907115581132,
"grad_norm": 0.7565628290176392,
"learning_rate": 7.529688695345143e-05,
"loss": 3.1255,
"step": 116000
},
{
"epoch": 6.292690797612004,
"grad_norm": 0.7597582340240479,
"learning_rate": 7.500264814923792e-05,
"loss": 3.1226,
"step": 117000
},
{
"epoch": 6.346474479642876,
"grad_norm": 0.7350876331329346,
"learning_rate": 7.470840934502442e-05,
"loss": 3.1263,
"step": 118000
},
{
"epoch": 6.400258161673748,
"grad_norm": 0.734434962272644,
"learning_rate": 7.441475901841936e-05,
"loss": 3.1267,
"step": 119000
},
{
"epoch": 6.45404184370462,
"grad_norm": 0.7643101215362549,
"learning_rate": 7.412052021420586e-05,
"loss": 3.13,
"step": 120000
},
{
"epoch": 6.507825525735492,
"grad_norm": 0.7487729787826538,
"learning_rate": 7.382628140999235e-05,
"loss": 3.1309,
"step": 121000
},
{
"epoch": 6.561609207766364,
"grad_norm": 0.7111514806747437,
"learning_rate": 7.353204260577885e-05,
"loss": 3.1298,
"step": 122000
},
{
"epoch": 6.615392889797236,
"grad_norm": 0.7280795574188232,
"learning_rate": 7.323780380156535e-05,
"loss": 3.1316,
"step": 123000
},
{
"epoch": 6.669176571828107,
"grad_norm": 0.7801093459129333,
"learning_rate": 7.294385923615607e-05,
"loss": 3.1235,
"step": 124000
},
{
"epoch": 6.722960253858979,
"grad_norm": 0.7695817351341248,
"learning_rate": 7.264962043194257e-05,
"loss": 3.1298,
"step": 125000
},
{
"epoch": 6.776743935889851,
"grad_norm": 0.7277592420578003,
"learning_rate": 7.235538162772907e-05,
"loss": 3.1337,
"step": 126000
},
{
"epoch": 6.830527617920723,
"grad_norm": 0.7386214137077332,
"learning_rate": 7.206143706231978e-05,
"loss": 3.1248,
"step": 127000
},
{
"epoch": 6.8843112999515945,
"grad_norm": 0.7697268128395081,
"learning_rate": 7.176719825810628e-05,
"loss": 3.1267,
"step": 128000
},
{
"epoch": 6.938094981982466,
"grad_norm": 0.7416918873786926,
"learning_rate": 7.147325369269699e-05,
"loss": 3.1255,
"step": 129000
},
{
"epoch": 6.991878664013338,
"grad_norm": 0.7437503933906555,
"learning_rate": 7.11790148884835e-05,
"loss": 3.1265,
"step": 130000
},
{
"epoch": 7.0,
"eval_accuracy": 0.3987135038494649,
"eval_loss": 3.430708169937134,
"eval_runtime": 154.0366,
"eval_samples_per_second": 376.001,
"eval_steps_per_second": 5.875,
"step": 130151
},
{
"epoch": 7.04566234604421,
"grad_norm": 0.7688168883323669,
"learning_rate": 7.088507032307421e-05,
"loss": 3.0768,
"step": 131000
},
{
"epoch": 7.099446028075082,
"grad_norm": 0.7706940174102783,
"learning_rate": 7.059083151886071e-05,
"loss": 3.0727,
"step": 132000
},
{
"epoch": 7.153229710105954,
"grad_norm": 0.7756544947624207,
"learning_rate": 7.029688695345142e-05,
"loss": 3.0784,
"step": 133000
},
{
"epoch": 7.207013392136826,
"grad_norm": 0.7629918456077576,
"learning_rate": 7.000264814923792e-05,
"loss": 3.0812,
"step": 134000
},
{
"epoch": 7.260797074167698,
"grad_norm": 0.7643315196037292,
"learning_rate": 6.970840934502443e-05,
"loss": 3.0791,
"step": 135000
},
{
"epoch": 7.314580756198569,
"grad_norm": 0.7508428692817688,
"learning_rate": 6.941417054081093e-05,
"loss": 3.0904,
"step": 136000
},
{
"epoch": 7.368364438229441,
"grad_norm": 0.749332070350647,
"learning_rate": 6.912022597540164e-05,
"loss": 3.0907,
"step": 137000
},
{
"epoch": 7.422148120260313,
"grad_norm": 0.7576011419296265,
"learning_rate": 6.882598717118814e-05,
"loss": 3.0874,
"step": 138000
},
{
"epoch": 7.475931802291185,
"grad_norm": 0.7459414601325989,
"learning_rate": 6.853174836697463e-05,
"loss": 3.0893,
"step": 139000
},
{
"epoch": 7.529715484322057,
"grad_norm": 0.7699885964393616,
"learning_rate": 6.823750956276113e-05,
"loss": 3.0894,
"step": 140000
},
{
"epoch": 7.5834991663529285,
"grad_norm": 0.7432721853256226,
"learning_rate": 6.794327075854765e-05,
"loss": 3.0884,
"step": 141000
},
{
"epoch": 7.6372828483838004,
"grad_norm": 0.7425631880760193,
"learning_rate": 6.764903195433415e-05,
"loss": 3.0955,
"step": 142000
},
{
"epoch": 7.691066530414672,
"grad_norm": 0.7397525906562805,
"learning_rate": 6.735508738892485e-05,
"loss": 3.0927,
"step": 143000
},
{
"epoch": 7.744850212445544,
"grad_norm": 0.8293583989143372,
"learning_rate": 6.706084858471135e-05,
"loss": 3.0944,
"step": 144000
},
{
"epoch": 7.798633894476416,
"grad_norm": 0.7823474407196045,
"learning_rate": 6.676690401930206e-05,
"loss": 3.092,
"step": 145000
},
{
"epoch": 7.852417576507287,
"grad_norm": 0.7494142651557922,
"learning_rate": 6.647266521508858e-05,
"loss": 3.0927,
"step": 146000
},
{
"epoch": 7.906201258538159,
"grad_norm": 0.7707638144493103,
"learning_rate": 6.617872064967928e-05,
"loss": 3.0927,
"step": 147000
},
{
"epoch": 7.959984940569031,
"grad_norm": 0.7771040797233582,
"learning_rate": 6.588448184546578e-05,
"loss": 3.0955,
"step": 148000
},
{
"epoch": 8.0,
"eval_accuracy": 0.4013845282133079,
"eval_loss": 3.4010231494903564,
"eval_runtime": 154.2403,
"eval_samples_per_second": 375.505,
"eval_steps_per_second": 5.867,
"step": 148744
},
{
"epoch": 8.013768622599903,
"grad_norm": 0.7500186562538147,
"learning_rate": 6.559024304125229e-05,
"loss": 3.0751,
"step": 149000
},
{
"epoch": 8.067552304630775,
"grad_norm": 0.7857389450073242,
"learning_rate": 6.529600423703879e-05,
"loss": 3.038,
"step": 150000
},
{
"epoch": 8.121335986661647,
"grad_norm": 0.7659834027290344,
"learning_rate": 6.500205967162951e-05,
"loss": 3.0429,
"step": 151000
},
{
"epoch": 8.175119668692519,
"grad_norm": 0.7773808240890503,
"learning_rate": 6.470811510622021e-05,
"loss": 3.0451,
"step": 152000
},
{
"epoch": 8.22890335072339,
"grad_norm": 0.7654848694801331,
"learning_rate": 6.441387630200672e-05,
"loss": 3.0463,
"step": 153000
},
{
"epoch": 8.282687032754263,
"grad_norm": 0.7545380592346191,
"learning_rate": 6.411963749779322e-05,
"loss": 3.0458,
"step": 154000
},
{
"epoch": 8.336470714785134,
"grad_norm": 0.7594432830810547,
"learning_rate": 6.382569293238392e-05,
"loss": 3.0503,
"step": 155000
},
{
"epoch": 8.390254396816006,
"grad_norm": 0.7385092973709106,
"learning_rate": 6.353145412817044e-05,
"loss": 3.0529,
"step": 156000
},
{
"epoch": 8.444038078846878,
"grad_norm": 0.7623139023780823,
"learning_rate": 6.323750956276114e-05,
"loss": 3.056,
"step": 157000
},
{
"epoch": 8.49782176087775,
"grad_norm": 0.7708114385604858,
"learning_rate": 6.294327075854765e-05,
"loss": 3.0551,
"step": 158000
},
{
"epoch": 8.551605442908622,
"grad_norm": 0.7581725120544434,
"learning_rate": 6.264903195433413e-05,
"loss": 3.0606,
"step": 159000
},
{
"epoch": 8.605389124939494,
"grad_norm": 0.7970029711723328,
"learning_rate": 6.235479315012063e-05,
"loss": 3.0621,
"step": 160000
},
{
"epoch": 8.659172806970366,
"grad_norm": 0.759104311466217,
"learning_rate": 6.206084858471135e-05,
"loss": 3.0587,
"step": 161000
},
{
"epoch": 8.712956489001236,
"grad_norm": 0.7619072794914246,
"learning_rate": 6.176660978049786e-05,
"loss": 3.0615,
"step": 162000
},
{
"epoch": 8.766740171032108,
"grad_norm": 0.7338131070137024,
"learning_rate": 6.147266521508856e-05,
"loss": 3.0647,
"step": 163000
},
{
"epoch": 8.82052385306298,
"grad_norm": 0.7602887153625488,
"learning_rate": 6.117872064967928e-05,
"loss": 3.0653,
"step": 164000
},
{
"epoch": 8.874307535093852,
"grad_norm": 0.7433264255523682,
"learning_rate": 6.088448184546578e-05,
"loss": 3.0627,
"step": 165000
},
{
"epoch": 8.928091217124724,
"grad_norm": 0.7426097989082336,
"learning_rate": 6.059024304125228e-05,
"loss": 3.0664,
"step": 166000
},
{
"epoch": 8.981874899155596,
"grad_norm": 0.7173585891723633,
"learning_rate": 6.0296004237038787e-05,
"loss": 3.0614,
"step": 167000
},
{
"epoch": 9.0,
"eval_accuracy": 0.4025592065422428,
"eval_loss": 3.394737958908081,
"eval_runtime": 154.2459,
"eval_samples_per_second": 375.491,
"eval_steps_per_second": 5.867,
"step": 167337
},
{
"epoch": 9.035658581186468,
"grad_norm": 0.8185796141624451,
"learning_rate": 6.000176543282529e-05,
"loss": 3.0225,
"step": 168000
},
{
"epoch": 9.08944226321734,
"grad_norm": 0.7936908602714539,
"learning_rate": 5.970752662861179e-05,
"loss": 3.0087,
"step": 169000
},
{
"epoch": 9.143225945248211,
"grad_norm": 0.7915844321250916,
"learning_rate": 5.94135820632025e-05,
"loss": 3.0149,
"step": 170000
},
{
"epoch": 9.197009627279083,
"grad_norm": 0.7934896945953369,
"learning_rate": 5.9119343258989e-05,
"loss": 3.0156,
"step": 171000
},
{
"epoch": 9.250793309309955,
"grad_norm": 0.7754538059234619,
"learning_rate": 5.882539869357972e-05,
"loss": 3.0215,
"step": 172000
},
{
"epoch": 9.304576991340827,
"grad_norm": 0.7899085879325867,
"learning_rate": 5.853115988936622e-05,
"loss": 3.0223,
"step": 173000
},
{
"epoch": 9.358360673371699,
"grad_norm": 0.7922378182411194,
"learning_rate": 5.823750956276114e-05,
"loss": 3.0266,
"step": 174000
},
{
"epoch": 9.412144355402571,
"grad_norm": 0.8085660338401794,
"learning_rate": 5.794327075854764e-05,
"loss": 3.0204,
"step": 175000
},
{
"epoch": 9.465928037433443,
"grad_norm": 0.8308489322662354,
"learning_rate": 5.764903195433414e-05,
"loss": 3.0297,
"step": 176000
},
{
"epoch": 9.519711719464315,
"grad_norm": 0.7885105609893799,
"learning_rate": 5.735479315012065e-05,
"loss": 3.0309,
"step": 177000
},
{
"epoch": 9.573495401495187,
"grad_norm": 0.7959656715393066,
"learning_rate": 5.7060554345907135e-05,
"loss": 3.0299,
"step": 178000
},
{
"epoch": 9.627279083526059,
"grad_norm": 0.8052105903625488,
"learning_rate": 5.6766609780497856e-05,
"loss": 3.0341,
"step": 179000
},
{
"epoch": 9.68106276555693,
"grad_norm": 0.77768474817276,
"learning_rate": 5.647237097628435e-05,
"loss": 3.0311,
"step": 180000
},
{
"epoch": 9.734846447587802,
"grad_norm": 0.7868794202804565,
"learning_rate": 5.617842641087507e-05,
"loss": 3.037,
"step": 181000
},
{
"epoch": 9.788630129618674,
"grad_norm": 0.7672579884529114,
"learning_rate": 5.5884187606661565e-05,
"loss": 3.0332,
"step": 182000
},
{
"epoch": 9.842413811649546,
"grad_norm": 0.7784843444824219,
"learning_rate": 5.5589948802448066e-05,
"loss": 3.0331,
"step": 183000
},
{
"epoch": 9.896197493680418,
"grad_norm": 0.8073210120201111,
"learning_rate": 5.529600423703878e-05,
"loss": 3.0338,
"step": 184000
},
{
"epoch": 9.949981175711288,
"grad_norm": 0.7723698616027832,
"learning_rate": 5.500176543282528e-05,
"loss": 3.0346,
"step": 185000
},
{
"epoch": 10.0,
"eval_accuracy": 0.40372259828500323,
"eval_loss": 3.386077642440796,
"eval_runtime": 153.8424,
"eval_samples_per_second": 376.476,
"eval_steps_per_second": 5.883,
"step": 185930
},
{
"epoch": 10.00376485774216,
"grad_norm": 0.7879741191864014,
"learning_rate": 5.470752662861178e-05,
"loss": 3.035,
"step": 186000
},
{
"epoch": 10.057548539773032,
"grad_norm": 0.7588198781013489,
"learning_rate": 5.441328782439828e-05,
"loss": 2.9797,
"step": 187000
},
{
"epoch": 10.111332221803904,
"grad_norm": 0.7911401987075806,
"learning_rate": 5.4119343258989e-05,
"loss": 2.9867,
"step": 188000
},
{
"epoch": 10.165115903834776,
"grad_norm": 0.834837794303894,
"learning_rate": 5.38251044547755e-05,
"loss": 2.9866,
"step": 189000
},
{
"epoch": 10.218899585865648,
"grad_norm": 0.785953938961029,
"learning_rate": 5.353115988936621e-05,
"loss": 2.9908,
"step": 190000
},
{
"epoch": 10.27268326789652,
"grad_norm": 0.7968313694000244,
"learning_rate": 5.323692108515271e-05,
"loss": 2.9986,
"step": 191000
},
{
"epoch": 10.326466949927392,
"grad_norm": 0.815880298614502,
"learning_rate": 5.294297651974343e-05,
"loss": 2.9967,
"step": 192000
},
{
"epoch": 10.380250631958264,
"grad_norm": 0.8155861496925354,
"learning_rate": 5.264873771552993e-05,
"loss": 3.0001,
"step": 193000
},
{
"epoch": 10.434034313989136,
"grad_norm": 0.8102470636367798,
"learning_rate": 5.235479315012064e-05,
"loss": 3.0023,
"step": 194000
},
{
"epoch": 10.487817996020008,
"grad_norm": 0.8228176832199097,
"learning_rate": 5.206055434590714e-05,
"loss": 3.0047,
"step": 195000
},
{
"epoch": 10.54160167805088,
"grad_norm": 0.810368537902832,
"learning_rate": 5.1766315541693643e-05,
"loss": 3.0041,
"step": 196000
},
{
"epoch": 10.595385360081751,
"grad_norm": 0.8073120713233948,
"learning_rate": 5.1472076737480144e-05,
"loss": 3.0071,
"step": 197000
},
{
"epoch": 10.649169042112623,
"grad_norm": 0.7942905426025391,
"learning_rate": 5.117813217207086e-05,
"loss": 3.0055,
"step": 198000
},
{
"epoch": 10.702952724143495,
"grad_norm": 0.8009095788002014,
"learning_rate": 5.088389336785736e-05,
"loss": 3.0065,
"step": 199000
},
{
"epoch": 10.756736406174367,
"grad_norm": 0.7769667506217957,
"learning_rate": 5.0589654563643853e-05,
"loss": 3.0097,
"step": 200000
},
{
"epoch": 10.810520088205239,
"grad_norm": 0.7919924259185791,
"learning_rate": 5.0295415759430354e-05,
"loss": 3.0101,
"step": 201000
},
{
"epoch": 10.864303770236111,
"grad_norm": 0.7941082715988159,
"learning_rate": 5.000147119402107e-05,
"loss": 3.0089,
"step": 202000
},
{
"epoch": 10.918087452266983,
"grad_norm": 0.764107346534729,
"learning_rate": 4.970752662861178e-05,
"loss": 3.0077,
"step": 203000
},
{
"epoch": 10.971871134297855,
"grad_norm": 0.7957196235656738,
"learning_rate": 4.941328782439828e-05,
"loss": 3.0121,
"step": 204000
},
{
"epoch": 11.0,
"eval_accuracy": 0.40473771922010227,
"eval_loss": 3.37876033782959,
"eval_runtime": 154.4072,
"eval_samples_per_second": 375.099,
"eval_steps_per_second": 5.861,
"step": 204523
},
{
"epoch": 11.025654816328727,
"grad_norm": 0.826215922832489,
"learning_rate": 4.9119049020184784e-05,
"loss": 2.9851,
"step": 205000
},
{
"epoch": 11.079438498359597,
"grad_norm": 0.8113217353820801,
"learning_rate": 4.88251044547755e-05,
"loss": 2.9589,
"step": 206000
},
{
"epoch": 11.133222180390469,
"grad_norm": 0.8072004318237305,
"learning_rate": 4.8530865650562e-05,
"loss": 2.9636,
"step": 207000
},
{
"epoch": 11.18700586242134,
"grad_norm": 0.8238457441329956,
"learning_rate": 4.82366268463485e-05,
"loss": 2.969,
"step": 208000
},
{
"epoch": 11.240789544452213,
"grad_norm": 0.8087642788887024,
"learning_rate": 4.7942388042135e-05,
"loss": 2.9709,
"step": 209000
},
{
"epoch": 11.294573226483084,
"grad_norm": 0.7844156622886658,
"learning_rate": 4.764873771552993e-05,
"loss": 2.9751,
"step": 210000
},
{
"epoch": 11.348356908513956,
"grad_norm": 0.8092362284660339,
"learning_rate": 4.735449891131642e-05,
"loss": 2.9723,
"step": 211000
},
{
"epoch": 11.402140590544828,
"grad_norm": 0.833483874797821,
"learning_rate": 4.706055434590714e-05,
"loss": 2.9742,
"step": 212000
},
{
"epoch": 11.4559242725757,
"grad_norm": 0.8131833672523499,
"learning_rate": 4.676631554169364e-05,
"loss": 2.981,
"step": 213000
},
{
"epoch": 11.509707954606572,
"grad_norm": 0.8103277683258057,
"learning_rate": 4.647207673748014e-05,
"loss": 2.98,
"step": 214000
},
{
"epoch": 11.563491636637444,
"grad_norm": 0.8259956240653992,
"learning_rate": 4.6177837933266646e-05,
"loss": 2.9833,
"step": 215000
},
{
"epoch": 11.617275318668316,
"grad_norm": 0.8347487449645996,
"learning_rate": 4.588389336785735e-05,
"loss": 2.978,
"step": 216000
},
{
"epoch": 11.671059000699188,
"grad_norm": 0.8020747303962708,
"learning_rate": 4.558965456364386e-05,
"loss": 2.9826,
"step": 217000
},
{
"epoch": 11.72484268273006,
"grad_norm": 0.7874395251274109,
"learning_rate": 4.529570999823457e-05,
"loss": 2.9874,
"step": 218000
},
{
"epoch": 11.778626364760932,
"grad_norm": 0.816592812538147,
"learning_rate": 4.500147119402107e-05,
"loss": 2.9812,
"step": 219000
},
{
"epoch": 11.832410046791804,
"grad_norm": 0.8037729263305664,
"learning_rate": 4.470752662861178e-05,
"loss": 2.9872,
"step": 220000
},
{
"epoch": 11.886193728822676,
"grad_norm": 0.7837305068969727,
"learning_rate": 4.44135820632025e-05,
"loss": 2.9866,
"step": 221000
},
{
"epoch": 11.939977410853547,
"grad_norm": 0.7976572513580322,
"learning_rate": 4.4119343258989e-05,
"loss": 2.9896,
"step": 222000
},
{
"epoch": 11.99376109288442,
"grad_norm": 0.802457869052887,
"learning_rate": 4.382539869357971e-05,
"loss": 2.9917,
"step": 223000
},
{
"epoch": 12.0,
"eval_accuracy": 0.40500792546768455,
"eval_loss": 3.3736560344696045,
"eval_runtime": 154.2381,
"eval_samples_per_second": 375.51,
"eval_steps_per_second": 5.868,
"step": 223116
},
{
"epoch": 12.047544774915291,
"grad_norm": 0.8179975748062134,
"learning_rate": 4.353115988936621e-05,
"loss": 2.9436,
"step": 224000
},
{
"epoch": 12.101328456946163,
"grad_norm": 0.8451590538024902,
"learning_rate": 4.323692108515271e-05,
"loss": 2.9435,
"step": 225000
},
{
"epoch": 12.155112138977035,
"grad_norm": 0.8380730748176575,
"learning_rate": 4.2942682280939214e-05,
"loss": 2.9481,
"step": 226000
},
{
"epoch": 12.208895821007907,
"grad_norm": 0.8392196297645569,
"learning_rate": 4.264873771552992e-05,
"loss": 2.9472,
"step": 227000
},
{
"epoch": 12.262679503038777,
"grad_norm": 0.8227624893188477,
"learning_rate": 4.235449891131642e-05,
"loss": 2.9494,
"step": 228000
},
{
"epoch": 12.316463185069649,
"grad_norm": 0.824691653251648,
"learning_rate": 4.206026010710293e-05,
"loss": 2.9491,
"step": 229000
},
{
"epoch": 12.370246867100521,
"grad_norm": 0.829526960849762,
"learning_rate": 4.176631554169364e-05,
"loss": 2.957,
"step": 230000
},
{
"epoch": 12.424030549131393,
"grad_norm": 0.8544576168060303,
"learning_rate": 4.1472076737480145e-05,
"loss": 2.9542,
"step": 231000
},
{
"epoch": 12.477814231162265,
"grad_norm": 0.8392364978790283,
"learning_rate": 4.117813217207085e-05,
"loss": 2.9559,
"step": 232000
},
{
"epoch": 12.531597913193137,
"grad_norm": 0.8318558931350708,
"learning_rate": 4.0883893367857353e-05,
"loss": 2.9587,
"step": 233000
},
{
"epoch": 12.585381595224009,
"grad_norm": 0.8154683709144592,
"learning_rate": 4.0589654563643854e-05,
"loss": 2.9603,
"step": 234000
},
{
"epoch": 12.63916527725488,
"grad_norm": 0.8392585515975952,
"learning_rate": 4.0295415759430355e-05,
"loss": 2.9579,
"step": 235000
},
{
"epoch": 12.692948959285753,
"grad_norm": 0.8337314128875732,
"learning_rate": 4.0001176955216856e-05,
"loss": 2.9643,
"step": 236000
},
{
"epoch": 12.746732641316624,
"grad_norm": 0.8414183259010315,
"learning_rate": 3.970752662861178e-05,
"loss": 2.9609,
"step": 237000
},
{
"epoch": 12.800516323347496,
"grad_norm": 0.8429349064826965,
"learning_rate": 3.9413287824398284e-05,
"loss": 2.9656,
"step": 238000
},
{
"epoch": 12.854300005378368,
"grad_norm": 0.8262794613838196,
"learning_rate": 3.9119049020184785e-05,
"loss": 2.9668,
"step": 239000
},
{
"epoch": 12.90808368740924,
"grad_norm": 0.8269763588905334,
"learning_rate": 3.88251044547755e-05,
"loss": 2.9689,
"step": 240000
},
{
"epoch": 12.961867369440112,
"grad_norm": 0.8158543109893799,
"learning_rate": 3.8530865650562e-05,
"loss": 2.968,
"step": 241000
},
{
"epoch": 13.0,
"eval_accuracy": 0.4055080959085722,
"eval_loss": 3.3828203678131104,
"eval_runtime": 154.2308,
"eval_samples_per_second": 375.528,
"eval_steps_per_second": 5.868,
"step": 241709
},
{
"epoch": 13.015651051470984,
"grad_norm": 0.8760387897491455,
"learning_rate": 3.82366268463485e-05,
"loss": 2.9507,
"step": 242000
},
{
"epoch": 13.069434733501856,
"grad_norm": 0.8574272394180298,
"learning_rate": 3.7942388042134995e-05,
"loss": 2.9228,
"step": 243000
},
{
"epoch": 13.123218415532728,
"grad_norm": 0.8364537358283997,
"learning_rate": 3.7648443476725716e-05,
"loss": 2.9249,
"step": 244000
},
{
"epoch": 13.1770020975636,
"grad_norm": 0.8717691898345947,
"learning_rate": 3.735420467251221e-05,
"loss": 2.9258,
"step": 245000
},
{
"epoch": 13.230785779594472,
"grad_norm": 0.8629365563392639,
"learning_rate": 3.705996586829871e-05,
"loss": 2.9303,
"step": 246000
},
{
"epoch": 13.284569461625344,
"grad_norm": 0.8226146101951599,
"learning_rate": 3.6766021302889425e-05,
"loss": 2.9324,
"step": 247000
},
{
"epoch": 13.338353143656215,
"grad_norm": 0.8641866445541382,
"learning_rate": 3.6471782498675926e-05,
"loss": 2.9361,
"step": 248000
},
{
"epoch": 13.392136825687086,
"grad_norm": 0.8602815866470337,
"learning_rate": 3.6177543694462427e-05,
"loss": 2.9348,
"step": 249000
},
{
"epoch": 13.445920507717958,
"grad_norm": 0.8341040015220642,
"learning_rate": 3.588359912905314e-05,
"loss": 2.9366,
"step": 250000
},
{
"epoch": 13.49970418974883,
"grad_norm": 0.8433042764663696,
"learning_rate": 3.558936032483964e-05,
"loss": 2.9371,
"step": 251000
},
{
"epoch": 13.553487871779701,
"grad_norm": 0.8445100784301758,
"learning_rate": 3.529512152062614e-05,
"loss": 2.937,
"step": 252000
},
{
"epoch": 13.607271553810573,
"grad_norm": 0.8464850783348083,
"learning_rate": 3.5001176955216856e-05,
"loss": 2.939,
"step": 253000
},
{
"epoch": 13.661055235841445,
"grad_norm": 0.8083788156509399,
"learning_rate": 3.470693815100336e-05,
"loss": 2.9434,
"step": 254000
},
{
"epoch": 13.714838917872317,
"grad_norm": 0.8728957176208496,
"learning_rate": 3.441269934678986e-05,
"loss": 2.941,
"step": 255000
},
{
"epoch": 13.768622599903189,
"grad_norm": 0.8241551518440247,
"learning_rate": 3.411846054257636e-05,
"loss": 2.9452,
"step": 256000
},
{
"epoch": 13.822406281934061,
"grad_norm": 0.8384578227996826,
"learning_rate": 3.382451597716707e-05,
"loss": 2.9448,
"step": 257000
},
{
"epoch": 13.876189963964933,
"grad_norm": 0.810941219329834,
"learning_rate": 3.3530277172953574e-05,
"loss": 2.9405,
"step": 258000
},
{
"epoch": 13.929973645995805,
"grad_norm": 0.8232195973396301,
"learning_rate": 3.323603836874007e-05,
"loss": 2.9492,
"step": 259000
},
{
"epoch": 13.983757328026677,
"grad_norm": 0.8487170338630676,
"learning_rate": 3.294179956452657e-05,
"loss": 2.9462,
"step": 260000
},
{
"epoch": 14.0,
"eval_accuracy": 0.4059775641477828,
"eval_loss": 3.392092227935791,
"eval_runtime": 154.2386,
"eval_samples_per_second": 375.509,
"eval_steps_per_second": 5.868,
"step": 260302
},
{
"epoch": 14.037541010057549,
"grad_norm": 0.8577378988265991,
"learning_rate": 3.264785499911728e-05,
"loss": 2.9154,
"step": 261000
},
{
"epoch": 14.09132469208842,
"grad_norm": 0.868270993232727,
"learning_rate": 3.2353910433708004e-05,
"loss": 2.9065,
"step": 262000
},
{
"epoch": 14.145108374119292,
"grad_norm": 0.8782519102096558,
"learning_rate": 3.20596716294945e-05,
"loss": 2.9078,
"step": 263000
},
{
"epoch": 14.198892056150164,
"grad_norm": 0.8726826906204224,
"learning_rate": 3.1765432825281e-05,
"loss": 2.9074,
"step": 264000
},
{
"epoch": 14.252675738181036,
"grad_norm": 0.8613258600234985,
"learning_rate": 3.14711940210675e-05,
"loss": 2.9139,
"step": 265000
},
{
"epoch": 14.306459420211908,
"grad_norm": 0.8485667109489441,
"learning_rate": 3.1177249455658214e-05,
"loss": 2.9174,
"step": 266000
},
{
"epoch": 14.36024310224278,
"grad_norm": 0.9009717702865601,
"learning_rate": 3.0883010651444715e-05,
"loss": 2.9158,
"step": 267000
},
{
"epoch": 14.414026784273652,
"grad_norm": 0.8538597822189331,
"learning_rate": 3.058936032483964e-05,
"loss": 2.9185,
"step": 268000
},
{
"epoch": 14.467810466304524,
"grad_norm": 0.856895387172699,
"learning_rate": 3.029512152062614e-05,
"loss": 2.9169,
"step": 269000
},
{
"epoch": 14.521594148335396,
"grad_norm": 0.8506320714950562,
"learning_rate": 3.0000882716412644e-05,
"loss": 2.9192,
"step": 270000
},
{
"epoch": 14.575377830366268,
"grad_norm": 0.8925907611846924,
"learning_rate": 2.9706643912199145e-05,
"loss": 2.918,
"step": 271000
},
{
"epoch": 14.629161512397138,
"grad_norm": 0.8765429854393005,
"learning_rate": 2.941269934678986e-05,
"loss": 2.9245,
"step": 272000
},
{
"epoch": 14.68294519442801,
"grad_norm": 0.8704999089241028,
"learning_rate": 2.911875478138057e-05,
"loss": 2.9222,
"step": 273000
},
{
"epoch": 14.736728876458882,
"grad_norm": 0.8442783951759338,
"learning_rate": 2.882451597716707e-05,
"loss": 2.921,
"step": 274000
},
{
"epoch": 14.790512558489754,
"grad_norm": 0.8700312376022339,
"learning_rate": 2.8530277172953568e-05,
"loss": 2.9234,
"step": 275000
},
{
"epoch": 14.844296240520626,
"grad_norm": 0.8738580346107483,
"learning_rate": 2.823603836874007e-05,
"loss": 2.9259,
"step": 276000
},
{
"epoch": 14.898079922551497,
"grad_norm": 0.8423880934715271,
"learning_rate": 2.7941799564526573e-05,
"loss": 2.9253,
"step": 277000
},
{
"epoch": 14.95186360458237,
"grad_norm": 0.8413381576538086,
"learning_rate": 2.7647560760313074e-05,
"loss": 2.9308,
"step": 278000
},
{
"epoch": 15.0,
"eval_accuracy": 0.40731603234036906,
"eval_loss": 3.384235382080078,
"eval_runtime": 153.8779,
"eval_samples_per_second": 376.389,
"eval_steps_per_second": 5.881,
"step": 278895
},
{
"epoch": 15.005647286613241,
"grad_norm": 0.8703798651695251,
"learning_rate": 2.7353616194903788e-05,
"loss": 2.925,
"step": 279000
},
{
"epoch": 15.059430968644113,
"grad_norm": 0.8850764036178589,
"learning_rate": 2.705937739069029e-05,
"loss": 2.8835,
"step": 280000
},
{
"epoch": 15.113214650674985,
"grad_norm": 0.8852105140686035,
"learning_rate": 2.6765138586476786e-05,
"loss": 2.8864,
"step": 281000
},
{
"epoch": 15.166998332705857,
"grad_norm": 0.9038397669792175,
"learning_rate": 2.6470899782263287e-05,
"loss": 2.8946,
"step": 282000
},
{
"epoch": 15.220782014736729,
"grad_norm": 0.8719667792320251,
"learning_rate": 2.6176955216853998e-05,
"loss": 2.894,
"step": 283000
},
{
"epoch": 15.274565696767601,
"grad_norm": 0.8941630125045776,
"learning_rate": 2.5882716412640502e-05,
"loss": 2.8992,
"step": 284000
},
{
"epoch": 15.328349378798473,
"grad_norm": 0.8849285840988159,
"learning_rate": 2.5588771847231213e-05,
"loss": 2.8982,
"step": 285000
},
{
"epoch": 15.382133060829345,
"grad_norm": 0.916803240776062,
"learning_rate": 2.5294533043017717e-05,
"loss": 2.8998,
"step": 286000
},
{
"epoch": 15.435916742860217,
"grad_norm": 0.871529757976532,
"learning_rate": 2.5000882716412644e-05,
"loss": 2.9004,
"step": 287000
},
{
"epoch": 15.489700424891089,
"grad_norm": 0.8877633213996887,
"learning_rate": 2.470664391219914e-05,
"loss": 2.9036,
"step": 288000
},
{
"epoch": 15.54348410692196,
"grad_norm": 0.8539232015609741,
"learning_rate": 2.4412405107985643e-05,
"loss": 2.901,
"step": 289000
},
{
"epoch": 15.597267788952832,
"grad_norm": 0.8996196985244751,
"learning_rate": 2.4118460542576357e-05,
"loss": 2.9012,
"step": 290000
},
{
"epoch": 15.651051470983704,
"grad_norm": 0.9288415908813477,
"learning_rate": 2.3824221738362857e-05,
"loss": 2.9059,
"step": 291000
},
{
"epoch": 15.704835153014574,
"grad_norm": 0.83979731798172,
"learning_rate": 2.3529982934149355e-05,
"loss": 2.9078,
"step": 292000
},
{
"epoch": 15.758618835045446,
"grad_norm": 0.871395468711853,
"learning_rate": 2.3235744129935856e-05,
"loss": 2.905,
"step": 293000
},
{
"epoch": 15.812402517076318,
"grad_norm": 0.8687715530395508,
"learning_rate": 2.294179956452657e-05,
"loss": 2.9092,
"step": 294000
},
{
"epoch": 15.86618619910719,
"grad_norm": 0.8818358778953552,
"learning_rate": 2.2647854999117284e-05,
"loss": 2.9087,
"step": 295000
},
{
"epoch": 15.919969881138062,
"grad_norm": 0.8488963842391968,
"learning_rate": 2.2353910433707998e-05,
"loss": 2.9098,
"step": 296000
},
{
"epoch": 15.973753563168934,
"grad_norm": 0.8631708025932312,
"learning_rate": 2.20596716294945e-05,
"loss": 2.9096,
"step": 297000
},
{
"epoch": 16.0,
"eval_accuracy": 0.4075240279998705,
"eval_loss": 3.3800442218780518,
"eval_runtime": 154.4151,
"eval_samples_per_second": 375.08,
"eval_steps_per_second": 5.861,
"step": 297488
},
{
"epoch": 16.027537245199806,
"grad_norm": 0.9190370440483093,
"learning_rate": 2.1765432825281e-05,
"loss": 2.8902,
"step": 298000
},
{
"epoch": 16.081320927230678,
"grad_norm": 0.9171285033226013,
"learning_rate": 2.14711940210675e-05,
"loss": 2.8722,
"step": 299000
},
{
"epoch": 16.13510460926155,
"grad_norm": 0.8986324071884155,
"learning_rate": 2.1176955216853998e-05,
"loss": 2.8795,
"step": 300000
},
{
"epoch": 16.18888829129242,
"grad_norm": 0.9201930165290833,
"learning_rate": 2.08827164126405e-05,
"loss": 2.8797,
"step": 301000
},
{
"epoch": 16.242671973323294,
"grad_norm": 0.9046939611434937,
"learning_rate": 2.0588771847231213e-05,
"loss": 2.8811,
"step": 302000
},
{
"epoch": 16.296455655354166,
"grad_norm": 0.9016453623771667,
"learning_rate": 2.0294533043017714e-05,
"loss": 2.8794,
"step": 303000
},
{
"epoch": 16.350239337385037,
"grad_norm": 0.9240383505821228,
"learning_rate": 2.0000294238804215e-05,
"loss": 2.8855,
"step": 304000
},
{
"epoch": 16.40402301941591,
"grad_norm": 0.9093482494354248,
"learning_rate": 1.9706055434590716e-05,
"loss": 2.8801,
"step": 305000
},
{
"epoch": 16.45780670144678,
"grad_norm": 0.8959923386573792,
"learning_rate": 1.941211086918143e-05,
"loss": 2.8851,
"step": 306000
},
{
"epoch": 16.511590383477653,
"grad_norm": 0.9291424751281738,
"learning_rate": 1.9117872064967927e-05,
"loss": 2.8823,
"step": 307000
},
{
"epoch": 16.565374065508525,
"grad_norm": 0.9166994094848633,
"learning_rate": 1.8823633260754428e-05,
"loss": 2.8914,
"step": 308000
},
{
"epoch": 16.619157747539397,
"grad_norm": 0.9259293675422668,
"learning_rate": 1.852939445654093e-05,
"loss": 2.8881,
"step": 309000
},
{
"epoch": 16.67294142957027,
"grad_norm": 0.9145235419273376,
"learning_rate": 1.8235449891131643e-05,
"loss": 2.8876,
"step": 310000
},
{
"epoch": 16.72672511160114,
"grad_norm": 0.926196813583374,
"learning_rate": 1.7941211086918144e-05,
"loss": 2.8875,
"step": 311000
},
{
"epoch": 16.780508793632013,
"grad_norm": 0.8985670208930969,
"learning_rate": 1.7647266521508858e-05,
"loss": 2.8897,
"step": 312000
},
{
"epoch": 16.834292475662885,
"grad_norm": 0.9112594723701477,
"learning_rate": 1.735302771729536e-05,
"loss": 2.891,
"step": 313000
},
{
"epoch": 16.888076157693757,
"grad_norm": 0.9346410036087036,
"learning_rate": 1.705908315188607e-05,
"loss": 2.8916,
"step": 314000
},
{
"epoch": 16.94185983972463,
"grad_norm": 0.9073229432106018,
"learning_rate": 1.6764844347672574e-05,
"loss": 2.8898,
"step": 315000
},
{
"epoch": 16.9956435217555,
"grad_norm": 0.8861480951309204,
"learning_rate": 1.647060554345907e-05,
"loss": 2.889,
"step": 316000
},
{
"epoch": 17.0,
"eval_accuracy": 0.4077118018591426,
"eval_loss": 3.384974479675293,
"eval_runtime": 154.1381,
"eval_samples_per_second": 375.754,
"eval_steps_per_second": 5.871,
"step": 316081
},
{
"epoch": 17.049427203786372,
"grad_norm": 0.9185024499893188,
"learning_rate": 1.6176660978049785e-05,
"loss": 2.8642,
"step": 317000
},
{
"epoch": 17.103210885817244,
"grad_norm": 0.8915971517562866,
"learning_rate": 1.5882422173836286e-05,
"loss": 2.8621,
"step": 318000
},
{
"epoch": 17.156994567848116,
"grad_norm": 0.9077499508857727,
"learning_rate": 1.5588183369622787e-05,
"loss": 2.8628,
"step": 319000
},
{
"epoch": 17.210778249878988,
"grad_norm": 0.9534841775894165,
"learning_rate": 1.5294533043017714e-05,
"loss": 2.8685,
"step": 320000
},
{
"epoch": 17.26456193190986,
"grad_norm": 0.924517035484314,
"learning_rate": 1.5000294238804214e-05,
"loss": 2.8611,
"step": 321000
},
{
"epoch": 17.318345613940732,
"grad_norm": 0.9032144546508789,
"learning_rate": 1.4706055434590716e-05,
"loss": 2.8672,
"step": 322000
},
{
"epoch": 17.372129295971604,
"grad_norm": 0.9569965600967407,
"learning_rate": 1.4411816630377215e-05,
"loss": 2.8669,
"step": 323000
},
{
"epoch": 17.425912978002476,
"grad_norm": 0.9252744913101196,
"learning_rate": 1.4117577826163716e-05,
"loss": 2.8681,
"step": 324000
},
{
"epoch": 17.479696660033348,
"grad_norm": 0.9229059815406799,
"learning_rate": 1.3823633260754429e-05,
"loss": 2.8733,
"step": 325000
},
{
"epoch": 17.533480342064216,
"grad_norm": 0.9757437705993652,
"learning_rate": 1.3529394456540928e-05,
"loss": 2.873,
"step": 326000
},
{
"epoch": 17.587264024095088,
"grad_norm": 0.9332020878791809,
"learning_rate": 1.323515565232743e-05,
"loss": 2.8693,
"step": 327000
},
{
"epoch": 17.64104770612596,
"grad_norm": 0.9381711483001709,
"learning_rate": 1.294091684811393e-05,
"loss": 2.8694,
"step": 328000
},
{
"epoch": 17.69483138815683,
"grad_norm": 0.9215328693389893,
"learning_rate": 1.2646972282704644e-05,
"loss": 2.8712,
"step": 329000
},
{
"epoch": 17.748615070187704,
"grad_norm": 0.9105529189109802,
"learning_rate": 1.2353027717295358e-05,
"loss": 2.873,
"step": 330000
},
{
"epoch": 17.802398752218576,
"grad_norm": 0.9045984148979187,
"learning_rate": 1.2058788913081857e-05,
"loss": 2.8739,
"step": 331000
},
{
"epoch": 17.856182434249448,
"grad_norm": 0.8981735110282898,
"learning_rate": 1.1764550108868358e-05,
"loss": 2.8732,
"step": 332000
},
{
"epoch": 17.90996611628032,
"grad_norm": 0.9405637383460999,
"learning_rate": 1.1470311304654859e-05,
"loss": 2.8759,
"step": 333000
},
{
"epoch": 17.96374979831119,
"grad_norm": 0.880014955997467,
"learning_rate": 1.1176366739245573e-05,
"loss": 2.8779,
"step": 334000
},
{
"epoch": 18.0,
"eval_accuracy": 0.407625137001017,
"eval_loss": 3.392023801803589,
"eval_runtime": 154.3691,
"eval_samples_per_second": 375.192,
"eval_steps_per_second": 5.863,
"step": 334674
},
{
"epoch": 18.017533480342063,
"grad_norm": 0.9213481545448303,
"learning_rate": 1.0882127935032074e-05,
"loss": 2.8646,
"step": 335000
},
{
"epoch": 18.071317162372935,
"grad_norm": 0.926913321018219,
"learning_rate": 1.0587889130818573e-05,
"loss": 2.8498,
"step": 336000
},
{
"epoch": 18.125100844403807,
"grad_norm": 0.953425943851471,
"learning_rate": 1.0293650326605072e-05,
"loss": 2.8487,
"step": 337000
},
{
"epoch": 18.17888452643468,
"grad_norm": 0.9122514724731445,
"learning_rate": 9.999705761195786e-06,
"loss": 2.8491,
"step": 338000
},
{
"epoch": 18.23266820846555,
"grad_norm": 0.9349797964096069,
"learning_rate": 9.7057611957865e-06,
"loss": 2.8526,
"step": 339000
},
{
"epoch": 18.286451890496423,
"grad_norm": 0.9712046384811401,
"learning_rate": 9.411522391573001e-06,
"loss": 2.8535,
"step": 340000
},
{
"epoch": 18.340235572527295,
"grad_norm": 0.9234364628791809,
"learning_rate": 9.117577826163715e-06,
"loss": 2.857,
"step": 341000
},
{
"epoch": 18.394019254558167,
"grad_norm": 0.9454054832458496,
"learning_rate": 8.823339021950216e-06,
"loss": 2.8539,
"step": 342000
},
{
"epoch": 18.44780293658904,
"grad_norm": 0.9207014441490173,
"learning_rate": 8.529100217736715e-06,
"loss": 2.8589,
"step": 343000
},
{
"epoch": 18.50158661861991,
"grad_norm": 0.895506739616394,
"learning_rate": 8.235155652327429e-06,
"loss": 2.8582,
"step": 344000
},
{
"epoch": 18.555370300650782,
"grad_norm": 0.9252230525016785,
"learning_rate": 7.941211086918143e-06,
"loss": 2.8572,
"step": 345000
},
{
"epoch": 18.609153982681654,
"grad_norm": 0.954779863357544,
"learning_rate": 7.646972282704644e-06,
"loss": 2.8593,
"step": 346000
},
{
"epoch": 18.662937664712526,
"grad_norm": 0.9090393781661987,
"learning_rate": 7.352733478491143e-06,
"loss": 2.8606,
"step": 347000
},
{
"epoch": 18.716721346743398,
"grad_norm": 0.9491481184959412,
"learning_rate": 7.058494674277644e-06,
"loss": 2.8569,
"step": 348000
},
{
"epoch": 18.77050502877427,
"grad_norm": 0.9368701577186584,
"learning_rate": 6.764255870064144e-06,
"loss": 2.857,
"step": 349000
},
{
"epoch": 18.824288710805142,
"grad_norm": 0.9315699934959412,
"learning_rate": 6.470017065850645e-06,
"loss": 2.8571,
"step": 350000
},
{
"epoch": 18.878072392836014,
"grad_norm": 0.9450963735580444,
"learning_rate": 6.176366739245571e-06,
"loss": 2.8616,
"step": 351000
},
{
"epoch": 18.931856074866886,
"grad_norm": 0.958202064037323,
"learning_rate": 5.882127935032072e-06,
"loss": 2.8551,
"step": 352000
},
{
"epoch": 18.985639756897758,
"grad_norm": 0.916106104850769,
"learning_rate": 5.587889130818573e-06,
"loss": 2.8585,
"step": 353000
},
{
"epoch": 19.0,
"eval_accuracy": 0.4083989399745756,
"eval_loss": 3.389803171157837,
"eval_runtime": 154.6817,
"eval_samples_per_second": 374.433,
"eval_steps_per_second": 5.851,
"step": 353267
},
{
"epoch": 19.03942343892863,
"grad_norm": 0.9335721731185913,
"learning_rate": 5.293944565409286e-06,
"loss": 2.8473,
"step": 354000
},
{
"epoch": 19.0932071209595,
"grad_norm": 0.9438668489456177,
"learning_rate": 4.999705761195786e-06,
"loss": 2.8388,
"step": 355000
},
{
"epoch": 19.146990802990373,
"grad_norm": 0.9449619054794312,
"learning_rate": 4.705466956982287e-06,
"loss": 2.842,
"step": 356000
},
{
"epoch": 19.200774485021245,
"grad_norm": 0.9449966549873352,
"learning_rate": 4.411522391573001e-06,
"loss": 2.8438,
"step": 357000
},
{
"epoch": 19.254558167052117,
"grad_norm": 0.9282692074775696,
"learning_rate": 4.1172835873595005e-06,
"loss": 2.8424,
"step": 358000
},
{
"epoch": 19.30834184908299,
"grad_norm": 0.9687463641166687,
"learning_rate": 3.823044783146001e-06,
"loss": 2.8429,
"step": 359000
},
{
"epoch": 19.36212553111386,
"grad_norm": 0.9648985862731934,
"learning_rate": 3.528805978932502e-06,
"loss": 2.8414,
"step": 360000
},
{
"epoch": 19.415909213144733,
"grad_norm": 0.933380126953125,
"learning_rate": 3.234567174719002e-06,
"loss": 2.844,
"step": 361000
},
{
"epoch": 19.469692895175605,
"grad_norm": 0.9169676303863525,
"learning_rate": 2.940622609309716e-06,
"loss": 2.8437,
"step": 362000
},
{
"epoch": 19.523476577206477,
"grad_norm": 0.9346097707748413,
"learning_rate": 2.6463838050962164e-06,
"loss": 2.844,
"step": 363000
},
{
"epoch": 19.57726025923735,
"grad_norm": 0.9195376634597778,
"learning_rate": 2.35243923968693e-06,
"loss": 2.8418,
"step": 364000
},
{
"epoch": 19.63104394126822,
"grad_norm": 0.9312041401863098,
"learning_rate": 2.0582004354734305e-06,
"loss": 2.8455,
"step": 365000
},
{
"epoch": 19.684827623299093,
"grad_norm": 0.9278233647346497,
"learning_rate": 1.7639616312599305e-06,
"loss": 2.8454,
"step": 366000
},
{
"epoch": 19.738611305329965,
"grad_norm": 0.9816317558288574,
"learning_rate": 1.4700170658506446e-06,
"loss": 2.8422,
"step": 367000
},
{
"epoch": 19.792394987360836,
"grad_norm": 0.9713099002838135,
"learning_rate": 1.1760725004413582e-06,
"loss": 2.8441,
"step": 368000
},
{
"epoch": 19.846178669391705,
"grad_norm": 0.9396886825561523,
"learning_rate": 8.818336962278584e-07,
"loss": 2.844,
"step": 369000
},
{
"epoch": 19.899962351422577,
"grad_norm": 0.9674928784370422,
"learning_rate": 5.875948920143589e-07,
"loss": 2.8451,
"step": 370000
},
{
"epoch": 19.95374603345345,
"grad_norm": 0.9253866076469421,
"learning_rate": 2.933560878008592e-07,
"loss": 2.8469,
"step": 371000
},
{
"epoch": 20.0,
"eval_accuracy": 0.4080815719204785,
"eval_loss": 3.3971192836761475,
"eval_runtime": 154.407,
"eval_samples_per_second": 375.1,
"eval_steps_per_second": 5.861,
"step": 371860
},
{
"epoch": 20.0,
"step": 371860,
"total_flos": 1.5667414205184e+18,
"train_loss": 3.1547193204799435,
"train_runtime": 81066.2548,
"train_samples_per_second": 146.785,
"train_steps_per_second": 4.587
}
],
"logging_steps": 1000,
"max_steps": 371860,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 5000,
"total_flos": 1.5667414205184e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}