{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999981467376096, "global_step": 269794, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.446997776130467e-06, "loss": 4.2339, "step": 500 }, { "epoch": 0.0, "learning_rate": 2.9295774647887324e-06, "loss": 3.743, "step": 1000 }, { "epoch": 0.0, "learning_rate": 4.4121571534469985e-06, "loss": 3.5488, "step": 1500 }, { "epoch": 0.01, "learning_rate": 5.8947368421052634e-06, "loss": 3.441, "step": 2000 }, { "epoch": 0.01, "learning_rate": 7.377316530763528e-06, "loss": 3.3861, "step": 2500 }, { "epoch": 0.01, "learning_rate": 8.859896219421795e-06, "loss": 3.3338, "step": 3000 }, { "epoch": 0.01, "learning_rate": 1.034247590808006e-05, "loss": 3.2739, "step": 3500 }, { "epoch": 0.01, "learning_rate": 1.1825055596738326e-05, "loss": 3.2384, "step": 4000 }, { "epoch": 0.01, "learning_rate": 1.3307635285396591e-05, "loss": 3.1955, "step": 4500 }, { "epoch": 0.01, "learning_rate": 1.4790214974054856e-05, "loss": 3.1733, "step": 5000 }, { "epoch": 0.02, "learning_rate": 1.6272794662713124e-05, "loss": 3.1648, "step": 5500 }, { "epoch": 0.02, "learning_rate": 1.7755374351371386e-05, "loss": 3.1262, "step": 6000 }, { "epoch": 0.02, "learning_rate": 1.923795404002965e-05, "loss": 3.112, "step": 6500 }, { "epoch": 0.02, "learning_rate": 2.072053372868792e-05, "loss": 3.0858, "step": 7000 }, { "epoch": 0.02, "learning_rate": 2.2203113417346184e-05, "loss": 3.0709, "step": 7500 }, { "epoch": 0.02, "learning_rate": 2.368569310600445e-05, "loss": 3.0446, "step": 8000 }, { "epoch": 0.03, "learning_rate": 2.516827279466271e-05, "loss": 3.0337, "step": 8500 }, { "epoch": 0.03, "learning_rate": 2.6650852483320982e-05, "loss": 3.0176, "step": 9000 }, { "epoch": 0.03, "learning_rate": 2.8133432171979247e-05, "loss": 2.9892, "step": 9500 }, { "epoch": 0.03, "learning_rate": 2.9613046701260195e-05, "loss": 2.9966, "step": 10000 }, { "epoch": 0.03, "learning_rate": 3.1095626389918456e-05, "loss": 2.9685, "step": 10500 }, { "epoch": 0.03, "learning_rate": 3.257820607857672e-05, "loss": 2.9461, "step": 11000 }, { "epoch": 0.03, "learning_rate": 3.406078576723499e-05, "loss": 2.9472, "step": 11500 }, { "epoch": 0.04, "learning_rate": 3.554336545589326e-05, "loss": 2.9346, "step": 12000 }, { "epoch": 0.04, "learning_rate": 3.7022979985174206e-05, "loss": 2.9418, "step": 12500 }, { "epoch": 0.04, "learning_rate": 3.850555967383247e-05, "loss": 2.9088, "step": 13000 }, { "epoch": 0.04, "learning_rate": 3.9988139362490736e-05, "loss": 2.8908, "step": 13500 }, { "epoch": 0.04, "learning_rate": 4.147071905114901e-05, "loss": 2.8886, "step": 14000 }, { "epoch": 0.04, "learning_rate": 4.2953298739807265e-05, "loss": 2.8681, "step": 14500 }, { "epoch": 0.04, "learning_rate": 4.4432913269088213e-05, "loss": 2.8626, "step": 15000 }, { "epoch": 0.05, "learning_rate": 4.591549295774648e-05, "loss": 2.8519, "step": 15500 }, { "epoch": 0.05, "learning_rate": 4.739807264640474e-05, "loss": 2.8402, "step": 16000 }, { "epoch": 0.05, "learning_rate": 4.888065233506301e-05, "loss": 2.8259, "step": 16500 }, { "epoch": 0.05, "learning_rate": 5.036323202372127e-05, "loss": 2.836, "step": 17000 }, { "epoch": 0.05, "learning_rate": 5.1845811712379545e-05, "loss": 2.8166, "step": 17500 }, { "epoch": 0.05, "learning_rate": 5.3328391401037816e-05, "loss": 2.796, "step": 18000 }, { "epoch": 0.05, "learning_rate": 5.4810971089696074e-05, "loss": 2.7931, "step": 18500 }, { "epoch": 0.06, "learning_rate": 5.629058561897702e-05, "loss": 2.7947, "step": 19000 }, { "epoch": 0.06, "learning_rate": 5.777020014825797e-05, "loss": 2.7701, "step": 19500 }, { "epoch": 0.06, "learning_rate": 5.925277983691624e-05, "loss": 2.7674, "step": 20000 }, { "epoch": 0.06, "learning_rate": 6.07353595255745e-05, "loss": 2.7625, "step": 20500 }, { "epoch": 0.06, "learning_rate": 6.221793921423277e-05, "loss": 2.754, "step": 21000 }, { "epoch": 0.06, "learning_rate": 6.370051890289104e-05, "loss": 2.7621, "step": 21500 }, { "epoch": 0.07, "learning_rate": 6.51830985915493e-05, "loss": 2.7329, "step": 22000 }, { "epoch": 0.07, "learning_rate": 6.666567828020757e-05, "loss": 2.7392, "step": 22500 }, { "epoch": 0.07, "learning_rate": 6.814825796886583e-05, "loss": 2.7446, "step": 23000 }, { "epoch": 0.07, "learning_rate": 6.96308376575241e-05, "loss": 2.7346, "step": 23500 }, { "epoch": 0.07, "learning_rate": 7.111341734618236e-05, "loss": 2.7246, "step": 24000 }, { "epoch": 0.07, "learning_rate": 7.259599703484063e-05, "loss": 2.7393, "step": 24500 }, { "epoch": 0.07, "learning_rate": 7.407857672349889e-05, "loss": 2.7271, "step": 25000 }, { "epoch": 0.07, "eval_loss": 2.5492873191833496, "eval_runtime": 278.1602, "eval_samples_per_second": 359.505, "eval_steps_per_second": 44.938, "step": 25000 }, { "epoch": 0.08, "learning_rate": 7.556115641215716e-05, "loss": 2.7127, "step": 25500 }, { "epoch": 0.08, "learning_rate": 7.704373610081542e-05, "loss": 2.713, "step": 26000 }, { "epoch": 0.08, "learning_rate": 7.852631578947369e-05, "loss": 2.7002, "step": 26500 }, { "epoch": 0.08, "learning_rate": 8.000593031875464e-05, "loss": 2.7042, "step": 27000 }, { "epoch": 0.08, "learning_rate": 8.14885100074129e-05, "loss": 2.7054, "step": 27500 }, { "epoch": 0.08, "learning_rate": 8.297108969607117e-05, "loss": 2.6925, "step": 28000 }, { "epoch": 0.08, "learning_rate": 8.445366938472942e-05, "loss": 2.6927, "step": 28500 }, { "epoch": 0.09, "learning_rate": 8.59362490733877e-05, "loss": 2.6843, "step": 29000 }, { "epoch": 0.09, "learning_rate": 8.741882876204596e-05, "loss": 2.6848, "step": 29500 }, { "epoch": 0.09, "learning_rate": 8.890140845070423e-05, "loss": 2.6851, "step": 30000 }, { "epoch": 0.09, "learning_rate": 9.03839881393625e-05, "loss": 2.676, "step": 30500 }, { "epoch": 0.09, "learning_rate": 9.186656782802076e-05, "loss": 2.6811, "step": 31000 }, { "epoch": 0.09, "learning_rate": 9.334914751667902e-05, "loss": 2.6687, "step": 31500 }, { "epoch": 0.09, "learning_rate": 9.483172720533729e-05, "loss": 2.6751, "step": 32000 }, { "epoch": 0.1, "learning_rate": 9.631430689399555e-05, "loss": 2.6746, "step": 32500 }, { "epoch": 0.1, "learning_rate": 9.779688658265382e-05, "loss": 2.6662, "step": 33000 }, { "epoch": 0.1, "learning_rate": 9.927946627131208e-05, "loss": 2.6685, "step": 33500 }, { "epoch": 0.1, "learning_rate": 9.999982309633072e-05, "loss": 2.6702, "step": 34000 }, { "epoch": 0.1, "learning_rate": 9.999846516961786e-05, "loss": 2.6692, "step": 34500 }, { "epoch": 0.1, "learning_rate": 9.999577483170183e-05, "loss": 2.6732, "step": 35000 }, { "epoch": 0.11, "learning_rate": 9.9991750774992e-05, "loss": 2.6676, "step": 35500 }, { "epoch": 0.11, "learning_rate": 9.998638103115459e-05, "loss": 2.65, "step": 36000 }, { "epoch": 0.11, "learning_rate": 9.997967246921306e-05, "loss": 2.6493, "step": 36500 }, { "epoch": 0.11, "learning_rate": 9.997162526884725e-05, "loss": 2.657, "step": 37000 }, { "epoch": 0.11, "learning_rate": 9.996223964559059e-05, "loss": 2.6521, "step": 37500 }, { "epoch": 0.11, "learning_rate": 9.995151585082442e-05, "loss": 2.6412, "step": 38000 }, { "epoch": 0.11, "learning_rate": 9.993947963012904e-05, "loss": 2.6418, "step": 38500 }, { "epoch": 0.12, "learning_rate": 9.992608306462086e-05, "loss": 2.6442, "step": 39000 }, { "epoch": 0.12, "learning_rate": 9.991134929600863e-05, "loss": 2.6362, "step": 39500 }, { "epoch": 0.12, "learning_rate": 9.989527871891659e-05, "loss": 2.6354, "step": 40000 }, { "epoch": 0.12, "learning_rate": 9.987787176377355e-05, "loss": 2.6167, "step": 40500 }, { "epoch": 0.12, "learning_rate": 9.985912889680143e-05, "loss": 2.6316, "step": 41000 }, { "epoch": 0.12, "learning_rate": 9.983905062000275e-05, "loss": 2.6355, "step": 41500 }, { "epoch": 0.12, "learning_rate": 9.981763747114724e-05, "loss": 2.6183, "step": 42000 }, { "epoch": 0.13, "learning_rate": 9.979493684988302e-05, "loss": 2.6111, "step": 42500 }, { "epoch": 0.13, "learning_rate": 9.977085837996527e-05, "loss": 2.6053, "step": 43000 }, { "epoch": 0.13, "learning_rate": 9.97454990173924e-05, "loss": 2.6019, "step": 43500 }, { "epoch": 0.13, "learning_rate": 9.971875780087649e-05, "loss": 2.6166, "step": 44000 }, { "epoch": 0.13, "learning_rate": 9.969074240841182e-05, "loss": 2.6378, "step": 44500 }, { "epoch": 0.13, "learning_rate": 9.966134130446202e-05, "loss": 2.6069, "step": 45000 }, { "epoch": 0.13, "learning_rate": 9.963067287750436e-05, "loss": 2.6201, "step": 45500 }, { "epoch": 0.14, "learning_rate": 9.959861502967945e-05, "loss": 2.5973, "step": 46000 }, { "epoch": 0.14, "learning_rate": 9.956522874954218e-05, "loss": 2.5926, "step": 46500 }, { "epoch": 0.14, "learning_rate": 9.95305149312992e-05, "loss": 2.5856, "step": 47000 }, { "epoch": 0.14, "learning_rate": 9.949447450471355e-05, "loss": 2.5929, "step": 47500 }, { "epoch": 0.14, "learning_rate": 9.945710843507967e-05, "loss": 2.5851, "step": 48000 }, { "epoch": 0.14, "learning_rate": 9.94184964259314e-05, "loss": 2.5704, "step": 48500 }, { "epoch": 0.15, "learning_rate": 9.9378484754234e-05, "loss": 2.5823, "step": 49000 }, { "epoch": 0.15, "learning_rate": 9.933715054611808e-05, "loss": 2.5716, "step": 49500 }, { "epoch": 0.15, "learning_rate": 9.929458153797284e-05, "loss": 2.6217, "step": 50000 }, { "epoch": 0.15, "eval_loss": 2.431061267852783, "eval_runtime": 279.1709, "eval_samples_per_second": 358.204, "eval_steps_per_second": 44.775, "step": 50000 }, { "epoch": 0.19, "learning_rate": 9.77079560136976e-05, "loss": 2.5993, "step": 50500 }, { "epoch": 0.19, "learning_rate": 9.76101478820416e-05, "loss": 2.5537, "step": 51000 }, { "epoch": 0.19, "learning_rate": 9.751034728712014e-05, "loss": 2.5581, "step": 51500 }, { "epoch": 0.19, "learning_rate": 9.740855840554321e-05, "loss": 2.6137, "step": 52000 }, { "epoch": 0.19, "learning_rate": 9.730478549712973e-05, "loss": 2.5886, "step": 52500 }, { "epoch": 0.2, "learning_rate": 9.719903290472921e-05, "loss": 2.5534, "step": 53000 }, { "epoch": 0.2, "learning_rate": 9.709130505404002e-05, "loss": 2.5475, "step": 53500 }, { "epoch": 0.2, "learning_rate": 9.698160645342426e-05, "loss": 2.5613, "step": 54000 }, { "epoch": 0.2, "learning_rate": 9.686994169371903e-05, "loss": 2.5487, "step": 54500 }, { "epoch": 0.2, "learning_rate": 9.675631544804424e-05, "loss": 2.5659, "step": 55000 }, { "epoch": 0.21, "learning_rate": 9.664073247160717e-05, "loss": 2.5319, "step": 55500 }, { "epoch": 0.21, "learning_rate": 9.652319760150334e-05, "loss": 2.5536, "step": 56000 }, { "epoch": 0.21, "learning_rate": 9.640371575651422e-05, "loss": 2.53, "step": 56500 }, { "epoch": 0.21, "learning_rate": 9.628229193690122e-05, "loss": 2.524, "step": 57000 }, { "epoch": 0.21, "learning_rate": 9.615893122419657e-05, "loss": 2.5219, "step": 57500 }, { "epoch": 0.21, "learning_rate": 9.60336387809906e-05, "loss": 2.51, "step": 58000 }, { "epoch": 0.22, "learning_rate": 9.590641985071566e-05, "loss": 2.5131, "step": 58500 }, { "epoch": 0.22, "learning_rate": 9.577727975742674e-05, "loss": 2.506, "step": 59000 }, { "epoch": 0.22, "learning_rate": 9.564622390557863e-05, "loss": 2.5086, "step": 59500 }, { "epoch": 0.22, "learning_rate": 9.551325777979978e-05, "loss": 2.4997, "step": 60000 }, { "epoch": 0.22, "learning_rate": 9.53783869446627e-05, "loss": 2.4888, "step": 60500 }, { "epoch": 0.23, "learning_rate": 9.524161704445116e-05, "loss": 2.4952, "step": 61000 }, { "epoch": 0.23, "learning_rate": 9.510295380292393e-05, "loss": 2.4868, "step": 61500 }, { "epoch": 0.23, "learning_rate": 9.496240302307531e-05, "loss": 2.4906, "step": 62000 }, { "epoch": 0.23, "learning_rate": 9.481997058689214e-05, "loss": 2.4815, "step": 62500 }, { "epoch": 0.23, "learning_rate": 9.467566245510782e-05, "loss": 2.4772, "step": 63000 }, { "epoch": 0.24, "learning_rate": 9.45294846669527e-05, "loss": 2.4919, "step": 63500 }, { "epoch": 0.24, "learning_rate": 9.438144333990148e-05, "loss": 2.4889, "step": 64000 }, { "epoch": 0.24, "learning_rate": 9.423154466941704e-05, "loss": 2.4811, "step": 64500 }, { "epoch": 0.24, "learning_rate": 9.407979492869132e-05, "loss": 2.4721, "step": 65000 }, { "epoch": 0.24, "learning_rate": 9.392620046838267e-05, "loss": 2.4696, "step": 65500 }, { "epoch": 0.24, "learning_rate": 9.377076771635013e-05, "loss": 2.4739, "step": 66000 }, { "epoch": 0.25, "learning_rate": 9.361350317738446e-05, "loss": 2.4704, "step": 66500 }, { "epoch": 0.25, "learning_rate": 9.345441343293581e-05, "loss": 2.4727, "step": 67000 }, { "epoch": 0.25, "learning_rate": 9.329350514083845e-05, "loss": 2.4649, "step": 67500 }, { "epoch": 0.25, "learning_rate": 9.313078503503196e-05, "loss": 2.4619, "step": 68000 }, { "epoch": 0.25, "learning_rate": 9.296625992527957e-05, "loss": 2.4577, "step": 68500 }, { "epoch": 0.26, "learning_rate": 9.279993669688308e-05, "loss": 2.46, "step": 69000 }, { "epoch": 0.26, "learning_rate": 9.263182231039476e-05, "loss": 2.4562, "step": 69500 }, { "epoch": 0.26, "learning_rate": 9.246192380132604e-05, "loss": 2.4431, "step": 70000 }, { "epoch": 0.26, "learning_rate": 9.229024827985306e-05, "loss": 2.4523, "step": 70500 }, { "epoch": 0.26, "learning_rate": 9.211680293051915e-05, "loss": 2.4558, "step": 71000 }, { "epoch": 0.27, "learning_rate": 9.194159501193414e-05, "loss": 2.4508, "step": 71500 }, { "epoch": 0.27, "learning_rate": 9.17646318564706e-05, "loss": 2.4526, "step": 72000 }, { "epoch": 0.27, "learning_rate": 9.158592086995692e-05, "loss": 2.4455, "step": 72500 }, { "epoch": 0.27, "learning_rate": 9.140546953136748e-05, "loss": 2.4423, "step": 73000 }, { "epoch": 0.27, "learning_rate": 9.122328539250962e-05, "loss": 2.4291, "step": 73500 }, { "epoch": 0.27, "learning_rate": 9.103937607770756e-05, "loss": 2.4521, "step": 74000 }, { "epoch": 0.28, "learning_rate": 9.085374928348338e-05, "loss": 2.4427, "step": 74500 }, { "epoch": 0.28, "learning_rate": 9.066641277823487e-05, "loss": 2.4378, "step": 75000 }, { "epoch": 0.28, "eval_loss": 2.274932622909546, "eval_runtime": 278.4521, "eval_samples_per_second": 359.128, "eval_steps_per_second": 44.891, "step": 75000 }, { "epoch": 0.28, "learning_rate": 9.047737440191048e-05, "loss": 2.4375, "step": 75500 }, { "epoch": 0.28, "learning_rate": 9.028664206568123e-05, "loss": 2.4262, "step": 76000 }, { "epoch": 0.28, "learning_rate": 9.009422375160955e-05, "loss": 2.4342, "step": 76500 }, { "epoch": 0.29, "learning_rate": 8.990012751231527e-05, "loss": 2.427, "step": 77000 }, { "epoch": 0.29, "learning_rate": 8.970436147063869e-05, "loss": 2.419, "step": 77500 }, { "epoch": 0.29, "learning_rate": 8.950693381930058e-05, "loss": 2.4241, "step": 78000 }, { "epoch": 0.29, "learning_rate": 8.930785282055928e-05, "loss": 2.4192, "step": 78500 }, { "epoch": 0.29, "learning_rate": 8.910712680586502e-05, "loss": 2.4179, "step": 79000 }, { "epoch": 0.29, "learning_rate": 8.890476417551119e-05, "loss": 2.4176, "step": 79500 }, { "epoch": 0.3, "learning_rate": 8.870077339828284e-05, "loss": 2.4179, "step": 80000 }, { "epoch": 0.3, "learning_rate": 8.849516301110216e-05, "loss": 2.418, "step": 80500 }, { "epoch": 0.3, "learning_rate": 8.828794161867136e-05, "loss": 2.4116, "step": 81000 }, { "epoch": 0.3, "learning_rate": 8.807911789311245e-05, "loss": 2.4114, "step": 81500 }, { "epoch": 0.3, "learning_rate": 8.786870057360441e-05, "loss": 2.4091, "step": 82000 }, { "epoch": 0.31, "learning_rate": 8.765669846601735e-05, "loss": 2.4125, "step": 82500 }, { "epoch": 0.31, "learning_rate": 8.744312044254409e-05, "loss": 2.4115, "step": 83000 }, { "epoch": 0.31, "learning_rate": 8.722797544132881e-05, "loss": 2.4195, "step": 83500 }, { "epoch": 0.31, "learning_rate": 8.701127246609299e-05, "loss": 2.4048, "step": 84000 }, { "epoch": 0.31, "learning_rate": 8.679302058575865e-05, "loss": 2.4075, "step": 84500 }, { "epoch": 0.32, "learning_rate": 8.657322893406876e-05, "loss": 2.3983, "step": 85000 }, { "epoch": 0.32, "learning_rate": 8.635190670920503e-05, "loss": 2.3999, "step": 85500 }, { "epoch": 0.32, "learning_rate": 8.6129063173403e-05, "loss": 2.4055, "step": 86000 }, { "epoch": 0.32, "learning_rate": 8.590470765256439e-05, "loss": 2.3899, "step": 86500 }, { "epoch": 0.32, "learning_rate": 8.567884953586675e-05, "loss": 2.3935, "step": 87000 }, { "epoch": 0.32, "learning_rate": 8.545149827537065e-05, "loss": 2.3872, "step": 87500 }, { "epoch": 0.33, "learning_rate": 8.522266338562404e-05, "loss": 2.3877, "step": 88000 }, { "epoch": 0.33, "learning_rate": 8.499235444326407e-05, "loss": 2.3939, "step": 88500 }, { "epoch": 0.33, "learning_rate": 8.476058108661639e-05, "loss": 2.3855, "step": 89000 }, { "epoch": 0.33, "learning_rate": 8.452735301529164e-05, "loss": 2.388, "step": 89500 }, { "epoch": 0.33, "learning_rate": 8.429267998977967e-05, "loss": 2.3801, "step": 90000 }, { "epoch": 0.34, "learning_rate": 8.4056571831041e-05, "loss": 2.3723, "step": 90500 }, { "epoch": 0.34, "learning_rate": 8.381903842009583e-05, "loss": 2.3838, "step": 91000 }, { "epoch": 0.34, "learning_rate": 8.358008969761053e-05, "loss": 2.3857, "step": 91500 }, { "epoch": 0.34, "learning_rate": 8.333973566348161e-05, "loss": 2.3678, "step": 92000 }, { "epoch": 0.34, "learning_rate": 8.309798637641725e-05, "loss": 2.3804, "step": 92500 }, { "epoch": 0.34, "learning_rate": 8.285485195351632e-05, "loss": 2.3667, "step": 93000 }, { "epoch": 0.35, "learning_rate": 8.261034256984503e-05, "loss": 2.3648, "step": 93500 }, { "epoch": 0.35, "learning_rate": 8.236446845801104e-05, "loss": 2.4219, "step": 94000 }, { "epoch": 0.35, "learning_rate": 8.211723990773533e-05, "loss": 2.4274, "step": 94500 }, { "epoch": 0.35, "learning_rate": 8.186866726542143e-05, "loss": 2.4006, "step": 95000 }, { "epoch": 0.35, "learning_rate": 8.161876093372264e-05, "loss": 2.3849, "step": 95500 }, { "epoch": 0.36, "learning_rate": 8.136753137110643e-05, "loss": 2.3937, "step": 96000 }, { "epoch": 0.36, "learning_rate": 8.111498909141696e-05, "loss": 2.3867, "step": 96500 }, { "epoch": 0.36, "learning_rate": 8.086114466343502e-05, "loss": 2.3914, "step": 97000 }, { "epoch": 0.36, "learning_rate": 8.060600871043566e-05, "loss": 2.3946, "step": 97500 }, { "epoch": 0.36, "learning_rate": 8.034959190974374e-05, "loss": 2.397, "step": 98000 }, { "epoch": 0.37, "learning_rate": 8.009190499228698e-05, "loss": 2.3854, "step": 98500 }, { "epoch": 0.37, "learning_rate": 7.983295874214692e-05, "loss": 2.4079, "step": 99000 }, { "epoch": 0.37, "learning_rate": 7.95727639961076e-05, "loss": 2.394, "step": 99500 }, { "epoch": 0.37, "learning_rate": 7.931133164320208e-05, "loss": 2.394, "step": 100000 }, { "epoch": 0.37, "eval_loss": 2.2282919883728027, "eval_runtime": 285.8926, "eval_samples_per_second": 349.782, "eval_steps_per_second": 43.723, "step": 100000 }, { "epoch": 0.37, "learning_rate": 7.904867262425669e-05, "loss": 2.3871, "step": 100500 }, { "epoch": 0.37, "learning_rate": 7.878479793143314e-05, "loss": 2.3768, "step": 101000 }, { "epoch": 0.38, "learning_rate": 7.851971860776864e-05, "loss": 2.3783, "step": 101500 }, { "epoch": 0.38, "learning_rate": 7.82534457467136e-05, "loss": 2.3733, "step": 102000 }, { "epoch": 0.38, "learning_rate": 7.798599049166741e-05, "loss": 2.3717, "step": 102500 }, { "epoch": 0.38, "learning_rate": 7.771736403551216e-05, "loss": 2.3689, "step": 103000 }, { "epoch": 0.38, "learning_rate": 7.744757762014416e-05, "loss": 2.3714, "step": 103500 }, { "epoch": 0.39, "learning_rate": 7.717664253600352e-05, "loss": 2.3792, "step": 104000 }, { "epoch": 0.39, "learning_rate": 7.690457012160156e-05, "loss": 2.3711, "step": 104500 }, { "epoch": 0.39, "learning_rate": 7.663137176304642e-05, "loss": 2.3829, "step": 105000 }, { "epoch": 0.39, "learning_rate": 7.635705889356646e-05, "loss": 2.4541, "step": 105500 }, { "epoch": 0.39, "learning_rate": 7.608164299303187e-05, "loss": 2.386, "step": 106000 }, { "epoch": 0.39, "learning_rate": 7.580513558747409e-05, "loss": 2.3839, "step": 106500 }, { "epoch": 0.4, "learning_rate": 7.552754824860368e-05, "loss": 2.3896, "step": 107000 }, { "epoch": 0.4, "learning_rate": 7.524889259332584e-05, "loss": 2.3839, "step": 107500 }, { "epoch": 0.4, "learning_rate": 7.496918028325434e-05, "loss": 2.3821, "step": 108000 }, { "epoch": 0.4, "learning_rate": 7.468842302422355e-05, "loss": 2.3797, "step": 108500 }, { "epoch": 0.4, "learning_rate": 7.440663256579836e-05, "loss": 2.348, "step": 109000 }, { "epoch": 0.41, "learning_rate": 7.412382070078269e-05, "loss": 2.3413, "step": 109500 }, { "epoch": 0.41, "learning_rate": 7.383999926472585e-05, "loss": 2.3497, "step": 110000 }, { "epoch": 0.41, "learning_rate": 7.355518013542717e-05, "loss": 2.3303, "step": 110500 }, { "epoch": 0.41, "learning_rate": 7.326937523243908e-05, "loss": 2.3348, "step": 111000 }, { "epoch": 0.41, "learning_rate": 7.29825965165682e-05, "loss": 2.3375, "step": 111500 }, { "epoch": 0.42, "learning_rate": 7.269485598937468e-05, "loss": 2.3271, "step": 112000 }, { "epoch": 0.42, "learning_rate": 7.240616569267015e-05, "loss": 2.3184, "step": 112500 }, { "epoch": 0.42, "learning_rate": 7.211653770801363e-05, "loss": 2.333, "step": 113000 }, { "epoch": 0.42, "learning_rate": 7.182598415620591e-05, "loss": 2.3712, "step": 113500 }, { "epoch": 0.42, "learning_rate": 7.153451719678243e-05, "loss": 2.3387, "step": 114000 }, { "epoch": 0.42, "learning_rate": 7.124214902750428e-05, "loss": 2.3274, "step": 114500 }, { "epoch": 0.43, "learning_rate": 7.094889188384774e-05, "loss": 2.326, "step": 115000 }, { "epoch": 0.43, "learning_rate": 7.065475803849241e-05, "loss": 2.3131, "step": 115500 }, { "epoch": 0.43, "learning_rate": 7.035975980080728e-05, "loss": 2.3186, "step": 116000 }, { "epoch": 0.43, "learning_rate": 7.006390951633589e-05, "loss": 2.3156, "step": 116500 }, { "epoch": 0.43, "learning_rate": 6.976721956627952e-05, "loss": 2.3211, "step": 117000 }, { "epoch": 0.44, "learning_rate": 6.946970236697905e-05, "loss": 2.3116, "step": 117500 }, { "epoch": 0.44, "learning_rate": 6.917137036939542e-05, "loss": 2.3189, "step": 118000 }, { "epoch": 0.44, "learning_rate": 6.88722360585885e-05, "loss": 2.3286, "step": 118500 }, { "epoch": 0.44, "learning_rate": 6.857231195319457e-05, "loss": 2.3186, "step": 119000 }, { "epoch": 0.44, "learning_rate": 6.827161060490248e-05, "loss": 2.3102, "step": 119500 }, { "epoch": 0.44, "learning_rate": 6.797014459792836e-05, "loss": 2.3146, "step": 120000 }, { "epoch": 0.45, "learning_rate": 6.766792654848896e-05, "loss": 2.2962, "step": 120500 }, { "epoch": 0.45, "learning_rate": 6.736496910427364e-05, "loss": 2.2984, "step": 121000 }, { "epoch": 0.45, "learning_rate": 6.70612849439151e-05, "loss": 2.308, "step": 121500 }, { "epoch": 0.45, "learning_rate": 6.675688677645883e-05, "loss": 2.3029, "step": 122000 }, { "epoch": 0.45, "learning_rate": 6.645178734083117e-05, "loss": 2.2933, "step": 122500 }, { "epoch": 0.46, "learning_rate": 6.614599940530619e-05, "loss": 2.3026, "step": 123000 }, { "epoch": 0.46, "learning_rate": 6.583953576697141e-05, "loss": 2.3084, "step": 123500 }, { "epoch": 0.46, "learning_rate": 6.553240925119219e-05, "loss": 2.2985, "step": 124000 }, { "epoch": 0.46, "learning_rate": 6.522463271107502e-05, "loss": 2.3035, "step": 124500 }, { "epoch": 0.46, "learning_rate": 6.49162190269296e-05, "loss": 2.2891, "step": 125000 }, { "epoch": 0.46, "eval_loss": 2.1396679878234863, "eval_runtime": 305.6778, "eval_samples_per_second": 327.142, "eval_steps_per_second": 40.893, "step": 125000 }, { "epoch": 0.47, "learning_rate": 6.460718110572983e-05, "loss": 2.2987, "step": 125500 }, { "epoch": 0.47, "learning_rate": 6.429753188057368e-05, "loss": 2.2899, "step": 126000 }, { "epoch": 0.47, "learning_rate": 6.398728431014187e-05, "loss": 2.2919, "step": 126500 }, { "epoch": 0.47, "learning_rate": 6.367645137815561e-05, "loss": 2.2878, "step": 127000 }, { "epoch": 0.47, "learning_rate": 6.336504609283325e-05, "loss": 2.2848, "step": 127500 }, { "epoch": 0.47, "learning_rate": 6.305308148634585e-05, "loss": 2.2813, "step": 128000 }, { "epoch": 0.48, "learning_rate": 6.274057061427182e-05, "loss": 2.2804, "step": 128500 }, { "epoch": 0.48, "learning_rate": 6.242752655505053e-05, "loss": 2.2888, "step": 129000 }, { "epoch": 0.48, "learning_rate": 6.211396240943499e-05, "loss": 2.2884, "step": 129500 }, { "epoch": 0.48, "learning_rate": 6.17998912999436e-05, "loss": 2.28, "step": 130000 }, { "epoch": 0.48, "learning_rate": 6.148532637031098e-05, "loss": 2.2855, "step": 130500 }, { "epoch": 0.49, "learning_rate": 6.117028078493787e-05, "loss": 2.3124, "step": 131000 }, { "epoch": 0.49, "learning_rate": 6.085476772834029e-05, "loss": 2.2886, "step": 131500 }, { "epoch": 0.49, "learning_rate": 6.053880040459764e-05, "loss": 2.3043, "step": 132000 }, { "epoch": 0.49, "learning_rate": 6.022239203680027e-05, "loss": 2.3534, "step": 132500 }, { "epoch": 0.49, "learning_rate": 5.990555586649599e-05, "loss": 2.569, "step": 133000 }, { "epoch": 0.49, "learning_rate": 5.958830515313596e-05, "loss": 2.341, "step": 133500 }, { "epoch": 0.5, "learning_rate": 5.927065317351976e-05, "loss": 2.2866, "step": 134000 }, { "epoch": 0.5, "learning_rate": 5.8952613221239826e-05, "loss": 2.2786, "step": 134500 }, { "epoch": 0.5, "learning_rate": 5.863419860612506e-05, "loss": 2.2707, "step": 135000 }, { "epoch": 0.5, "learning_rate": 5.831542265368378e-05, "loss": 2.277, "step": 135500 }, { "epoch": 0.5, "learning_rate": 5.799629870454619e-05, "loss": 2.2702, "step": 136000 }, { "epoch": 0.51, "learning_rate": 5.7676840113905974e-05, "loss": 2.2673, "step": 136500 }, { "epoch": 0.51, "learning_rate": 5.735706025096136e-05, "loss": 2.2517, "step": 137000 }, { "epoch": 0.51, "learning_rate": 5.7036972498355744e-05, "loss": 2.2768, "step": 137500 }, { "epoch": 0.51, "learning_rate": 5.671659025161755e-05, "loss": 2.2569, "step": 138000 }, { "epoch": 0.51, "learning_rate": 5.6395926918599606e-05, "loss": 2.2527, "step": 138500 }, { "epoch": 0.52, "learning_rate": 5.607499591891816e-05, "loss": 2.2662, "step": 139000 }, { "epoch": 0.52, "learning_rate": 5.5753810683391104e-05, "loss": 2.2446, "step": 139500 }, { "epoch": 0.52, "learning_rate": 5.5432384653476e-05, "loss": 2.2506, "step": 140000 }, { "epoch": 0.52, "learning_rate": 5.5110731280707605e-05, "loss": 2.2495, "step": 140500 }, { "epoch": 0.52, "learning_rate": 5.4788864026134824e-05, "loss": 2.2513, "step": 141000 }, { "epoch": 0.52, "learning_rate": 5.446679635975741e-05, "loss": 2.2512, "step": 141500 }, { "epoch": 0.53, "learning_rate": 5.41445417599623e-05, "loss": 2.2497, "step": 142000 }, { "epoch": 0.53, "learning_rate": 5.3822113712959466e-05, "loss": 2.2491, "step": 142500 }, { "epoch": 0.53, "learning_rate": 5.349952571221761e-05, "loss": 2.2425, "step": 143000 }, { "epoch": 0.53, "learning_rate": 5.3176791257899405e-05, "loss": 2.2491, "step": 143500 }, { "epoch": 0.53, "learning_rate": 5.285392385629653e-05, "loss": 2.2542, "step": 144000 }, { "epoch": 0.54, "learning_rate": 5.253093701926446e-05, "loss": 2.2498, "step": 144500 }, { "epoch": 0.54, "learning_rate": 5.2207844263657e-05, "loss": 2.2456, "step": 145000 }, { "epoch": 0.54, "learning_rate": 5.188465911076059e-05, "loss": 2.235, "step": 145500 }, { "epoch": 0.54, "learning_rate": 5.156139508572844e-05, "loss": 2.2463, "step": 146000 }, { "epoch": 0.54, "learning_rate": 5.1238065717014526e-05, "loss": 2.232, "step": 146500 }, { "epoch": 0.54, "learning_rate": 5.091468453580748e-05, "loss": 2.227, "step": 147000 }, { "epoch": 0.55, "learning_rate": 5.0591265075464167e-05, "loss": 2.2354, "step": 147500 }, { "epoch": 0.55, "learning_rate": 5.026782087094353e-05, "loss": 2.2266, "step": 148000 }, { "epoch": 0.55, "learning_rate": 4.9944365458239946e-05, "loss": 2.2319, "step": 148500 }, { "epoch": 0.55, "learning_rate": 4.9620912373816894e-05, "loss": 2.2383, "step": 149000 }, { "epoch": 0.55, "learning_rate": 4.929747515404043e-05, "loss": 2.2315, "step": 149500 }, { "epoch": 0.56, "learning_rate": 4.897406733461264e-05, "loss": 2.2286, "step": 150000 }, { "epoch": 0.56, "eval_loss": 2.0796430110931396, "eval_runtime": 286.5966, "eval_samples_per_second": 348.923, "eval_steps_per_second": 43.615, "step": 150000 }, { "epoch": 0.56, "learning_rate": 4.8650702450005264e-05, "loss": 2.2269, "step": 150500 }, { "epoch": 0.56, "learning_rate": 4.832739403289318e-05, "loss": 2.2294, "step": 151000 }, { "epoch": 0.56, "learning_rate": 4.8004155613588214e-05, "loss": 2.216, "step": 151500 }, { "epoch": 0.56, "learning_rate": 4.7681000719472726e-05, "loss": 2.2128, "step": 152000 }, { "epoch": 0.57, "learning_rate": 4.7357942874433634e-05, "loss": 2.2088, "step": 152500 }, { "epoch": 0.57, "learning_rate": 4.703499559829639e-05, "loss": 2.2112, "step": 153000 }, { "epoch": 0.57, "learning_rate": 4.671217240625916e-05, "loss": 2.2168, "step": 153500 }, { "epoch": 0.57, "learning_rate": 4.6389486808327304e-05, "loss": 2.2034, "step": 154000 }, { "epoch": 0.57, "learning_rate": 4.606695230874788e-05, "loss": 2.2045, "step": 154500 }, { "epoch": 0.57, "learning_rate": 4.5744582405444544e-05, "loss": 2.2043, "step": 155000 }, { "epoch": 0.58, "learning_rate": 4.542239058945272e-05, "loss": 2.2116, "step": 155500 }, { "epoch": 0.58, "learning_rate": 4.51003903443549e-05, "loss": 2.2046, "step": 156000 }, { "epoch": 0.58, "learning_rate": 4.4778595145716465e-05, "loss": 2.1967, "step": 156500 }, { "epoch": 0.58, "learning_rate": 4.4457018460521684e-05, "loss": 2.2046, "step": 157000 }, { "epoch": 0.58, "learning_rate": 4.4135673746610115e-05, "loss": 2.1952, "step": 157500 }, { "epoch": 0.59, "learning_rate": 4.381457445211346e-05, "loss": 2.2071, "step": 158000 }, { "epoch": 0.59, "learning_rate": 4.349373401489269e-05, "loss": 2.1993, "step": 158500 }, { "epoch": 0.59, "learning_rate": 4.317316586197571e-05, "loss": 2.1998, "step": 159000 }, { "epoch": 0.59, "learning_rate": 4.2852883408995515e-05, "loss": 2.1938, "step": 159500 }, { "epoch": 0.59, "learning_rate": 4.253290005962863e-05, "loss": 2.1901, "step": 160000 }, { "epoch": 0.59, "learning_rate": 4.221322920503423e-05, "loss": 2.1933, "step": 160500 }, { "epoch": 0.6, "learning_rate": 4.1893884223293746e-05, "loss": 2.1884, "step": 161000 }, { "epoch": 0.6, "learning_rate": 4.157487847885094e-05, "loss": 2.1855, "step": 161500 }, { "epoch": 0.6, "learning_rate": 4.1256225321952705e-05, "loss": 2.1968, "step": 162000 }, { "epoch": 0.6, "learning_rate": 4.093793808809028e-05, "loss": 2.186, "step": 162500 }, { "epoch": 0.6, "learning_rate": 4.062003009744115e-05, "loss": 2.1905, "step": 163000 }, { "epoch": 0.61, "learning_rate": 4.0302514654311675e-05, "loss": 2.1828, "step": 163500 }, { "epoch": 0.61, "learning_rate": 3.998540504658027e-05, "loss": 2.1878, "step": 164000 }, { "epoch": 0.61, "learning_rate": 3.966871454514137e-05, "loss": 2.1918, "step": 164500 }, { "epoch": 0.61, "learning_rate": 3.935245640334991e-05, "loss": 2.182, "step": 165000 }, { "epoch": 0.61, "learning_rate": 3.903664385646685e-05, "loss": 2.1872, "step": 165500 }, { "epoch": 0.62, "learning_rate": 3.872129012110515e-05, "loss": 2.18, "step": 166000 }, { "epoch": 0.62, "learning_rate": 3.8406408394676724e-05, "loss": 2.1794, "step": 166500 }, { "epoch": 0.62, "learning_rate": 3.8092011854840135e-05, "loss": 2.1778, "step": 167000 }, { "epoch": 0.62, "learning_rate": 3.7778113658949145e-05, "loss": 2.1764, "step": 167500 }, { "epoch": 0.62, "learning_rate": 3.7464726943501955e-05, "loss": 2.1727, "step": 168000 }, { "epoch": 0.62, "learning_rate": 3.715186482359162e-05, "loss": 2.1718, "step": 168500 }, { "epoch": 0.63, "learning_rate": 3.683954039235707e-05, "loss": 2.1766, "step": 169000 }, { "epoch": 0.63, "learning_rate": 3.6527766720435186e-05, "loss": 2.1838, "step": 169500 }, { "epoch": 0.63, "learning_rate": 3.6216556855413906e-05, "loss": 2.1786, "step": 170000 }, { "epoch": 0.63, "learning_rate": 3.5905923821286006e-05, "loss": 2.1748, "step": 170500 }, { "epoch": 0.63, "learning_rate": 3.559588061790419e-05, "loss": 2.1708, "step": 171000 }, { "epoch": 0.64, "learning_rate": 3.528644022043701e-05, "loss": 2.1749, "step": 171500 }, { "epoch": 0.64, "learning_rate": 3.497761557882584e-05, "loss": 2.1668, "step": 172000 }, { "epoch": 0.64, "learning_rate": 3.466941961724301e-05, "loss": 2.1703, "step": 172500 }, { "epoch": 0.64, "learning_rate": 3.4361865233550814e-05, "loss": 2.1682, "step": 173000 }, { "epoch": 0.64, "learning_rate": 3.40549652987618e-05, "loss": 2.1705, "step": 173500 }, { "epoch": 0.64, "learning_rate": 3.374873265650016e-05, "loss": 2.1671, "step": 174000 }, { "epoch": 0.65, "learning_rate": 3.3443180122464156e-05, "loss": 2.16, "step": 174500 }, { "epoch": 0.65, "learning_rate": 3.3138320483889874e-05, "loss": 2.1536, "step": 175000 }, { "epoch": 0.65, "eval_loss": 2.0123212337493896, "eval_runtime": 294.337, "eval_samples_per_second": 339.747, "eval_steps_per_second": 42.468, "step": 175000 }, { "epoch": 0.65, "learning_rate": 3.283416649901599e-05, "loss": 2.152, "step": 175500 }, { "epoch": 0.65, "learning_rate": 3.253073089654992e-05, "loss": 2.1519, "step": 176000 }, { "epoch": 0.65, "learning_rate": 3.222802637513508e-05, "loss": 2.1638, "step": 176500 }, { "epoch": 0.66, "learning_rate": 3.192606560281948e-05, "loss": 2.1527, "step": 177000 }, { "epoch": 0.66, "learning_rate": 3.162486121652556e-05, "loss": 2.1499, "step": 177500 }, { "epoch": 0.66, "learning_rate": 3.1324425821521375e-05, "loss": 2.1614, "step": 178000 }, { "epoch": 0.66, "learning_rate": 3.1024771990893e-05, "loss": 2.1556, "step": 178500 }, { "epoch": 0.66, "learning_rate": 3.072591226501842e-05, "loss": 2.1514, "step": 179000 }, { "epoch": 0.67, "learning_rate": 3.042785915104267e-05, "loss": 2.1533, "step": 179500 }, { "epoch": 0.67, "learning_rate": 3.013062512235445e-05, "loss": 2.1413, "step": 180000 }, { "epoch": 0.67, "learning_rate": 2.9834222618064146e-05, "loss": 2.1461, "step": 180500 }, { "epoch": 0.67, "learning_rate": 2.9538664042483145e-05, "loss": 2.1475, "step": 181000 }, { "epoch": 0.67, "learning_rate": 2.9243961764604878e-05, "loss": 2.1452, "step": 181500 }, { "epoch": 0.67, "learning_rate": 2.895012811758705e-05, "loss": 2.1488, "step": 182000 }, { "epoch": 0.68, "learning_rate": 2.8657175398235548e-05, "loss": 2.1496, "step": 182500 }, { "epoch": 0.68, "learning_rate": 2.8365115866489895e-05, "loss": 2.1348, "step": 183000 }, { "epoch": 0.68, "learning_rate": 2.8073961744910036e-05, "loss": 2.1406, "step": 183500 }, { "epoch": 0.68, "learning_rate": 2.7783725218164992e-05, "loss": 2.1484, "step": 184000 }, { "epoch": 0.68, "learning_rate": 2.7494418432522773e-05, "loss": 2.1482, "step": 184500 }, { "epoch": 0.69, "learning_rate": 2.7206053495342176e-05, "loss": 2.1491, "step": 185000 }, { "epoch": 0.69, "learning_rate": 2.691864247456609e-05, "loss": 2.1316, "step": 185500 }, { "epoch": 0.69, "learning_rate": 2.6632197398216403e-05, "loss": 2.1459, "step": 186000 }, { "epoch": 0.69, "learning_rate": 2.6346730253890626e-05, "loss": 2.1308, "step": 186500 }, { "epoch": 0.69, "learning_rate": 2.6062252988260348e-05, "loss": 2.1389, "step": 187000 }, { "epoch": 0.69, "learning_rate": 2.5778777506571112e-05, "loss": 2.1374, "step": 187500 }, { "epoch": 0.7, "learning_rate": 2.5496315672144238e-05, "loss": 2.1322, "step": 188000 }, { "epoch": 0.7, "learning_rate": 2.521487930588044e-05, "loss": 2.1235, "step": 188500 }, { "epoch": 0.7, "learning_rate": 2.4934480185764976e-05, "loss": 2.1291, "step": 189000 }, { "epoch": 0.7, "learning_rate": 2.465513004637487e-05, "loss": 2.1366, "step": 189500 }, { "epoch": 0.7, "learning_rate": 2.4376840578387754e-05, "loss": 2.1256, "step": 190000 }, { "epoch": 0.71, "learning_rate": 2.409962342809261e-05, "loss": 2.1304, "step": 190500 }, { "epoch": 0.71, "learning_rate": 2.382349019690248e-05, "loss": 2.1229, "step": 191000 }, { "epoch": 0.71, "learning_rate": 2.3548452440868816e-05, "loss": 2.1268, "step": 191500 }, { "epoch": 0.71, "learning_rate": 2.3274521670197923e-05, "loss": 2.1278, "step": 192000 }, { "epoch": 0.71, "learning_rate": 2.300170934876934e-05, "loss": 2.1277, "step": 192500 }, { "epoch": 0.72, "learning_rate": 2.2730026893655916e-05, "loss": 2.1195, "step": 193000 }, { "epoch": 0.72, "learning_rate": 2.2459485674646187e-05, "loss": 2.1253, "step": 193500 }, { "epoch": 0.72, "learning_rate": 2.2190097013768403e-05, "loss": 2.1252, "step": 194000 }, { "epoch": 0.72, "learning_rate": 2.1921872184816817e-05, "loss": 2.1168, "step": 194500 }, { "epoch": 0.72, "learning_rate": 2.1654822412879798e-05, "loss": 2.1288, "step": 195000 }, { "epoch": 0.72, "learning_rate": 2.138895887387018e-05, "loss": 2.1195, "step": 195500 }, { "epoch": 0.73, "learning_rate": 2.112429269405739e-05, "loss": 2.1196, "step": 196000 }, { "epoch": 0.73, "learning_rate": 2.0860834949602015e-05, "loss": 2.1175, "step": 196500 }, { "epoch": 0.73, "learning_rate": 2.0598596666092075e-05, "loss": 2.1234, "step": 197000 }, { "epoch": 0.73, "learning_rate": 2.0337588818081744e-05, "loss": 2.1122, "step": 197500 }, { "epoch": 0.73, "learning_rate": 2.007782232863199e-05, "loss": 2.1185, "step": 198000 }, { "epoch": 0.74, "learning_rate": 1.9819308068853526e-05, "loss": 2.1201, "step": 198500 }, { "epoch": 0.74, "learning_rate": 1.956205685745183e-05, "loss": 2.1079, "step": 199000 }, { "epoch": 0.74, "learning_rate": 1.9306079460274302e-05, "loss": 2.1137, "step": 199500 }, { "epoch": 0.74, "learning_rate": 1.9051386589859843e-05, "loss": 2.114, "step": 200000 }, { "epoch": 0.74, "eval_loss": 1.9818874597549438, "eval_runtime": 287.1851, "eval_samples_per_second": 348.208, "eval_steps_per_second": 43.526, "step": 200000 }, { "epoch": 0.74, "learning_rate": 1.879798890499046e-05, "loss": 2.1094, "step": 200500 }, { "epoch": 0.75, "learning_rate": 1.8545897010245273e-05, "loss": 2.1185, "step": 201000 }, { "epoch": 0.75, "learning_rate": 1.8295121455556607e-05, "loss": 2.1187, "step": 201500 }, { "epoch": 0.75, "learning_rate": 1.8045672735768616e-05, "loss": 2.1096, "step": 202000 }, { "epoch": 0.75, "learning_rate": 1.7797561290197957e-05, "loss": 2.1058, "step": 202500 }, { "epoch": 0.75, "learning_rate": 1.755079750219699e-05, "loss": 2.1065, "step": 203000 }, { "epoch": 0.75, "learning_rate": 1.7305391698719187e-05, "loss": 2.104, "step": 203500 }, { "epoch": 0.76, "learning_rate": 1.706135414988701e-05, "loss": 2.1047, "step": 204000 }, { "epoch": 0.76, "learning_rate": 1.6818695068562084e-05, "loss": 2.1033, "step": 204500 }, { "epoch": 0.76, "learning_rate": 1.6577424609917756e-05, "loss": 2.1106, "step": 205000 }, { "epoch": 0.76, "learning_rate": 1.633755287101416e-05, "loss": 2.1068, "step": 205500 }, { "epoch": 0.76, "learning_rate": 1.6099089890375623e-05, "loss": 2.1013, "step": 206000 }, { "epoch": 0.77, "learning_rate": 1.5862045647570574e-05, "loss": 2.1019, "step": 206500 }, { "epoch": 0.77, "learning_rate": 1.562643006279392e-05, "loss": 2.0999, "step": 207000 }, { "epoch": 0.77, "learning_rate": 1.5392252996451884e-05, "loss": 2.1059, "step": 207500 }, { "epoch": 0.77, "learning_rate": 1.5159524248749296e-05, "loss": 2.1054, "step": 208000 }, { "epoch": 0.77, "learning_rate": 1.4928253559279532e-05, "loss": 2.1043, "step": 208500 }, { "epoch": 0.77, "learning_rate": 1.469845060661686e-05, "loss": 2.105, "step": 209000 }, { "epoch": 0.78, "learning_rate": 1.4470125007911478e-05, "loss": 2.1091, "step": 209500 }, { "epoch": 0.78, "learning_rate": 1.4243286318486915e-05, "loss": 2.101, "step": 210000 }, { "epoch": 0.78, "learning_rate": 1.4017944031440283e-05, "loss": 2.1054, "step": 210500 }, { "epoch": 0.78, "learning_rate": 1.3794107577244886e-05, "loss": 2.0959, "step": 211000 }, { "epoch": 0.78, "learning_rate": 1.3571786323355596e-05, "loss": 2.0913, "step": 211500 }, { "epoch": 0.79, "learning_rate": 1.335098957381687e-05, "loss": 2.0924, "step": 212000 }, { "epoch": 0.79, "learning_rate": 1.3131726568873315e-05, "loss": 2.0939, "step": 212500 }, { "epoch": 0.79, "learning_rate": 1.2914006484583013e-05, "loss": 2.0917, "step": 213000 }, { "epoch": 0.79, "learning_rate": 1.2697838432433545e-05, "loss": 2.0899, "step": 213500 }, { "epoch": 0.79, "learning_rate": 1.2483231458960599e-05, "loss": 2.0961, "step": 214000 }, { "epoch": 0.8, "learning_rate": 1.2270194545369473e-05, "loss": 2.0955, "step": 214500 }, { "epoch": 0.8, "learning_rate": 1.2058736607159133e-05, "loss": 2.0848, "step": 215000 }, { "epoch": 0.8, "learning_rate": 1.1848866493749111e-05, "loss": 2.0841, "step": 215500 }, { "epoch": 0.8, "learning_rate": 1.1640592988109261e-05, "loss": 2.088, "step": 216000 }, { "epoch": 0.8, "learning_rate": 1.1433924806392054e-05, "loss": 2.092, "step": 216500 }, { "epoch": 0.8, "learning_rate": 1.1228870597567887e-05, "loss": 2.0807, "step": 217000 }, { "epoch": 0.81, "learning_rate": 1.102543894306316e-05, "loss": 2.0881, "step": 217500 }, { "epoch": 0.81, "learning_rate": 1.0823638356401056e-05, "loss": 2.084, "step": 218000 }, { "epoch": 0.81, "learning_rate": 1.0623477282845312e-05, "loss": 2.0816, "step": 218500 }, { "epoch": 0.81, "learning_rate": 1.0424964099046813e-05, "loss": 2.0868, "step": 219000 }, { "epoch": 0.81, "learning_rate": 1.0228107112692953e-05, "loss": 2.0883, "step": 219500 }, { "epoch": 0.82, "learning_rate": 1.0032914562160051e-05, "loss": 2.0836, "step": 220000 }, { "epoch": 0.82, "learning_rate": 9.839394616168506e-06, "loss": 2.0877, "step": 220500 }, { "epoch": 0.82, "learning_rate": 9.647555373440976e-06, "loss": 2.082, "step": 221000 }, { "epoch": 0.82, "learning_rate": 9.457404862363428e-06, "loss": 2.0798, "step": 221500 }, { "epoch": 0.82, "learning_rate": 9.26895104064921e-06, "loss": 2.0855, "step": 222000 }, { "epoch": 0.82, "learning_rate": 9.082201795005968e-06, "loss": 2.0892, "step": 222500 }, { "epoch": 0.83, "learning_rate": 8.897164940805591e-06, "loss": 2.0816, "step": 223000 }, { "epoch": 0.83, "learning_rate": 8.71384822175716e-06, "loss": 2.0886, "step": 223500 }, { "epoch": 0.83, "learning_rate": 8.532259309582886e-06, "loss": 2.0848, "step": 224000 }, { "epoch": 0.83, "learning_rate": 8.35240580369701e-06, "loss": 2.0806, "step": 224500 }, { "epoch": 0.83, "learning_rate": 8.174295230887846e-06, "loss": 2.0845, "step": 225000 }, { "epoch": 0.83, "eval_loss": 1.9471186399459839, "eval_runtime": 284.8288, "eval_samples_per_second": 351.088, "eval_steps_per_second": 43.886, "step": 225000 }, { "epoch": 0.84, "learning_rate": 7.997935045002724e-06, "loss": 2.0879, "step": 225500 }, { "epoch": 0.84, "learning_rate": 7.823332626636065e-06, "loss": 2.0804, "step": 226000 }, { "epoch": 0.84, "learning_rate": 7.650495282820502e-06, "loss": 2.0835, "step": 226500 }, { "epoch": 0.84, "learning_rate": 7.479430246721092e-06, "loss": 2.0834, "step": 227000 }, { "epoch": 0.84, "learning_rate": 7.310144677332631e-06, "loss": 2.0755, "step": 227500 }, { "epoch": 0.85, "learning_rate": 7.142645659180036e-06, "loss": 2.0744, "step": 228000 }, { "epoch": 0.85, "learning_rate": 6.9769402020218314e-06, "loss": 2.073, "step": 228500 }, { "epoch": 0.85, "learning_rate": 6.813035240556842e-06, "loss": 2.075, "step": 229000 }, { "epoch": 0.85, "learning_rate": 6.650937634133952e-06, "loss": 2.0767, "step": 229500 }, { "epoch": 0.85, "learning_rate": 6.4906541664650325e-06, "loss": 2.0683, "step": 230000 }, { "epoch": 0.85, "learning_rate": 6.332191545341093e-06, "loss": 2.0783, "step": 230500 }, { "epoch": 0.86, "learning_rate": 6.175556402351546e-06, "loss": 2.0665, "step": 231000 }, { "epoch": 0.86, "learning_rate": 6.020755292606617e-06, "loss": 2.0818, "step": 231500 }, { "epoch": 0.86, "learning_rate": 5.867794694463102e-06, "loss": 2.0693, "step": 232000 }, { "epoch": 0.86, "learning_rate": 5.716681009253189e-06, "loss": 2.0705, "step": 232500 }, { "epoch": 0.86, "learning_rate": 5.567420561016629e-06, "loss": 2.0818, "step": 233000 }, { "epoch": 0.87, "learning_rate": 5.420019596236003e-06, "loss": 2.0723, "step": 233500 }, { "epoch": 0.87, "learning_rate": 5.274484283575371e-06, "loss": 2.0833, "step": 234000 }, { "epoch": 0.87, "learning_rate": 5.1308207136220755e-06, "loss": 2.0764, "step": 234500 }, { "epoch": 0.87, "learning_rate": 4.989034898631872e-06, "loss": 2.0724, "step": 235000 }, { "epoch": 0.87, "learning_rate": 4.84913277227731e-06, "loss": 2.0718, "step": 235500 }, { "epoch": 0.87, "learning_rate": 4.711120189399443e-06, "loss": 2.0752, "step": 236000 }, { "epoch": 0.88, "learning_rate": 4.575002925762739e-06, "loss": 2.0718, "step": 236500 }, { "epoch": 0.88, "learning_rate": 4.440786677813458e-06, "loss": 2.0747, "step": 237000 }, { "epoch": 0.88, "learning_rate": 4.308477062441168e-06, "loss": 2.0714, "step": 237500 }, { "epoch": 0.88, "learning_rate": 4.17807961674373e-06, "loss": 2.0652, "step": 238000 }, { "epoch": 0.88, "learning_rate": 4.049599797795589e-06, "loss": 2.0726, "step": 238500 }, { "epoch": 0.89, "learning_rate": 3.923042982419334e-06, "loss": 2.0692, "step": 239000 }, { "epoch": 0.89, "learning_rate": 3.798414466960759e-06, "loss": 2.076, "step": 239500 }, { "epoch": 0.89, "learning_rate": 3.6757194670671513e-06, "loss": 2.0695, "step": 240000 }, { "epoch": 0.89, "learning_rate": 3.55496311746904e-06, "loss": 2.066, "step": 240500 }, { "epoch": 0.89, "learning_rate": 3.436150471765326e-06, "loss": 2.0671, "step": 241000 }, { "epoch": 0.9, "learning_rate": 3.319286502211766e-06, "loss": 2.0682, "step": 241500 }, { "epoch": 0.9, "learning_rate": 3.2043760995128903e-06, "loss": 2.0716, "step": 242000 }, { "epoch": 0.9, "learning_rate": 3.091424072617366e-06, "loss": 2.0684, "step": 242500 }, { "epoch": 0.9, "learning_rate": 2.9804351485166747e-06, "loss": 2.0631, "step": 243000 }, { "epoch": 0.9, "learning_rate": 2.8714139720473597e-06, "loss": 2.0686, "step": 243500 }, { "epoch": 0.9, "learning_rate": 2.7643651056965924e-06, "loss": 2.0651, "step": 244000 }, { "epoch": 0.91, "learning_rate": 2.659293029411264e-06, "loss": 2.0695, "step": 244500 }, { "epoch": 0.91, "learning_rate": 2.556202140410474e-06, "loss": 2.0704, "step": 245000 }, { "epoch": 0.91, "learning_rate": 2.4550967530015623e-06, "loss": 2.0668, "step": 245500 }, { "epoch": 0.91, "learning_rate": 2.3559810983994732e-06, "loss": 2.0656, "step": 246000 }, { "epoch": 0.91, "learning_rate": 2.258859324549778e-06, "loss": 2.0691, "step": 246500 }, { "epoch": 0.92, "learning_rate": 2.1637354959549884e-06, "loss": 2.0589, "step": 247000 }, { "epoch": 0.92, "learning_rate": 2.0706135935045333e-06, "loss": 2.067, "step": 247500 }, { "epoch": 0.92, "learning_rate": 1.9794975143081264e-06, "loss": 2.0654, "step": 248000 }, { "epoch": 0.92, "learning_rate": 1.8903910715326823e-06, "loss": 2.0635, "step": 248500 }, { "epoch": 0.92, "learning_rate": 1.8032979942427475e-06, "loss": 2.0673, "step": 249000 }, { "epoch": 0.92, "learning_rate": 1.7182219272444011e-06, "loss": 2.0662, "step": 249500 }, { "epoch": 0.93, "learning_rate": 1.635166430932772e-06, "loss": 2.0697, "step": 250000 }, { "epoch": 0.93, "eval_loss": 1.9344754219055176, "eval_runtime": 284.7729, "eval_samples_per_second": 351.157, "eval_steps_per_second": 43.895, "step": 250000 }, { "epoch": 0.93, "learning_rate": 1.5541349811430016e-06, "loss": 2.061, "step": 250500 }, { "epoch": 0.93, "learning_rate": 1.475130969004812e-06, "loss": 2.0696, "step": 251000 }, { "epoch": 0.93, "learning_rate": 1.3981577008005563e-06, "loss": 2.0691, "step": 251500 }, { "epoch": 0.93, "learning_rate": 1.3232183978268698e-06, "loss": 2.0622, "step": 252000 }, { "epoch": 0.94, "learning_rate": 1.2503161962598653e-06, "loss": 2.0714, "step": 252500 }, { "epoch": 0.94, "learning_rate": 1.1794541470238729e-06, "loss": 2.0664, "step": 253000 }, { "epoch": 0.94, "learning_rate": 1.1106352156637633e-06, "loss": 2.0606, "step": 253500 }, { "epoch": 0.94, "learning_rate": 1.0438622822208478e-06, "loss": 2.0645, "step": 254000 }, { "epoch": 0.94, "learning_rate": 9.791381411123513e-07, "loss": 2.0724, "step": 254500 }, { "epoch": 0.95, "learning_rate": 9.164655010144518e-07, "loss": 2.0629, "step": 255000 }, { "epoch": 0.95, "learning_rate": 8.558469847489314e-07, "loss": 2.0688, "step": 255500 }, { "epoch": 0.95, "learning_rate": 7.972851291734252e-07, "loss": 2.0566, "step": 256000 }, { "epoch": 0.95, "learning_rate": 7.407823850752338e-07, "loss": 2.0689, "step": 256500 }, { "epoch": 0.95, "learning_rate": 6.863411170687673e-07, "loss": 2.071, "step": 257000 }, { "epoch": 0.95, "learning_rate": 6.339636034966123e-07, "loss": 2.0665, "step": 257500 }, { "epoch": 0.96, "learning_rate": 5.836520363341258e-07, "loss": 2.0606, "step": 258000 }, { "epoch": 0.96, "learning_rate": 5.354085210977633e-07, "loss": 2.0646, "step": 258500 }, { "epoch": 0.96, "learning_rate": 4.892350767569276e-07, "loss": 2.0663, "step": 259000 }, { "epoch": 0.96, "learning_rate": 4.451336356494862e-07, "loss": 2.0683, "step": 259500 }, { "epoch": 0.96, "learning_rate": 4.031060434009026e-07, "loss": 2.0595, "step": 260000 }, { "epoch": 0.97, "learning_rate": 3.631540588470039e-07, "loss": 2.0691, "step": 260500 }, { "epoch": 0.97, "learning_rate": 3.252793539603671e-07, "loss": 2.0651, "step": 261000 }, { "epoch": 0.97, "learning_rate": 2.8948351378034753e-07, "loss": 2.0611, "step": 261500 }, { "epoch": 0.97, "learning_rate": 2.557680363467485e-07, "loss": 2.0733, "step": 262000 }, { "epoch": 0.97, "learning_rate": 2.2413433263713257e-07, "loss": 2.0591, "step": 262500 }, { "epoch": 0.97, "learning_rate": 1.9458372650776325e-07, "loss": 2.0626, "step": 263000 }, { "epoch": 0.98, "learning_rate": 1.6711745463821593e-07, "loss": 2.0669, "step": 263500 }, { "epoch": 0.98, "learning_rate": 1.4173666647959715e-07, "loss": 2.0701, "step": 264000 }, { "epoch": 0.98, "learning_rate": 1.1844242420647745e-07, "loss": 2.0679, "step": 264500 }, { "epoch": 0.98, "learning_rate": 9.723570267241577e-08, "loss": 2.0679, "step": 265000 }, { "epoch": 0.98, "learning_rate": 7.811738936916447e-08, "loss": 2.0643, "step": 265500 }, { "epoch": 0.99, "learning_rate": 6.108828438952663e-08, "loss": 2.0615, "step": 266000 }, { "epoch": 0.99, "learning_rate": 4.614910039389409e-08, "loss": 2.0511, "step": 266500 }, { "epoch": 0.99, "learning_rate": 3.3300462580387884e-08, "loss": 2.0594, "step": 267000 }, { "epoch": 0.99, "learning_rate": 2.254290865871811e-08, "loss": 2.0665, "step": 267500 }, { "epoch": 0.99, "learning_rate": 1.3876888827679679e-08, "loss": 2.0572, "step": 268000 }, { "epoch": 1.0, "learning_rate": 7.302765756300733e-09, "loss": 2.0682, "step": 268500 }, { "epoch": 1.0, "learning_rate": 2.820814568671448e-09, "loss": 2.0611, "step": 269000 }, { "epoch": 1.0, "learning_rate": 4.31222832436573e-10, "loss": 2.0658, "step": 269500 }, { "epoch": 1.0, "step": 269794, "total_flos": 7.264837577610363e+18, "train_loss": 1.8196985500431868, "train_runtime": 434920.5349, "train_samples_per_second": 148.879, "train_steps_per_second": 0.62 } ], "max_steps": 269794, "num_train_epochs": 1, "total_flos": 7.264837577610363e+18, "trial_name": null, "trial_params": null }