{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.613999632555576, "global_step": 72000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 9.999999632555577e-05, "loss": 1.5591, "step": 200 }, { "epoch": 0.04, "learning_rate": 9.999999265111153e-05, "loss": 1.3368, "step": 400 }, { "epoch": 0.06, "learning_rate": 9.999998897666729e-05, "loss": 1.2706, "step": 600 }, { "epoch": 0.07, "learning_rate": 9.999998530222305e-05, "loss": 1.2055, "step": 800 }, { "epoch": 0.09, "learning_rate": 9.999998162777881e-05, "loss": 1.1738, "step": 1000 }, { "epoch": 0.11, "learning_rate": 9.999997795333457e-05, "loss": 1.1357, "step": 1200 }, { "epoch": 0.13, "learning_rate": 9.999997427889031e-05, "loss": 1.1226, "step": 1400 }, { "epoch": 0.15, "learning_rate": 9.999997060444607e-05, "loss": 1.0901, "step": 1600 }, { "epoch": 0.17, "learning_rate": 9.999996693000185e-05, "loss": 1.0793, "step": 1800 }, { "epoch": 0.18, "learning_rate": 9.999996325555761e-05, "loss": 1.0587, "step": 2000 }, { "epoch": 0.2, "learning_rate": 9.999995958111336e-05, "loss": 1.0522, "step": 2200 }, { "epoch": 0.22, "learning_rate": 9.999995590666912e-05, "loss": 1.0277, "step": 2400 }, { "epoch": 0.24, "learning_rate": 9.999995223222488e-05, "loss": 1.0177, "step": 2600 }, { "epoch": 0.26, "learning_rate": 9.999994855778064e-05, "loss": 1.0164, "step": 2800 }, { "epoch": 0.28, "learning_rate": 9.999994488333641e-05, "loss": 0.9814, "step": 3000 }, { "epoch": 0.29, "learning_rate": 9.999994120889216e-05, "loss": 0.9802, "step": 3200 }, { "epoch": 0.31, "learning_rate": 9.999993753444792e-05, "loss": 0.9808, "step": 3400 }, { "epoch": 0.33, "learning_rate": 9.999993386000368e-05, "loss": 0.9686, "step": 3600 }, { "epoch": 0.35, "learning_rate": 9.999993018555944e-05, "loss": 0.9531, "step": 3800 }, { "epoch": 0.37, "learning_rate": 9.99999265111152e-05, "loss": 0.9332, "step": 4000 }, { "epoch": 0.39, "learning_rate": 9.999992283667096e-05, "loss": 0.9486, "step": 4200 }, { "epoch": 0.4, "learning_rate": 9.999991916222672e-05, "loss": 0.9145, "step": 4400 }, { "epoch": 0.42, "learning_rate": 9.999991548778248e-05, "loss": 0.9164, "step": 4600 }, { "epoch": 0.44, "learning_rate": 9.999991181333824e-05, "loss": 0.8984, "step": 4800 }, { "epoch": 0.46, "learning_rate": 9.999990813889399e-05, "loss": 0.9016, "step": 5000 }, { "epoch": 0.48, "learning_rate": 9.999990446444975e-05, "loss": 0.9019, "step": 5200 }, { "epoch": 0.5, "learning_rate": 9.999990079000552e-05, "loss": 0.8835, "step": 5400 }, { "epoch": 0.51, "learning_rate": 9.999989711556128e-05, "loss": 0.8829, "step": 5600 }, { "epoch": 0.53, "learning_rate": 9.999989344111703e-05, "loss": 0.8872, "step": 5800 }, { "epoch": 0.55, "learning_rate": 9.999988976667279e-05, "loss": 0.8895, "step": 6000 }, { "epoch": 0.57, "learning_rate": 9.999988609222855e-05, "loss": 0.8533, "step": 6200 }, { "epoch": 0.59, "learning_rate": 9.999988241778431e-05, "loss": 0.8515, "step": 6400 }, { "epoch": 0.61, "learning_rate": 9.999987874334007e-05, "loss": 0.8682, "step": 6600 }, { "epoch": 0.62, "learning_rate": 9.999987506889583e-05, "loss": 0.8345, "step": 6800 }, { "epoch": 0.64, "learning_rate": 9.99998713944516e-05, "loss": 0.8614, "step": 7000 }, { "epoch": 0.66, "learning_rate": 9.999986772000735e-05, "loss": 0.8481, "step": 7200 }, { "epoch": 0.68, "learning_rate": 9.999986404556311e-05, "loss": 0.8479, "step": 7400 }, { "epoch": 0.7, "learning_rate": 9.999986037111887e-05, "loss": 0.8467, "step": 7600 }, { "epoch": 0.72, "learning_rate": 9.999985669667464e-05, "loss": 0.8441, "step": 7800 }, { "epoch": 0.73, "learning_rate": 9.99998530222304e-05, "loss": 0.8087, "step": 8000 }, { "epoch": 0.75, "learning_rate": 9.999984934778616e-05, "loss": 0.8202, "step": 8200 }, { "epoch": 0.77, "learning_rate": 9.99998456733419e-05, "loss": 0.8231, "step": 8400 }, { "epoch": 0.79, "learning_rate": 9.999984199889766e-05, "loss": 0.8188, "step": 8600 }, { "epoch": 0.81, "learning_rate": 9.999983832445344e-05, "loss": 0.8103, "step": 8800 }, { "epoch": 0.83, "learning_rate": 9.99998346500092e-05, "loss": 0.8158, "step": 9000 }, { "epoch": 0.85, "learning_rate": 9.999983097556495e-05, "loss": 0.808, "step": 9200 }, { "epoch": 0.86, "learning_rate": 9.99998273011207e-05, "loss": 0.8146, "step": 9400 }, { "epoch": 0.88, "learning_rate": 9.999982362667647e-05, "loss": 0.797, "step": 9600 }, { "epoch": 0.9, "learning_rate": 9.999981995223223e-05, "loss": 0.7784, "step": 9800 }, { "epoch": 0.92, "learning_rate": 9.9999816277788e-05, "loss": 0.7864, "step": 10000 }, { "epoch": 0.94, "learning_rate": 9.999981260334375e-05, "loss": 0.7987, "step": 10200 }, { "epoch": 0.96, "learning_rate": 9.999980892889951e-05, "loss": 0.7757, "step": 10400 }, { "epoch": 0.97, "learning_rate": 9.999980525445527e-05, "loss": 0.7812, "step": 10600 }, { "epoch": 0.99, "learning_rate": 9.999980158001103e-05, "loss": 0.7801, "step": 10800 }, { "epoch": 1.0, "eval_loss": 0.6798496246337891, "eval_runtime": 1461.6898, "eval_samples_per_second": 119.156, "eval_steps_per_second": 7.448, "step": 10886 }, { "epoch": 1.01, "learning_rate": 9.999979790556679e-05, "loss": 0.7013, "step": 11000 }, { "epoch": 1.03, "learning_rate": 9.999979423112255e-05, "loss": 0.6525, "step": 11200 }, { "epoch": 1.05, "learning_rate": 9.999979055667831e-05, "loss": 0.6677, "step": 11400 }, { "epoch": 1.07, "learning_rate": 9.999978688223407e-05, "loss": 0.6573, "step": 11600 }, { "epoch": 1.08, "learning_rate": 9.999978320778982e-05, "loss": 0.652, "step": 11800 }, { "epoch": 1.1, "learning_rate": 9.999977953334558e-05, "loss": 0.6554, "step": 12000 }, { "epoch": 1.12, "learning_rate": 9.999977585890135e-05, "loss": 0.6593, "step": 12200 }, { "epoch": 1.14, "learning_rate": 9.999977218445711e-05, "loss": 0.6634, "step": 12400 }, { "epoch": 1.16, "learning_rate": 9.999976851001287e-05, "loss": 0.653, "step": 12600 }, { "epoch": 1.18, "learning_rate": 9.999976483556862e-05, "loss": 0.6617, "step": 12800 }, { "epoch": 1.19, "learning_rate": 9.999976116112438e-05, "loss": 0.6567, "step": 13000 }, { "epoch": 1.21, "learning_rate": 9.999975748668014e-05, "loss": 0.6683, "step": 13200 }, { "epoch": 1.23, "learning_rate": 9.999975381223591e-05, "loss": 0.6681, "step": 13400 }, { "epoch": 1.25, "learning_rate": 9.999975013779166e-05, "loss": 0.6501, "step": 13600 }, { "epoch": 1.27, "learning_rate": 9.999974646334742e-05, "loss": 0.664, "step": 13800 }, { "epoch": 1.29, "learning_rate": 9.999974278890318e-05, "loss": 0.6702, "step": 14000 }, { "epoch": 1.3, "learning_rate": 9.999973911445894e-05, "loss": 0.6535, "step": 14200 }, { "epoch": 1.32, "learning_rate": 9.99997354400147e-05, "loss": 0.6614, "step": 14400 }, { "epoch": 1.34, "learning_rate": 9.999973176557046e-05, "loss": 0.6584, "step": 14600 }, { "epoch": 1.36, "learning_rate": 9.999972809112622e-05, "loss": 0.649, "step": 14800 }, { "epoch": 1.38, "learning_rate": 9.999972441668198e-05, "loss": 0.6489, "step": 15000 }, { "epoch": 1.4, "learning_rate": 9.999972074223775e-05, "loss": 0.6568, "step": 15200 }, { "epoch": 1.41, "learning_rate": 9.999971706779349e-05, "loss": 0.645, "step": 15400 }, { "epoch": 1.43, "learning_rate": 9.999971339334925e-05, "loss": 0.6554, "step": 15600 }, { "epoch": 1.45, "learning_rate": 9.999970971890503e-05, "loss": 0.6395, "step": 15800 }, { "epoch": 1.47, "learning_rate": 9.999970604446079e-05, "loss": 0.6497, "step": 16000 }, { "epoch": 1.49, "learning_rate": 9.999970237001653e-05, "loss": 0.6485, "step": 16200 }, { "epoch": 1.51, "learning_rate": 9.99996986955723e-05, "loss": 0.6383, "step": 16400 }, { "epoch": 1.52, "learning_rate": 9.999969502112805e-05, "loss": 0.64, "step": 16600 }, { "epoch": 1.54, "learning_rate": 9.999969134668382e-05, "loss": 0.6318, "step": 16800 }, { "epoch": 1.56, "learning_rate": 9.999968767223959e-05, "loss": 0.6381, "step": 17000 }, { "epoch": 1.58, "learning_rate": 9.999968399779534e-05, "loss": 0.6484, "step": 17200 }, { "epoch": 1.6, "learning_rate": 9.99996803233511e-05, "loss": 0.6491, "step": 17400 }, { "epoch": 1.62, "learning_rate": 9.999967664890686e-05, "loss": 0.6428, "step": 17600 }, { "epoch": 1.64, "learning_rate": 9.999967297446262e-05, "loss": 0.6356, "step": 17800 }, { "epoch": 1.65, "learning_rate": 9.999966930001838e-05, "loss": 0.637, "step": 18000 }, { "epoch": 1.67, "learning_rate": 9.999966562557414e-05, "loss": 0.6364, "step": 18200 }, { "epoch": 1.69, "learning_rate": 9.99996619511299e-05, "loss": 0.6473, "step": 18400 }, { "epoch": 1.71, "learning_rate": 9.999965827668566e-05, "loss": 0.6342, "step": 18600 }, { "epoch": 1.73, "learning_rate": 9.99996546022414e-05, "loss": 0.6395, "step": 18800 }, { "epoch": 1.75, "learning_rate": 9.999965092779717e-05, "loss": 0.6441, "step": 19000 }, { "epoch": 1.76, "learning_rate": 9.999964725335294e-05, "loss": 0.6461, "step": 19200 }, { "epoch": 1.78, "learning_rate": 9.99996435789087e-05, "loss": 0.6157, "step": 19400 }, { "epoch": 1.8, "learning_rate": 9.999963990446446e-05, "loss": 0.6233, "step": 19600 }, { "epoch": 1.82, "learning_rate": 9.999963623002021e-05, "loss": 0.6305, "step": 19800 }, { "epoch": 1.84, "learning_rate": 9.999963255557597e-05, "loss": 0.6277, "step": 20000 }, { "epoch": 1.86, "learning_rate": 9.999962888113173e-05, "loss": 0.6339, "step": 20200 }, { "epoch": 1.87, "learning_rate": 9.99996252066875e-05, "loss": 0.6271, "step": 20400 }, { "epoch": 1.89, "learning_rate": 9.999962153224325e-05, "loss": 0.6145, "step": 20600 }, { "epoch": 1.91, "learning_rate": 9.999961785779901e-05, "loss": 0.6299, "step": 20800 }, { "epoch": 1.93, "learning_rate": 9.999961418335477e-05, "loss": 0.6216, "step": 21000 }, { "epoch": 1.95, "learning_rate": 9.999961050891053e-05, "loss": 0.6167, "step": 21200 }, { "epoch": 1.97, "learning_rate": 9.999960683446629e-05, "loss": 0.6254, "step": 21400 }, { "epoch": 1.98, "learning_rate": 9.999960316002205e-05, "loss": 0.6376, "step": 21600 }, { "epoch": 2.0, "eval_loss": 0.5107570290565491, "eval_runtime": 1433.9446, "eval_samples_per_second": 121.461, "eval_steps_per_second": 7.592, "step": 21772 }, { "epoch": 2.0, "learning_rate": 9.999959948557781e-05, "loss": 0.6037, "step": 21800 }, { "epoch": 2.02, "learning_rate": 9.999959581113357e-05, "loss": 0.4903, "step": 22000 }, { "epoch": 2.04, "learning_rate": 9.999959213668933e-05, "loss": 0.4901, "step": 22200 }, { "epoch": 2.06, "learning_rate": 9.999958846224508e-05, "loss": 0.4826, "step": 22400 }, { "epoch": 2.08, "learning_rate": 9.999958478780085e-05, "loss": 0.4881, "step": 22600 }, { "epoch": 2.09, "learning_rate": 9.999958111335662e-05, "loss": 0.4921, "step": 22800 }, { "epoch": 2.11, "learning_rate": 9.999957743891238e-05, "loss": 0.502, "step": 23000 }, { "epoch": 2.13, "learning_rate": 9.999957376446812e-05, "loss": 0.4976, "step": 23200 }, { "epoch": 2.15, "learning_rate": 9.999957009002388e-05, "loss": 0.491, "step": 23400 }, { "epoch": 2.17, "learning_rate": 9.999956641557964e-05, "loss": 0.5033, "step": 23600 }, { "epoch": 2.19, "learning_rate": 9.99995627411354e-05, "loss": 0.4924, "step": 23800 }, { "epoch": 2.2, "learning_rate": 9.999955906669118e-05, "loss": 0.5026, "step": 24000 }, { "epoch": 2.22, "learning_rate": 9.999955539224692e-05, "loss": 0.4966, "step": 24200 }, { "epoch": 2.24, "learning_rate": 9.999955171780269e-05, "loss": 0.4963, "step": 24400 }, { "epoch": 2.26, "learning_rate": 9.999954804335845e-05, "loss": 0.5072, "step": 24600 }, { "epoch": 2.28, "learning_rate": 9.99995443689142e-05, "loss": 0.4907, "step": 24800 }, { "epoch": 2.3, "learning_rate": 9.999954069446997e-05, "loss": 0.4938, "step": 25000 }, { "epoch": 2.31, "learning_rate": 9.999953702002573e-05, "loss": 0.5035, "step": 25200 }, { "epoch": 2.33, "learning_rate": 9.999953334558149e-05, "loss": 0.5006, "step": 25400 }, { "epoch": 2.35, "learning_rate": 9.999952967113725e-05, "loss": 0.4992, "step": 25600 }, { "epoch": 2.37, "learning_rate": 9.9999525996693e-05, "loss": 0.5109, "step": 25800 }, { "epoch": 2.39, "learning_rate": 9.999952232224876e-05, "loss": 0.4994, "step": 26000 }, { "epoch": 2.41, "learning_rate": 9.999951864780453e-05, "loss": 0.4925, "step": 26200 }, { "epoch": 2.43, "learning_rate": 9.999951497336029e-05, "loss": 0.5073, "step": 26400 }, { "epoch": 2.44, "learning_rate": 9.999951129891605e-05, "loss": 0.5061, "step": 26600 }, { "epoch": 2.46, "learning_rate": 9.99995076244718e-05, "loss": 0.4954, "step": 26800 }, { "epoch": 2.48, "learning_rate": 9.999950395002756e-05, "loss": 0.5123, "step": 27000 }, { "epoch": 2.5, "learning_rate": 9.999950027558332e-05, "loss": 0.5049, "step": 27200 }, { "epoch": 2.52, "learning_rate": 9.999949660113909e-05, "loss": 0.4972, "step": 27400 }, { "epoch": 2.54, "learning_rate": 9.999949292669484e-05, "loss": 0.505, "step": 27600 }, { "epoch": 2.55, "learning_rate": 9.99994892522506e-05, "loss": 0.52, "step": 27800 }, { "epoch": 2.57, "learning_rate": 9.999948557780636e-05, "loss": 0.5077, "step": 28000 }, { "epoch": 2.59, "learning_rate": 9.999948190336212e-05, "loss": 0.5159, "step": 28200 }, { "epoch": 2.61, "learning_rate": 9.999947822891788e-05, "loss": 0.5054, "step": 28400 }, { "epoch": 2.63, "learning_rate": 9.999947455447364e-05, "loss": 0.4999, "step": 28600 }, { "epoch": 2.65, "learning_rate": 9.99994708800294e-05, "loss": 0.5035, "step": 28800 }, { "epoch": 2.66, "learning_rate": 9.999946720558516e-05, "loss": 0.5041, "step": 29000 }, { "epoch": 2.68, "learning_rate": 9.999946353114092e-05, "loss": 0.4998, "step": 29200 }, { "epoch": 2.7, "learning_rate": 9.999945985669667e-05, "loss": 0.5098, "step": 29400 }, { "epoch": 2.72, "learning_rate": 9.999945618225244e-05, "loss": 0.5102, "step": 29600 }, { "epoch": 2.74, "learning_rate": 9.99994525078082e-05, "loss": 0.5031, "step": 29800 }, { "epoch": 2.76, "learning_rate": 9.999944883336396e-05, "loss": 0.5015, "step": 30000 }, { "epoch": 2.77, "learning_rate": 9.999944515891971e-05, "loss": 0.505, "step": 30200 }, { "epoch": 2.79, "learning_rate": 9.999944148447547e-05, "loss": 0.5146, "step": 30400 }, { "epoch": 2.81, "learning_rate": 9.999943781003123e-05, "loss": 0.5101, "step": 30600 }, { "epoch": 2.83, "learning_rate": 9.9999434135587e-05, "loss": 0.5155, "step": 30800 }, { "epoch": 2.85, "learning_rate": 9.999943046114277e-05, "loss": 0.5076, "step": 31000 }, { "epoch": 2.87, "learning_rate": 9.999942678669851e-05, "loss": 0.5157, "step": 31200 }, { "epoch": 2.88, "learning_rate": 9.999942311225427e-05, "loss": 0.5046, "step": 31400 }, { "epoch": 2.9, "learning_rate": 9.999941943781003e-05, "loss": 0.5078, "step": 31600 }, { "epoch": 2.92, "learning_rate": 9.99994157633658e-05, "loss": 0.5096, "step": 31800 }, { "epoch": 2.94, "learning_rate": 9.999941208892156e-05, "loss": 0.5057, "step": 32000 }, { "epoch": 2.96, "learning_rate": 9.999940841447732e-05, "loss": 0.5171, "step": 32200 }, { "epoch": 2.98, "learning_rate": 9.999940474003308e-05, "loss": 0.5212, "step": 32400 }, { "epoch": 2.99, "learning_rate": 9.999940106558884e-05, "loss": 0.5132, "step": 32600 }, { "epoch": 3.0, "eval_loss": 0.39297839999198914, "eval_runtime": 1401.043, "eval_samples_per_second": 124.314, "eval_steps_per_second": 7.77, "step": 32658 }, { "epoch": 3.01, "learning_rate": 9.999939739114458e-05, "loss": 0.4123, "step": 32800 }, { "epoch": 3.03, "learning_rate": 9.999939371670036e-05, "loss": 0.3688, "step": 33000 }, { "epoch": 3.05, "learning_rate": 9.999939004225612e-05, "loss": 0.3752, "step": 33200 }, { "epoch": 3.07, "learning_rate": 9.999938636781188e-05, "loss": 0.3733, "step": 33400 }, { "epoch": 3.09, "learning_rate": 9.999938269336764e-05, "loss": 0.371, "step": 33600 }, { "epoch": 3.1, "learning_rate": 9.999937901892339e-05, "loss": 0.3838, "step": 33800 }, { "epoch": 3.12, "learning_rate": 9.999937534447915e-05, "loss": 0.3865, "step": 34000 }, { "epoch": 3.14, "learning_rate": 9.999937167003491e-05, "loss": 0.3859, "step": 34200 }, { "epoch": 3.16, "learning_rate": 9.999936799559068e-05, "loss": 0.3882, "step": 34400 }, { "epoch": 3.18, "learning_rate": 9.999936432114643e-05, "loss": 0.3942, "step": 34600 }, { "epoch": 3.2, "learning_rate": 9.999936064670219e-05, "loss": 0.3843, "step": 34800 }, { "epoch": 3.22, "learning_rate": 9.999935697225795e-05, "loss": 0.3859, "step": 35000 }, { "epoch": 3.23, "learning_rate": 9.999935329781371e-05, "loss": 0.3947, "step": 35200 }, { "epoch": 3.25, "learning_rate": 9.999934962336947e-05, "loss": 0.3934, "step": 35400 }, { "epoch": 3.27, "learning_rate": 9.999934594892523e-05, "loss": 0.3932, "step": 35600 }, { "epoch": 3.29, "learning_rate": 9.999934227448099e-05, "loss": 0.3977, "step": 35800 }, { "epoch": 3.31, "learning_rate": 9.999933860003675e-05, "loss": 0.4046, "step": 36000 }, { "epoch": 3.33, "learning_rate": 9.999933492559251e-05, "loss": 0.3961, "step": 36200 }, { "epoch": 3.34, "learning_rate": 9.999933125114826e-05, "loss": 0.398, "step": 36400 }, { "epoch": 3.36, "learning_rate": 9.999932757670403e-05, "loss": 0.393, "step": 36600 }, { "epoch": 3.38, "learning_rate": 9.999932390225979e-05, "loss": 0.4043, "step": 36800 }, { "epoch": 3.4, "learning_rate": 9.999932022781555e-05, "loss": 0.4042, "step": 37000 }, { "epoch": 3.42, "learning_rate": 9.99993165533713e-05, "loss": 0.3996, "step": 37200 }, { "epoch": 3.44, "learning_rate": 9.999931287892706e-05, "loss": 0.3995, "step": 37400 }, { "epoch": 3.45, "learning_rate": 9.999930920448282e-05, "loss": 0.398, "step": 37600 }, { "epoch": 3.47, "learning_rate": 9.99993055300386e-05, "loss": 0.3978, "step": 37800 }, { "epoch": 3.49, "learning_rate": 9.999930185559434e-05, "loss": 0.4039, "step": 38000 }, { "epoch": 3.51, "learning_rate": 9.99992981811501e-05, "loss": 0.3983, "step": 38200 }, { "epoch": 3.53, "learning_rate": 9.999929450670586e-05, "loss": 0.4099, "step": 38400 }, { "epoch": 3.55, "learning_rate": 9.999929083226162e-05, "loss": 0.3965, "step": 38600 }, { "epoch": 3.56, "learning_rate": 9.999928715781738e-05, "loss": 0.4027, "step": 38800 }, { "epoch": 3.58, "learning_rate": 9.999928348337314e-05, "loss": 0.406, "step": 39000 }, { "epoch": 3.6, "learning_rate": 9.99992798089289e-05, "loss": 0.415, "step": 39200 }, { "epoch": 3.62, "learning_rate": 9.999927613448467e-05, "loss": 0.4055, "step": 39400 }, { "epoch": 3.64, "learning_rate": 9.999927246004043e-05, "loss": 0.4058, "step": 39600 }, { "epoch": 3.66, "learning_rate": 9.999926878559617e-05, "loss": 0.4051, "step": 39800 }, { "epoch": 3.67, "learning_rate": 9.999926511115195e-05, "loss": 0.4088, "step": 40000 }, { "epoch": 3.69, "learning_rate": 9.999926143670771e-05, "loss": 0.4073, "step": 40200 }, { "epoch": 3.71, "learning_rate": 9.999925776226347e-05, "loss": 0.4075, "step": 40400 }, { "epoch": 3.73, "learning_rate": 9.999925408781923e-05, "loss": 0.4071, "step": 40600 }, { "epoch": 3.75, "learning_rate": 9.999925041337498e-05, "loss": 0.4136, "step": 40800 }, { "epoch": 3.77, "learning_rate": 9.999924673893074e-05, "loss": 0.4143, "step": 41000 }, { "epoch": 3.78, "learning_rate": 9.999924306448651e-05, "loss": 0.415, "step": 41200 }, { "epoch": 3.8, "learning_rate": 9.999923939004227e-05, "loss": 0.4157, "step": 41400 }, { "epoch": 3.82, "learning_rate": 9.999923571559802e-05, "loss": 0.406, "step": 41600 }, { "epoch": 3.84, "learning_rate": 9.999923204115378e-05, "loss": 0.416, "step": 41800 }, { "epoch": 3.86, "learning_rate": 9.999922836670954e-05, "loss": 0.4142, "step": 42000 }, { "epoch": 3.88, "learning_rate": 9.99992246922653e-05, "loss": 0.4109, "step": 42200 }, { "epoch": 3.89, "learning_rate": 9.999922101782106e-05, "loss": 0.4161, "step": 42400 }, { "epoch": 3.91, "learning_rate": 9.999921734337682e-05, "loss": 0.408, "step": 42600 }, { "epoch": 3.93, "learning_rate": 9.999921366893258e-05, "loss": 0.4162, "step": 42800 }, { "epoch": 3.95, "learning_rate": 9.999920999448834e-05, "loss": 0.4165, "step": 43000 }, { "epoch": 3.97, "learning_rate": 9.99992063200441e-05, "loss": 0.4167, "step": 43200 }, { "epoch": 3.99, "learning_rate": 9.999920264559986e-05, "loss": 0.4179, "step": 43400 }, { "epoch": 4.0, "eval_loss": 0.3084418773651123, "eval_runtime": 1411.8768, "eval_samples_per_second": 123.36, "eval_steps_per_second": 7.71, "step": 43544 }, { "epoch": 4.01, "learning_rate": 9.999919897115562e-05, "loss": 0.3814, "step": 43600 }, { "epoch": 4.02, "learning_rate": 9.999919529671138e-05, "loss": 0.2863, "step": 43800 }, { "epoch": 4.04, "learning_rate": 9.999919162226714e-05, "loss": 0.2924, "step": 44000 }, { "epoch": 4.06, "learning_rate": 9.999918794782289e-05, "loss": 0.2942, "step": 44200 }, { "epoch": 4.08, "learning_rate": 9.999918427337865e-05, "loss": 0.3034, "step": 44400 }, { "epoch": 4.1, "learning_rate": 9.999918059893441e-05, "loss": 0.3002, "step": 44600 }, { "epoch": 4.12, "learning_rate": 9.999917692449018e-05, "loss": 0.3029, "step": 44800 }, { "epoch": 4.13, "learning_rate": 9.999917325004593e-05, "loss": 0.2977, "step": 45000 }, { "epoch": 4.15, "learning_rate": 9.999916957560169e-05, "loss": 0.3062, "step": 45200 }, { "epoch": 4.17, "learning_rate": 9.999916590115745e-05, "loss": 0.3075, "step": 45400 }, { "epoch": 4.19, "learning_rate": 9.999916222671321e-05, "loss": 0.3137, "step": 45600 }, { "epoch": 4.21, "learning_rate": 9.999915855226897e-05, "loss": 0.3037, "step": 45800 }, { "epoch": 4.23, "learning_rate": 9.999915487782473e-05, "loss": 0.31, "step": 46000 }, { "epoch": 4.24, "learning_rate": 9.99991512033805e-05, "loss": 0.3141, "step": 46200 }, { "epoch": 4.26, "learning_rate": 9.999914752893625e-05, "loss": 0.3166, "step": 46400 }, { "epoch": 4.28, "learning_rate": 9.999914385449201e-05, "loss": 0.3125, "step": 46600 }, { "epoch": 4.3, "learning_rate": 9.999914018004776e-05, "loss": 0.3109, "step": 46800 }, { "epoch": 4.32, "learning_rate": 9.999913650560354e-05, "loss": 0.3169, "step": 47000 }, { "epoch": 4.34, "learning_rate": 9.99991328311593e-05, "loss": 0.3211, "step": 47200 }, { "epoch": 4.35, "learning_rate": 9.999912915671506e-05, "loss": 0.3278, "step": 47400 }, { "epoch": 4.37, "learning_rate": 9.999912548227082e-05, "loss": 0.3154, "step": 47600 }, { "epoch": 4.39, "learning_rate": 9.999912180782656e-05, "loss": 0.3167, "step": 47800 }, { "epoch": 4.41, "learning_rate": 9.999911813338232e-05, "loss": 0.3172, "step": 48000 }, { "epoch": 4.43, "learning_rate": 9.99991144589381e-05, "loss": 0.3243, "step": 48200 }, { "epoch": 4.45, "learning_rate": 9.999911078449386e-05, "loss": 0.3286, "step": 48400 }, { "epoch": 4.46, "learning_rate": 9.99991071100496e-05, "loss": 0.3268, "step": 48600 }, { "epoch": 4.48, "learning_rate": 9.999910343560537e-05, "loss": 0.3204, "step": 48800 }, { "epoch": 4.5, "learning_rate": 9.999909976116113e-05, "loss": 0.3311, "step": 49000 }, { "epoch": 4.52, "learning_rate": 9.999909608671689e-05, "loss": 0.3241, "step": 49200 }, { "epoch": 4.54, "learning_rate": 9.999909241227265e-05, "loss": 0.3305, "step": 49400 }, { "epoch": 4.56, "learning_rate": 9.999908873782841e-05, "loss": 0.3238, "step": 49600 }, { "epoch": 4.57, "learning_rate": 9.999908506338417e-05, "loss": 0.3283, "step": 49800 }, { "epoch": 4.59, "learning_rate": 9.999908138893993e-05, "loss": 0.3275, "step": 50000 }, { "epoch": 4.61, "learning_rate": 9.999907771449569e-05, "loss": 0.3302, "step": 50200 }, { "epoch": 4.63, "learning_rate": 9.999907404005145e-05, "loss": 0.3342, "step": 50400 }, { "epoch": 4.65, "learning_rate": 9.999907036560721e-05, "loss": 0.3363, "step": 50600 }, { "epoch": 4.67, "learning_rate": 9.999906669116297e-05, "loss": 0.3319, "step": 50800 }, { "epoch": 4.68, "learning_rate": 9.999906301671873e-05, "loss": 0.3288, "step": 51000 }, { "epoch": 4.7, "learning_rate": 9.999905934227448e-05, "loss": 0.3349, "step": 51200 }, { "epoch": 4.72, "learning_rate": 9.999905566783024e-05, "loss": 0.3289, "step": 51400 }, { "epoch": 4.74, "learning_rate": 9.999905199338601e-05, "loss": 0.3393, "step": 51600 }, { "epoch": 4.76, "learning_rate": 9.999904831894177e-05, "loss": 0.3388, "step": 51800 }, { "epoch": 4.78, "learning_rate": 9.999904464449752e-05, "loss": 0.3322, "step": 52000 }, { "epoch": 4.8, "learning_rate": 9.999904097005328e-05, "loss": 0.332, "step": 52200 }, { "epoch": 4.81, "learning_rate": 9.999903729560904e-05, "loss": 0.3378, "step": 52400 }, { "epoch": 4.83, "learning_rate": 9.99990336211648e-05, "loss": 0.3369, "step": 52600 }, { "epoch": 4.85, "learning_rate": 9.999902994672056e-05, "loss": 0.337, "step": 52800 }, { "epoch": 4.87, "learning_rate": 9.999902627227632e-05, "loss": 0.3434, "step": 53000 }, { "epoch": 4.89, "learning_rate": 9.999902259783208e-05, "loss": 0.3326, "step": 53200 }, { "epoch": 4.91, "learning_rate": 9.999901892338784e-05, "loss": 0.3382, "step": 53400 }, { "epoch": 4.92, "learning_rate": 9.99990152489436e-05, "loss": 0.3443, "step": 53600 }, { "epoch": 4.94, "learning_rate": 9.999901157449936e-05, "loss": 0.3405, "step": 53800 }, { "epoch": 4.96, "learning_rate": 9.999900790005512e-05, "loss": 0.3454, "step": 54000 }, { "epoch": 4.98, "learning_rate": 9.999900422561088e-05, "loss": 0.3499, "step": 54200 }, { "epoch": 5.0, "learning_rate": 9.999900055116665e-05, "loss": 0.3442, "step": 54400 }, { "epoch": 5.0, "eval_loss": 0.24815598130226135, "eval_runtime": 1424.5159, "eval_samples_per_second": 122.265, "eval_steps_per_second": 7.642, "step": 54430 }, { "epoch": 5.02, "learning_rate": 9.999899687672239e-05, "loss": 0.2559, "step": 54600 }, { "epoch": 5.03, "learning_rate": 9.999899320227815e-05, "loss": 0.2383, "step": 54800 }, { "epoch": 5.05, "learning_rate": 9.999898952783391e-05, "loss": 0.2416, "step": 55000 }, { "epoch": 5.07, "learning_rate": 9.999898585338969e-05, "loss": 0.237, "step": 55200 }, { "epoch": 5.09, "learning_rate": 9.999898217894545e-05, "loss": 0.2445, "step": 55400 }, { "epoch": 5.11, "learning_rate": 9.99989785045012e-05, "loss": 0.2412, "step": 55600 }, { "epoch": 5.13, "learning_rate": 9.999897483005695e-05, "loss": 0.2431, "step": 55800 }, { "epoch": 5.14, "learning_rate": 9.999897115561272e-05, "loss": 0.2449, "step": 56000 }, { "epoch": 5.16, "learning_rate": 9.999896748116848e-05, "loss": 0.2461, "step": 56200 }, { "epoch": 5.18, "learning_rate": 9.999896380672424e-05, "loss": 0.2527, "step": 56400 }, { "epoch": 5.2, "learning_rate": 9.999896013228e-05, "loss": 0.2522, "step": 56600 }, { "epoch": 5.22, "learning_rate": 9.999895645783576e-05, "loss": 0.2485, "step": 56800 }, { "epoch": 5.24, "learning_rate": 9.999895278339152e-05, "loss": 0.2517, "step": 57000 }, { "epoch": 5.25, "learning_rate": 9.999894910894728e-05, "loss": 0.2572, "step": 57200 }, { "epoch": 5.27, "learning_rate": 9.999894543450304e-05, "loss": 0.2574, "step": 57400 }, { "epoch": 5.29, "learning_rate": 9.99989417600588e-05, "loss": 0.2551, "step": 57600 }, { "epoch": 5.31, "learning_rate": 9.999893808561456e-05, "loss": 0.2584, "step": 57800 }, { "epoch": 5.33, "learning_rate": 9.999893441117032e-05, "loss": 0.2607, "step": 58000 }, { "epoch": 5.35, "learning_rate": 9.999893073672607e-05, "loss": 0.2631, "step": 58200 }, { "epoch": 5.36, "learning_rate": 9.999892706228183e-05, "loss": 0.2609, "step": 58400 }, { "epoch": 5.38, "learning_rate": 9.99989233878376e-05, "loss": 0.265, "step": 58600 }, { "epoch": 5.4, "learning_rate": 9.999891971339336e-05, "loss": 0.2625, "step": 58800 }, { "epoch": 5.42, "learning_rate": 9.999891603894911e-05, "loss": 0.2648, "step": 59000 }, { "epoch": 5.44, "learning_rate": 9.999891236450487e-05, "loss": 0.2677, "step": 59200 }, { "epoch": 5.46, "learning_rate": 9.999890869006063e-05, "loss": 0.2667, "step": 59400 }, { "epoch": 5.47, "learning_rate": 9.999890501561639e-05, "loss": 0.2623, "step": 59600 }, { "epoch": 5.49, "learning_rate": 9.999890134117216e-05, "loss": 0.2713, "step": 59800 }, { "epoch": 5.51, "learning_rate": 9.999889766672791e-05, "loss": 0.2659, "step": 60000 }, { "epoch": 5.53, "learning_rate": 9.999889399228367e-05, "loss": 0.2688, "step": 60200 }, { "epoch": 5.55, "learning_rate": 9.999889031783943e-05, "loss": 0.2716, "step": 60400 }, { "epoch": 5.57, "learning_rate": 9.999888664339519e-05, "loss": 0.2723, "step": 60600 }, { "epoch": 5.59, "learning_rate": 9.999888296895095e-05, "loss": 0.2724, "step": 60800 }, { "epoch": 5.6, "learning_rate": 9.999887929450671e-05, "loss": 0.2697, "step": 61000 }, { "epoch": 5.62, "learning_rate": 9.999887562006247e-05, "loss": 0.2749, "step": 61200 }, { "epoch": 5.64, "learning_rate": 9.999887194561823e-05, "loss": 0.273, "step": 61400 }, { "epoch": 5.66, "learning_rate": 9.999886827117398e-05, "loss": 0.2816, "step": 61600 }, { "epoch": 5.68, "learning_rate": 9.999886459672974e-05, "loss": 0.2742, "step": 61800 }, { "epoch": 5.7, "learning_rate": 9.999886092228552e-05, "loss": 0.2788, "step": 62000 }, { "epoch": 5.71, "learning_rate": 9.999885724784128e-05, "loss": 0.2782, "step": 62200 }, { "epoch": 5.73, "learning_rate": 9.999885357339704e-05, "loss": 0.2804, "step": 62400 }, { "epoch": 5.75, "learning_rate": 9.999884989895278e-05, "loss": 0.2777, "step": 62600 }, { "epoch": 5.77, "learning_rate": 9.999884622450854e-05, "loss": 0.2822, "step": 62800 }, { "epoch": 5.79, "learning_rate": 9.99988425500643e-05, "loss": 0.2811, "step": 63000 }, { "epoch": 5.81, "learning_rate": 9.999883887562006e-05, "loss": 0.2795, "step": 63200 }, { "epoch": 5.82, "learning_rate": 9.999883520117583e-05, "loss": 0.2846, "step": 63400 }, { "epoch": 5.84, "learning_rate": 9.999883152673159e-05, "loss": 0.2887, "step": 63600 }, { "epoch": 5.86, "learning_rate": 9.999882785228735e-05, "loss": 0.2811, "step": 63800 }, { "epoch": 5.88, "learning_rate": 9.99988241778431e-05, "loss": 0.283, "step": 64000 }, { "epoch": 5.9, "learning_rate": 9.999882050339887e-05, "loss": 0.2822, "step": 64200 }, { "epoch": 5.92, "learning_rate": 9.999881682895463e-05, "loss": 0.2853, "step": 64400 }, { "epoch": 5.93, "learning_rate": 9.999881315451039e-05, "loss": 0.2869, "step": 64600 }, { "epoch": 5.95, "learning_rate": 9.999880948006615e-05, "loss": 0.283, "step": 64800 }, { "epoch": 5.97, "learning_rate": 9.999880580562191e-05, "loss": 0.2893, "step": 65000 }, { "epoch": 5.99, "learning_rate": 9.999880213117766e-05, "loss": 0.2863, "step": 65200 }, { "epoch": 6.0, "eval_loss": 0.20682939887046814, "eval_runtime": 1457.2635, "eval_samples_per_second": 119.518, "eval_steps_per_second": 7.47, "step": 65316 }, { "epoch": 6.01, "learning_rate": 9.999879845673342e-05, "loss": 0.2505, "step": 65400 }, { "epoch": 6.03, "learning_rate": 9.999879478228919e-05, "loss": 0.1957, "step": 65600 }, { "epoch": 6.04, "learning_rate": 9.999879110784495e-05, "loss": 0.1986, "step": 65800 }, { "epoch": 6.06, "learning_rate": 9.99987874334007e-05, "loss": 0.1984, "step": 66000 }, { "epoch": 6.08, "learning_rate": 9.999878375895646e-05, "loss": 0.1983, "step": 66200 }, { "epoch": 6.1, "learning_rate": 9.999878008451222e-05, "loss": 0.203, "step": 66400 }, { "epoch": 6.12, "learning_rate": 9.999877641006798e-05, "loss": 0.203, "step": 66600 }, { "epoch": 6.14, "learning_rate": 9.999877273562375e-05, "loss": 0.2047, "step": 66800 }, { "epoch": 6.15, "learning_rate": 9.99987690611795e-05, "loss": 0.208, "step": 67000 }, { "epoch": 6.17, "learning_rate": 9.999876538673526e-05, "loss": 0.2103, "step": 67200 }, { "epoch": 6.19, "learning_rate": 9.999876171229102e-05, "loss": 0.2141, "step": 67400 }, { "epoch": 6.21, "learning_rate": 9.999875803784678e-05, "loss": 0.2101, "step": 67600 }, { "epoch": 6.23, "learning_rate": 9.999875436340254e-05, "loss": 0.2157, "step": 67800 }, { "epoch": 6.25, "learning_rate": 9.99987506889583e-05, "loss": 0.2173, "step": 68000 }, { "epoch": 6.26, "learning_rate": 9.999874701451406e-05, "loss": 0.2143, "step": 68200 }, { "epoch": 6.28, "learning_rate": 9.999874334006982e-05, "loss": 0.2156, "step": 68400 }, { "epoch": 6.3, "learning_rate": 9.999873966562557e-05, "loss": 0.2181, "step": 68600 }, { "epoch": 6.32, "learning_rate": 9.999873599118133e-05, "loss": 0.2215, "step": 68800 }, { "epoch": 6.34, "learning_rate": 9.99987323167371e-05, "loss": 0.2214, "step": 69000 }, { "epoch": 6.36, "learning_rate": 9.999872864229286e-05, "loss": 0.221, "step": 69200 }, { "epoch": 6.38, "learning_rate": 9.999872496784863e-05, "loss": 0.2254, "step": 69400 }, { "epoch": 6.39, "learning_rate": 9.999872129340437e-05, "loss": 0.2235, "step": 69600 }, { "epoch": 6.41, "learning_rate": 9.999871761896013e-05, "loss": 0.2231, "step": 69800 }, { "epoch": 6.43, "learning_rate": 9.999871394451589e-05, "loss": 0.2267, "step": 70000 }, { "epoch": 6.45, "learning_rate": 9.999871027007167e-05, "loss": 0.2239, "step": 70200 }, { "epoch": 6.47, "learning_rate": 9.999870659562741e-05, "loss": 0.227, "step": 70400 }, { "epoch": 6.49, "learning_rate": 9.999870292118317e-05, "loss": 0.2277, "step": 70600 }, { "epoch": 6.5, "learning_rate": 9.999869924673893e-05, "loss": 0.2245, "step": 70800 }, { "epoch": 6.52, "learning_rate": 9.99986955722947e-05, "loss": 0.2284, "step": 71000 }, { "epoch": 6.54, "learning_rate": 9.999869189785046e-05, "loss": 0.2308, "step": 71200 }, { "epoch": 6.56, "learning_rate": 9.999868822340622e-05, "loss": 0.2348, "step": 71400 }, { "epoch": 6.58, "learning_rate": 9.999868454896198e-05, "loss": 0.2289, "step": 71600 }, { "epoch": 6.6, "learning_rate": 9.999868087451774e-05, "loss": 0.2311, "step": 71800 }, { "epoch": 6.61, "learning_rate": 9.99986772000735e-05, "loss": 0.2308, "step": 72000 } ], "max_steps": 5443000000, "num_train_epochs": 500000, "total_flos": 2.6739082122913382e+17, "trial_name": null, "trial_params": null }