diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14856 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.9736303748052775, + "global_step": 530000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.9953078958728256e-05, + "loss": 6.7877, + "step": 500 + }, + { + "epoch": 0.0, + "eval_loss": 6.12216329574585, + "eval_runtime": 227.3514, + "eval_samples_per_second": 439.848, + "eval_steps_per_second": 13.745, + "step": 500 + }, + { + "epoch": 0.01, + "learning_rate": 4.990615791745651e-05, + "loss": 5.9665, + "step": 1000 + }, + { + "epoch": 0.01, + "eval_loss": 5.832594394683838, + "eval_runtime": 227.8781, + "eval_samples_per_second": 438.831, + "eval_steps_per_second": 13.713, + "step": 1000 + }, + { + "epoch": 0.01, + "learning_rate": 4.9859236876184755e-05, + "loss": 5.7813, + "step": 1500 + }, + { + "epoch": 0.01, + "eval_loss": 5.70578145980835, + "eval_runtime": 227.8691, + "eval_samples_per_second": 438.848, + "eval_steps_per_second": 13.714, + "step": 1500 + }, + { + "epoch": 0.02, + "learning_rate": 4.981231583491301e-05, + "loss": 5.6811, + "step": 2000 + }, + { + "epoch": 0.02, + "eval_loss": 5.598133563995361, + "eval_runtime": 228.0113, + "eval_samples_per_second": 438.575, + "eval_steps_per_second": 13.705, + "step": 2000 + }, + { + "epoch": 0.02, + "learning_rate": 4.976539479364126e-05, + "loss": 5.5624, + "step": 2500 + }, + { + "epoch": 0.02, + "eval_loss": 5.457843780517578, + "eval_runtime": 230.6878, + "eval_samples_per_second": 433.486, + "eval_steps_per_second": 13.546, + "step": 2500 + }, + { + "epoch": 0.03, + "learning_rate": 4.9718473752369515e-05, + "loss": 5.4101, + "step": 3000 + }, + { + "epoch": 0.03, + "eval_loss": 5.203133583068848, + "eval_runtime": 230.8416, + "eval_samples_per_second": 433.198, + "eval_steps_per_second": 13.537, + "step": 3000 + }, + { + "epoch": 0.03, + "learning_rate": 4.967155271109777e-05, + "loss": 5.1461, + "step": 3500 + }, + { + "epoch": 0.03, + "eval_loss": 4.861812591552734, + "eval_runtime": 231.3953, + "eval_samples_per_second": 432.161, + "eval_steps_per_second": 13.505, + "step": 3500 + }, + { + "epoch": 0.04, + "learning_rate": 4.962463166982602e-05, + "loss": 4.8415, + "step": 4000 + }, + { + "epoch": 0.04, + "eval_loss": 4.487979888916016, + "eval_runtime": 231.5686, + "eval_samples_per_second": 431.837, + "eval_steps_per_second": 13.495, + "step": 4000 + }, + { + "epoch": 0.04, + "learning_rate": 4.957771062855427e-05, + "loss": 4.5424, + "step": 4500 + }, + { + "epoch": 0.04, + "eval_loss": 4.209322452545166, + "eval_runtime": 232.2861, + "eval_samples_per_second": 430.504, + "eval_steps_per_second": 13.453, + "step": 4500 + }, + { + "epoch": 0.05, + "learning_rate": 4.953078958728252e-05, + "loss": 4.286, + "step": 5000 + }, + { + "epoch": 0.05, + "eval_loss": 3.9922587871551514, + "eval_runtime": 231.4165, + "eval_samples_per_second": 432.121, + "eval_steps_per_second": 13.504, + "step": 5000 + }, + { + "epoch": 0.05, + "learning_rate": 4.9483868546010775e-05, + "loss": 4.0996, + "step": 5500 + }, + { + "epoch": 0.05, + "eval_loss": 3.8145012855529785, + "eval_runtime": 230.7704, + "eval_samples_per_second": 433.331, + "eval_steps_per_second": 13.542, + "step": 5500 + }, + { + "epoch": 0.06, + "learning_rate": 4.943694750473903e-05, + "loss": 3.9396, + "step": 6000 + }, + { + "epoch": 0.06, + "eval_loss": 3.652764320373535, + "eval_runtime": 230.0751, + "eval_samples_per_second": 434.641, + "eval_steps_per_second": 13.583, + "step": 6000 + }, + { + "epoch": 0.06, + "learning_rate": 4.939002646346728e-05, + "loss": 3.8204, + "step": 6500 + }, + { + "epoch": 0.06, + "eval_loss": 3.5459558963775635, + "eval_runtime": 227.8247, + "eval_samples_per_second": 438.934, + "eval_steps_per_second": 13.717, + "step": 6500 + }, + { + "epoch": 0.07, + "learning_rate": 4.934310542219553e-05, + "loss": 3.7161, + "step": 7000 + }, + { + "epoch": 0.07, + "eval_loss": 3.432518720626831, + "eval_runtime": 228.0814, + "eval_samples_per_second": 438.44, + "eval_steps_per_second": 13.701, + "step": 7000 + }, + { + "epoch": 0.07, + "learning_rate": 4.929618438092378e-05, + "loss": 3.602, + "step": 7500 + }, + { + "epoch": 0.07, + "eval_loss": 3.3445019721984863, + "eval_runtime": 227.7955, + "eval_samples_per_second": 438.99, + "eval_steps_per_second": 13.718, + "step": 7500 + }, + { + "epoch": 0.08, + "learning_rate": 4.9249263339652035e-05, + "loss": 3.519, + "step": 8000 + }, + { + "epoch": 0.08, + "eval_loss": 3.2647743225097656, + "eval_runtime": 227.9434, + "eval_samples_per_second": 438.705, + "eval_steps_per_second": 13.71, + "step": 8000 + }, + { + "epoch": 0.08, + "learning_rate": 4.920234229838029e-05, + "loss": 3.4347, + "step": 8500 + }, + { + "epoch": 0.08, + "eval_loss": 3.188307762145996, + "eval_runtime": 228.5169, + "eval_samples_per_second": 437.604, + "eval_steps_per_second": 13.675, + "step": 8500 + }, + { + "epoch": 0.08, + "learning_rate": 4.915542125710854e-05, + "loss": 3.3642, + "step": 9000 + }, + { + "epoch": 0.08, + "eval_loss": 3.1089322566986084, + "eval_runtime": 228.7192, + "eval_samples_per_second": 437.217, + "eval_steps_per_second": 13.663, + "step": 9000 + }, + { + "epoch": 0.09, + "learning_rate": 4.9108500215836794e-05, + "loss": 3.29, + "step": 9500 + }, + { + "epoch": 0.09, + "eval_loss": 3.044532299041748, + "eval_runtime": 229.7935, + "eval_samples_per_second": 435.173, + "eval_steps_per_second": 13.599, + "step": 9500 + }, + { + "epoch": 0.09, + "learning_rate": 4.906157917456504e-05, + "loss": 3.2545, + "step": 10000 + }, + { + "epoch": 0.09, + "eval_loss": 3.0254080295562744, + "eval_runtime": 229.6489, + "eval_samples_per_second": 435.447, + "eval_steps_per_second": 13.608, + "step": 10000 + }, + { + "epoch": 0.1, + "learning_rate": 4.9014658133293294e-05, + "loss": 3.2036, + "step": 10500 + }, + { + "epoch": 0.1, + "eval_loss": 2.9655680656433105, + "eval_runtime": 228.7959, + "eval_samples_per_second": 437.071, + "eval_steps_per_second": 13.658, + "step": 10500 + }, + { + "epoch": 0.1, + "learning_rate": 4.896773709202155e-05, + "loss": 3.1292, + "step": 11000 + }, + { + "epoch": 0.1, + "eval_loss": 2.92611026763916, + "eval_runtime": 224.7807, + "eval_samples_per_second": 444.878, + "eval_steps_per_second": 13.902, + "step": 11000 + }, + { + "epoch": 0.11, + "learning_rate": 4.89208160507498e-05, + "loss": 3.1273, + "step": 11500 + }, + { + "epoch": 0.11, + "eval_loss": 2.880321502685547, + "eval_runtime": 225.1803, + "eval_samples_per_second": 444.089, + "eval_steps_per_second": 13.878, + "step": 11500 + }, + { + "epoch": 0.11, + "learning_rate": 4.8873895009478054e-05, + "loss": 3.0435, + "step": 12000 + }, + { + "epoch": 0.11, + "eval_loss": 2.8387928009033203, + "eval_runtime": 229.766, + "eval_samples_per_second": 435.225, + "eval_steps_per_second": 13.601, + "step": 12000 + }, + { + "epoch": 0.12, + "learning_rate": 4.88269739682063e-05, + "loss": 2.9999, + "step": 12500 + }, + { + "epoch": 0.12, + "eval_loss": 2.7962071895599365, + "eval_runtime": 226.0122, + "eval_samples_per_second": 442.454, + "eval_steps_per_second": 13.827, + "step": 12500 + }, + { + "epoch": 0.12, + "learning_rate": 4.8780052926934554e-05, + "loss": 2.9642, + "step": 13000 + }, + { + "epoch": 0.12, + "eval_loss": 2.7659239768981934, + "eval_runtime": 225.1957, + "eval_samples_per_second": 444.058, + "eval_steps_per_second": 13.877, + "step": 13000 + }, + { + "epoch": 0.13, + "learning_rate": 4.873313188566281e-05, + "loss": 2.9657, + "step": 13500 + }, + { + "epoch": 0.13, + "eval_loss": 2.729302167892456, + "eval_runtime": 225.23, + "eval_samples_per_second": 443.99, + "eval_steps_per_second": 13.875, + "step": 13500 + }, + { + "epoch": 0.13, + "learning_rate": 4.868621084439106e-05, + "loss": 2.9025, + "step": 14000 + }, + { + "epoch": 0.13, + "eval_loss": 2.698721170425415, + "eval_runtime": 225.2627, + "eval_samples_per_second": 443.926, + "eval_steps_per_second": 13.873, + "step": 14000 + }, + { + "epoch": 0.14, + "learning_rate": 4.8639289803119314e-05, + "loss": 2.9036, + "step": 14500 + }, + { + "epoch": 0.14, + "eval_loss": 2.668630361557007, + "eval_runtime": 225.2216, + "eval_samples_per_second": 444.007, + "eval_steps_per_second": 13.875, + "step": 14500 + }, + { + "epoch": 0.14, + "learning_rate": 4.859236876184757e-05, + "loss": 2.8405, + "step": 15000 + }, + { + "epoch": 0.14, + "eval_loss": 2.6363720893859863, + "eval_runtime": 226.5807, + "eval_samples_per_second": 441.344, + "eval_steps_per_second": 13.792, + "step": 15000 + }, + { + "epoch": 0.15, + "learning_rate": 4.8545447720575814e-05, + "loss": 2.8067, + "step": 15500 + }, + { + "epoch": 0.15, + "eval_loss": 2.6151633262634277, + "eval_runtime": 230.6298, + "eval_samples_per_second": 433.595, + "eval_steps_per_second": 13.55, + "step": 15500 + }, + { + "epoch": 0.15, + "learning_rate": 4.849852667930407e-05, + "loss": 2.7782, + "step": 16000 + }, + { + "epoch": 0.15, + "eval_loss": 2.586013078689575, + "eval_runtime": 237.3092, + "eval_samples_per_second": 421.391, + "eval_steps_per_second": 13.168, + "step": 16000 + }, + { + "epoch": 0.15, + "learning_rate": 4.845160563803232e-05, + "loss": 2.7647, + "step": 16500 + }, + { + "epoch": 0.15, + "eval_loss": 2.5545260906219482, + "eval_runtime": 239.3104, + "eval_samples_per_second": 417.867, + "eval_steps_per_second": 13.058, + "step": 16500 + }, + { + "epoch": 0.16, + "learning_rate": 4.8404684596760573e-05, + "loss": 2.7115, + "step": 17000 + }, + { + "epoch": 0.16, + "eval_loss": 2.5350353717803955, + "eval_runtime": 242.6234, + "eval_samples_per_second": 412.161, + "eval_steps_per_second": 12.88, + "step": 17000 + }, + { + "epoch": 0.16, + "learning_rate": 4.835776355548883e-05, + "loss": 2.6993, + "step": 17500 + }, + { + "epoch": 0.16, + "eval_loss": 2.502089738845825, + "eval_runtime": 236.8183, + "eval_samples_per_second": 422.265, + "eval_steps_per_second": 13.196, + "step": 17500 + }, + { + "epoch": 0.17, + "learning_rate": 4.831084251421707e-05, + "loss": 2.7097, + "step": 18000 + }, + { + "epoch": 0.17, + "eval_loss": 2.493856191635132, + "eval_runtime": 236.3441, + "eval_samples_per_second": 423.112, + "eval_steps_per_second": 13.222, + "step": 18000 + }, + { + "epoch": 0.17, + "learning_rate": 4.8263921472945327e-05, + "loss": 2.6829, + "step": 18500 + }, + { + "epoch": 0.17, + "eval_loss": 2.484680652618408, + "eval_runtime": 234.9827, + "eval_samples_per_second": 425.563, + "eval_steps_per_second": 13.299, + "step": 18500 + }, + { + "epoch": 0.18, + "learning_rate": 4.821700043167358e-05, + "loss": 2.6755, + "step": 19000 + }, + { + "epoch": 0.18, + "eval_loss": 2.461320161819458, + "eval_runtime": 233.7755, + "eval_samples_per_second": 427.761, + "eval_steps_per_second": 13.368, + "step": 19000 + }, + { + "epoch": 0.18, + "learning_rate": 4.817007939040183e-05, + "loss": 2.6365, + "step": 19500 + }, + { + "epoch": 0.18, + "eval_loss": 2.435065269470215, + "eval_runtime": 238.7041, + "eval_samples_per_second": 418.929, + "eval_steps_per_second": 13.092, + "step": 19500 + }, + { + "epoch": 0.19, + "learning_rate": 4.8123158349130086e-05, + "loss": 2.6216, + "step": 20000 + }, + { + "epoch": 0.19, + "eval_loss": 2.4164962768554688, + "eval_runtime": 234.7619, + "eval_samples_per_second": 425.964, + "eval_steps_per_second": 13.311, + "step": 20000 + }, + { + "epoch": 0.19, + "learning_rate": 4.807623730785834e-05, + "loss": 2.6014, + "step": 20500 + }, + { + "epoch": 0.19, + "eval_loss": 2.402653932571411, + "eval_runtime": 234.6149, + "eval_samples_per_second": 426.23, + "eval_steps_per_second": 13.32, + "step": 20500 + }, + { + "epoch": 0.2, + "learning_rate": 4.8029316266586586e-05, + "loss": 2.5833, + "step": 21000 + }, + { + "epoch": 0.2, + "eval_loss": 2.3744804859161377, + "eval_runtime": 231.112, + "eval_samples_per_second": 432.691, + "eval_steps_per_second": 13.522, + "step": 21000 + }, + { + "epoch": 0.2, + "learning_rate": 4.798239522531484e-05, + "loss": 2.5663, + "step": 21500 + }, + { + "epoch": 0.2, + "eval_loss": 2.35831880569458, + "eval_runtime": 233.0868, + "eval_samples_per_second": 429.025, + "eval_steps_per_second": 13.407, + "step": 21500 + }, + { + "epoch": 0.21, + "learning_rate": 4.793547418404309e-05, + "loss": 2.5626, + "step": 22000 + }, + { + "epoch": 0.21, + "eval_loss": 2.3559041023254395, + "eval_runtime": 234.7406, + "eval_samples_per_second": 426.002, + "eval_steps_per_second": 13.313, + "step": 22000 + }, + { + "epoch": 0.21, + "learning_rate": 4.7888553142771346e-05, + "loss": 2.5453, + "step": 22500 + }, + { + "epoch": 0.21, + "eval_loss": 2.3476722240448, + "eval_runtime": 231.9484, + "eval_samples_per_second": 431.13, + "eval_steps_per_second": 13.473, + "step": 22500 + }, + { + "epoch": 0.22, + "learning_rate": 4.78416321014996e-05, + "loss": 2.492, + "step": 23000 + }, + { + "epoch": 0.22, + "eval_loss": 2.3358097076416016, + "eval_runtime": 231.7601, + "eval_samples_per_second": 431.481, + "eval_steps_per_second": 13.484, + "step": 23000 + }, + { + "epoch": 0.22, + "learning_rate": 4.779471106022785e-05, + "loss": 2.4961, + "step": 23500 + }, + { + "epoch": 0.22, + "eval_loss": 2.31193208694458, + "eval_runtime": 233.823, + "eval_samples_per_second": 427.674, + "eval_steps_per_second": 13.365, + "step": 23500 + }, + { + "epoch": 0.23, + "learning_rate": 4.77477900189561e-05, + "loss": 2.4764, + "step": 24000 + }, + { + "epoch": 0.23, + "eval_loss": 2.2874138355255127, + "eval_runtime": 232.1465, + "eval_samples_per_second": 430.762, + "eval_steps_per_second": 13.461, + "step": 24000 + }, + { + "epoch": 0.23, + "learning_rate": 4.770086897768435e-05, + "loss": 2.4725, + "step": 24500 + }, + { + "epoch": 0.23, + "eval_loss": 2.2882509231567383, + "eval_runtime": 233.6105, + "eval_samples_per_second": 428.063, + "eval_steps_per_second": 13.377, + "step": 24500 + }, + { + "epoch": 0.23, + "learning_rate": 4.7653947936412606e-05, + "loss": 2.4483, + "step": 25000 + }, + { + "epoch": 0.23, + "eval_loss": 2.2681896686553955, + "eval_runtime": 230.7325, + "eval_samples_per_second": 433.402, + "eval_steps_per_second": 13.544, + "step": 25000 + }, + { + "epoch": 0.24, + "learning_rate": 4.760702689514086e-05, + "loss": 2.4408, + "step": 25500 + }, + { + "epoch": 0.24, + "eval_loss": 2.2558085918426514, + "eval_runtime": 231.5715, + "eval_samples_per_second": 431.832, + "eval_steps_per_second": 13.495, + "step": 25500 + }, + { + "epoch": 0.24, + "learning_rate": 4.756010585386911e-05, + "loss": 2.4344, + "step": 26000 + }, + { + "epoch": 0.24, + "eval_loss": 2.2374377250671387, + "eval_runtime": 233.7469, + "eval_samples_per_second": 427.813, + "eval_steps_per_second": 13.369, + "step": 26000 + }, + { + "epoch": 0.25, + "learning_rate": 4.751318481259736e-05, + "loss": 2.4257, + "step": 26500 + }, + { + "epoch": 0.25, + "eval_loss": 2.229942560195923, + "eval_runtime": 231.8416, + "eval_samples_per_second": 431.329, + "eval_steps_per_second": 13.479, + "step": 26500 + }, + { + "epoch": 0.25, + "learning_rate": 4.746626377132561e-05, + "loss": 2.3836, + "step": 27000 + }, + { + "epoch": 0.25, + "eval_loss": 2.2122104167938232, + "eval_runtime": 232.1939, + "eval_samples_per_second": 430.675, + "eval_steps_per_second": 13.459, + "step": 27000 + }, + { + "epoch": 0.26, + "learning_rate": 4.7419342730053865e-05, + "loss": 2.3901, + "step": 27500 + }, + { + "epoch": 0.26, + "eval_loss": 2.1925010681152344, + "eval_runtime": 228.7288, + "eval_samples_per_second": 437.199, + "eval_steps_per_second": 13.662, + "step": 27500 + }, + { + "epoch": 0.26, + "learning_rate": 4.737242168878212e-05, + "loss": 2.3796, + "step": 28000 + }, + { + "epoch": 0.26, + "eval_loss": 2.195193290710449, + "eval_runtime": 227.6921, + "eval_samples_per_second": 439.19, + "eval_steps_per_second": 13.725, + "step": 28000 + }, + { + "epoch": 0.27, + "learning_rate": 4.732550064751037e-05, + "loss": 2.3792, + "step": 28500 + }, + { + "epoch": 0.27, + "eval_loss": 2.178008794784546, + "eval_runtime": 227.653, + "eval_samples_per_second": 439.265, + "eval_steps_per_second": 13.727, + "step": 28500 + }, + { + "epoch": 0.27, + "learning_rate": 4.7278579606238625e-05, + "loss": 2.3715, + "step": 29000 + }, + { + "epoch": 0.27, + "eval_loss": 2.175689220428467, + "eval_runtime": 227.4287, + "eval_samples_per_second": 439.698, + "eval_steps_per_second": 13.741, + "step": 29000 + }, + { + "epoch": 0.28, + "learning_rate": 4.723165856496687e-05, + "loss": 2.387, + "step": 29500 + }, + { + "epoch": 0.28, + "eval_loss": 2.17189884185791, + "eval_runtime": 227.5833, + "eval_samples_per_second": 439.399, + "eval_steps_per_second": 13.731, + "step": 29500 + }, + { + "epoch": 0.28, + "learning_rate": 4.7184737523695125e-05, + "loss": 2.3689, + "step": 30000 + }, + { + "epoch": 0.28, + "eval_loss": 2.1487345695495605, + "eval_runtime": 227.5611, + "eval_samples_per_second": 439.442, + "eval_steps_per_second": 13.733, + "step": 30000 + }, + { + "epoch": 0.29, + "learning_rate": 4.713781648242338e-05, + "loss": 2.3086, + "step": 30500 + }, + { + "epoch": 0.29, + "eval_loss": 2.1480958461761475, + "eval_runtime": 227.4844, + "eval_samples_per_second": 439.591, + "eval_steps_per_second": 13.737, + "step": 30500 + }, + { + "epoch": 0.29, + "learning_rate": 4.709089544115163e-05, + "loss": 2.3212, + "step": 31000 + }, + { + "epoch": 0.29, + "eval_loss": 2.140958547592163, + "eval_runtime": 227.7332, + "eval_samples_per_second": 439.11, + "eval_steps_per_second": 13.722, + "step": 31000 + }, + { + "epoch": 0.3, + "learning_rate": 4.7043974399879885e-05, + "loss": 2.321, + "step": 31500 + }, + { + "epoch": 0.3, + "eval_loss": 2.1231191158294678, + "eval_runtime": 227.7051, + "eval_samples_per_second": 439.164, + "eval_steps_per_second": 13.724, + "step": 31500 + }, + { + "epoch": 0.3, + "learning_rate": 4.699705335860813e-05, + "loss": 2.2855, + "step": 32000 + }, + { + "epoch": 0.3, + "eval_loss": 2.1169111728668213, + "eval_runtime": 227.254, + "eval_samples_per_second": 440.036, + "eval_steps_per_second": 13.751, + "step": 32000 + }, + { + "epoch": 0.3, + "learning_rate": 4.6950132317336385e-05, + "loss": 2.2906, + "step": 32500 + }, + { + "epoch": 0.3, + "eval_loss": 2.104003667831421, + "eval_runtime": 227.1649, + "eval_samples_per_second": 440.209, + "eval_steps_per_second": 13.757, + "step": 32500 + }, + { + "epoch": 0.31, + "learning_rate": 4.690321127606464e-05, + "loss": 2.2771, + "step": 33000 + }, + { + "epoch": 0.31, + "eval_loss": 2.0972249507904053, + "eval_runtime": 227.2504, + "eval_samples_per_second": 440.043, + "eval_steps_per_second": 13.751, + "step": 33000 + }, + { + "epoch": 0.31, + "learning_rate": 4.685629023479289e-05, + "loss": 2.2485, + "step": 33500 + }, + { + "epoch": 0.31, + "eval_loss": 2.074732780456543, + "eval_runtime": 227.2974, + "eval_samples_per_second": 439.952, + "eval_steps_per_second": 13.749, + "step": 33500 + }, + { + "epoch": 0.32, + "learning_rate": 4.6809369193521144e-05, + "loss": 2.2783, + "step": 34000 + }, + { + "epoch": 0.32, + "eval_loss": 2.077274799346924, + "eval_runtime": 227.3063, + "eval_samples_per_second": 439.935, + "eval_steps_per_second": 13.748, + "step": 34000 + }, + { + "epoch": 0.32, + "learning_rate": 4.67624481522494e-05, + "loss": 2.2668, + "step": 34500 + }, + { + "epoch": 0.32, + "eval_loss": 2.075364828109741, + "eval_runtime": 227.2985, + "eval_samples_per_second": 439.95, + "eval_steps_per_second": 13.748, + "step": 34500 + }, + { + "epoch": 0.33, + "learning_rate": 4.6715527110977644e-05, + "loss": 2.2458, + "step": 35000 + }, + { + "epoch": 0.33, + "eval_loss": 2.060246467590332, + "eval_runtime": 227.0553, + "eval_samples_per_second": 440.421, + "eval_steps_per_second": 13.763, + "step": 35000 + }, + { + "epoch": 0.33, + "learning_rate": 4.66686060697059e-05, + "loss": 2.2365, + "step": 35500 + }, + { + "epoch": 0.33, + "eval_loss": 2.046921968460083, + "eval_runtime": 227.3148, + "eval_samples_per_second": 439.919, + "eval_steps_per_second": 13.747, + "step": 35500 + }, + { + "epoch": 0.34, + "learning_rate": 4.662168502843415e-05, + "loss": 2.2652, + "step": 36000 + }, + { + "epoch": 0.34, + "eval_loss": 2.049044370651245, + "eval_runtime": 227.288, + "eval_samples_per_second": 439.97, + "eval_steps_per_second": 13.749, + "step": 36000 + }, + { + "epoch": 0.34, + "learning_rate": 4.6574763987162404e-05, + "loss": 2.2276, + "step": 36500 + }, + { + "epoch": 0.34, + "eval_loss": 2.0362257957458496, + "eval_runtime": 227.1732, + "eval_samples_per_second": 440.193, + "eval_steps_per_second": 13.756, + "step": 36500 + }, + { + "epoch": 0.35, + "learning_rate": 4.652784294589066e-05, + "loss": 2.226, + "step": 37000 + }, + { + "epoch": 0.35, + "eval_loss": 2.0287837982177734, + "eval_runtime": 227.1386, + "eval_samples_per_second": 440.26, + "eval_steps_per_second": 13.758, + "step": 37000 + }, + { + "epoch": 0.35, + "learning_rate": 4.6480921904618904e-05, + "loss": 2.2358, + "step": 37500 + }, + { + "epoch": 0.35, + "eval_loss": 2.016014337539673, + "eval_runtime": 227.3457, + "eval_samples_per_second": 439.859, + "eval_steps_per_second": 13.746, + "step": 37500 + }, + { + "epoch": 0.36, + "learning_rate": 4.643400086334716e-05, + "loss": 2.1975, + "step": 38000 + }, + { + "epoch": 0.36, + "eval_loss": 2.0150697231292725, + "eval_runtime": 227.4193, + "eval_samples_per_second": 439.716, + "eval_steps_per_second": 13.741, + "step": 38000 + }, + { + "epoch": 0.36, + "learning_rate": 4.638707982207541e-05, + "loss": 2.1991, + "step": 38500 + }, + { + "epoch": 0.36, + "eval_loss": 2.007798910140991, + "eval_runtime": 227.3815, + "eval_samples_per_second": 439.789, + "eval_steps_per_second": 13.743, + "step": 38500 + }, + { + "epoch": 0.37, + "learning_rate": 4.6340158780803664e-05, + "loss": 2.1907, + "step": 39000 + }, + { + "epoch": 0.37, + "eval_loss": 2.0014853477478027, + "eval_runtime": 227.3276, + "eval_samples_per_second": 439.894, + "eval_steps_per_second": 13.747, + "step": 39000 + }, + { + "epoch": 0.37, + "learning_rate": 4.629323773953192e-05, + "loss": 2.1551, + "step": 39500 + }, + { + "epoch": 0.37, + "eval_loss": 2.0021867752075195, + "eval_runtime": 227.2631, + "eval_samples_per_second": 440.019, + "eval_steps_per_second": 13.751, + "step": 39500 + }, + { + "epoch": 0.38, + "learning_rate": 4.624631669826017e-05, + "loss": 2.1818, + "step": 40000 + }, + { + "epoch": 0.38, + "eval_loss": 1.9883029460906982, + "eval_runtime": 227.0668, + "eval_samples_per_second": 440.399, + "eval_steps_per_second": 13.762, + "step": 40000 + }, + { + "epoch": 0.38, + "learning_rate": 4.619939565698842e-05, + "loss": 2.1678, + "step": 40500 + }, + { + "epoch": 0.38, + "eval_loss": 1.9864903688430786, + "eval_runtime": 226.6949, + "eval_samples_per_second": 441.122, + "eval_steps_per_second": 13.785, + "step": 40500 + }, + { + "epoch": 0.38, + "learning_rate": 4.615247461571667e-05, + "loss": 2.173, + "step": 41000 + }, + { + "epoch": 0.38, + "eval_loss": 1.9766247272491455, + "eval_runtime": 226.7708, + "eval_samples_per_second": 440.974, + "eval_steps_per_second": 13.78, + "step": 41000 + }, + { + "epoch": 0.39, + "learning_rate": 4.6105553574444923e-05, + "loss": 2.1469, + "step": 41500 + }, + { + "epoch": 0.39, + "eval_loss": 1.962270975112915, + "eval_runtime": 227.1551, + "eval_samples_per_second": 440.228, + "eval_steps_per_second": 13.757, + "step": 41500 + }, + { + "epoch": 0.39, + "learning_rate": 4.605863253317318e-05, + "loss": 2.1414, + "step": 42000 + }, + { + "epoch": 0.39, + "eval_loss": 1.9634250402450562, + "eval_runtime": 227.1053, + "eval_samples_per_second": 440.324, + "eval_steps_per_second": 13.76, + "step": 42000 + }, + { + "epoch": 0.4, + "learning_rate": 4.601171149190143e-05, + "loss": 2.156, + "step": 42500 + }, + { + "epoch": 0.4, + "eval_loss": 1.9548449516296387, + "eval_runtime": 226.8657, + "eval_samples_per_second": 440.789, + "eval_steps_per_second": 13.775, + "step": 42500 + }, + { + "epoch": 0.4, + "learning_rate": 4.596479045062968e-05, + "loss": 2.1155, + "step": 43000 + }, + { + "epoch": 0.4, + "eval_loss": 1.9472512006759644, + "eval_runtime": 227.1058, + "eval_samples_per_second": 440.323, + "eval_steps_per_second": 13.76, + "step": 43000 + }, + { + "epoch": 0.41, + "learning_rate": 4.591786940935793e-05, + "loss": 2.114, + "step": 43500 + }, + { + "epoch": 0.41, + "eval_loss": 1.943420171737671, + "eval_runtime": 227.1581, + "eval_samples_per_second": 440.222, + "eval_steps_per_second": 13.757, + "step": 43500 + }, + { + "epoch": 0.41, + "learning_rate": 4.587094836808618e-05, + "loss": 2.1146, + "step": 44000 + }, + { + "epoch": 0.41, + "eval_loss": 1.9389326572418213, + "eval_runtime": 227.1632, + "eval_samples_per_second": 440.212, + "eval_steps_per_second": 13.757, + "step": 44000 + }, + { + "epoch": 0.42, + "learning_rate": 4.5824027326814436e-05, + "loss": 2.1018, + "step": 44500 + }, + { + "epoch": 0.42, + "eval_loss": 1.9317586421966553, + "eval_runtime": 227.2543, + "eval_samples_per_second": 440.036, + "eval_steps_per_second": 13.751, + "step": 44500 + }, + { + "epoch": 0.42, + "learning_rate": 4.577710628554269e-05, + "loss": 2.0955, + "step": 45000 + }, + { + "epoch": 0.42, + "eval_loss": 1.9257296323776245, + "eval_runtime": 227.0518, + "eval_samples_per_second": 440.428, + "eval_steps_per_second": 13.763, + "step": 45000 + }, + { + "epoch": 0.43, + "learning_rate": 4.573018524427094e-05, + "loss": 2.0788, + "step": 45500 + }, + { + "epoch": 0.43, + "eval_loss": 1.9265824556350708, + "eval_runtime": 226.9844, + "eval_samples_per_second": 440.559, + "eval_steps_per_second": 13.767, + "step": 45500 + }, + { + "epoch": 0.43, + "learning_rate": 4.5683264202999196e-05, + "loss": 2.0983, + "step": 46000 + }, + { + "epoch": 0.43, + "eval_loss": 1.9132778644561768, + "eval_runtime": 226.9226, + "eval_samples_per_second": 440.679, + "eval_steps_per_second": 13.771, + "step": 46000 + }, + { + "epoch": 0.44, + "learning_rate": 4.563634316172745e-05, + "loss": 2.0877, + "step": 46500 + }, + { + "epoch": 0.44, + "eval_loss": 1.9130772352218628, + "eval_runtime": 227.0233, + "eval_samples_per_second": 440.483, + "eval_steps_per_second": 13.765, + "step": 46500 + }, + { + "epoch": 0.44, + "learning_rate": 4.5589422120455696e-05, + "loss": 2.0831, + "step": 47000 + }, + { + "epoch": 0.44, + "eval_loss": 1.9047877788543701, + "eval_runtime": 227.0434, + "eval_samples_per_second": 440.444, + "eval_steps_per_second": 13.764, + "step": 47000 + }, + { + "epoch": 0.45, + "learning_rate": 4.554250107918395e-05, + "loss": 2.0843, + "step": 47500 + }, + { + "epoch": 0.45, + "eval_loss": 1.9129490852355957, + "eval_runtime": 226.7451, + "eval_samples_per_second": 441.024, + "eval_steps_per_second": 13.782, + "step": 47500 + }, + { + "epoch": 0.45, + "learning_rate": 4.54955800379122e-05, + "loss": 2.0583, + "step": 48000 + }, + { + "epoch": 0.45, + "eval_loss": 1.8967323303222656, + "eval_runtime": 226.9943, + "eval_samples_per_second": 440.54, + "eval_steps_per_second": 13.767, + "step": 48000 + }, + { + "epoch": 0.46, + "learning_rate": 4.5448658996640456e-05, + "loss": 2.0814, + "step": 48500 + }, + { + "epoch": 0.46, + "eval_loss": 1.8939955234527588, + "eval_runtime": 226.7956, + "eval_samples_per_second": 440.926, + "eval_steps_per_second": 13.779, + "step": 48500 + }, + { + "epoch": 0.46, + "learning_rate": 4.540173795536871e-05, + "loss": 2.0656, + "step": 49000 + }, + { + "epoch": 0.46, + "eval_loss": 1.8772351741790771, + "eval_runtime": 226.9522, + "eval_samples_per_second": 440.621, + "eval_steps_per_second": 13.769, + "step": 49000 + }, + { + "epoch": 0.46, + "learning_rate": 4.535481691409696e-05, + "loss": 2.0654, + "step": 49500 + }, + { + "epoch": 0.46, + "eval_loss": 1.8778467178344727, + "eval_runtime": 227.1023, + "eval_samples_per_second": 440.33, + "eval_steps_per_second": 13.76, + "step": 49500 + }, + { + "epoch": 0.47, + "learning_rate": 4.5307895872825216e-05, + "loss": 2.0695, + "step": 50000 + }, + { + "epoch": 0.47, + "eval_loss": 1.877779245376587, + "eval_runtime": 226.8595, + "eval_samples_per_second": 440.801, + "eval_steps_per_second": 13.775, + "step": 50000 + }, + { + "epoch": 0.47, + "learning_rate": 4.526097483155346e-05, + "loss": 2.0571, + "step": 50500 + }, + { + "epoch": 0.47, + "eval_loss": 1.878071904182434, + "eval_runtime": 226.8228, + "eval_samples_per_second": 440.873, + "eval_steps_per_second": 13.777, + "step": 50500 + }, + { + "epoch": 0.48, + "learning_rate": 4.5214053790281715e-05, + "loss": 2.0379, + "step": 51000 + }, + { + "epoch": 0.48, + "eval_loss": 1.8661781549453735, + "eval_runtime": 226.9895, + "eval_samples_per_second": 440.549, + "eval_steps_per_second": 13.767, + "step": 51000 + }, + { + "epoch": 0.48, + "learning_rate": 4.516713274900997e-05, + "loss": 2.0439, + "step": 51500 + }, + { + "epoch": 0.48, + "eval_loss": 1.8483119010925293, + "eval_runtime": 227.0054, + "eval_samples_per_second": 440.518, + "eval_steps_per_second": 13.766, + "step": 51500 + }, + { + "epoch": 0.49, + "learning_rate": 4.512021170773822e-05, + "loss": 2.0472, + "step": 52000 + }, + { + "epoch": 0.49, + "eval_loss": 1.8448209762573242, + "eval_runtime": 226.9827, + "eval_samples_per_second": 440.562, + "eval_steps_per_second": 13.768, + "step": 52000 + }, + { + "epoch": 0.49, + "learning_rate": 4.5073290666466475e-05, + "loss": 2.0281, + "step": 52500 + }, + { + "epoch": 0.49, + "eval_loss": 1.8498718738555908, + "eval_runtime": 227.1635, + "eval_samples_per_second": 440.212, + "eval_steps_per_second": 13.757, + "step": 52500 + }, + { + "epoch": 0.5, + "learning_rate": 4.502636962519473e-05, + "loss": 2.0077, + "step": 53000 + }, + { + "epoch": 0.5, + "eval_loss": 1.848965048789978, + "eval_runtime": 226.7848, + "eval_samples_per_second": 440.947, + "eval_steps_per_second": 13.78, + "step": 53000 + }, + { + "epoch": 0.5, + "learning_rate": 4.4979448583922975e-05, + "loss": 2.0407, + "step": 53500 + }, + { + "epoch": 0.5, + "eval_loss": 1.8440462350845337, + "eval_runtime": 227.0578, + "eval_samples_per_second": 440.416, + "eval_steps_per_second": 13.763, + "step": 53500 + }, + { + "epoch": 0.51, + "learning_rate": 4.493252754265123e-05, + "loss": 2.0314, + "step": 54000 + }, + { + "epoch": 0.51, + "eval_loss": 1.8299942016601562, + "eval_runtime": 226.8309, + "eval_samples_per_second": 440.857, + "eval_steps_per_second": 13.777, + "step": 54000 + }, + { + "epoch": 0.51, + "learning_rate": 4.488560650137948e-05, + "loss": 2.0178, + "step": 54500 + }, + { + "epoch": 0.51, + "eval_loss": 1.8236579895019531, + "eval_runtime": 226.9072, + "eval_samples_per_second": 440.709, + "eval_steps_per_second": 13.772, + "step": 54500 + }, + { + "epoch": 0.52, + "learning_rate": 4.4838685460107735e-05, + "loss": 2.0133, + "step": 55000 + }, + { + "epoch": 0.52, + "eval_loss": 1.826443076133728, + "eval_runtime": 226.9793, + "eval_samples_per_second": 440.569, + "eval_steps_per_second": 13.768, + "step": 55000 + }, + { + "epoch": 0.52, + "learning_rate": 4.479176441883599e-05, + "loss": 2.0298, + "step": 55500 + }, + { + "epoch": 0.52, + "eval_loss": 1.816024661064148, + "eval_runtime": 226.7364, + "eval_samples_per_second": 441.041, + "eval_steps_per_second": 13.783, + "step": 55500 + }, + { + "epoch": 0.53, + "learning_rate": 4.474484337756424e-05, + "loss": 1.9824, + "step": 56000 + }, + { + "epoch": 0.53, + "eval_loss": 1.8234096765518188, + "eval_runtime": 226.9723, + "eval_samples_per_second": 440.582, + "eval_steps_per_second": 13.768, + "step": 56000 + }, + { + "epoch": 0.53, + "learning_rate": 4.4697922336292495e-05, + "loss": 1.9908, + "step": 56500 + }, + { + "epoch": 0.53, + "eval_loss": 1.8151246309280396, + "eval_runtime": 226.9337, + "eval_samples_per_second": 440.657, + "eval_steps_per_second": 13.771, + "step": 56500 + }, + { + "epoch": 0.53, + "learning_rate": 4.465100129502074e-05, + "loss": 1.9782, + "step": 57000 + }, + { + "epoch": 0.53, + "eval_loss": 1.808193564414978, + "eval_runtime": 226.7205, + "eval_samples_per_second": 441.072, + "eval_steps_per_second": 13.783, + "step": 57000 + }, + { + "epoch": 0.54, + "learning_rate": 4.4604080253748995e-05, + "loss": 1.9985, + "step": 57500 + }, + { + "epoch": 0.54, + "eval_loss": 1.7979633808135986, + "eval_runtime": 226.818, + "eval_samples_per_second": 440.882, + "eval_steps_per_second": 13.778, + "step": 57500 + }, + { + "epoch": 0.54, + "learning_rate": 4.455715921247725e-05, + "loss": 1.978, + "step": 58000 + }, + { + "epoch": 0.54, + "eval_loss": 1.7955108880996704, + "eval_runtime": 226.7306, + "eval_samples_per_second": 441.052, + "eval_steps_per_second": 13.783, + "step": 58000 + }, + { + "epoch": 0.55, + "learning_rate": 4.45102381712055e-05, + "loss": 1.9746, + "step": 58500 + }, + { + "epoch": 0.55, + "eval_loss": 1.794771671295166, + "eval_runtime": 226.9956, + "eval_samples_per_second": 440.537, + "eval_steps_per_second": 13.767, + "step": 58500 + }, + { + "epoch": 0.55, + "learning_rate": 4.4463317129933754e-05, + "loss": 1.9838, + "step": 59000 + }, + { + "epoch": 0.55, + "eval_loss": 1.7896528244018555, + "eval_runtime": 226.9957, + "eval_samples_per_second": 440.537, + "eval_steps_per_second": 13.767, + "step": 59000 + }, + { + "epoch": 0.56, + "learning_rate": 4.4416396088662e-05, + "loss": 1.944, + "step": 59500 + }, + { + "epoch": 0.56, + "eval_loss": 1.7889646291732788, + "eval_runtime": 227.1746, + "eval_samples_per_second": 440.19, + "eval_steps_per_second": 13.756, + "step": 59500 + }, + { + "epoch": 0.56, + "learning_rate": 4.4369475047390254e-05, + "loss": 1.965, + "step": 60000 + }, + { + "epoch": 0.56, + "eval_loss": 1.7891310453414917, + "eval_runtime": 227.2101, + "eval_samples_per_second": 440.121, + "eval_steps_per_second": 13.754, + "step": 60000 + }, + { + "epoch": 0.57, + "learning_rate": 4.432255400611851e-05, + "loss": 1.9501, + "step": 60500 + }, + { + "epoch": 0.57, + "eval_loss": 1.7808892726898193, + "eval_runtime": 226.9599, + "eval_samples_per_second": 440.607, + "eval_steps_per_second": 13.769, + "step": 60500 + }, + { + "epoch": 0.57, + "learning_rate": 4.427563296484676e-05, + "loss": 1.9541, + "step": 61000 + }, + { + "epoch": 0.57, + "eval_loss": 1.7755423784255981, + "eval_runtime": 226.9182, + "eval_samples_per_second": 440.688, + "eval_steps_per_second": 13.771, + "step": 61000 + }, + { + "epoch": 0.58, + "learning_rate": 4.4228711923575014e-05, + "loss": 1.9483, + "step": 61500 + }, + { + "epoch": 0.58, + "eval_loss": 1.7737643718719482, + "eval_runtime": 226.5336, + "eval_samples_per_second": 441.436, + "eval_steps_per_second": 13.795, + "step": 61500 + }, + { + "epoch": 0.58, + "learning_rate": 4.418179088230327e-05, + "loss": 1.9568, + "step": 62000 + }, + { + "epoch": 0.58, + "eval_loss": 1.7714828252792358, + "eval_runtime": 226.801, + "eval_samples_per_second": 440.915, + "eval_steps_per_second": 13.779, + "step": 62000 + }, + { + "epoch": 0.59, + "learning_rate": 4.4134869841031514e-05, + "loss": 1.9418, + "step": 62500 + }, + { + "epoch": 0.59, + "eval_loss": 1.7680498361587524, + "eval_runtime": 226.5426, + "eval_samples_per_second": 441.418, + "eval_steps_per_second": 13.794, + "step": 62500 + }, + { + "epoch": 0.59, + "learning_rate": 4.408794879975977e-05, + "loss": 1.946, + "step": 63000 + }, + { + "epoch": 0.59, + "eval_loss": 1.756327509880066, + "eval_runtime": 226.7739, + "eval_samples_per_second": 440.968, + "eval_steps_per_second": 13.78, + "step": 63000 + }, + { + "epoch": 0.6, + "learning_rate": 4.404102775848802e-05, + "loss": 1.9331, + "step": 63500 + }, + { + "epoch": 0.6, + "eval_loss": 1.7607768774032593, + "eval_runtime": 226.7955, + "eval_samples_per_second": 440.926, + "eval_steps_per_second": 13.779, + "step": 63500 + }, + { + "epoch": 0.6, + "learning_rate": 4.3994106717216274e-05, + "loss": 1.9287, + "step": 64000 + }, + { + "epoch": 0.6, + "eval_loss": 1.7537603378295898, + "eval_runtime": 226.7832, + "eval_samples_per_second": 440.95, + "eval_steps_per_second": 13.78, + "step": 64000 + }, + { + "epoch": 0.61, + "learning_rate": 4.394718567594453e-05, + "loss": 1.919, + "step": 64500 + }, + { + "epoch": 0.61, + "eval_loss": 1.7554343938827515, + "eval_runtime": 226.8184, + "eval_samples_per_second": 440.881, + "eval_steps_per_second": 13.778, + "step": 64500 + }, + { + "epoch": 0.61, + "learning_rate": 4.390026463467278e-05, + "loss": 1.9506, + "step": 65000 + }, + { + "epoch": 0.61, + "eval_loss": 1.744460105895996, + "eval_runtime": 226.7808, + "eval_samples_per_second": 440.954, + "eval_steps_per_second": 13.78, + "step": 65000 + }, + { + "epoch": 0.61, + "learning_rate": 4.385334359340103e-05, + "loss": 1.9358, + "step": 65500 + }, + { + "epoch": 0.61, + "eval_loss": 1.7381893396377563, + "eval_runtime": 226.843, + "eval_samples_per_second": 440.833, + "eval_steps_per_second": 13.776, + "step": 65500 + }, + { + "epoch": 0.62, + "learning_rate": 4.380642255212928e-05, + "loss": 1.9002, + "step": 66000 + }, + { + "epoch": 0.62, + "eval_loss": 1.7369801998138428, + "eval_runtime": 226.5756, + "eval_samples_per_second": 441.354, + "eval_steps_per_second": 13.792, + "step": 66000 + }, + { + "epoch": 0.62, + "learning_rate": 4.3759501510857533e-05, + "loss": 1.9122, + "step": 66500 + }, + { + "epoch": 0.62, + "eval_loss": 1.7368305921554565, + "eval_runtime": 226.767, + "eval_samples_per_second": 440.981, + "eval_steps_per_second": 13.781, + "step": 66500 + }, + { + "epoch": 0.63, + "learning_rate": 4.371258046958579e-05, + "loss": 1.9225, + "step": 67000 + }, + { + "epoch": 0.63, + "eval_loss": 1.7325348854064941, + "eval_runtime": 226.9377, + "eval_samples_per_second": 440.65, + "eval_steps_per_second": 13.77, + "step": 67000 + }, + { + "epoch": 0.63, + "learning_rate": 4.366565942831404e-05, + "loss": 1.9213, + "step": 67500 + }, + { + "epoch": 0.63, + "eval_loss": 1.7244173288345337, + "eval_runtime": 226.9908, + "eval_samples_per_second": 440.547, + "eval_steps_per_second": 13.767, + "step": 67500 + }, + { + "epoch": 0.64, + "learning_rate": 4.3618738387042287e-05, + "loss": 1.9084, + "step": 68000 + }, + { + "epoch": 0.64, + "eval_loss": 1.725306510925293, + "eval_runtime": 226.9071, + "eval_samples_per_second": 440.709, + "eval_steps_per_second": 13.772, + "step": 68000 + }, + { + "epoch": 0.64, + "learning_rate": 4.357181734577054e-05, + "loss": 1.9056, + "step": 68500 + }, + { + "epoch": 0.64, + "eval_loss": 1.716600775718689, + "eval_runtime": 226.8672, + "eval_samples_per_second": 440.786, + "eval_steps_per_second": 13.775, + "step": 68500 + }, + { + "epoch": 0.65, + "learning_rate": 4.352489630449879e-05, + "loss": 1.9175, + "step": 69000 + }, + { + "epoch": 0.65, + "eval_loss": 1.7228271961212158, + "eval_runtime": 226.6113, + "eval_samples_per_second": 441.284, + "eval_steps_per_second": 13.79, + "step": 69000 + }, + { + "epoch": 0.65, + "learning_rate": 4.3477975263227046e-05, + "loss": 1.8865, + "step": 69500 + }, + { + "epoch": 0.65, + "eval_loss": 1.7209404706954956, + "eval_runtime": 226.7014, + "eval_samples_per_second": 441.109, + "eval_steps_per_second": 13.785, + "step": 69500 + }, + { + "epoch": 0.66, + "learning_rate": 4.34310542219553e-05, + "loss": 1.8866, + "step": 70000 + }, + { + "epoch": 0.66, + "eval_loss": 1.7054914236068726, + "eval_runtime": 226.4302, + "eval_samples_per_second": 441.637, + "eval_steps_per_second": 13.801, + "step": 70000 + }, + { + "epoch": 0.66, + "learning_rate": 4.338413318068355e-05, + "loss": 1.8733, + "step": 70500 + }, + { + "epoch": 0.66, + "eval_loss": 1.712384819984436, + "eval_runtime": 226.2537, + "eval_samples_per_second": 441.982, + "eval_steps_per_second": 13.812, + "step": 70500 + }, + { + "epoch": 0.67, + "learning_rate": 4.33372121394118e-05, + "loss": 1.9032, + "step": 71000 + }, + { + "epoch": 0.67, + "eval_loss": 1.6990084648132324, + "eval_runtime": 225.9511, + "eval_samples_per_second": 442.574, + "eval_steps_per_second": 13.83, + "step": 71000 + }, + { + "epoch": 0.67, + "learning_rate": 4.329029109814005e-05, + "loss": 1.8713, + "step": 71500 + }, + { + "epoch": 0.67, + "eval_loss": 1.6956583261489868, + "eval_runtime": 226.2467, + "eval_samples_per_second": 441.995, + "eval_steps_per_second": 13.812, + "step": 71500 + }, + { + "epoch": 0.68, + "learning_rate": 4.3243370056868306e-05, + "loss": 1.8766, + "step": 72000 + }, + { + "epoch": 0.68, + "eval_loss": 1.6951245069503784, + "eval_runtime": 226.2025, + "eval_samples_per_second": 442.082, + "eval_steps_per_second": 13.815, + "step": 72000 + }, + { + "epoch": 0.68, + "learning_rate": 4.319644901559656e-05, + "loss": 1.9066, + "step": 72500 + }, + { + "epoch": 0.68, + "eval_loss": 1.6982998847961426, + "eval_runtime": 226.163, + "eval_samples_per_second": 442.159, + "eval_steps_per_second": 13.817, + "step": 72500 + }, + { + "epoch": 0.69, + "learning_rate": 4.314952797432481e-05, + "loss": 1.889, + "step": 73000 + }, + { + "epoch": 0.69, + "eval_loss": 1.7046481370925903, + "eval_runtime": 226.2165, + "eval_samples_per_second": 442.054, + "eval_steps_per_second": 13.814, + "step": 73000 + }, + { + "epoch": 0.69, + "learning_rate": 4.310260693305306e-05, + "loss": 1.8583, + "step": 73500 + }, + { + "epoch": 0.69, + "eval_loss": 1.6921665668487549, + "eval_runtime": 226.1668, + "eval_samples_per_second": 442.152, + "eval_steps_per_second": 13.817, + "step": 73500 + }, + { + "epoch": 0.69, + "learning_rate": 4.305568589178131e-05, + "loss": 1.846, + "step": 74000 + }, + { + "epoch": 0.69, + "eval_loss": 1.6901285648345947, + "eval_runtime": 225.9398, + "eval_samples_per_second": 442.596, + "eval_steps_per_second": 13.831, + "step": 74000 + }, + { + "epoch": 0.7, + "learning_rate": 4.3008764850509566e-05, + "loss": 1.8748, + "step": 74500 + }, + { + "epoch": 0.7, + "eval_loss": 1.6798553466796875, + "eval_runtime": 226.2378, + "eval_samples_per_second": 442.013, + "eval_steps_per_second": 13.813, + "step": 74500 + }, + { + "epoch": 0.7, + "learning_rate": 4.296184380923782e-05, + "loss": 1.8596, + "step": 75000 + }, + { + "epoch": 0.7, + "eval_loss": 1.681208848953247, + "eval_runtime": 226.5798, + "eval_samples_per_second": 441.346, + "eval_steps_per_second": 13.792, + "step": 75000 + }, + { + "epoch": 0.71, + "learning_rate": 4.291492276796607e-05, + "loss": 1.843, + "step": 75500 + }, + { + "epoch": 0.71, + "eval_loss": 1.6777198314666748, + "eval_runtime": 226.8856, + "eval_samples_per_second": 440.751, + "eval_steps_per_second": 13.773, + "step": 75500 + }, + { + "epoch": 0.71, + "learning_rate": 4.2868001726694326e-05, + "loss": 1.8458, + "step": 76000 + }, + { + "epoch": 0.71, + "eval_loss": 1.6699376106262207, + "eval_runtime": 226.7938, + "eval_samples_per_second": 440.929, + "eval_steps_per_second": 13.779, + "step": 76000 + }, + { + "epoch": 0.72, + "learning_rate": 4.282108068542257e-05, + "loss": 1.8544, + "step": 76500 + }, + { + "epoch": 0.72, + "eval_loss": 1.6637232303619385, + "eval_runtime": 226.7851, + "eval_samples_per_second": 440.946, + "eval_steps_per_second": 13.78, + "step": 76500 + }, + { + "epoch": 0.72, + "learning_rate": 4.2774159644150825e-05, + "loss": 1.8533, + "step": 77000 + }, + { + "epoch": 0.72, + "eval_loss": 1.6666810512542725, + "eval_runtime": 226.4743, + "eval_samples_per_second": 441.551, + "eval_steps_per_second": 13.798, + "step": 77000 + }, + { + "epoch": 0.73, + "learning_rate": 4.272723860287908e-05, + "loss": 1.8645, + "step": 77500 + }, + { + "epoch": 0.73, + "eval_loss": 1.66965651512146, + "eval_runtime": 226.5459, + "eval_samples_per_second": 441.412, + "eval_steps_per_second": 13.794, + "step": 77500 + }, + { + "epoch": 0.73, + "learning_rate": 4.268031756160733e-05, + "loss": 1.8535, + "step": 78000 + }, + { + "epoch": 0.73, + "eval_loss": 1.665113091468811, + "eval_runtime": 226.5387, + "eval_samples_per_second": 441.426, + "eval_steps_per_second": 13.795, + "step": 78000 + }, + { + "epoch": 0.74, + "learning_rate": 4.2633396520335585e-05, + "loss": 1.8425, + "step": 78500 + }, + { + "epoch": 0.74, + "eval_loss": 1.655056118965149, + "eval_runtime": 226.5932, + "eval_samples_per_second": 441.319, + "eval_steps_per_second": 13.791, + "step": 78500 + }, + { + "epoch": 0.74, + "learning_rate": 4.258647547906383e-05, + "loss": 1.8642, + "step": 79000 + }, + { + "epoch": 0.74, + "eval_loss": 1.6554076671600342, + "eval_runtime": 226.5747, + "eval_samples_per_second": 441.356, + "eval_steps_per_second": 13.792, + "step": 79000 + }, + { + "epoch": 0.75, + "learning_rate": 4.2539554437792085e-05, + "loss": 1.8259, + "step": 79500 + }, + { + "epoch": 0.75, + "eval_loss": 1.6575286388397217, + "eval_runtime": 226.3838, + "eval_samples_per_second": 441.728, + "eval_steps_per_second": 13.804, + "step": 79500 + }, + { + "epoch": 0.75, + "learning_rate": 4.249263339652034e-05, + "loss": 1.8199, + "step": 80000 + }, + { + "epoch": 0.75, + "eval_loss": 1.6621437072753906, + "eval_runtime": 226.5957, + "eval_samples_per_second": 441.315, + "eval_steps_per_second": 13.791, + "step": 80000 + }, + { + "epoch": 0.76, + "learning_rate": 4.244571235524859e-05, + "loss": 1.8384, + "step": 80500 + }, + { + "epoch": 0.76, + "eval_loss": 1.6512243747711182, + "eval_runtime": 226.3597, + "eval_samples_per_second": 441.775, + "eval_steps_per_second": 13.805, + "step": 80500 + }, + { + "epoch": 0.76, + "learning_rate": 4.2398791313976845e-05, + "loss": 1.8349, + "step": 81000 + }, + { + "epoch": 0.76, + "eval_loss": 1.6566110849380493, + "eval_runtime": 226.3468, + "eval_samples_per_second": 441.8, + "eval_steps_per_second": 13.806, + "step": 81000 + }, + { + "epoch": 0.76, + "learning_rate": 4.23518702727051e-05, + "loss": 1.8342, + "step": 81500 + }, + { + "epoch": 0.76, + "eval_loss": 1.6478888988494873, + "eval_runtime": 226.3371, + "eval_samples_per_second": 441.819, + "eval_steps_per_second": 13.807, + "step": 81500 + }, + { + "epoch": 0.77, + "learning_rate": 4.2304949231433345e-05, + "loss": 1.8508, + "step": 82000 + }, + { + "epoch": 0.77, + "eval_loss": 1.6489554643630981, + "eval_runtime": 226.3981, + "eval_samples_per_second": 441.7, + "eval_steps_per_second": 13.803, + "step": 82000 + }, + { + "epoch": 0.77, + "learning_rate": 4.22580281901616e-05, + "loss": 1.8135, + "step": 82500 + }, + { + "epoch": 0.77, + "eval_loss": 1.6381429433822632, + "eval_runtime": 226.3351, + "eval_samples_per_second": 441.823, + "eval_steps_per_second": 13.807, + "step": 82500 + }, + { + "epoch": 0.78, + "learning_rate": 4.221110714888985e-05, + "loss": 1.8332, + "step": 83000 + }, + { + "epoch": 0.78, + "eval_loss": 1.6337202787399292, + "eval_runtime": 226.073, + "eval_samples_per_second": 442.335, + "eval_steps_per_second": 13.823, + "step": 83000 + }, + { + "epoch": 0.78, + "learning_rate": 4.2164186107618104e-05, + "loss": 1.8315, + "step": 83500 + }, + { + "epoch": 0.78, + "eval_loss": 1.637961506843567, + "eval_runtime": 226.4701, + "eval_samples_per_second": 441.559, + "eval_steps_per_second": 13.799, + "step": 83500 + }, + { + "epoch": 0.79, + "learning_rate": 4.211726506634636e-05, + "loss": 1.8363, + "step": 84000 + }, + { + "epoch": 0.79, + "eval_loss": 1.6368101835250854, + "eval_runtime": 226.1996, + "eval_samples_per_second": 442.087, + "eval_steps_per_second": 13.815, + "step": 84000 + }, + { + "epoch": 0.79, + "learning_rate": 4.2070344025074604e-05, + "loss": 1.8248, + "step": 84500 + }, + { + "epoch": 0.79, + "eval_loss": 1.626143455505371, + "eval_runtime": 226.4446, + "eval_samples_per_second": 441.609, + "eval_steps_per_second": 13.8, + "step": 84500 + }, + { + "epoch": 0.8, + "learning_rate": 4.202342298380286e-05, + "loss": 1.8045, + "step": 85000 + }, + { + "epoch": 0.8, + "eval_loss": 1.6303765773773193, + "eval_runtime": 226.3791, + "eval_samples_per_second": 441.737, + "eval_steps_per_second": 13.804, + "step": 85000 + }, + { + "epoch": 0.8, + "learning_rate": 4.197650194253111e-05, + "loss": 1.8172, + "step": 85500 + }, + { + "epoch": 0.8, + "eval_loss": 1.6250081062316895, + "eval_runtime": 226.1683, + "eval_samples_per_second": 442.149, + "eval_steps_per_second": 13.817, + "step": 85500 + }, + { + "epoch": 0.81, + "learning_rate": 4.1929580901259364e-05, + "loss": 1.8161, + "step": 86000 + }, + { + "epoch": 0.81, + "eval_loss": 1.615513801574707, + "eval_runtime": 226.0153, + "eval_samples_per_second": 442.448, + "eval_steps_per_second": 13.826, + "step": 86000 + }, + { + "epoch": 0.81, + "learning_rate": 4.188265985998762e-05, + "loss": 1.7852, + "step": 86500 + }, + { + "epoch": 0.81, + "eval_loss": 1.6129599809646606, + "eval_runtime": 225.9925, + "eval_samples_per_second": 442.493, + "eval_steps_per_second": 13.828, + "step": 86500 + }, + { + "epoch": 0.82, + "learning_rate": 4.183573881871587e-05, + "loss": 1.7935, + "step": 87000 + }, + { + "epoch": 0.82, + "eval_loss": 1.6187821626663208, + "eval_runtime": 226.2683, + "eval_samples_per_second": 441.953, + "eval_steps_per_second": 13.811, + "step": 87000 + }, + { + "epoch": 0.82, + "learning_rate": 4.178881777744412e-05, + "loss": 1.791, + "step": 87500 + }, + { + "epoch": 0.82, + "eval_loss": 1.6135245561599731, + "eval_runtime": 226.3101, + "eval_samples_per_second": 441.872, + "eval_steps_per_second": 13.808, + "step": 87500 + }, + { + "epoch": 0.83, + "learning_rate": 4.174189673617237e-05, + "loss": 1.8019, + "step": 88000 + }, + { + "epoch": 0.83, + "eval_loss": 1.6148468255996704, + "eval_runtime": 225.9824, + "eval_samples_per_second": 442.512, + "eval_steps_per_second": 13.829, + "step": 88000 + }, + { + "epoch": 0.83, + "learning_rate": 4.1694975694900624e-05, + "loss": 1.7808, + "step": 88500 + }, + { + "epoch": 0.83, + "eval_loss": 1.6106659173965454, + "eval_runtime": 226.0203, + "eval_samples_per_second": 442.438, + "eval_steps_per_second": 13.826, + "step": 88500 + }, + { + "epoch": 0.84, + "learning_rate": 4.164805465362888e-05, + "loss": 1.7844, + "step": 89000 + }, + { + "epoch": 0.84, + "eval_loss": 1.6136785745620728, + "eval_runtime": 226.2859, + "eval_samples_per_second": 441.919, + "eval_steps_per_second": 13.81, + "step": 89000 + }, + { + "epoch": 0.84, + "learning_rate": 4.160113361235713e-05, + "loss": 1.774, + "step": 89500 + }, + { + "epoch": 0.84, + "eval_loss": 1.6069365739822388, + "eval_runtime": 226.2313, + "eval_samples_per_second": 442.025, + "eval_steps_per_second": 13.813, + "step": 89500 + }, + { + "epoch": 0.84, + "learning_rate": 4.155421257108538e-05, + "loss": 1.7877, + "step": 90000 + }, + { + "epoch": 0.84, + "eval_loss": 1.6087855100631714, + "eval_runtime": 226.3039, + "eval_samples_per_second": 441.884, + "eval_steps_per_second": 13.809, + "step": 90000 + }, + { + "epoch": 0.85, + "learning_rate": 4.150729152981363e-05, + "loss": 1.7757, + "step": 90500 + }, + { + "epoch": 0.85, + "eval_loss": 1.590334177017212, + "eval_runtime": 226.2681, + "eval_samples_per_second": 441.954, + "eval_steps_per_second": 13.811, + "step": 90500 + }, + { + "epoch": 0.85, + "learning_rate": 4.1460370488541883e-05, + "loss": 1.7731, + "step": 91000 + }, + { + "epoch": 0.85, + "eval_loss": 1.5941905975341797, + "eval_runtime": 226.0068, + "eval_samples_per_second": 442.465, + "eval_steps_per_second": 13.827, + "step": 91000 + }, + { + "epoch": 0.86, + "learning_rate": 4.141344944727014e-05, + "loss": 1.7639, + "step": 91500 + }, + { + "epoch": 0.86, + "eval_loss": 1.6017308235168457, + "eval_runtime": 226.0481, + "eval_samples_per_second": 442.384, + "eval_steps_per_second": 13.824, + "step": 91500 + }, + { + "epoch": 0.86, + "learning_rate": 4.136652840599839e-05, + "loss": 1.7889, + "step": 92000 + }, + { + "epoch": 0.86, + "eval_loss": 1.592837929725647, + "eval_runtime": 226.2492, + "eval_samples_per_second": 441.991, + "eval_steps_per_second": 13.812, + "step": 92000 + }, + { + "epoch": 0.87, + "learning_rate": 4.131960736472664e-05, + "loss": 1.7788, + "step": 92500 + }, + { + "epoch": 0.87, + "eval_loss": 1.5879290103912354, + "eval_runtime": 226.1946, + "eval_samples_per_second": 442.097, + "eval_steps_per_second": 13.816, + "step": 92500 + }, + { + "epoch": 0.87, + "learning_rate": 4.127268632345489e-05, + "loss": 1.7976, + "step": 93000 + }, + { + "epoch": 0.87, + "eval_loss": 1.5884945392608643, + "eval_runtime": 226.1587, + "eval_samples_per_second": 442.167, + "eval_steps_per_second": 13.818, + "step": 93000 + }, + { + "epoch": 0.88, + "learning_rate": 4.122576528218314e-05, + "loss": 1.7698, + "step": 93500 + }, + { + "epoch": 0.88, + "eval_loss": 1.5887948274612427, + "eval_runtime": 226.2905, + "eval_samples_per_second": 441.91, + "eval_steps_per_second": 13.81, + "step": 93500 + }, + { + "epoch": 0.88, + "learning_rate": 4.1178844240911396e-05, + "loss": 1.7684, + "step": 94000 + }, + { + "epoch": 0.88, + "eval_loss": 1.576420783996582, + "eval_runtime": 226.1872, + "eval_samples_per_second": 442.112, + "eval_steps_per_second": 13.816, + "step": 94000 + }, + { + "epoch": 0.89, + "learning_rate": 4.113192319963965e-05, + "loss": 1.7679, + "step": 94500 + }, + { + "epoch": 0.89, + "eval_loss": 1.5760667324066162, + "eval_runtime": 226.1526, + "eval_samples_per_second": 442.179, + "eval_steps_per_second": 13.818, + "step": 94500 + }, + { + "epoch": 0.89, + "learning_rate": 4.10850021583679e-05, + "loss": 1.7532, + "step": 95000 + }, + { + "epoch": 0.89, + "eval_loss": 1.576305627822876, + "eval_runtime": 226.1826, + "eval_samples_per_second": 442.121, + "eval_steps_per_second": 13.816, + "step": 95000 + }, + { + "epoch": 0.9, + "learning_rate": 4.103808111709615e-05, + "loss": 1.765, + "step": 95500 + }, + { + "epoch": 0.9, + "eval_loss": 1.580979347229004, + "eval_runtime": 226.2119, + "eval_samples_per_second": 442.063, + "eval_steps_per_second": 13.814, + "step": 95500 + }, + { + "epoch": 0.9, + "learning_rate": 4.09911600758244e-05, + "loss": 1.7429, + "step": 96000 + }, + { + "epoch": 0.9, + "eval_loss": 1.5754783153533936, + "eval_runtime": 226.2117, + "eval_samples_per_second": 442.064, + "eval_steps_per_second": 13.814, + "step": 96000 + }, + { + "epoch": 0.91, + "learning_rate": 4.0944239034552656e-05, + "loss": 1.7522, + "step": 96500 + }, + { + "epoch": 0.91, + "eval_loss": 1.5753768682479858, + "eval_runtime": 225.941, + "eval_samples_per_second": 442.593, + "eval_steps_per_second": 13.831, + "step": 96500 + }, + { + "epoch": 0.91, + "learning_rate": 4.089731799328091e-05, + "loss": 1.7401, + "step": 97000 + }, + { + "epoch": 0.91, + "eval_loss": 1.5701993703842163, + "eval_runtime": 225.8905, + "eval_samples_per_second": 442.692, + "eval_steps_per_second": 13.834, + "step": 97000 + }, + { + "epoch": 0.91, + "learning_rate": 4.085039695200916e-05, + "loss": 1.7533, + "step": 97500 + }, + { + "epoch": 0.91, + "eval_loss": 1.5690381526947021, + "eval_runtime": 226.1613, + "eval_samples_per_second": 442.162, + "eval_steps_per_second": 13.818, + "step": 97500 + }, + { + "epoch": 0.92, + "learning_rate": 4.0803475910737416e-05, + "loss": 1.7558, + "step": 98000 + }, + { + "epoch": 0.92, + "eval_loss": 1.564376711845398, + "eval_runtime": 226.2821, + "eval_samples_per_second": 441.926, + "eval_steps_per_second": 13.81, + "step": 98000 + }, + { + "epoch": 0.92, + "learning_rate": 4.075655486946566e-05, + "loss": 1.7256, + "step": 98500 + }, + { + "epoch": 0.92, + "eval_loss": 1.562330722808838, + "eval_runtime": 226.2172, + "eval_samples_per_second": 442.053, + "eval_steps_per_second": 13.814, + "step": 98500 + }, + { + "epoch": 0.93, + "learning_rate": 4.0709633828193916e-05, + "loss": 1.7487, + "step": 99000 + }, + { + "epoch": 0.93, + "eval_loss": 1.5604215860366821, + "eval_runtime": 225.9865, + "eval_samples_per_second": 442.504, + "eval_steps_per_second": 13.828, + "step": 99000 + }, + { + "epoch": 0.93, + "learning_rate": 4.066271278692217e-05, + "loss": 1.748, + "step": 99500 + }, + { + "epoch": 0.93, + "eval_loss": 1.55665922164917, + "eval_runtime": 226.1873, + "eval_samples_per_second": 442.111, + "eval_steps_per_second": 13.816, + "step": 99500 + }, + { + "epoch": 0.94, + "learning_rate": 4.061579174565042e-05, + "loss": 1.7337, + "step": 100000 + }, + { + "epoch": 0.94, + "eval_loss": 1.5678207874298096, + "eval_runtime": 226.1956, + "eval_samples_per_second": 442.095, + "eval_steps_per_second": 13.815, + "step": 100000 + }, + { + "epoch": 0.94, + "learning_rate": 4.0568870704378675e-05, + "loss": 1.7551, + "step": 100500 + }, + { + "epoch": 0.94, + "eval_loss": 1.5610854625701904, + "eval_runtime": 226.1994, + "eval_samples_per_second": 442.088, + "eval_steps_per_second": 13.815, + "step": 100500 + }, + { + "epoch": 0.95, + "learning_rate": 4.052194966310692e-05, + "loss": 1.7394, + "step": 101000 + }, + { + "epoch": 0.95, + "eval_loss": 1.5528591871261597, + "eval_runtime": 226.4258, + "eval_samples_per_second": 441.646, + "eval_steps_per_second": 13.801, + "step": 101000 + }, + { + "epoch": 0.95, + "learning_rate": 4.0475028621835175e-05, + "loss": 1.7447, + "step": 101500 + }, + { + "epoch": 0.95, + "eval_loss": 1.5571315288543701, + "eval_runtime": 226.3464, + "eval_samples_per_second": 441.801, + "eval_steps_per_second": 13.806, + "step": 101500 + }, + { + "epoch": 0.96, + "learning_rate": 4.042810758056343e-05, + "loss": 1.7411, + "step": 102000 + }, + { + "epoch": 0.96, + "eval_loss": 1.5496081113815308, + "eval_runtime": 226.2985, + "eval_samples_per_second": 441.894, + "eval_steps_per_second": 13.809, + "step": 102000 + }, + { + "epoch": 0.96, + "learning_rate": 4.038118653929168e-05, + "loss": 1.7496, + "step": 102500 + }, + { + "epoch": 0.96, + "eval_loss": 1.551464319229126, + "eval_runtime": 226.3296, + "eval_samples_per_second": 441.834, + "eval_steps_per_second": 13.807, + "step": 102500 + }, + { + "epoch": 0.97, + "learning_rate": 4.0334265498019935e-05, + "loss": 1.7381, + "step": 103000 + }, + { + "epoch": 0.97, + "eval_loss": 1.5396504402160645, + "eval_runtime": 225.928, + "eval_samples_per_second": 442.619, + "eval_steps_per_second": 13.832, + "step": 103000 + }, + { + "epoch": 0.97, + "learning_rate": 4.028734445674819e-05, + "loss": 1.7513, + "step": 103500 + }, + { + "epoch": 0.97, + "eval_loss": 1.5501888990402222, + "eval_runtime": 226.1976, + "eval_samples_per_second": 442.091, + "eval_steps_per_second": 13.815, + "step": 103500 + }, + { + "epoch": 0.98, + "learning_rate": 4.0240423415476435e-05, + "loss": 1.7126, + "step": 104000 + }, + { + "epoch": 0.98, + "eval_loss": 1.5375173091888428, + "eval_runtime": 226.1628, + "eval_samples_per_second": 442.159, + "eval_steps_per_second": 13.817, + "step": 104000 + }, + { + "epoch": 0.98, + "learning_rate": 4.019350237420469e-05, + "loss": 1.7644, + "step": 104500 + }, + { + "epoch": 0.98, + "eval_loss": 1.5411220788955688, + "eval_runtime": 226.1986, + "eval_samples_per_second": 442.089, + "eval_steps_per_second": 13.815, + "step": 104500 + }, + { + "epoch": 0.99, + "learning_rate": 4.014658133293294e-05, + "loss": 1.7142, + "step": 105000 + }, + { + "epoch": 0.99, + "eval_loss": 1.5340408086776733, + "eval_runtime": 226.2255, + "eval_samples_per_second": 442.037, + "eval_steps_per_second": 13.814, + "step": 105000 + }, + { + "epoch": 0.99, + "learning_rate": 4.0099660291661195e-05, + "loss": 1.7053, + "step": 105500 + }, + { + "epoch": 0.99, + "eval_loss": 1.5434354543685913, + "eval_runtime": 226.2654, + "eval_samples_per_second": 441.959, + "eval_steps_per_second": 13.811, + "step": 105500 + }, + { + "epoch": 0.99, + "learning_rate": 4.005273925038945e-05, + "loss": 1.7352, + "step": 106000 + }, + { + "epoch": 0.99, + "eval_loss": 1.5321134328842163, + "eval_runtime": 226.0566, + "eval_samples_per_second": 442.367, + "eval_steps_per_second": 13.824, + "step": 106000 + }, + { + "epoch": 1.0, + "learning_rate": 4.00058182091177e-05, + "loss": 1.7152, + "step": 106500 + }, + { + "epoch": 1.0, + "eval_loss": 1.5397337675094604, + "eval_runtime": 226.363, + "eval_samples_per_second": 441.768, + "eval_steps_per_second": 13.805, + "step": 106500 + }, + { + "epoch": 1.0, + "learning_rate": 3.995889716784595e-05, + "loss": 1.6927, + "step": 107000 + }, + { + "epoch": 1.0, + "eval_loss": 1.542881727218628, + "eval_runtime": 226.3007, + "eval_samples_per_second": 441.89, + "eval_steps_per_second": 13.809, + "step": 107000 + }, + { + "epoch": 1.01, + "learning_rate": 3.99119761265742e-05, + "loss": 1.7013, + "step": 107500 + }, + { + "epoch": 1.01, + "eval_loss": 1.528303861618042, + "eval_runtime": 226.2289, + "eval_samples_per_second": 442.03, + "eval_steps_per_second": 13.813, + "step": 107500 + }, + { + "epoch": 1.01, + "learning_rate": 3.9865055085302454e-05, + "loss": 1.7318, + "step": 108000 + }, + { + "epoch": 1.01, + "eval_loss": 1.5268659591674805, + "eval_runtime": 226.1768, + "eval_samples_per_second": 442.132, + "eval_steps_per_second": 13.817, + "step": 108000 + }, + { + "epoch": 1.02, + "learning_rate": 3.981813404403071e-05, + "loss": 1.6903, + "step": 108500 + }, + { + "epoch": 1.02, + "eval_loss": 1.528219223022461, + "eval_runtime": 226.4877, + "eval_samples_per_second": 441.525, + "eval_steps_per_second": 13.798, + "step": 108500 + }, + { + "epoch": 1.02, + "learning_rate": 3.977121300275896e-05, + "loss": 1.7105, + "step": 109000 + }, + { + "epoch": 1.02, + "eval_loss": 1.5189323425292969, + "eval_runtime": 226.1979, + "eval_samples_per_second": 442.091, + "eval_steps_per_second": 13.815, + "step": 109000 + }, + { + "epoch": 1.03, + "learning_rate": 3.972429196148721e-05, + "loss": 1.6979, + "step": 109500 + }, + { + "epoch": 1.03, + "eval_loss": 1.5255582332611084, + "eval_runtime": 226.4286, + "eval_samples_per_second": 441.64, + "eval_steps_per_second": 13.801, + "step": 109500 + }, + { + "epoch": 1.03, + "learning_rate": 3.967737092021546e-05, + "loss": 1.6922, + "step": 110000 + }, + { + "epoch": 1.03, + "eval_loss": 1.5171970129013062, + "eval_runtime": 226.4396, + "eval_samples_per_second": 441.619, + "eval_steps_per_second": 13.801, + "step": 110000 + }, + { + "epoch": 1.04, + "learning_rate": 3.9630449878943714e-05, + "loss": 1.7161, + "step": 110500 + }, + { + "epoch": 1.04, + "eval_loss": 1.517843246459961, + "eval_runtime": 226.4686, + "eval_samples_per_second": 441.562, + "eval_steps_per_second": 13.799, + "step": 110500 + }, + { + "epoch": 1.04, + "learning_rate": 3.958352883767197e-05, + "loss": 1.719, + "step": 111000 + }, + { + "epoch": 1.04, + "eval_loss": 1.5142602920532227, + "eval_runtime": 226.7344, + "eval_samples_per_second": 441.045, + "eval_steps_per_second": 13.783, + "step": 111000 + }, + { + "epoch": 1.05, + "learning_rate": 3.953660779640022e-05, + "loss": 1.6857, + "step": 111500 + }, + { + "epoch": 1.05, + "eval_loss": 1.5159624814987183, + "eval_runtime": 226.7078, + "eval_samples_per_second": 441.096, + "eval_steps_per_second": 13.784, + "step": 111500 + }, + { + "epoch": 1.05, + "learning_rate": 3.9489686755128474e-05, + "loss": 1.6983, + "step": 112000 + }, + { + "epoch": 1.05, + "eval_loss": 1.510577917098999, + "eval_runtime": 226.2373, + "eval_samples_per_second": 442.014, + "eval_steps_per_second": 13.813, + "step": 112000 + }, + { + "epoch": 1.06, + "learning_rate": 3.944276571385672e-05, + "loss": 1.6838, + "step": 112500 + }, + { + "epoch": 1.06, + "eval_loss": 1.5174448490142822, + "eval_runtime": 226.647, + "eval_samples_per_second": 441.215, + "eval_steps_per_second": 13.788, + "step": 112500 + }, + { + "epoch": 1.06, + "learning_rate": 3.9395844672584974e-05, + "loss": 1.6936, + "step": 113000 + }, + { + "epoch": 1.06, + "eval_loss": 1.5134094953536987, + "eval_runtime": 226.7825, + "eval_samples_per_second": 440.951, + "eval_steps_per_second": 13.78, + "step": 113000 + }, + { + "epoch": 1.07, + "learning_rate": 3.934892363131323e-05, + "loss": 1.675, + "step": 113500 + }, + { + "epoch": 1.07, + "eval_loss": 1.5090696811676025, + "eval_runtime": 226.8241, + "eval_samples_per_second": 440.87, + "eval_steps_per_second": 13.777, + "step": 113500 + }, + { + "epoch": 1.07, + "learning_rate": 3.930200259004148e-05, + "loss": 1.6706, + "step": 114000 + }, + { + "epoch": 1.07, + "eval_loss": 1.5159416198730469, + "eval_runtime": 226.8746, + "eval_samples_per_second": 440.772, + "eval_steps_per_second": 13.774, + "step": 114000 + }, + { + "epoch": 1.07, + "learning_rate": 3.9255081548769734e-05, + "loss": 1.6763, + "step": 114500 + }, + { + "epoch": 1.07, + "eval_loss": 1.4984365701675415, + "eval_runtime": 226.8627, + "eval_samples_per_second": 440.795, + "eval_steps_per_second": 13.775, + "step": 114500 + }, + { + "epoch": 1.08, + "learning_rate": 3.920816050749798e-05, + "loss": 1.6888, + "step": 115000 + }, + { + "epoch": 1.08, + "eval_loss": 1.504223108291626, + "eval_runtime": 228.3239, + "eval_samples_per_second": 437.974, + "eval_steps_per_second": 13.687, + "step": 115000 + }, + { + "epoch": 1.08, + "learning_rate": 3.916123946622623e-05, + "loss": 1.6627, + "step": 115500 + }, + { + "epoch": 1.08, + "eval_loss": 1.5011541843414307, + "eval_runtime": 226.9808, + "eval_samples_per_second": 440.566, + "eval_steps_per_second": 13.768, + "step": 115500 + }, + { + "epoch": 1.09, + "learning_rate": 3.911431842495449e-05, + "loss": 1.6739, + "step": 116000 + }, + { + "epoch": 1.09, + "eval_loss": 1.5042829513549805, + "eval_runtime": 227.232, + "eval_samples_per_second": 440.079, + "eval_steps_per_second": 13.752, + "step": 116000 + }, + { + "epoch": 1.09, + "learning_rate": 3.906739738368274e-05, + "loss": 1.6465, + "step": 116500 + }, + { + "epoch": 1.09, + "eval_loss": 1.496393084526062, + "eval_runtime": 227.0422, + "eval_samples_per_second": 440.447, + "eval_steps_per_second": 13.764, + "step": 116500 + }, + { + "epoch": 1.1, + "learning_rate": 3.902047634241099e-05, + "loss": 1.681, + "step": 117000 + }, + { + "epoch": 1.1, + "eval_loss": 1.501774549484253, + "eval_runtime": 226.8713, + "eval_samples_per_second": 440.779, + "eval_steps_per_second": 13.774, + "step": 117000 + }, + { + "epoch": 1.1, + "learning_rate": 3.8973555301139247e-05, + "loss": 1.6674, + "step": 117500 + }, + { + "epoch": 1.1, + "eval_loss": 1.492223858833313, + "eval_runtime": 227.2955, + "eval_samples_per_second": 439.956, + "eval_steps_per_second": 13.749, + "step": 117500 + }, + { + "epoch": 1.11, + "learning_rate": 3.892663425986749e-05, + "loss": 1.6808, + "step": 118000 + }, + { + "epoch": 1.11, + "eval_loss": 1.5016900300979614, + "eval_runtime": 227.0887, + "eval_samples_per_second": 440.356, + "eval_steps_per_second": 13.761, + "step": 118000 + }, + { + "epoch": 1.11, + "learning_rate": 3.8879713218595746e-05, + "loss": 1.6687, + "step": 118500 + }, + { + "epoch": 1.11, + "eval_loss": 1.4928853511810303, + "eval_runtime": 235.0541, + "eval_samples_per_second": 425.434, + "eval_steps_per_second": 13.295, + "step": 118500 + }, + { + "epoch": 1.12, + "learning_rate": 3.8832792177324e-05, + "loss": 1.6711, + "step": 119000 + }, + { + "epoch": 1.12, + "eval_loss": 1.4951835870742798, + "eval_runtime": 231.8391, + "eval_samples_per_second": 431.334, + "eval_steps_per_second": 13.479, + "step": 119000 + }, + { + "epoch": 1.12, + "learning_rate": 3.878587113605225e-05, + "loss": 1.6629, + "step": 119500 + }, + { + "epoch": 1.12, + "eval_loss": 1.49068284034729, + "eval_runtime": 232.7614, + "eval_samples_per_second": 429.624, + "eval_steps_per_second": 13.426, + "step": 119500 + }, + { + "epoch": 1.13, + "learning_rate": 3.8738950094780506e-05, + "loss": 1.6762, + "step": 120000 + }, + { + "epoch": 1.13, + "eval_loss": 1.497752070426941, + "eval_runtime": 226.4934, + "eval_samples_per_second": 441.514, + "eval_steps_per_second": 13.797, + "step": 120000 + }, + { + "epoch": 1.13, + "learning_rate": 3.869202905350875e-05, + "loss": 1.6806, + "step": 120500 + }, + { + "epoch": 1.13, + "eval_loss": 1.5004570484161377, + "eval_runtime": 231.6639, + "eval_samples_per_second": 431.66, + "eval_steps_per_second": 13.489, + "step": 120500 + }, + { + "epoch": 1.14, + "learning_rate": 3.8645108012237006e-05, + "loss": 1.6322, + "step": 121000 + }, + { + "epoch": 1.14, + "eval_loss": 1.491471767425537, + "eval_runtime": 223.6465, + "eval_samples_per_second": 447.134, + "eval_steps_per_second": 13.973, + "step": 121000 + }, + { + "epoch": 1.14, + "learning_rate": 3.859818697096526e-05, + "loss": 1.6823, + "step": 121500 + }, + { + "epoch": 1.14, + "eval_loss": 1.4876571893692017, + "eval_runtime": 223.7062, + "eval_samples_per_second": 447.015, + "eval_steps_per_second": 13.969, + "step": 121500 + }, + { + "epoch": 1.14, + "learning_rate": 3.855126592969351e-05, + "loss": 1.6659, + "step": 122000 + }, + { + "epoch": 1.14, + "eval_loss": 1.4812325239181519, + "eval_runtime": 223.6584, + "eval_samples_per_second": 447.11, + "eval_steps_per_second": 13.972, + "step": 122000 + }, + { + "epoch": 1.15, + "learning_rate": 3.8504344888421766e-05, + "loss": 1.6801, + "step": 122500 + }, + { + "epoch": 1.15, + "eval_loss": 1.4771298170089722, + "eval_runtime": 223.7401, + "eval_samples_per_second": 446.947, + "eval_steps_per_second": 13.967, + "step": 122500 + }, + { + "epoch": 1.15, + "learning_rate": 3.845742384715002e-05, + "loss": 1.6685, + "step": 123000 + }, + { + "epoch": 1.15, + "eval_loss": 1.473673939704895, + "eval_runtime": 223.7681, + "eval_samples_per_second": 446.891, + "eval_steps_per_second": 13.965, + "step": 123000 + }, + { + "epoch": 1.16, + "learning_rate": 3.8410502805878266e-05, + "loss": 1.6686, + "step": 123500 + }, + { + "epoch": 1.16, + "eval_loss": 1.4780343770980835, + "eval_runtime": 223.8692, + "eval_samples_per_second": 446.689, + "eval_steps_per_second": 13.959, + "step": 123500 + }, + { + "epoch": 1.16, + "learning_rate": 3.836358176460652e-05, + "loss": 1.6603, + "step": 124000 + }, + { + "epoch": 1.16, + "eval_loss": 1.478870153427124, + "eval_runtime": 237.4256, + "eval_samples_per_second": 421.185, + "eval_steps_per_second": 13.162, + "step": 124000 + }, + { + "epoch": 1.17, + "learning_rate": 3.831666072333477e-05, + "loss": 1.6402, + "step": 124500 + }, + { + "epoch": 1.17, + "eval_loss": 1.4783562421798706, + "eval_runtime": 236.4563, + "eval_samples_per_second": 422.911, + "eval_steps_per_second": 13.216, + "step": 124500 + }, + { + "epoch": 1.17, + "learning_rate": 3.8269739682063025e-05, + "loss": 1.6585, + "step": 125000 + }, + { + "epoch": 1.17, + "eval_loss": 1.465975046157837, + "eval_runtime": 236.3659, + "eval_samples_per_second": 423.073, + "eval_steps_per_second": 13.221, + "step": 125000 + }, + { + "epoch": 1.18, + "learning_rate": 3.822281864079128e-05, + "loss": 1.6569, + "step": 125500 + }, + { + "epoch": 1.18, + "eval_loss": 1.4764187335968018, + "eval_runtime": 224.8017, + "eval_samples_per_second": 444.837, + "eval_steps_per_second": 13.901, + "step": 125500 + }, + { + "epoch": 1.18, + "learning_rate": 3.8175897599519525e-05, + "loss": 1.6557, + "step": 126000 + }, + { + "epoch": 1.18, + "eval_loss": 1.4717729091644287, + "eval_runtime": 221.9623, + "eval_samples_per_second": 450.527, + "eval_steps_per_second": 14.079, + "step": 126000 + }, + { + "epoch": 1.19, + "learning_rate": 3.812897655824778e-05, + "loss": 1.6409, + "step": 126500 + }, + { + "epoch": 1.19, + "eval_loss": 1.4730979204177856, + "eval_runtime": 221.9654, + "eval_samples_per_second": 450.521, + "eval_steps_per_second": 14.079, + "step": 126500 + }, + { + "epoch": 1.19, + "learning_rate": 3.808205551697603e-05, + "loss": 1.6487, + "step": 127000 + }, + { + "epoch": 1.19, + "eval_loss": 1.4701308012008667, + "eval_runtime": 222.0375, + "eval_samples_per_second": 450.374, + "eval_steps_per_second": 14.074, + "step": 127000 + }, + { + "epoch": 1.2, + "learning_rate": 3.8035134475704285e-05, + "loss": 1.645, + "step": 127500 + }, + { + "epoch": 1.2, + "eval_loss": 1.4616634845733643, + "eval_runtime": 222.0725, + "eval_samples_per_second": 450.303, + "eval_steps_per_second": 14.072, + "step": 127500 + }, + { + "epoch": 1.2, + "learning_rate": 3.798821343443254e-05, + "loss": 1.6402, + "step": 128000 + }, + { + "epoch": 1.2, + "eval_loss": 1.4659932851791382, + "eval_runtime": 222.1612, + "eval_samples_per_second": 450.124, + "eval_steps_per_second": 14.066, + "step": 128000 + }, + { + "epoch": 1.21, + "learning_rate": 3.794129239316079e-05, + "loss": 1.6373, + "step": 128500 + }, + { + "epoch": 1.21, + "eval_loss": 1.4619441032409668, + "eval_runtime": 222.2305, + "eval_samples_per_second": 449.983, + "eval_steps_per_second": 14.062, + "step": 128500 + }, + { + "epoch": 1.21, + "learning_rate": 3.789437135188904e-05, + "loss": 1.6473, + "step": 129000 + }, + { + "epoch": 1.21, + "eval_loss": 1.4622013568878174, + "eval_runtime": 222.2288, + "eval_samples_per_second": 449.987, + "eval_steps_per_second": 14.062, + "step": 129000 + }, + { + "epoch": 1.22, + "learning_rate": 3.784745031061729e-05, + "loss": 1.6421, + "step": 129500 + }, + { + "epoch": 1.22, + "eval_loss": 1.4600987434387207, + "eval_runtime": 222.2845, + "eval_samples_per_second": 449.874, + "eval_steps_per_second": 14.059, + "step": 129500 + }, + { + "epoch": 1.22, + "learning_rate": 3.7800529269345545e-05, + "loss": 1.6401, + "step": 130000 + }, + { + "epoch": 1.22, + "eval_loss": 1.4546868801116943, + "eval_runtime": 229.4754, + "eval_samples_per_second": 435.777, + "eval_steps_per_second": 13.618, + "step": 130000 + }, + { + "epoch": 1.22, + "learning_rate": 3.77536082280738e-05, + "loss": 1.6446, + "step": 130500 + }, + { + "epoch": 1.22, + "eval_loss": 1.4492428302764893, + "eval_runtime": 228.4821, + "eval_samples_per_second": 437.671, + "eval_steps_per_second": 13.677, + "step": 130500 + }, + { + "epoch": 1.23, + "learning_rate": 3.770668718680205e-05, + "loss": 1.6274, + "step": 131000 + }, + { + "epoch": 1.23, + "eval_loss": 1.4530912637710571, + "eval_runtime": 228.7476, + "eval_samples_per_second": 437.163, + "eval_steps_per_second": 13.661, + "step": 131000 + }, + { + "epoch": 1.23, + "learning_rate": 3.76597661455303e-05, + "loss": 1.6548, + "step": 131500 + }, + { + "epoch": 1.23, + "eval_loss": 1.453102946281433, + "eval_runtime": 232.9785, + "eval_samples_per_second": 429.224, + "eval_steps_per_second": 13.413, + "step": 131500 + }, + { + "epoch": 1.24, + "learning_rate": 3.761284510425855e-05, + "loss": 1.6128, + "step": 132000 + }, + { + "epoch": 1.24, + "eval_loss": 1.445427656173706, + "eval_runtime": 232.8681, + "eval_samples_per_second": 429.428, + "eval_steps_per_second": 13.42, + "step": 132000 + }, + { + "epoch": 1.24, + "learning_rate": 3.7565924062986804e-05, + "loss": 1.6217, + "step": 132500 + }, + { + "epoch": 1.24, + "eval_loss": 1.4480894804000854, + "eval_runtime": 229.5409, + "eval_samples_per_second": 435.652, + "eval_steps_per_second": 13.614, + "step": 132500 + }, + { + "epoch": 1.25, + "learning_rate": 3.751900302171506e-05, + "loss": 1.6292, + "step": 133000 + }, + { + "epoch": 1.25, + "eval_loss": 1.4402817487716675, + "eval_runtime": 223.3255, + "eval_samples_per_second": 447.777, + "eval_steps_per_second": 13.993, + "step": 133000 + }, + { + "epoch": 1.25, + "learning_rate": 3.747208198044331e-05, + "loss": 1.6262, + "step": 133500 + }, + { + "epoch": 1.25, + "eval_loss": 1.4454220533370972, + "eval_runtime": 223.1317, + "eval_samples_per_second": 448.166, + "eval_steps_per_second": 14.005, + "step": 133500 + }, + { + "epoch": 1.26, + "learning_rate": 3.7425160939171564e-05, + "loss": 1.6355, + "step": 134000 + }, + { + "epoch": 1.26, + "eval_loss": 1.4490951299667358, + "eval_runtime": 222.8039, + "eval_samples_per_second": 448.825, + "eval_steps_per_second": 14.026, + "step": 134000 + }, + { + "epoch": 1.26, + "learning_rate": 3.737823989789981e-05, + "loss": 1.6361, + "step": 134500 + }, + { + "epoch": 1.26, + "eval_loss": 1.446262001991272, + "eval_runtime": 222.7683, + "eval_samples_per_second": 448.897, + "eval_steps_per_second": 14.028, + "step": 134500 + }, + { + "epoch": 1.27, + "learning_rate": 3.7331318856628064e-05, + "loss": 1.6147, + "step": 135000 + }, + { + "epoch": 1.27, + "eval_loss": 1.4396119117736816, + "eval_runtime": 222.7955, + "eval_samples_per_second": 448.842, + "eval_steps_per_second": 14.026, + "step": 135000 + }, + { + "epoch": 1.27, + "learning_rate": 3.728439781535632e-05, + "loss": 1.6442, + "step": 135500 + }, + { + "epoch": 1.27, + "eval_loss": 1.4377689361572266, + "eval_runtime": 222.8075, + "eval_samples_per_second": 448.818, + "eval_steps_per_second": 14.026, + "step": 135500 + }, + { + "epoch": 1.28, + "learning_rate": 3.723747677408457e-05, + "loss": 1.6069, + "step": 136000 + }, + { + "epoch": 1.28, + "eval_loss": 1.436558723449707, + "eval_runtime": 222.7834, + "eval_samples_per_second": 448.867, + "eval_steps_per_second": 14.027, + "step": 136000 + }, + { + "epoch": 1.28, + "learning_rate": 3.7190555732812824e-05, + "loss": 1.6183, + "step": 136500 + }, + { + "epoch": 1.28, + "eval_loss": 1.4427673816680908, + "eval_runtime": 222.8527, + "eval_samples_per_second": 448.727, + "eval_steps_per_second": 14.023, + "step": 136500 + }, + { + "epoch": 1.29, + "learning_rate": 3.714363469154107e-05, + "loss": 1.6183, + "step": 137000 + }, + { + "epoch": 1.29, + "eval_loss": 1.4434157609939575, + "eval_runtime": 222.9556, + "eval_samples_per_second": 448.52, + "eval_steps_per_second": 14.016, + "step": 137000 + }, + { + "epoch": 1.29, + "learning_rate": 3.7096713650269324e-05, + "loss": 1.6068, + "step": 137500 + }, + { + "epoch": 1.29, + "eval_loss": 1.4364444017410278, + "eval_runtime": 222.8985, + "eval_samples_per_second": 448.635, + "eval_steps_per_second": 14.02, + "step": 137500 + }, + { + "epoch": 1.3, + "learning_rate": 3.704979260899758e-05, + "loss": 1.6363, + "step": 138000 + }, + { + "epoch": 1.3, + "eval_loss": 1.4355006217956543, + "eval_runtime": 222.8369, + "eval_samples_per_second": 448.759, + "eval_steps_per_second": 14.024, + "step": 138000 + }, + { + "epoch": 1.3, + "learning_rate": 3.700287156772583e-05, + "loss": 1.6119, + "step": 138500 + }, + { + "epoch": 1.3, + "eval_loss": 1.4310338497161865, + "eval_runtime": 222.7256, + "eval_samples_per_second": 448.983, + "eval_steps_per_second": 14.031, + "step": 138500 + }, + { + "epoch": 1.3, + "learning_rate": 3.6955950526454084e-05, + "loss": 1.6483, + "step": 139000 + }, + { + "epoch": 1.3, + "eval_loss": 1.4362512826919556, + "eval_runtime": 222.6329, + "eval_samples_per_second": 449.17, + "eval_steps_per_second": 14.037, + "step": 139000 + }, + { + "epoch": 1.31, + "learning_rate": 3.690902948518234e-05, + "loss": 1.612, + "step": 139500 + }, + { + "epoch": 1.31, + "eval_loss": 1.4332929849624634, + "eval_runtime": 222.5859, + "eval_samples_per_second": 449.265, + "eval_steps_per_second": 14.04, + "step": 139500 + }, + { + "epoch": 1.31, + "learning_rate": 3.686210844391059e-05, + "loss": 1.61, + "step": 140000 + }, + { + "epoch": 1.31, + "eval_loss": 1.4201843738555908, + "eval_runtime": 226.6701, + "eval_samples_per_second": 441.17, + "eval_steps_per_second": 13.787, + "step": 140000 + }, + { + "epoch": 1.32, + "learning_rate": 3.681518740263884e-05, + "loss": 1.6236, + "step": 140500 + }, + { + "epoch": 1.32, + "eval_loss": 1.4394793510437012, + "eval_runtime": 234.5172, + "eval_samples_per_second": 426.408, + "eval_steps_per_second": 13.325, + "step": 140500 + }, + { + "epoch": 1.32, + "learning_rate": 3.676826636136709e-05, + "loss": 1.6185, + "step": 141000 + }, + { + "epoch": 1.32, + "eval_loss": 1.433275818824768, + "eval_runtime": 234.8559, + "eval_samples_per_second": 425.793, + "eval_steps_per_second": 13.306, + "step": 141000 + }, + { + "epoch": 1.33, + "learning_rate": 3.672134532009534e-05, + "loss": 1.6061, + "step": 141500 + }, + { + "epoch": 1.33, + "eval_loss": 1.418621301651001, + "eval_runtime": 236.7519, + "eval_samples_per_second": 422.383, + "eval_steps_per_second": 13.199, + "step": 141500 + }, + { + "epoch": 1.33, + "learning_rate": 3.6674424278823596e-05, + "loss": 1.6104, + "step": 142000 + }, + { + "epoch": 1.33, + "eval_loss": 1.4307106733322144, + "eval_runtime": 235.4095, + "eval_samples_per_second": 424.792, + "eval_steps_per_second": 13.275, + "step": 142000 + }, + { + "epoch": 1.34, + "learning_rate": 3.662750323755185e-05, + "loss": 1.6079, + "step": 142500 + }, + { + "epoch": 1.34, + "eval_loss": 1.4250682592391968, + "eval_runtime": 235.4454, + "eval_samples_per_second": 424.727, + "eval_steps_per_second": 13.273, + "step": 142500 + }, + { + "epoch": 1.34, + "learning_rate": 3.65805821962801e-05, + "loss": 1.5905, + "step": 143000 + }, + { + "epoch": 1.34, + "eval_loss": 1.4250717163085938, + "eval_runtime": 235.7357, + "eval_samples_per_second": 424.204, + "eval_steps_per_second": 13.256, + "step": 143000 + }, + { + "epoch": 1.35, + "learning_rate": 3.653366115500835e-05, + "loss": 1.6081, + "step": 143500 + }, + { + "epoch": 1.35, + "eval_loss": 1.4208674430847168, + "eval_runtime": 234.1473, + "eval_samples_per_second": 427.082, + "eval_steps_per_second": 13.346, + "step": 143500 + }, + { + "epoch": 1.35, + "learning_rate": 3.64867401137366e-05, + "loss": 1.6312, + "step": 144000 + }, + { + "epoch": 1.35, + "eval_loss": 1.4182296991348267, + "eval_runtime": 235.783, + "eval_samples_per_second": 424.119, + "eval_steps_per_second": 13.254, + "step": 144000 + }, + { + "epoch": 1.36, + "learning_rate": 3.6439819072464856e-05, + "loss": 1.6118, + "step": 144500 + }, + { + "epoch": 1.36, + "eval_loss": 1.4259034395217896, + "eval_runtime": 235.3478, + "eval_samples_per_second": 424.903, + "eval_steps_per_second": 13.278, + "step": 144500 + }, + { + "epoch": 1.36, + "learning_rate": 3.639289803119311e-05, + "loss": 1.6056, + "step": 145000 + }, + { + "epoch": 1.36, + "eval_loss": 1.4210768938064575, + "eval_runtime": 233.3721, + "eval_samples_per_second": 428.5, + "eval_steps_per_second": 13.391, + "step": 145000 + }, + { + "epoch": 1.37, + "learning_rate": 3.634597698992136e-05, + "loss": 1.6058, + "step": 145500 + }, + { + "epoch": 1.37, + "eval_loss": 1.4050568342208862, + "eval_runtime": 231.2948, + "eval_samples_per_second": 432.349, + "eval_steps_per_second": 13.511, + "step": 145500 + }, + { + "epoch": 1.37, + "learning_rate": 3.6299055948649616e-05, + "loss": 1.6005, + "step": 146000 + }, + { + "epoch": 1.37, + "eval_loss": 1.4116266965866089, + "eval_runtime": 231.0158, + "eval_samples_per_second": 432.871, + "eval_steps_per_second": 13.527, + "step": 146000 + }, + { + "epoch": 1.37, + "learning_rate": 3.625213490737787e-05, + "loss": 1.5909, + "step": 146500 + }, + { + "epoch": 1.37, + "eval_loss": 1.4132776260375977, + "eval_runtime": 231.0824, + "eval_samples_per_second": 432.746, + "eval_steps_per_second": 13.523, + "step": 146500 + }, + { + "epoch": 1.38, + "learning_rate": 3.6205213866106116e-05, + "loss": 1.614, + "step": 147000 + }, + { + "epoch": 1.38, + "eval_loss": 1.413307547569275, + "eval_runtime": 222.5174, + "eval_samples_per_second": 449.403, + "eval_steps_per_second": 14.044, + "step": 147000 + }, + { + "epoch": 1.38, + "learning_rate": 3.615829282483437e-05, + "loss": 1.5907, + "step": 147500 + }, + { + "epoch": 1.38, + "eval_loss": 1.4117494821548462, + "eval_runtime": 222.5274, + "eval_samples_per_second": 449.383, + "eval_steps_per_second": 14.043, + "step": 147500 + }, + { + "epoch": 1.39, + "learning_rate": 3.611137178356262e-05, + "loss": 1.5882, + "step": 148000 + }, + { + "epoch": 1.39, + "eval_loss": 1.4053359031677246, + "eval_runtime": 222.505, + "eval_samples_per_second": 449.428, + "eval_steps_per_second": 14.045, + "step": 148000 + }, + { + "epoch": 1.39, + "learning_rate": 3.6064450742290876e-05, + "loss": 1.5864, + "step": 148500 + }, + { + "epoch": 1.39, + "eval_loss": 1.3983163833618164, + "eval_runtime": 222.439, + "eval_samples_per_second": 449.561, + "eval_steps_per_second": 14.049, + "step": 148500 + }, + { + "epoch": 1.4, + "learning_rate": 3.601752970101913e-05, + "loss": 1.5881, + "step": 149000 + }, + { + "epoch": 1.4, + "eval_loss": 1.4097754955291748, + "eval_runtime": 222.6731, + "eval_samples_per_second": 449.089, + "eval_steps_per_second": 14.034, + "step": 149000 + }, + { + "epoch": 1.4, + "learning_rate": 3.597060865974738e-05, + "loss": 1.5866, + "step": 149500 + }, + { + "epoch": 1.4, + "eval_loss": 1.4093042612075806, + "eval_runtime": 222.7781, + "eval_samples_per_second": 448.877, + "eval_steps_per_second": 14.027, + "step": 149500 + }, + { + "epoch": 1.41, + "learning_rate": 3.5923687618475635e-05, + "loss": 1.5835, + "step": 150000 + }, + { + "epoch": 1.41, + "eval_loss": 1.4054986238479614, + "eval_runtime": 228.6714, + "eval_samples_per_second": 437.309, + "eval_steps_per_second": 13.666, + "step": 150000 + }, + { + "epoch": 1.41, + "learning_rate": 3.587676657720388e-05, + "loss": 1.5792, + "step": 150500 + }, + { + "epoch": 1.41, + "eval_loss": 1.40665864944458, + "eval_runtime": 228.4732, + "eval_samples_per_second": 437.688, + "eval_steps_per_second": 13.678, + "step": 150500 + }, + { + "epoch": 1.42, + "learning_rate": 3.5829845535932135e-05, + "loss": 1.5667, + "step": 151000 + }, + { + "epoch": 1.42, + "eval_loss": 1.4050201177597046, + "eval_runtime": 228.2289, + "eval_samples_per_second": 438.157, + "eval_steps_per_second": 13.692, + "step": 151000 + }, + { + "epoch": 1.42, + "learning_rate": 3.578292449466039e-05, + "loss": 1.5771, + "step": 151500 + }, + { + "epoch": 1.42, + "eval_loss": 1.4022421836853027, + "eval_runtime": 233.937, + "eval_samples_per_second": 427.466, + "eval_steps_per_second": 13.358, + "step": 151500 + }, + { + "epoch": 1.43, + "learning_rate": 3.573600345338864e-05, + "loss": 1.5768, + "step": 152000 + }, + { + "epoch": 1.43, + "eval_loss": 1.4057670831680298, + "eval_runtime": 233.6648, + "eval_samples_per_second": 427.963, + "eval_steps_per_second": 13.374, + "step": 152000 + }, + { + "epoch": 1.43, + "learning_rate": 3.5689082412116895e-05, + "loss": 1.5752, + "step": 152500 + }, + { + "epoch": 1.43, + "eval_loss": 1.403328537940979, + "eval_runtime": 239.3336, + "eval_samples_per_second": 417.827, + "eval_steps_per_second": 13.057, + "step": 152500 + }, + { + "epoch": 1.44, + "learning_rate": 3.564216137084515e-05, + "loss": 1.5757, + "step": 153000 + }, + { + "epoch": 1.44, + "eval_loss": 1.3900455236434937, + "eval_runtime": 237.9128, + "eval_samples_per_second": 420.322, + "eval_steps_per_second": 13.135, + "step": 153000 + }, + { + "epoch": 1.44, + "learning_rate": 3.5595240329573395e-05, + "loss": 1.5689, + "step": 153500 + }, + { + "epoch": 1.44, + "eval_loss": 1.3995317220687866, + "eval_runtime": 239.4513, + "eval_samples_per_second": 417.621, + "eval_steps_per_second": 13.051, + "step": 153500 + }, + { + "epoch": 1.45, + "learning_rate": 3.554831928830165e-05, + "loss": 1.5896, + "step": 154000 + }, + { + "epoch": 1.45, + "eval_loss": 1.3904423713684082, + "eval_runtime": 236.1241, + "eval_samples_per_second": 423.506, + "eval_steps_per_second": 13.235, + "step": 154000 + }, + { + "epoch": 1.45, + "learning_rate": 3.55013982470299e-05, + "loss": 1.5916, + "step": 154500 + }, + { + "epoch": 1.45, + "eval_loss": 1.400153636932373, + "eval_runtime": 227.755, + "eval_samples_per_second": 439.068, + "eval_steps_per_second": 13.721, + "step": 154500 + }, + { + "epoch": 1.45, + "learning_rate": 3.5454477205758155e-05, + "loss": 1.5896, + "step": 155000 + }, + { + "epoch": 1.45, + "eval_loss": 1.3952091932296753, + "eval_runtime": 222.2884, + "eval_samples_per_second": 449.866, + "eval_steps_per_second": 14.058, + "step": 155000 + }, + { + "epoch": 1.46, + "learning_rate": 3.540755616448641e-05, + "loss": 1.5947, + "step": 155500 + }, + { + "epoch": 1.46, + "eval_loss": 1.3903526067733765, + "eval_runtime": 222.5604, + "eval_samples_per_second": 449.316, + "eval_steps_per_second": 14.041, + "step": 155500 + }, + { + "epoch": 1.46, + "learning_rate": 3.536063512321466e-05, + "loss": 1.5658, + "step": 156000 + }, + { + "epoch": 1.46, + "eval_loss": 1.388243317604065, + "eval_runtime": 222.4336, + "eval_samples_per_second": 449.572, + "eval_steps_per_second": 14.049, + "step": 156000 + }, + { + "epoch": 1.47, + "learning_rate": 3.531371408194291e-05, + "loss": 1.5711, + "step": 156500 + }, + { + "epoch": 1.47, + "eval_loss": 1.3848968744277954, + "eval_runtime": 222.2993, + "eval_samples_per_second": 449.844, + "eval_steps_per_second": 14.058, + "step": 156500 + }, + { + "epoch": 1.47, + "learning_rate": 3.526679304067116e-05, + "loss": 1.5877, + "step": 157000 + }, + { + "epoch": 1.47, + "eval_loss": 1.3827555179595947, + "eval_runtime": 222.1315, + "eval_samples_per_second": 450.184, + "eval_steps_per_second": 14.068, + "step": 157000 + }, + { + "epoch": 1.48, + "learning_rate": 3.5219871999399414e-05, + "loss": 1.5695, + "step": 157500 + }, + { + "epoch": 1.48, + "eval_loss": 1.390235185623169, + "eval_runtime": 222.0358, + "eval_samples_per_second": 450.378, + "eval_steps_per_second": 14.074, + "step": 157500 + }, + { + "epoch": 1.48, + "learning_rate": 3.517295095812767e-05, + "loss": 1.5652, + "step": 158000 + }, + { + "epoch": 1.48, + "eval_loss": 1.3825620412826538, + "eval_runtime": 221.9295, + "eval_samples_per_second": 450.594, + "eval_steps_per_second": 14.081, + "step": 158000 + }, + { + "epoch": 1.49, + "learning_rate": 3.512602991685592e-05, + "loss": 1.5696, + "step": 158500 + }, + { + "epoch": 1.49, + "eval_loss": 1.3775691986083984, + "eval_runtime": 221.9726, + "eval_samples_per_second": 450.506, + "eval_steps_per_second": 14.078, + "step": 158500 + }, + { + "epoch": 1.49, + "learning_rate": 3.5079108875584174e-05, + "loss": 1.5738, + "step": 159000 + }, + { + "epoch": 1.49, + "eval_loss": 1.3838963508605957, + "eval_runtime": 221.9255, + "eval_samples_per_second": 450.602, + "eval_steps_per_second": 14.081, + "step": 159000 + }, + { + "epoch": 1.5, + "learning_rate": 3.503218783431242e-05, + "loss": 1.5685, + "step": 159500 + }, + { + "epoch": 1.5, + "eval_loss": 1.384406566619873, + "eval_runtime": 221.8809, + "eval_samples_per_second": 450.692, + "eval_steps_per_second": 14.084, + "step": 159500 + }, + { + "epoch": 1.5, + "learning_rate": 3.4985266793040674e-05, + "loss": 1.5653, + "step": 160000 + }, + { + "epoch": 1.5, + "eval_loss": 1.3828558921813965, + "eval_runtime": 221.9019, + "eval_samples_per_second": 450.65, + "eval_steps_per_second": 14.083, + "step": 160000 + }, + { + "epoch": 1.51, + "learning_rate": 3.493834575176893e-05, + "loss": 1.5865, + "step": 160500 + }, + { + "epoch": 1.51, + "eval_loss": 1.385338306427002, + "eval_runtime": 221.7161, + "eval_samples_per_second": 451.027, + "eval_steps_per_second": 14.095, + "step": 160500 + }, + { + "epoch": 1.51, + "learning_rate": 3.489142471049718e-05, + "loss": 1.5501, + "step": 161000 + }, + { + "epoch": 1.51, + "eval_loss": 1.3802553415298462, + "eval_runtime": 221.7084, + "eval_samples_per_second": 451.043, + "eval_steps_per_second": 14.095, + "step": 161000 + }, + { + "epoch": 1.52, + "learning_rate": 3.4844503669225434e-05, + "loss": 1.5556, + "step": 161500 + }, + { + "epoch": 1.52, + "eval_loss": 1.3760793209075928, + "eval_runtime": 221.6941, + "eval_samples_per_second": 451.072, + "eval_steps_per_second": 14.096, + "step": 161500 + }, + { + "epoch": 1.52, + "learning_rate": 3.479758262795368e-05, + "loss": 1.5857, + "step": 162000 + }, + { + "epoch": 1.52, + "eval_loss": 1.38301420211792, + "eval_runtime": 221.6934, + "eval_samples_per_second": 451.073, + "eval_steps_per_second": 14.096, + "step": 162000 + }, + { + "epoch": 1.52, + "learning_rate": 3.4750661586681934e-05, + "loss": 1.5549, + "step": 162500 + }, + { + "epoch": 1.52, + "eval_loss": 1.3752098083496094, + "eval_runtime": 221.8522, + "eval_samples_per_second": 450.751, + "eval_steps_per_second": 14.086, + "step": 162500 + }, + { + "epoch": 1.53, + "learning_rate": 3.470374054541019e-05, + "loss": 1.5498, + "step": 163000 + }, + { + "epoch": 1.53, + "eval_loss": 1.378463625907898, + "eval_runtime": 221.8169, + "eval_samples_per_second": 450.822, + "eval_steps_per_second": 14.088, + "step": 163000 + }, + { + "epoch": 1.53, + "learning_rate": 3.465681950413844e-05, + "loss": 1.5719, + "step": 163500 + }, + { + "epoch": 1.53, + "eval_loss": 1.3748183250427246, + "eval_runtime": 221.783, + "eval_samples_per_second": 450.891, + "eval_steps_per_second": 14.09, + "step": 163500 + }, + { + "epoch": 1.54, + "learning_rate": 3.4609898462866694e-05, + "loss": 1.5751, + "step": 164000 + }, + { + "epoch": 1.54, + "eval_loss": 1.3731558322906494, + "eval_runtime": 221.7643, + "eval_samples_per_second": 450.929, + "eval_steps_per_second": 14.092, + "step": 164000 + }, + { + "epoch": 1.54, + "learning_rate": 3.456297742159495e-05, + "loss": 1.5459, + "step": 164500 + }, + { + "epoch": 1.54, + "eval_loss": 1.3722800016403198, + "eval_runtime": 221.6317, + "eval_samples_per_second": 451.199, + "eval_steps_per_second": 14.1, + "step": 164500 + }, + { + "epoch": 1.55, + "learning_rate": 3.451605638032319e-05, + "loss": 1.5603, + "step": 165000 + }, + { + "epoch": 1.55, + "eval_loss": 1.366198182106018, + "eval_runtime": 221.6334, + "eval_samples_per_second": 451.196, + "eval_steps_per_second": 14.1, + "step": 165000 + }, + { + "epoch": 1.55, + "learning_rate": 3.446913533905145e-05, + "loss": 1.5479, + "step": 165500 + }, + { + "epoch": 1.55, + "eval_loss": 1.3646401166915894, + "eval_runtime": 221.6561, + "eval_samples_per_second": 451.149, + "eval_steps_per_second": 14.098, + "step": 165500 + }, + { + "epoch": 1.56, + "learning_rate": 3.44222142977797e-05, + "loss": 1.5386, + "step": 166000 + }, + { + "epoch": 1.56, + "eval_loss": 1.3601208925247192, + "eval_runtime": 221.6415, + "eval_samples_per_second": 451.179, + "eval_steps_per_second": 14.099, + "step": 166000 + }, + { + "epoch": 1.56, + "learning_rate": 3.437529325650795e-05, + "loss": 1.5442, + "step": 166500 + }, + { + "epoch": 1.56, + "eval_loss": 1.357480764389038, + "eval_runtime": 221.6464, + "eval_samples_per_second": 451.169, + "eval_steps_per_second": 14.099, + "step": 166500 + }, + { + "epoch": 1.57, + "learning_rate": 3.4328372215236207e-05, + "loss": 1.5476, + "step": 167000 + }, + { + "epoch": 1.57, + "eval_loss": 1.3579912185668945, + "eval_runtime": 221.645, + "eval_samples_per_second": 451.172, + "eval_steps_per_second": 14.099, + "step": 167000 + }, + { + "epoch": 1.57, + "learning_rate": 3.428145117396445e-05, + "loss": 1.538, + "step": 167500 + }, + { + "epoch": 1.57, + "eval_loss": 1.3654874563217163, + "eval_runtime": 221.6529, + "eval_samples_per_second": 451.156, + "eval_steps_per_second": 14.099, + "step": 167500 + }, + { + "epoch": 1.58, + "learning_rate": 3.4234530132692706e-05, + "loss": 1.5531, + "step": 168000 + }, + { + "epoch": 1.58, + "eval_loss": 1.3746790885925293, + "eval_runtime": 221.6568, + "eval_samples_per_second": 451.148, + "eval_steps_per_second": 14.098, + "step": 168000 + }, + { + "epoch": 1.58, + "learning_rate": 3.418760909142096e-05, + "loss": 1.5538, + "step": 168500 + }, + { + "epoch": 1.58, + "eval_loss": 1.3619598150253296, + "eval_runtime": 221.6175, + "eval_samples_per_second": 451.228, + "eval_steps_per_second": 14.101, + "step": 168500 + }, + { + "epoch": 1.59, + "learning_rate": 3.414068805014921e-05, + "loss": 1.5497, + "step": 169000 + }, + { + "epoch": 1.59, + "eval_loss": 1.3550140857696533, + "eval_runtime": 221.6445, + "eval_samples_per_second": 451.173, + "eval_steps_per_second": 14.099, + "step": 169000 + }, + { + "epoch": 1.59, + "learning_rate": 3.4093767008877466e-05, + "loss": 1.5373, + "step": 169500 + }, + { + "epoch": 1.59, + "eval_loss": 1.3558791875839233, + "eval_runtime": 221.8117, + "eval_samples_per_second": 450.833, + "eval_steps_per_second": 14.089, + "step": 169500 + }, + { + "epoch": 1.6, + "learning_rate": 3.404684596760572e-05, + "loss": 1.551, + "step": 170000 + }, + { + "epoch": 1.6, + "eval_loss": 1.3610763549804688, + "eval_runtime": 221.8143, + "eval_samples_per_second": 450.828, + "eval_steps_per_second": 14.088, + "step": 170000 + }, + { + "epoch": 1.6, + "learning_rate": 3.3999924926333966e-05, + "loss": 1.5348, + "step": 170500 + }, + { + "epoch": 1.6, + "eval_loss": 1.3658798933029175, + "eval_runtime": 221.7237, + "eval_samples_per_second": 451.012, + "eval_steps_per_second": 14.094, + "step": 170500 + }, + { + "epoch": 1.6, + "learning_rate": 3.395300388506222e-05, + "loss": 1.5391, + "step": 171000 + }, + { + "epoch": 1.6, + "eval_loss": 1.3553804159164429, + "eval_runtime": 221.7256, + "eval_samples_per_second": 451.008, + "eval_steps_per_second": 14.094, + "step": 171000 + }, + { + "epoch": 1.61, + "learning_rate": 3.390608284379047e-05, + "loss": 1.5268, + "step": 171500 + }, + { + "epoch": 1.61, + "eval_loss": 1.3583581447601318, + "eval_runtime": 221.5416, + "eval_samples_per_second": 451.383, + "eval_steps_per_second": 14.106, + "step": 171500 + }, + { + "epoch": 1.61, + "learning_rate": 3.3859161802518726e-05, + "loss": 1.5377, + "step": 172000 + }, + { + "epoch": 1.61, + "eval_loss": 1.3604706525802612, + "eval_runtime": 221.5467, + "eval_samples_per_second": 451.372, + "eval_steps_per_second": 14.105, + "step": 172000 + }, + { + "epoch": 1.62, + "learning_rate": 3.381224076124698e-05, + "loss": 1.5201, + "step": 172500 + }, + { + "epoch": 1.62, + "eval_loss": 1.3594732284545898, + "eval_runtime": 221.5767, + "eval_samples_per_second": 451.311, + "eval_steps_per_second": 14.103, + "step": 172500 + }, + { + "epoch": 1.62, + "learning_rate": 3.3765319719975226e-05, + "loss": 1.5475, + "step": 173000 + }, + { + "epoch": 1.62, + "eval_loss": 1.3577581644058228, + "eval_runtime": 221.5721, + "eval_samples_per_second": 451.32, + "eval_steps_per_second": 14.104, + "step": 173000 + }, + { + "epoch": 1.63, + "learning_rate": 3.371839867870348e-05, + "loss": 1.5442, + "step": 173500 + }, + { + "epoch": 1.63, + "eval_loss": 1.3556467294692993, + "eval_runtime": 221.5702, + "eval_samples_per_second": 451.324, + "eval_steps_per_second": 14.104, + "step": 173500 + }, + { + "epoch": 1.63, + "learning_rate": 3.367147763743173e-05, + "loss": 1.5487, + "step": 174000 + }, + { + "epoch": 1.63, + "eval_loss": 1.35114324092865, + "eval_runtime": 221.5518, + "eval_samples_per_second": 451.362, + "eval_steps_per_second": 14.105, + "step": 174000 + }, + { + "epoch": 1.64, + "learning_rate": 3.3624556596159985e-05, + "loss": 1.5282, + "step": 174500 + }, + { + "epoch": 1.64, + "eval_loss": 1.3505486249923706, + "eval_runtime": 221.5706, + "eval_samples_per_second": 451.323, + "eval_steps_per_second": 14.104, + "step": 174500 + }, + { + "epoch": 1.64, + "learning_rate": 3.357763555488824e-05, + "loss": 1.5335, + "step": 175000 + }, + { + "epoch": 1.64, + "eval_loss": 1.3498034477233887, + "eval_runtime": 221.5638, + "eval_samples_per_second": 451.337, + "eval_steps_per_second": 14.104, + "step": 175000 + }, + { + "epoch": 1.65, + "learning_rate": 3.353071451361649e-05, + "loss": 1.5217, + "step": 175500 + }, + { + "epoch": 1.65, + "eval_loss": 1.3372061252593994, + "eval_runtime": 221.6104, + "eval_samples_per_second": 451.242, + "eval_steps_per_second": 14.101, + "step": 175500 + }, + { + "epoch": 1.65, + "learning_rate": 3.348379347234474e-05, + "loss": 1.5374, + "step": 176000 + }, + { + "epoch": 1.65, + "eval_loss": 1.3543497323989868, + "eval_runtime": 221.5921, + "eval_samples_per_second": 451.28, + "eval_steps_per_second": 14.102, + "step": 176000 + }, + { + "epoch": 1.66, + "learning_rate": 3.343687243107299e-05, + "loss": 1.5282, + "step": 176500 + }, + { + "epoch": 1.66, + "eval_loss": 1.3469191789627075, + "eval_runtime": 221.5417, + "eval_samples_per_second": 451.382, + "eval_steps_per_second": 14.106, + "step": 176500 + }, + { + "epoch": 1.66, + "learning_rate": 3.3389951389801245e-05, + "loss": 1.5135, + "step": 177000 + }, + { + "epoch": 1.66, + "eval_loss": 1.3495582342147827, + "eval_runtime": 221.7135, + "eval_samples_per_second": 451.033, + "eval_steps_per_second": 14.095, + "step": 177000 + }, + { + "epoch": 1.67, + "learning_rate": 3.33430303485295e-05, + "loss": 1.5604, + "step": 177500 + }, + { + "epoch": 1.67, + "eval_loss": 1.3418365716934204, + "eval_runtime": 221.6933, + "eval_samples_per_second": 451.074, + "eval_steps_per_second": 14.096, + "step": 177500 + }, + { + "epoch": 1.67, + "learning_rate": 3.329610930725775e-05, + "loss": 1.5222, + "step": 178000 + }, + { + "epoch": 1.67, + "eval_loss": 1.3411223888397217, + "eval_runtime": 221.6176, + "eval_samples_per_second": 451.228, + "eval_steps_per_second": 14.101, + "step": 178000 + }, + { + "epoch": 1.68, + "learning_rate": 3.3249188265986e-05, + "loss": 1.5207, + "step": 178500 + }, + { + "epoch": 1.68, + "eval_loss": 1.3367658853530884, + "eval_runtime": 221.5787, + "eval_samples_per_second": 451.307, + "eval_steps_per_second": 14.103, + "step": 178500 + }, + { + "epoch": 1.68, + "learning_rate": 3.320226722471425e-05, + "loss": 1.5232, + "step": 179000 + }, + { + "epoch": 1.68, + "eval_loss": 1.3388681411743164, + "eval_runtime": 221.3977, + "eval_samples_per_second": 451.676, + "eval_steps_per_second": 14.115, + "step": 179000 + }, + { + "epoch": 1.68, + "learning_rate": 3.3155346183442505e-05, + "loss": 1.5397, + "step": 179500 + }, + { + "epoch": 1.68, + "eval_loss": 1.3412151336669922, + "eval_runtime": 221.3909, + "eval_samples_per_second": 451.69, + "eval_steps_per_second": 14.115, + "step": 179500 + }, + { + "epoch": 1.69, + "learning_rate": 3.310842514217076e-05, + "loss": 1.5368, + "step": 180000 + }, + { + "epoch": 1.69, + "eval_loss": 1.3428925275802612, + "eval_runtime": 221.3914, + "eval_samples_per_second": 451.689, + "eval_steps_per_second": 14.115, + "step": 180000 + }, + { + "epoch": 1.69, + "learning_rate": 3.306150410089901e-05, + "loss": 1.5595, + "step": 180500 + }, + { + "epoch": 1.69, + "eval_loss": 1.336493730545044, + "eval_runtime": 221.2471, + "eval_samples_per_second": 451.983, + "eval_steps_per_second": 14.124, + "step": 180500 + }, + { + "epoch": 1.7, + "learning_rate": 3.3014583059627265e-05, + "loss": 1.5407, + "step": 181000 + }, + { + "epoch": 1.7, + "eval_loss": 1.3366632461547852, + "eval_runtime": 221.2531, + "eval_samples_per_second": 451.971, + "eval_steps_per_second": 14.124, + "step": 181000 + }, + { + "epoch": 1.7, + "learning_rate": 3.296766201835551e-05, + "loss": 1.5289, + "step": 181500 + }, + { + "epoch": 1.7, + "eval_loss": 1.3385401964187622, + "eval_runtime": 221.3851, + "eval_samples_per_second": 451.702, + "eval_steps_per_second": 14.116, + "step": 181500 + }, + { + "epoch": 1.71, + "learning_rate": 3.2920740977083764e-05, + "loss": 1.5259, + "step": 182000 + }, + { + "epoch": 1.71, + "eval_loss": 1.3364156484603882, + "eval_runtime": 221.3895, + "eval_samples_per_second": 451.693, + "eval_steps_per_second": 14.115, + "step": 182000 + }, + { + "epoch": 1.71, + "learning_rate": 3.287381993581202e-05, + "loss": 1.5351, + "step": 182500 + }, + { + "epoch": 1.71, + "eval_loss": 1.3378472328186035, + "eval_runtime": 221.3747, + "eval_samples_per_second": 451.723, + "eval_steps_per_second": 14.116, + "step": 182500 + }, + { + "epoch": 1.72, + "learning_rate": 3.282689889454027e-05, + "loss": 1.5167, + "step": 183000 + }, + { + "epoch": 1.72, + "eval_loss": 1.3326358795166016, + "eval_runtime": 221.3651, + "eval_samples_per_second": 451.742, + "eval_steps_per_second": 14.117, + "step": 183000 + }, + { + "epoch": 1.72, + "learning_rate": 3.2779977853268524e-05, + "loss": 1.5312, + "step": 183500 + }, + { + "epoch": 1.72, + "eval_loss": 1.3356856107711792, + "eval_runtime": 221.3292, + "eval_samples_per_second": 451.816, + "eval_steps_per_second": 14.119, + "step": 183500 + }, + { + "epoch": 1.73, + "learning_rate": 3.273305681199677e-05, + "loss": 1.5343, + "step": 184000 + }, + { + "epoch": 1.73, + "eval_loss": 1.332999348640442, + "eval_runtime": 221.3046, + "eval_samples_per_second": 451.866, + "eval_steps_per_second": 14.121, + "step": 184000 + }, + { + "epoch": 1.73, + "learning_rate": 3.2686135770725024e-05, + "loss": 1.532, + "step": 184500 + }, + { + "epoch": 1.73, + "eval_loss": 1.3290119171142578, + "eval_runtime": 221.4415, + "eval_samples_per_second": 451.586, + "eval_steps_per_second": 14.112, + "step": 184500 + }, + { + "epoch": 1.74, + "learning_rate": 3.263921472945328e-05, + "loss": 1.5106, + "step": 185000 + }, + { + "epoch": 1.74, + "eval_loss": 1.3379125595092773, + "eval_runtime": 221.5598, + "eval_samples_per_second": 451.345, + "eval_steps_per_second": 14.105, + "step": 185000 + }, + { + "epoch": 1.74, + "learning_rate": 3.259229368818153e-05, + "loss": 1.5146, + "step": 185500 + }, + { + "epoch": 1.74, + "eval_loss": 1.3202601671218872, + "eval_runtime": 221.5182, + "eval_samples_per_second": 451.43, + "eval_steps_per_second": 14.107, + "step": 185500 + }, + { + "epoch": 1.75, + "learning_rate": 3.2545372646909784e-05, + "loss": 1.5059, + "step": 186000 + }, + { + "epoch": 1.75, + "eval_loss": 1.3281402587890625, + "eval_runtime": 221.4506, + "eval_samples_per_second": 451.568, + "eval_steps_per_second": 14.111, + "step": 186000 + }, + { + "epoch": 1.75, + "learning_rate": 3.249845160563804e-05, + "loss": 1.5206, + "step": 186500 + }, + { + "epoch": 1.75, + "eval_loss": 1.3232953548431396, + "eval_runtime": 221.1927, + "eval_samples_per_second": 452.095, + "eval_steps_per_second": 14.128, + "step": 186500 + }, + { + "epoch": 1.75, + "learning_rate": 3.2451530564366284e-05, + "loss": 1.5332, + "step": 187000 + }, + { + "epoch": 1.75, + "eval_loss": 1.328766107559204, + "eval_runtime": 221.1463, + "eval_samples_per_second": 452.189, + "eval_steps_per_second": 14.131, + "step": 187000 + }, + { + "epoch": 1.76, + "learning_rate": 3.240460952309454e-05, + "loss": 1.5253, + "step": 187500 + }, + { + "epoch": 1.76, + "eval_loss": 1.3251526355743408, + "eval_runtime": 221.188, + "eval_samples_per_second": 452.104, + "eval_steps_per_second": 14.128, + "step": 187500 + }, + { + "epoch": 1.76, + "learning_rate": 3.235768848182279e-05, + "loss": 1.5412, + "step": 188000 + }, + { + "epoch": 1.76, + "eval_loss": 1.3271182775497437, + "eval_runtime": 221.1601, + "eval_samples_per_second": 452.161, + "eval_steps_per_second": 14.13, + "step": 188000 + }, + { + "epoch": 1.77, + "learning_rate": 3.2310767440551044e-05, + "loss": 1.5102, + "step": 188500 + }, + { + "epoch": 1.77, + "eval_loss": 1.32902193069458, + "eval_runtime": 221.169, + "eval_samples_per_second": 452.143, + "eval_steps_per_second": 14.129, + "step": 188500 + }, + { + "epoch": 1.77, + "learning_rate": 3.22638463992793e-05, + "loss": 1.5158, + "step": 189000 + }, + { + "epoch": 1.77, + "eval_loss": 1.3175742626190186, + "eval_runtime": 221.1728, + "eval_samples_per_second": 452.135, + "eval_steps_per_second": 14.129, + "step": 189000 + }, + { + "epoch": 1.78, + "learning_rate": 3.221692535800755e-05, + "loss": 1.5015, + "step": 189500 + }, + { + "epoch": 1.78, + "eval_loss": 1.3242403268814087, + "eval_runtime": 221.1038, + "eval_samples_per_second": 452.276, + "eval_steps_per_second": 14.134, + "step": 189500 + }, + { + "epoch": 1.78, + "learning_rate": 3.21700043167358e-05, + "loss": 1.5226, + "step": 190000 + }, + { + "epoch": 1.78, + "eval_loss": 1.3241665363311768, + "eval_runtime": 221.0819, + "eval_samples_per_second": 452.321, + "eval_steps_per_second": 14.135, + "step": 190000 + }, + { + "epoch": 1.79, + "learning_rate": 3.212308327546405e-05, + "loss": 1.5164, + "step": 190500 + }, + { + "epoch": 1.79, + "eval_loss": 1.3255438804626465, + "eval_runtime": 221.0427, + "eval_samples_per_second": 452.401, + "eval_steps_per_second": 14.138, + "step": 190500 + }, + { + "epoch": 1.79, + "learning_rate": 3.20761622341923e-05, + "loss": 1.4973, + "step": 191000 + }, + { + "epoch": 1.79, + "eval_loss": 1.317428469657898, + "eval_runtime": 221.0182, + "eval_samples_per_second": 452.451, + "eval_steps_per_second": 14.139, + "step": 191000 + }, + { + "epoch": 1.8, + "learning_rate": 3.2029241192920556e-05, + "loss": 1.5136, + "step": 191500 + }, + { + "epoch": 1.8, + "eval_loss": 1.3230990171432495, + "eval_runtime": 221.0488, + "eval_samples_per_second": 452.389, + "eval_steps_per_second": 14.137, + "step": 191500 + }, + { + "epoch": 1.8, + "learning_rate": 3.198232015164881e-05, + "loss": 1.4964, + "step": 192000 + }, + { + "epoch": 1.8, + "eval_loss": 1.3161015510559082, + "eval_runtime": 221.0504, + "eval_samples_per_second": 452.385, + "eval_steps_per_second": 14.137, + "step": 192000 + }, + { + "epoch": 1.81, + "learning_rate": 3.1935399110377056e-05, + "loss": 1.5111, + "step": 192500 + }, + { + "epoch": 1.81, + "eval_loss": 1.3233813047409058, + "eval_runtime": 221.2963, + "eval_samples_per_second": 451.883, + "eval_steps_per_second": 14.121, + "step": 192500 + }, + { + "epoch": 1.81, + "learning_rate": 3.188847806910531e-05, + "loss": 1.4848, + "step": 193000 + }, + { + "epoch": 1.81, + "eval_loss": 1.3204160928726196, + "eval_runtime": 221.2711, + "eval_samples_per_second": 451.934, + "eval_steps_per_second": 14.123, + "step": 193000 + }, + { + "epoch": 1.82, + "learning_rate": 3.184155702783356e-05, + "loss": 1.5071, + "step": 193500 + }, + { + "epoch": 1.82, + "eval_loss": 1.3207734823226929, + "eval_runtime": 221.1959, + "eval_samples_per_second": 452.088, + "eval_steps_per_second": 14.128, + "step": 193500 + }, + { + "epoch": 1.82, + "learning_rate": 3.1794635986561816e-05, + "loss": 1.505, + "step": 194000 + }, + { + "epoch": 1.82, + "eval_loss": 1.3126252889633179, + "eval_runtime": 221.1627, + "eval_samples_per_second": 452.156, + "eval_steps_per_second": 14.13, + "step": 194000 + }, + { + "epoch": 1.83, + "learning_rate": 3.174771494529007e-05, + "loss": 1.5166, + "step": 194500 + }, + { + "epoch": 1.83, + "eval_loss": 1.3139557838439941, + "eval_runtime": 221.0163, + "eval_samples_per_second": 452.455, + "eval_steps_per_second": 14.139, + "step": 194500 + }, + { + "epoch": 1.83, + "learning_rate": 3.170079390401832e-05, + "loss": 1.5, + "step": 195000 + }, + { + "epoch": 1.83, + "eval_loss": 1.3125284910202026, + "eval_runtime": 221.0368, + "eval_samples_per_second": 452.413, + "eval_steps_per_second": 14.138, + "step": 195000 + }, + { + "epoch": 1.83, + "learning_rate": 3.165387286274657e-05, + "loss": 1.5055, + "step": 195500 + }, + { + "epoch": 1.83, + "eval_loss": 1.3205658197402954, + "eval_runtime": 221.047, + "eval_samples_per_second": 452.392, + "eval_steps_per_second": 14.137, + "step": 195500 + }, + { + "epoch": 1.84, + "learning_rate": 3.160695182147482e-05, + "loss": 1.4874, + "step": 196000 + }, + { + "epoch": 1.84, + "eval_loss": 1.3127025365829468, + "eval_runtime": 221.0478, + "eval_samples_per_second": 452.391, + "eval_steps_per_second": 14.137, + "step": 196000 + }, + { + "epoch": 1.84, + "learning_rate": 3.1560030780203076e-05, + "loss": 1.4879, + "step": 196500 + }, + { + "epoch": 1.84, + "eval_loss": 1.3064494132995605, + "eval_runtime": 221.0498, + "eval_samples_per_second": 452.387, + "eval_steps_per_second": 14.137, + "step": 196500 + }, + { + "epoch": 1.85, + "learning_rate": 3.151310973893133e-05, + "loss": 1.4988, + "step": 197000 + }, + { + "epoch": 1.85, + "eval_loss": 1.3118481636047363, + "eval_runtime": 221.0593, + "eval_samples_per_second": 452.367, + "eval_steps_per_second": 14.136, + "step": 197000 + }, + { + "epoch": 1.85, + "learning_rate": 3.146618869765958e-05, + "loss": 1.5136, + "step": 197500 + }, + { + "epoch": 1.85, + "eval_loss": 1.3082212209701538, + "eval_runtime": 221.0316, + "eval_samples_per_second": 452.424, + "eval_steps_per_second": 14.138, + "step": 197500 + }, + { + "epoch": 1.86, + "learning_rate": 3.141926765638783e-05, + "loss": 1.4772, + "step": 198000 + }, + { + "epoch": 1.86, + "eval_loss": 1.3054986000061035, + "eval_runtime": 221.0487, + "eval_samples_per_second": 452.389, + "eval_steps_per_second": 14.137, + "step": 198000 + }, + { + "epoch": 1.86, + "learning_rate": 3.137234661511608e-05, + "loss": 1.4835, + "step": 198500 + }, + { + "epoch": 1.86, + "eval_loss": 1.3038787841796875, + "eval_runtime": 221.0318, + "eval_samples_per_second": 452.424, + "eval_steps_per_second": 14.138, + "step": 198500 + }, + { + "epoch": 1.87, + "learning_rate": 3.1325425573844335e-05, + "loss": 1.4953, + "step": 199000 + }, + { + "epoch": 1.87, + "eval_loss": 1.3007208108901978, + "eval_runtime": 221.0384, + "eval_samples_per_second": 452.41, + "eval_steps_per_second": 14.138, + "step": 199000 + }, + { + "epoch": 1.87, + "learning_rate": 3.127850453257259e-05, + "loss": 1.488, + "step": 199500 + }, + { + "epoch": 1.87, + "eval_loss": 1.301313042640686, + "eval_runtime": 221.0649, + "eval_samples_per_second": 452.356, + "eval_steps_per_second": 14.136, + "step": 199500 + }, + { + "epoch": 1.88, + "learning_rate": 3.123158349130084e-05, + "loss": 1.4808, + "step": 200000 + }, + { + "epoch": 1.88, + "eval_loss": 1.307305097579956, + "eval_runtime": 221.0452, + "eval_samples_per_second": 452.396, + "eval_steps_per_second": 14.137, + "step": 200000 + }, + { + "epoch": 1.88, + "learning_rate": 3.1184662450029095e-05, + "loss": 1.49, + "step": 200500 + }, + { + "epoch": 1.88, + "eval_loss": 1.3078263998031616, + "eval_runtime": 220.9228, + "eval_samples_per_second": 452.647, + "eval_steps_per_second": 14.145, + "step": 200500 + }, + { + "epoch": 1.89, + "learning_rate": 3.113774140875734e-05, + "loss": 1.4568, + "step": 201000 + }, + { + "epoch": 1.89, + "eval_loss": 1.3042628765106201, + "eval_runtime": 221.0985, + "eval_samples_per_second": 452.287, + "eval_steps_per_second": 14.134, + "step": 201000 + }, + { + "epoch": 1.89, + "learning_rate": 3.1090820367485595e-05, + "loss": 1.4818, + "step": 201500 + }, + { + "epoch": 1.89, + "eval_loss": 1.3147982358932495, + "eval_runtime": 221.0384, + "eval_samples_per_second": 452.41, + "eval_steps_per_second": 14.138, + "step": 201500 + }, + { + "epoch": 1.9, + "learning_rate": 3.104389932621385e-05, + "loss": 1.4693, + "step": 202000 + }, + { + "epoch": 1.9, + "eval_loss": 1.2982258796691895, + "eval_runtime": 220.9646, + "eval_samples_per_second": 452.561, + "eval_steps_per_second": 14.143, + "step": 202000 + }, + { + "epoch": 1.9, + "learning_rate": 3.09969782849421e-05, + "loss": 1.4645, + "step": 202500 + }, + { + "epoch": 1.9, + "eval_loss": 1.3004374504089355, + "eval_runtime": 220.9286, + "eval_samples_per_second": 452.635, + "eval_steps_per_second": 14.145, + "step": 202500 + }, + { + "epoch": 1.9, + "learning_rate": 3.0950057243670355e-05, + "loss": 1.4802, + "step": 203000 + }, + { + "epoch": 1.9, + "eval_loss": 1.2999825477600098, + "eval_runtime": 220.8399, + "eval_samples_per_second": 452.817, + "eval_steps_per_second": 14.151, + "step": 203000 + }, + { + "epoch": 1.91, + "learning_rate": 3.09031362023986e-05, + "loss": 1.4797, + "step": 203500 + }, + { + "epoch": 1.91, + "eval_loss": 1.2966198921203613, + "eval_runtime": 220.8339, + "eval_samples_per_second": 452.829, + "eval_steps_per_second": 14.151, + "step": 203500 + }, + { + "epoch": 1.91, + "learning_rate": 3.0856215161126855e-05, + "loss": 1.4995, + "step": 204000 + }, + { + "epoch": 1.91, + "eval_loss": 1.2981046438217163, + "eval_runtime": 220.814, + "eval_samples_per_second": 452.87, + "eval_steps_per_second": 14.152, + "step": 204000 + }, + { + "epoch": 1.92, + "learning_rate": 3.080929411985511e-05, + "loss": 1.5087, + "step": 204500 + }, + { + "epoch": 1.92, + "eval_loss": 1.2951112985610962, + "eval_runtime": 220.8383, + "eval_samples_per_second": 452.82, + "eval_steps_per_second": 14.151, + "step": 204500 + }, + { + "epoch": 1.92, + "learning_rate": 3.076237307858336e-05, + "loss": 1.4984, + "step": 205000 + }, + { + "epoch": 1.92, + "eval_loss": 1.2964484691619873, + "eval_runtime": 220.8897, + "eval_samples_per_second": 452.715, + "eval_steps_per_second": 14.147, + "step": 205000 + }, + { + "epoch": 1.93, + "learning_rate": 3.0715452037311615e-05, + "loss": 1.4864, + "step": 205500 + }, + { + "epoch": 1.93, + "eval_loss": 1.2925612926483154, + "eval_runtime": 220.8427, + "eval_samples_per_second": 452.811, + "eval_steps_per_second": 14.15, + "step": 205500 + }, + { + "epoch": 1.93, + "learning_rate": 3.066853099603987e-05, + "loss": 1.4752, + "step": 206000 + }, + { + "epoch": 1.93, + "eval_loss": 1.3018134832382202, + "eval_runtime": 220.8565, + "eval_samples_per_second": 452.783, + "eval_steps_per_second": 14.149, + "step": 206000 + }, + { + "epoch": 1.94, + "learning_rate": 3.0621609954768114e-05, + "loss": 1.5018, + "step": 206500 + }, + { + "epoch": 1.94, + "eval_loss": 1.2949315309524536, + "eval_runtime": 220.8631, + "eval_samples_per_second": 452.769, + "eval_steps_per_second": 14.149, + "step": 206500 + }, + { + "epoch": 1.94, + "learning_rate": 3.057468891349637e-05, + "loss": 1.4902, + "step": 207000 + }, + { + "epoch": 1.94, + "eval_loss": 1.2878872156143188, + "eval_runtime": 220.8333, + "eval_samples_per_second": 452.83, + "eval_steps_per_second": 14.151, + "step": 207000 + }, + { + "epoch": 1.95, + "learning_rate": 3.052776787222462e-05, + "loss": 1.4699, + "step": 207500 + }, + { + "epoch": 1.95, + "eval_loss": 1.2887647151947021, + "eval_runtime": 220.8064, + "eval_samples_per_second": 452.885, + "eval_steps_per_second": 14.153, + "step": 207500 + }, + { + "epoch": 1.95, + "learning_rate": 3.0480846830952874e-05, + "loss": 1.4834, + "step": 208000 + }, + { + "epoch": 1.95, + "eval_loss": 1.290585994720459, + "eval_runtime": 220.7934, + "eval_samples_per_second": 452.912, + "eval_steps_per_second": 14.154, + "step": 208000 + }, + { + "epoch": 1.96, + "learning_rate": 3.0433925789681128e-05, + "loss": 1.4561, + "step": 208500 + }, + { + "epoch": 1.96, + "eval_loss": 1.297606110572815, + "eval_runtime": 220.8078, + "eval_samples_per_second": 452.882, + "eval_steps_per_second": 14.153, + "step": 208500 + }, + { + "epoch": 1.96, + "learning_rate": 3.0387004748409374e-05, + "loss": 1.4673, + "step": 209000 + }, + { + "epoch": 1.96, + "eval_loss": 1.2891823053359985, + "eval_runtime": 220.802, + "eval_samples_per_second": 452.895, + "eval_steps_per_second": 14.153, + "step": 209000 + }, + { + "epoch": 1.97, + "learning_rate": 3.0340083707137627e-05, + "loss": 1.4872, + "step": 209500 + }, + { + "epoch": 1.97, + "eval_loss": 1.2896424531936646, + "eval_runtime": 220.9571, + "eval_samples_per_second": 452.577, + "eval_steps_per_second": 14.143, + "step": 209500 + }, + { + "epoch": 1.97, + "learning_rate": 3.029316266586588e-05, + "loss": 1.4538, + "step": 210000 + }, + { + "epoch": 1.97, + "eval_loss": 1.2892200946807861, + "eval_runtime": 220.9468, + "eval_samples_per_second": 452.598, + "eval_steps_per_second": 14.144, + "step": 210000 + }, + { + "epoch": 1.98, + "learning_rate": 3.0246241624594134e-05, + "loss": 1.4672, + "step": 210500 + }, + { + "epoch": 1.98, + "eval_loss": 1.2804533243179321, + "eval_runtime": 220.8886, + "eval_samples_per_second": 452.717, + "eval_steps_per_second": 14.147, + "step": 210500 + }, + { + "epoch": 1.98, + "learning_rate": 3.0199320583322387e-05, + "loss": 1.46, + "step": 211000 + }, + { + "epoch": 1.98, + "eval_loss": 1.292427659034729, + "eval_runtime": 220.8568, + "eval_samples_per_second": 452.782, + "eval_steps_per_second": 14.149, + "step": 211000 + }, + { + "epoch": 1.98, + "learning_rate": 3.015239954205064e-05, + "loss": 1.4597, + "step": 211500 + }, + { + "epoch": 1.98, + "eval_loss": 1.2845008373260498, + "eval_runtime": 220.714, + "eval_samples_per_second": 453.075, + "eval_steps_per_second": 14.159, + "step": 211500 + }, + { + "epoch": 1.99, + "learning_rate": 3.0105478500778887e-05, + "loss": 1.4603, + "step": 212000 + }, + { + "epoch": 1.99, + "eval_loss": 1.2904783487319946, + "eval_runtime": 220.7365, + "eval_samples_per_second": 453.029, + "eval_steps_per_second": 14.157, + "step": 212000 + }, + { + "epoch": 1.99, + "learning_rate": 3.005855745950714e-05, + "loss": 1.4812, + "step": 212500 + }, + { + "epoch": 1.99, + "eval_loss": 1.2882287502288818, + "eval_runtime": 220.7381, + "eval_samples_per_second": 453.025, + "eval_steps_per_second": 14.157, + "step": 212500 + }, + { + "epoch": 2.0, + "learning_rate": 3.0011636418235394e-05, + "loss": 1.4662, + "step": 213000 + }, + { + "epoch": 2.0, + "eval_loss": 1.277458667755127, + "eval_runtime": 220.7153, + "eval_samples_per_second": 453.072, + "eval_steps_per_second": 14.159, + "step": 213000 + }, + { + "epoch": 2.0, + "learning_rate": 2.9964715376963647e-05, + "loss": 1.4622, + "step": 213500 + }, + { + "epoch": 2.0, + "eval_loss": 1.287771463394165, + "eval_runtime": 220.7913, + "eval_samples_per_second": 452.916, + "eval_steps_per_second": 14.154, + "step": 213500 + }, + { + "epoch": 2.01, + "learning_rate": 2.99177943356919e-05, + "loss": 1.4618, + "step": 214000 + }, + { + "epoch": 2.01, + "eval_loss": 1.2776265144348145, + "eval_runtime": 220.7076, + "eval_samples_per_second": 453.088, + "eval_steps_per_second": 14.159, + "step": 214000 + }, + { + "epoch": 2.01, + "learning_rate": 2.987087329442015e-05, + "loss": 1.4462, + "step": 214500 + }, + { + "epoch": 2.01, + "eval_loss": 1.282547116279602, + "eval_runtime": 220.7335, + "eval_samples_per_second": 453.035, + "eval_steps_per_second": 14.157, + "step": 214500 + }, + { + "epoch": 2.02, + "learning_rate": 2.9823952253148403e-05, + "loss": 1.4502, + "step": 215000 + }, + { + "epoch": 2.02, + "eval_loss": 1.2854583263397217, + "eval_runtime": 220.783, + "eval_samples_per_second": 452.933, + "eval_steps_per_second": 14.154, + "step": 215000 + }, + { + "epoch": 2.02, + "learning_rate": 2.9777031211876653e-05, + "loss": 1.4419, + "step": 215500 + }, + { + "epoch": 2.02, + "eval_loss": 1.282270908355713, + "eval_runtime": 220.7936, + "eval_samples_per_second": 452.912, + "eval_steps_per_second": 14.153, + "step": 215500 + }, + { + "epoch": 2.03, + "learning_rate": 2.9730110170604906e-05, + "loss": 1.4476, + "step": 216000 + }, + { + "epoch": 2.03, + "eval_loss": 1.273934006690979, + "eval_runtime": 220.778, + "eval_samples_per_second": 452.944, + "eval_steps_per_second": 14.154, + "step": 216000 + }, + { + "epoch": 2.03, + "learning_rate": 2.968318912933316e-05, + "loss": 1.4549, + "step": 216500 + }, + { + "epoch": 2.03, + "eval_loss": 1.2787511348724365, + "eval_runtime": 220.7612, + "eval_samples_per_second": 452.978, + "eval_steps_per_second": 14.156, + "step": 216500 + }, + { + "epoch": 2.04, + "learning_rate": 2.9636268088061413e-05, + "loss": 1.4614, + "step": 217000 + }, + { + "epoch": 2.04, + "eval_loss": 1.2720485925674438, + "eval_runtime": 220.7698, + "eval_samples_per_second": 452.96, + "eval_steps_per_second": 14.155, + "step": 217000 + }, + { + "epoch": 2.04, + "learning_rate": 2.9589347046789663e-05, + "loss": 1.4594, + "step": 217500 + }, + { + "epoch": 2.04, + "eval_loss": 1.2739953994750977, + "eval_runtime": 220.7583, + "eval_samples_per_second": 452.984, + "eval_steps_per_second": 14.156, + "step": 217500 + }, + { + "epoch": 2.05, + "learning_rate": 2.9542426005517916e-05, + "loss": 1.4633, + "step": 218000 + }, + { + "epoch": 2.05, + "eval_loss": 1.279288411140442, + "eval_runtime": 220.7177, + "eval_samples_per_second": 453.068, + "eval_steps_per_second": 14.158, + "step": 218000 + }, + { + "epoch": 2.05, + "learning_rate": 2.949550496424617e-05, + "loss": 1.4755, + "step": 218500 + }, + { + "epoch": 2.05, + "eval_loss": 1.2718831300735474, + "eval_runtime": 220.8318, + "eval_samples_per_second": 452.833, + "eval_steps_per_second": 14.151, + "step": 218500 + }, + { + "epoch": 2.06, + "learning_rate": 2.944858392297442e-05, + "loss": 1.4608, + "step": 219000 + }, + { + "epoch": 2.06, + "eval_loss": 1.2677054405212402, + "eval_runtime": 220.8761, + "eval_samples_per_second": 452.742, + "eval_steps_per_second": 14.148, + "step": 219000 + }, + { + "epoch": 2.06, + "learning_rate": 2.9401662881702673e-05, + "loss": 1.4327, + "step": 219500 + }, + { + "epoch": 2.06, + "eval_loss": 1.270250678062439, + "eval_runtime": 220.795, + "eval_samples_per_second": 452.909, + "eval_steps_per_second": 14.153, + "step": 219500 + }, + { + "epoch": 2.06, + "learning_rate": 2.9354741840430923e-05, + "loss": 1.4414, + "step": 220000 + }, + { + "epoch": 2.06, + "eval_loss": 1.2740583419799805, + "eval_runtime": 220.7867, + "eval_samples_per_second": 452.926, + "eval_steps_per_second": 14.154, + "step": 220000 + }, + { + "epoch": 2.07, + "learning_rate": 2.9307820799159176e-05, + "loss": 1.4333, + "step": 220500 + }, + { + "epoch": 2.07, + "eval_loss": 1.2706983089447021, + "eval_runtime": 220.6245, + "eval_samples_per_second": 453.259, + "eval_steps_per_second": 14.164, + "step": 220500 + }, + { + "epoch": 2.07, + "learning_rate": 2.926089975788743e-05, + "loss": 1.445, + "step": 221000 + }, + { + "epoch": 2.07, + "eval_loss": 1.2743273973464966, + "eval_runtime": 220.6137, + "eval_samples_per_second": 453.281, + "eval_steps_per_second": 14.165, + "step": 221000 + }, + { + "epoch": 2.08, + "learning_rate": 2.9213978716615682e-05, + "loss": 1.4417, + "step": 221500 + }, + { + "epoch": 2.08, + "eval_loss": 1.2643128633499146, + "eval_runtime": 220.619, + "eval_samples_per_second": 453.27, + "eval_steps_per_second": 14.165, + "step": 221500 + }, + { + "epoch": 2.08, + "learning_rate": 2.9167057675343932e-05, + "loss": 1.4456, + "step": 222000 + }, + { + "epoch": 2.08, + "eval_loss": 1.2675349712371826, + "eval_runtime": 220.5897, + "eval_samples_per_second": 453.33, + "eval_steps_per_second": 14.167, + "step": 222000 + }, + { + "epoch": 2.09, + "learning_rate": 2.9120136634072186e-05, + "loss": 1.4515, + "step": 222500 + }, + { + "epoch": 2.09, + "eval_loss": 1.2693637609481812, + "eval_runtime": 220.5837, + "eval_samples_per_second": 453.343, + "eval_steps_per_second": 14.167, + "step": 222500 + }, + { + "epoch": 2.09, + "learning_rate": 2.9073215592800436e-05, + "loss": 1.4649, + "step": 223000 + }, + { + "epoch": 2.09, + "eval_loss": 1.269477367401123, + "eval_runtime": 220.5619, + "eval_samples_per_second": 453.387, + "eval_steps_per_second": 14.168, + "step": 223000 + }, + { + "epoch": 2.1, + "learning_rate": 2.902629455152869e-05, + "loss": 1.458, + "step": 223500 + }, + { + "epoch": 2.1, + "eval_loss": 1.270194411277771, + "eval_runtime": 220.6045, + "eval_samples_per_second": 453.3, + "eval_steps_per_second": 14.166, + "step": 223500 + }, + { + "epoch": 2.1, + "learning_rate": 2.8979373510256942e-05, + "loss": 1.4286, + "step": 224000 + }, + { + "epoch": 2.1, + "eval_loss": 1.2670238018035889, + "eval_runtime": 220.6056, + "eval_samples_per_second": 453.298, + "eval_steps_per_second": 14.166, + "step": 224000 + }, + { + "epoch": 2.11, + "learning_rate": 2.8932452468985195e-05, + "loss": 1.444, + "step": 224500 + }, + { + "epoch": 2.11, + "eval_loss": 1.263913869857788, + "eval_runtime": 220.6041, + "eval_samples_per_second": 453.301, + "eval_steps_per_second": 14.166, + "step": 224500 + }, + { + "epoch": 2.11, + "learning_rate": 2.888553142771345e-05, + "loss": 1.4352, + "step": 225000 + }, + { + "epoch": 2.11, + "eval_loss": 1.2642849683761597, + "eval_runtime": 220.5676, + "eval_samples_per_second": 453.376, + "eval_steps_per_second": 14.168, + "step": 225000 + }, + { + "epoch": 2.12, + "learning_rate": 2.8838610386441695e-05, + "loss": 1.4535, + "step": 225500 + }, + { + "epoch": 2.12, + "eval_loss": 1.2708691358566284, + "eval_runtime": 220.6074, + "eval_samples_per_second": 453.294, + "eval_steps_per_second": 14.165, + "step": 225500 + }, + { + "epoch": 2.12, + "learning_rate": 2.879168934516995e-05, + "loss": 1.4289, + "step": 226000 + }, + { + "epoch": 2.12, + "eval_loss": 1.2640670537948608, + "eval_runtime": 220.5867, + "eval_samples_per_second": 453.336, + "eval_steps_per_second": 14.167, + "step": 226000 + }, + { + "epoch": 2.13, + "learning_rate": 2.8744768303898202e-05, + "loss": 1.4569, + "step": 226500 + }, + { + "epoch": 2.13, + "eval_loss": 1.2591434717178345, + "eval_runtime": 220.5876, + "eval_samples_per_second": 453.335, + "eval_steps_per_second": 14.167, + "step": 226500 + }, + { + "epoch": 2.13, + "learning_rate": 2.8697847262626455e-05, + "loss": 1.4474, + "step": 227000 + }, + { + "epoch": 2.13, + "eval_loss": 1.2669652700424194, + "eval_runtime": 220.5728, + "eval_samples_per_second": 453.365, + "eval_steps_per_second": 14.168, + "step": 227000 + }, + { + "epoch": 2.13, + "learning_rate": 2.865092622135471e-05, + "loss": 1.4412, + "step": 227500 + }, + { + "epoch": 2.13, + "eval_loss": 1.2617195844650269, + "eval_runtime": 220.5569, + "eval_samples_per_second": 453.398, + "eval_steps_per_second": 14.169, + "step": 227500 + }, + { + "epoch": 2.14, + "learning_rate": 2.860400518008296e-05, + "loss": 1.4522, + "step": 228000 + }, + { + "epoch": 2.14, + "eval_loss": 1.262453317642212, + "eval_runtime": 220.6524, + "eval_samples_per_second": 453.201, + "eval_steps_per_second": 14.163, + "step": 228000 + }, + { + "epoch": 2.14, + "learning_rate": 2.8557084138811208e-05, + "loss": 1.4223, + "step": 228500 + }, + { + "epoch": 2.14, + "eval_loss": 1.2601869106292725, + "eval_runtime": 220.7247, + "eval_samples_per_second": 453.053, + "eval_steps_per_second": 14.158, + "step": 228500 + }, + { + "epoch": 2.15, + "learning_rate": 2.851016309753946e-05, + "loss": 1.4352, + "step": 229000 + }, + { + "epoch": 2.15, + "eval_loss": 1.2503169775009155, + "eval_runtime": 220.6544, + "eval_samples_per_second": 453.197, + "eval_steps_per_second": 14.162, + "step": 229000 + }, + { + "epoch": 2.15, + "learning_rate": 2.8463242056267715e-05, + "loss": 1.4153, + "step": 229500 + }, + { + "epoch": 2.15, + "eval_loss": 1.2586244344711304, + "eval_runtime": 220.6119, + "eval_samples_per_second": 453.285, + "eval_steps_per_second": 14.165, + "step": 229500 + }, + { + "epoch": 2.16, + "learning_rate": 2.8416321014995968e-05, + "loss": 1.4255, + "step": 230000 + }, + { + "epoch": 2.16, + "eval_loss": 1.2628742456436157, + "eval_runtime": 220.5148, + "eval_samples_per_second": 453.484, + "eval_steps_per_second": 14.171, + "step": 230000 + }, + { + "epoch": 2.16, + "learning_rate": 2.836939997372422e-05, + "loss": 1.4402, + "step": 230500 + }, + { + "epoch": 2.16, + "eval_loss": 1.2623021602630615, + "eval_runtime": 220.3405, + "eval_samples_per_second": 453.843, + "eval_steps_per_second": 14.183, + "step": 230500 + }, + { + "epoch": 2.17, + "learning_rate": 2.8322478932452475e-05, + "loss": 1.4327, + "step": 231000 + }, + { + "epoch": 2.17, + "eval_loss": 1.2573188543319702, + "eval_runtime": 220.4056, + "eval_samples_per_second": 453.709, + "eval_steps_per_second": 14.178, + "step": 231000 + }, + { + "epoch": 2.17, + "learning_rate": 2.827555789118072e-05, + "loss": 1.4522, + "step": 231500 + }, + { + "epoch": 2.17, + "eval_loss": 1.262677550315857, + "eval_runtime": 220.4232, + "eval_samples_per_second": 453.673, + "eval_steps_per_second": 14.177, + "step": 231500 + }, + { + "epoch": 2.18, + "learning_rate": 2.8228636849908974e-05, + "loss": 1.4528, + "step": 232000 + }, + { + "epoch": 2.18, + "eval_loss": 1.259798526763916, + "eval_runtime": 220.4521, + "eval_samples_per_second": 453.613, + "eval_steps_per_second": 14.175, + "step": 232000 + }, + { + "epoch": 2.18, + "learning_rate": 2.8181715808637228e-05, + "loss": 1.4322, + "step": 232500 + }, + { + "epoch": 2.18, + "eval_loss": 1.2561190128326416, + "eval_runtime": 220.4464, + "eval_samples_per_second": 453.625, + "eval_steps_per_second": 14.176, + "step": 232500 + }, + { + "epoch": 2.19, + "learning_rate": 2.813479476736548e-05, + "loss": 1.4285, + "step": 233000 + }, + { + "epoch": 2.19, + "eval_loss": 1.262929916381836, + "eval_runtime": 220.4749, + "eval_samples_per_second": 453.566, + "eval_steps_per_second": 14.174, + "step": 233000 + }, + { + "epoch": 2.19, + "learning_rate": 2.8087873726093734e-05, + "loss": 1.4357, + "step": 233500 + }, + { + "epoch": 2.19, + "eval_loss": 1.2526911497116089, + "eval_runtime": 220.5247, + "eval_samples_per_second": 453.464, + "eval_steps_per_second": 14.171, + "step": 233500 + }, + { + "epoch": 2.2, + "learning_rate": 2.804095268482198e-05, + "loss": 1.4242, + "step": 234000 + }, + { + "epoch": 2.2, + "eval_loss": 1.2577769756317139, + "eval_runtime": 220.5683, + "eval_samples_per_second": 453.374, + "eval_steps_per_second": 14.168, + "step": 234000 + }, + { + "epoch": 2.2, + "learning_rate": 2.7994031643550234e-05, + "loss": 1.4162, + "step": 234500 + }, + { + "epoch": 2.2, + "eval_loss": 1.2519463300704956, + "eval_runtime": 220.6043, + "eval_samples_per_second": 453.3, + "eval_steps_per_second": 14.166, + "step": 234500 + }, + { + "epoch": 2.21, + "learning_rate": 2.7947110602278487e-05, + "loss": 1.4332, + "step": 235000 + }, + { + "epoch": 2.21, + "eval_loss": 1.2554024457931519, + "eval_runtime": 220.6422, + "eval_samples_per_second": 453.222, + "eval_steps_per_second": 14.163, + "step": 235000 + }, + { + "epoch": 2.21, + "learning_rate": 2.790018956100674e-05, + "loss": 1.4313, + "step": 235500 + }, + { + "epoch": 2.21, + "eval_loss": 1.2516902685165405, + "eval_runtime": 220.661, + "eval_samples_per_second": 453.184, + "eval_steps_per_second": 14.162, + "step": 235500 + }, + { + "epoch": 2.21, + "learning_rate": 2.7853268519734994e-05, + "loss": 1.4295, + "step": 236000 + }, + { + "epoch": 2.21, + "eval_loss": 1.246413230895996, + "eval_runtime": 220.6917, + "eval_samples_per_second": 453.121, + "eval_steps_per_second": 14.16, + "step": 236000 + }, + { + "epoch": 2.22, + "learning_rate": 2.7806347478463247e-05, + "loss": 1.4206, + "step": 236500 + }, + { + "epoch": 2.22, + "eval_loss": 1.2490031719207764, + "eval_runtime": 220.6934, + "eval_samples_per_second": 453.117, + "eval_steps_per_second": 14.16, + "step": 236500 + }, + { + "epoch": 2.22, + "learning_rate": 2.7759426437191494e-05, + "loss": 1.4304, + "step": 237000 + }, + { + "epoch": 2.22, + "eval_loss": 1.2488088607788086, + "eval_runtime": 220.7139, + "eval_samples_per_second": 453.075, + "eval_steps_per_second": 14.159, + "step": 237000 + }, + { + "epoch": 2.23, + "learning_rate": 2.7712505395919747e-05, + "loss": 1.4205, + "step": 237500 + }, + { + "epoch": 2.23, + "eval_loss": 1.2458001375198364, + "eval_runtime": 220.8036, + "eval_samples_per_second": 452.891, + "eval_steps_per_second": 14.153, + "step": 237500 + }, + { + "epoch": 2.23, + "learning_rate": 2.7665584354648e-05, + "loss": 1.4287, + "step": 238000 + }, + { + "epoch": 2.23, + "eval_loss": 1.2452707290649414, + "eval_runtime": 220.9497, + "eval_samples_per_second": 452.592, + "eval_steps_per_second": 14.143, + "step": 238000 + }, + { + "epoch": 2.24, + "learning_rate": 2.7618663313376253e-05, + "loss": 1.4384, + "step": 238500 + }, + { + "epoch": 2.24, + "eval_loss": 1.2455310821533203, + "eval_runtime": 220.9115, + "eval_samples_per_second": 452.67, + "eval_steps_per_second": 14.146, + "step": 238500 + }, + { + "epoch": 2.24, + "learning_rate": 2.7571742272104507e-05, + "loss": 1.4297, + "step": 239000 + }, + { + "epoch": 2.24, + "eval_loss": 1.2461364269256592, + "eval_runtime": 220.8712, + "eval_samples_per_second": 452.753, + "eval_steps_per_second": 14.149, + "step": 239000 + }, + { + "epoch": 2.25, + "learning_rate": 2.7524821230832753e-05, + "loss": 1.4192, + "step": 239500 + }, + { + "epoch": 2.25, + "eval_loss": 1.241681456565857, + "eval_runtime": 220.7997, + "eval_samples_per_second": 452.899, + "eval_steps_per_second": 14.153, + "step": 239500 + }, + { + "epoch": 2.25, + "learning_rate": 2.7477900189561007e-05, + "loss": 1.4371, + "step": 240000 + }, + { + "epoch": 2.25, + "eval_loss": 1.241499662399292, + "eval_runtime": 220.8006, + "eval_samples_per_second": 452.897, + "eval_steps_per_second": 14.153, + "step": 240000 + }, + { + "epoch": 2.26, + "learning_rate": 2.743097914828926e-05, + "loss": 1.4134, + "step": 240500 + }, + { + "epoch": 2.26, + "eval_loss": 1.2505980730056763, + "eval_runtime": 220.9405, + "eval_samples_per_second": 452.611, + "eval_steps_per_second": 14.144, + "step": 240500 + }, + { + "epoch": 2.26, + "learning_rate": 2.7384058107017513e-05, + "loss": 1.4282, + "step": 241000 + }, + { + "epoch": 2.26, + "eval_loss": 1.2383712530136108, + "eval_runtime": 220.9785, + "eval_samples_per_second": 452.533, + "eval_steps_per_second": 14.142, + "step": 241000 + }, + { + "epoch": 2.27, + "learning_rate": 2.7337137065745766e-05, + "loss": 1.423, + "step": 241500 + }, + { + "epoch": 2.27, + "eval_loss": 1.2391642332077026, + "eval_runtime": 221.0121, + "eval_samples_per_second": 452.464, + "eval_steps_per_second": 14.14, + "step": 241500 + }, + { + "epoch": 2.27, + "learning_rate": 2.729021602447402e-05, + "loss": 1.3965, + "step": 242000 + }, + { + "epoch": 2.27, + "eval_loss": 1.234950304031372, + "eval_runtime": 221.0193, + "eval_samples_per_second": 452.449, + "eval_steps_per_second": 14.139, + "step": 242000 + }, + { + "epoch": 2.28, + "learning_rate": 2.7243294983202266e-05, + "loss": 1.4213, + "step": 242500 + }, + { + "epoch": 2.28, + "eval_loss": 1.2397407293319702, + "eval_runtime": 221.049, + "eval_samples_per_second": 452.388, + "eval_steps_per_second": 14.137, + "step": 242500 + }, + { + "epoch": 2.28, + "learning_rate": 2.719637394193052e-05, + "loss": 1.4205, + "step": 243000 + }, + { + "epoch": 2.28, + "eval_loss": 1.239810824394226, + "eval_runtime": 221.0771, + "eval_samples_per_second": 452.331, + "eval_steps_per_second": 14.135, + "step": 243000 + }, + { + "epoch": 2.29, + "learning_rate": 2.7149452900658773e-05, + "loss": 1.4043, + "step": 243500 + }, + { + "epoch": 2.29, + "eval_loss": 1.234268307685852, + "eval_runtime": 221.0918, + "eval_samples_per_second": 452.301, + "eval_steps_per_second": 14.134, + "step": 243500 + }, + { + "epoch": 2.29, + "learning_rate": 2.7102531859387026e-05, + "loss": 1.4253, + "step": 244000 + }, + { + "epoch": 2.29, + "eval_loss": 1.2382601499557495, + "eval_runtime": 221.0904, + "eval_samples_per_second": 452.304, + "eval_steps_per_second": 14.134, + "step": 244000 + }, + { + "epoch": 2.29, + "learning_rate": 2.705561081811528e-05, + "loss": 1.417, + "step": 244500 + }, + { + "epoch": 2.29, + "eval_loss": 1.2414758205413818, + "eval_runtime": 221.0991, + "eval_samples_per_second": 452.286, + "eval_steps_per_second": 14.134, + "step": 244500 + }, + { + "epoch": 2.3, + "learning_rate": 2.7008689776843526e-05, + "loss": 1.4246, + "step": 245000 + }, + { + "epoch": 2.3, + "eval_loss": 1.2350417375564575, + "eval_runtime": 221.1225, + "eval_samples_per_second": 452.238, + "eval_steps_per_second": 14.132, + "step": 245000 + }, + { + "epoch": 2.3, + "learning_rate": 2.696176873557178e-05, + "loss": 1.4098, + "step": 245500 + }, + { + "epoch": 2.3, + "eval_loss": 1.231085181236267, + "eval_runtime": 221.136, + "eval_samples_per_second": 452.21, + "eval_steps_per_second": 14.132, + "step": 245500 + }, + { + "epoch": 2.31, + "learning_rate": 2.6914847694300032e-05, + "loss": 1.4127, + "step": 246000 + }, + { + "epoch": 2.31, + "eval_loss": 1.2361029386520386, + "eval_runtime": 221.7853, + "eval_samples_per_second": 450.887, + "eval_steps_per_second": 14.09, + "step": 246000 + }, + { + "epoch": 2.31, + "learning_rate": 2.6867926653028286e-05, + "loss": 1.3934, + "step": 246500 + }, + { + "epoch": 2.31, + "eval_loss": 1.2260022163391113, + "eval_runtime": 221.7286, + "eval_samples_per_second": 451.002, + "eval_steps_per_second": 14.094, + "step": 246500 + }, + { + "epoch": 2.32, + "learning_rate": 2.682100561175654e-05, + "loss": 1.4177, + "step": 247000 + }, + { + "epoch": 2.32, + "eval_loss": 1.2279936075210571, + "eval_runtime": 221.7401, + "eval_samples_per_second": 450.979, + "eval_steps_per_second": 14.093, + "step": 247000 + }, + { + "epoch": 2.32, + "learning_rate": 2.6774084570484792e-05, + "loss": 1.393, + "step": 247500 + }, + { + "epoch": 2.32, + "eval_loss": 1.230800986289978, + "eval_runtime": 221.7152, + "eval_samples_per_second": 451.029, + "eval_steps_per_second": 14.095, + "step": 247500 + }, + { + "epoch": 2.33, + "learning_rate": 2.672716352921304e-05, + "loss": 1.4007, + "step": 248000 + }, + { + "epoch": 2.33, + "eval_loss": 1.2362948656082153, + "eval_runtime": 221.5844, + "eval_samples_per_second": 451.295, + "eval_steps_per_second": 14.103, + "step": 248000 + }, + { + "epoch": 2.33, + "learning_rate": 2.6680242487941292e-05, + "loss": 1.4119, + "step": 248500 + }, + { + "epoch": 2.33, + "eval_loss": 1.2321358919143677, + "eval_runtime": 221.5867, + "eval_samples_per_second": 451.291, + "eval_steps_per_second": 14.103, + "step": 248500 + }, + { + "epoch": 2.34, + "learning_rate": 2.6633321446669545e-05, + "loss": 1.4134, + "step": 249000 + }, + { + "epoch": 2.34, + "eval_loss": 1.2312347888946533, + "eval_runtime": 221.6239, + "eval_samples_per_second": 451.215, + "eval_steps_per_second": 14.1, + "step": 249000 + }, + { + "epoch": 2.34, + "learning_rate": 2.65864004053978e-05, + "loss": 1.3995, + "step": 249500 + }, + { + "epoch": 2.34, + "eval_loss": 1.227805495262146, + "eval_runtime": 221.6235, + "eval_samples_per_second": 451.216, + "eval_steps_per_second": 14.1, + "step": 249500 + }, + { + "epoch": 2.35, + "learning_rate": 2.6539479364126052e-05, + "loss": 1.4042, + "step": 250000 + }, + { + "epoch": 2.35, + "eval_loss": 1.2257417440414429, + "eval_runtime": 221.6629, + "eval_samples_per_second": 451.136, + "eval_steps_per_second": 14.098, + "step": 250000 + }, + { + "epoch": 2.35, + "learning_rate": 2.64925583228543e-05, + "loss": 1.4041, + "step": 250500 + }, + { + "epoch": 2.35, + "eval_loss": 1.2193177938461304, + "eval_runtime": 221.6777, + "eval_samples_per_second": 451.105, + "eval_steps_per_second": 14.097, + "step": 250500 + }, + { + "epoch": 2.36, + "learning_rate": 2.6445637281582552e-05, + "loss": 1.4014, + "step": 251000 + }, + { + "epoch": 2.36, + "eval_loss": 1.2312824726104736, + "eval_runtime": 221.7704, + "eval_samples_per_second": 450.917, + "eval_steps_per_second": 14.091, + "step": 251000 + }, + { + "epoch": 2.36, + "learning_rate": 2.6398716240310805e-05, + "loss": 1.4122, + "step": 251500 + }, + { + "epoch": 2.36, + "eval_loss": 1.2306718826293945, + "eval_runtime": 221.7735, + "eval_samples_per_second": 450.911, + "eval_steps_per_second": 14.091, + "step": 251500 + }, + { + "epoch": 2.36, + "learning_rate": 2.6351795199039058e-05, + "loss": 1.3839, + "step": 252000 + }, + { + "epoch": 2.36, + "eval_loss": 1.2296006679534912, + "eval_runtime": 221.8207, + "eval_samples_per_second": 450.815, + "eval_steps_per_second": 14.088, + "step": 252000 + }, + { + "epoch": 2.37, + "learning_rate": 2.630487415776731e-05, + "loss": 1.4113, + "step": 252500 + }, + { + "epoch": 2.37, + "eval_loss": 1.2209049463272095, + "eval_runtime": 221.8254, + "eval_samples_per_second": 450.805, + "eval_steps_per_second": 14.088, + "step": 252500 + }, + { + "epoch": 2.37, + "learning_rate": 2.6257953116495565e-05, + "loss": 1.3818, + "step": 253000 + }, + { + "epoch": 2.37, + "eval_loss": 1.2337490320205688, + "eval_runtime": 221.8579, + "eval_samples_per_second": 450.739, + "eval_steps_per_second": 14.086, + "step": 253000 + }, + { + "epoch": 2.38, + "learning_rate": 2.621103207522381e-05, + "loss": 1.4197, + "step": 253500 + }, + { + "epoch": 2.38, + "eval_loss": 1.2286659479141235, + "eval_runtime": 222.042, + "eval_samples_per_second": 450.365, + "eval_steps_per_second": 14.074, + "step": 253500 + }, + { + "epoch": 2.38, + "learning_rate": 2.6164111033952065e-05, + "loss": 1.407, + "step": 254000 + }, + { + "epoch": 2.38, + "eval_loss": 1.2235952615737915, + "eval_runtime": 222.0342, + "eval_samples_per_second": 450.381, + "eval_steps_per_second": 14.074, + "step": 254000 + }, + { + "epoch": 2.39, + "learning_rate": 2.6117189992680318e-05, + "loss": 1.4031, + "step": 254500 + }, + { + "epoch": 2.39, + "eval_loss": 1.2171984910964966, + "eval_runtime": 222.0173, + "eval_samples_per_second": 450.415, + "eval_steps_per_second": 14.075, + "step": 254500 + }, + { + "epoch": 2.39, + "learning_rate": 2.607026895140857e-05, + "loss": 1.39, + "step": 255000 + }, + { + "epoch": 2.39, + "eval_loss": 1.2256048917770386, + "eval_runtime": 221.9334, + "eval_samples_per_second": 450.586, + "eval_steps_per_second": 14.081, + "step": 255000 + }, + { + "epoch": 2.4, + "learning_rate": 2.6023347910136824e-05, + "loss": 1.3946, + "step": 255500 + }, + { + "epoch": 2.4, + "eval_loss": 1.2248618602752686, + "eval_runtime": 221.838, + "eval_samples_per_second": 450.779, + "eval_steps_per_second": 14.087, + "step": 255500 + }, + { + "epoch": 2.4, + "learning_rate": 2.597642686886507e-05, + "loss": 1.417, + "step": 256000 + }, + { + "epoch": 2.4, + "eval_loss": 1.2168443202972412, + "eval_runtime": 221.8642, + "eval_samples_per_second": 450.726, + "eval_steps_per_second": 14.085, + "step": 256000 + }, + { + "epoch": 2.41, + "learning_rate": 2.5929505827593324e-05, + "loss": 1.4065, + "step": 256500 + }, + { + "epoch": 2.41, + "eval_loss": 1.2248560190200806, + "eval_runtime": 221.8502, + "eval_samples_per_second": 450.755, + "eval_steps_per_second": 14.086, + "step": 256500 + }, + { + "epoch": 2.41, + "learning_rate": 2.5882584786321578e-05, + "loss": 1.4046, + "step": 257000 + }, + { + "epoch": 2.41, + "eval_loss": 1.2218291759490967, + "eval_runtime": 221.8436, + "eval_samples_per_second": 450.768, + "eval_steps_per_second": 14.086, + "step": 257000 + }, + { + "epoch": 2.42, + "learning_rate": 2.583566374504983e-05, + "loss": 1.4101, + "step": 257500 + }, + { + "epoch": 2.42, + "eval_loss": 1.2202662229537964, + "eval_runtime": 221.8576, + "eval_samples_per_second": 450.74, + "eval_steps_per_second": 14.086, + "step": 257500 + }, + { + "epoch": 2.42, + "learning_rate": 2.5788742703778084e-05, + "loss": 1.3997, + "step": 258000 + }, + { + "epoch": 2.42, + "eval_loss": 1.2214444875717163, + "eval_runtime": 221.8565, + "eval_samples_per_second": 450.742, + "eval_steps_per_second": 14.086, + "step": 258000 + }, + { + "epoch": 2.43, + "learning_rate": 2.5741821662506337e-05, + "loss": 1.4134, + "step": 258500 + }, + { + "epoch": 2.43, + "eval_loss": 1.220430612564087, + "eval_runtime": 221.8826, + "eval_samples_per_second": 450.689, + "eval_steps_per_second": 14.084, + "step": 258500 + }, + { + "epoch": 2.43, + "learning_rate": 2.5694900621234584e-05, + "loss": 1.3823, + "step": 259000 + }, + { + "epoch": 2.43, + "eval_loss": 1.2182176113128662, + "eval_runtime": 221.9231, + "eval_samples_per_second": 450.606, + "eval_steps_per_second": 14.081, + "step": 259000 + }, + { + "epoch": 2.44, + "learning_rate": 2.5647979579962837e-05, + "loss": 1.3854, + "step": 259500 + }, + { + "epoch": 2.44, + "eval_loss": 1.2200777530670166, + "eval_runtime": 221.9107, + "eval_samples_per_second": 450.632, + "eval_steps_per_second": 14.082, + "step": 259500 + }, + { + "epoch": 2.44, + "learning_rate": 2.560105853869109e-05, + "loss": 1.4126, + "step": 260000 + }, + { + "epoch": 2.44, + "eval_loss": 1.2206027507781982, + "eval_runtime": 221.8721, + "eval_samples_per_second": 450.71, + "eval_steps_per_second": 14.085, + "step": 260000 + }, + { + "epoch": 2.44, + "learning_rate": 2.5554137497419344e-05, + "loss": 1.3986, + "step": 260500 + }, + { + "epoch": 2.44, + "eval_loss": 1.2171374559402466, + "eval_runtime": 222.0524, + "eval_samples_per_second": 450.344, + "eval_steps_per_second": 14.073, + "step": 260500 + }, + { + "epoch": 2.45, + "learning_rate": 2.5507216456147597e-05, + "loss": 1.401, + "step": 261000 + }, + { + "epoch": 2.45, + "eval_loss": 1.220663070678711, + "eval_runtime": 222.0724, + "eval_samples_per_second": 450.304, + "eval_steps_per_second": 14.072, + "step": 261000 + }, + { + "epoch": 2.45, + "learning_rate": 2.5460295414875847e-05, + "loss": 1.3983, + "step": 261500 + }, + { + "epoch": 2.45, + "eval_loss": 1.2131149768829346, + "eval_runtime": 222.0269, + "eval_samples_per_second": 450.396, + "eval_steps_per_second": 14.075, + "step": 261500 + }, + { + "epoch": 2.46, + "learning_rate": 2.5413374373604097e-05, + "loss": 1.3837, + "step": 262000 + }, + { + "epoch": 2.46, + "eval_loss": 1.2130707502365112, + "eval_runtime": 222.0086, + "eval_samples_per_second": 450.433, + "eval_steps_per_second": 14.076, + "step": 262000 + }, + { + "epoch": 2.46, + "learning_rate": 2.536645333233235e-05, + "loss": 1.3935, + "step": 262500 + }, + { + "epoch": 2.46, + "eval_loss": 1.2063870429992676, + "eval_runtime": 221.8375, + "eval_samples_per_second": 450.78, + "eval_steps_per_second": 14.087, + "step": 262500 + }, + { + "epoch": 2.47, + "learning_rate": 2.5319532291060603e-05, + "loss": 1.4004, + "step": 263000 + }, + { + "epoch": 2.47, + "eval_loss": 1.21162748336792, + "eval_runtime": 221.8742, + "eval_samples_per_second": 450.706, + "eval_steps_per_second": 14.085, + "step": 263000 + }, + { + "epoch": 2.47, + "learning_rate": 2.5272611249788857e-05, + "loss": 1.3934, + "step": 263500 + }, + { + "epoch": 2.47, + "eval_loss": 1.2112177610397339, + "eval_runtime": 221.8733, + "eval_samples_per_second": 450.708, + "eval_steps_per_second": 14.085, + "step": 263500 + }, + { + "epoch": 2.48, + "learning_rate": 2.522569020851711e-05, + "loss": 1.3747, + "step": 264000 + }, + { + "epoch": 2.48, + "eval_loss": 1.217407464981079, + "eval_runtime": 221.8721, + "eval_samples_per_second": 450.71, + "eval_steps_per_second": 14.085, + "step": 264000 + }, + { + "epoch": 2.48, + "learning_rate": 2.517876916724536e-05, + "loss": 1.3891, + "step": 264500 + }, + { + "epoch": 2.48, + "eval_loss": 1.215196132659912, + "eval_runtime": 221.9107, + "eval_samples_per_second": 450.632, + "eval_steps_per_second": 14.082, + "step": 264500 + }, + { + "epoch": 2.49, + "learning_rate": 2.5131848125973613e-05, + "loss": 1.3753, + "step": 265000 + }, + { + "epoch": 2.49, + "eval_loss": 1.2109936475753784, + "eval_runtime": 221.8926, + "eval_samples_per_second": 450.668, + "eval_steps_per_second": 14.083, + "step": 265000 + }, + { + "epoch": 2.49, + "learning_rate": 2.5084927084701863e-05, + "loss": 1.4029, + "step": 265500 + }, + { + "epoch": 2.49, + "eval_loss": 1.2136719226837158, + "eval_runtime": 221.9088, + "eval_samples_per_second": 450.636, + "eval_steps_per_second": 14.082, + "step": 265500 + }, + { + "epoch": 2.5, + "learning_rate": 2.5038006043430116e-05, + "loss": 1.3716, + "step": 266000 + }, + { + "epoch": 2.5, + "eval_loss": 1.2064990997314453, + "eval_runtime": 221.917, + "eval_samples_per_second": 450.619, + "eval_steps_per_second": 14.082, + "step": 266000 + }, + { + "epoch": 2.5, + "learning_rate": 2.499108500215837e-05, + "loss": 1.3917, + "step": 266500 + }, + { + "epoch": 2.5, + "eval_loss": 1.2140556573867798, + "eval_runtime": 221.8926, + "eval_samples_per_second": 450.668, + "eval_steps_per_second": 14.083, + "step": 266500 + }, + { + "epoch": 2.51, + "learning_rate": 2.494416396088662e-05, + "loss": 1.3958, + "step": 267000 + }, + { + "epoch": 2.51, + "eval_loss": 1.2115426063537598, + "eval_runtime": 221.881, + "eval_samples_per_second": 450.692, + "eval_steps_per_second": 14.084, + "step": 267000 + }, + { + "epoch": 2.51, + "learning_rate": 2.4897242919614873e-05, + "loss": 1.388, + "step": 267500 + }, + { + "epoch": 2.51, + "eval_loss": 1.2005212306976318, + "eval_runtime": 222.0705, + "eval_samples_per_second": 450.307, + "eval_steps_per_second": 14.072, + "step": 267500 + }, + { + "epoch": 2.51, + "learning_rate": 2.4850321878343126e-05, + "loss": 1.3769, + "step": 268000 + }, + { + "epoch": 2.51, + "eval_loss": 1.2068332433700562, + "eval_runtime": 222.0339, + "eval_samples_per_second": 450.382, + "eval_steps_per_second": 14.074, + "step": 268000 + }, + { + "epoch": 2.52, + "learning_rate": 2.480340083707138e-05, + "loss": 1.4106, + "step": 268500 + }, + { + "epoch": 2.52, + "eval_loss": 1.2093212604522705, + "eval_runtime": 222.0101, + "eval_samples_per_second": 450.43, + "eval_steps_per_second": 14.076, + "step": 268500 + }, + { + "epoch": 2.52, + "learning_rate": 2.475647979579963e-05, + "loss": 1.3883, + "step": 269000 + }, + { + "epoch": 2.52, + "eval_loss": 1.2118886709213257, + "eval_runtime": 221.9501, + "eval_samples_per_second": 450.552, + "eval_steps_per_second": 14.08, + "step": 269000 + }, + { + "epoch": 2.53, + "learning_rate": 2.4709558754527883e-05, + "loss": 1.3742, + "step": 269500 + }, + { + "epoch": 2.53, + "eval_loss": 1.2088725566864014, + "eval_runtime": 221.8528, + "eval_samples_per_second": 450.749, + "eval_steps_per_second": 14.086, + "step": 269500 + }, + { + "epoch": 2.53, + "learning_rate": 2.4662637713256136e-05, + "loss": 1.3922, + "step": 270000 + }, + { + "epoch": 2.53, + "eval_loss": 1.2059705257415771, + "eval_runtime": 221.9028, + "eval_samples_per_second": 450.648, + "eval_steps_per_second": 14.083, + "step": 270000 + }, + { + "epoch": 2.54, + "learning_rate": 2.4615716671984386e-05, + "loss": 1.3924, + "step": 270500 + }, + { + "epoch": 2.54, + "eval_loss": 1.2069438695907593, + "eval_runtime": 221.8382, + "eval_samples_per_second": 450.779, + "eval_steps_per_second": 14.087, + "step": 270500 + }, + { + "epoch": 2.54, + "learning_rate": 2.456879563071264e-05, + "loss": 1.4043, + "step": 271000 + }, + { + "epoch": 2.54, + "eval_loss": 1.2041146755218506, + "eval_runtime": 221.8591, + "eval_samples_per_second": 450.736, + "eval_steps_per_second": 14.086, + "step": 271000 + }, + { + "epoch": 2.55, + "learning_rate": 2.4521874589440892e-05, + "loss": 1.3877, + "step": 271500 + }, + { + "epoch": 2.55, + "eval_loss": 1.209468960762024, + "eval_runtime": 221.837, + "eval_samples_per_second": 450.781, + "eval_steps_per_second": 14.087, + "step": 271500 + }, + { + "epoch": 2.55, + "learning_rate": 2.4474953548169142e-05, + "loss": 1.3866, + "step": 272000 + }, + { + "epoch": 2.55, + "eval_loss": 1.2078853845596313, + "eval_runtime": 221.8543, + "eval_samples_per_second": 450.746, + "eval_steps_per_second": 14.086, + "step": 272000 + }, + { + "epoch": 2.56, + "learning_rate": 2.4428032506897396e-05, + "loss": 1.3892, + "step": 272500 + }, + { + "epoch": 2.56, + "eval_loss": 1.203126311302185, + "eval_runtime": 221.8674, + "eval_samples_per_second": 450.72, + "eval_steps_per_second": 14.085, + "step": 272500 + }, + { + "epoch": 2.56, + "learning_rate": 2.438111146562565e-05, + "loss": 1.3876, + "step": 273000 + }, + { + "epoch": 2.56, + "eval_loss": 1.208136796951294, + "eval_runtime": 221.8861, + "eval_samples_per_second": 450.682, + "eval_steps_per_second": 14.084, + "step": 273000 + }, + { + "epoch": 2.57, + "learning_rate": 2.43341904243539e-05, + "loss": 1.3791, + "step": 273500 + }, + { + "epoch": 2.57, + "eval_loss": 1.2000367641448975, + "eval_runtime": 221.8811, + "eval_samples_per_second": 450.692, + "eval_steps_per_second": 14.084, + "step": 273500 + }, + { + "epoch": 2.57, + "learning_rate": 2.4287269383082152e-05, + "loss": 1.3721, + "step": 274000 + }, + { + "epoch": 2.57, + "eval_loss": 1.1960883140563965, + "eval_runtime": 221.8902, + "eval_samples_per_second": 450.673, + "eval_steps_per_second": 14.084, + "step": 274000 + }, + { + "epoch": 2.58, + "learning_rate": 2.4240348341810405e-05, + "loss": 1.3758, + "step": 274500 + }, + { + "epoch": 2.58, + "eval_loss": 1.1972496509552002, + "eval_runtime": 222.0858, + "eval_samples_per_second": 450.276, + "eval_steps_per_second": 14.071, + "step": 274500 + }, + { + "epoch": 2.58, + "learning_rate": 2.4193427300538655e-05, + "loss": 1.3854, + "step": 275000 + }, + { + "epoch": 2.58, + "eval_loss": 1.2011773586273193, + "eval_runtime": 222.0189, + "eval_samples_per_second": 450.412, + "eval_steps_per_second": 14.075, + "step": 275000 + }, + { + "epoch": 2.59, + "learning_rate": 2.414650625926691e-05, + "loss": 1.3815, + "step": 275500 + }, + { + "epoch": 2.59, + "eval_loss": 1.200337290763855, + "eval_runtime": 222.0085, + "eval_samples_per_second": 450.433, + "eval_steps_per_second": 14.076, + "step": 275500 + }, + { + "epoch": 2.59, + "learning_rate": 2.409958521799516e-05, + "loss": 1.37, + "step": 276000 + }, + { + "epoch": 2.59, + "eval_loss": 1.203967809677124, + "eval_runtime": 221.9584, + "eval_samples_per_second": 450.535, + "eval_steps_per_second": 14.079, + "step": 276000 + }, + { + "epoch": 2.59, + "learning_rate": 2.405266417672341e-05, + "loss": 1.3913, + "step": 276500 + }, + { + "epoch": 2.59, + "eval_loss": 1.193058729171753, + "eval_runtime": 221.8004, + "eval_samples_per_second": 450.856, + "eval_steps_per_second": 14.089, + "step": 276500 + }, + { + "epoch": 2.6, + "learning_rate": 2.4005743135451665e-05, + "loss": 1.3707, + "step": 277000 + }, + { + "epoch": 2.6, + "eval_loss": 1.2021199464797974, + "eval_runtime": 221.8184, + "eval_samples_per_second": 450.819, + "eval_steps_per_second": 14.088, + "step": 277000 + }, + { + "epoch": 2.6, + "learning_rate": 2.3958822094179915e-05, + "loss": 1.3903, + "step": 277500 + }, + { + "epoch": 2.6, + "eval_loss": 1.1956475973129272, + "eval_runtime": 221.7913, + "eval_samples_per_second": 450.874, + "eval_steps_per_second": 14.09, + "step": 277500 + }, + { + "epoch": 2.61, + "learning_rate": 2.3911901052908168e-05, + "loss": 1.3792, + "step": 278000 + }, + { + "epoch": 2.61, + "eval_loss": 1.1976265907287598, + "eval_runtime": 221.8164, + "eval_samples_per_second": 450.823, + "eval_steps_per_second": 14.088, + "step": 278000 + }, + { + "epoch": 2.61, + "learning_rate": 2.386498001163642e-05, + "loss": 1.3765, + "step": 278500 + }, + { + "epoch": 2.61, + "eval_loss": 1.2006570100784302, + "eval_runtime": 221.8144, + "eval_samples_per_second": 450.827, + "eval_steps_per_second": 14.088, + "step": 278500 + }, + { + "epoch": 2.62, + "learning_rate": 2.381805897036467e-05, + "loss": 1.3589, + "step": 279000 + }, + { + "epoch": 2.62, + "eval_loss": 1.1913626194000244, + "eval_runtime": 221.8059, + "eval_samples_per_second": 450.845, + "eval_steps_per_second": 14.089, + "step": 279000 + }, + { + "epoch": 2.62, + "learning_rate": 2.3771137929092925e-05, + "loss": 1.3887, + "step": 279500 + }, + { + "epoch": 2.62, + "eval_loss": 1.1945520639419556, + "eval_runtime": 221.8425, + "eval_samples_per_second": 450.77, + "eval_steps_per_second": 14.087, + "step": 279500 + }, + { + "epoch": 2.63, + "learning_rate": 2.3724216887821178e-05, + "loss": 1.3897, + "step": 280000 + }, + { + "epoch": 2.63, + "eval_loss": 1.1894328594207764, + "eval_runtime": 221.8345, + "eval_samples_per_second": 450.786, + "eval_steps_per_second": 14.087, + "step": 280000 + }, + { + "epoch": 2.63, + "learning_rate": 2.3677295846549428e-05, + "loss": 1.3875, + "step": 280500 + }, + { + "epoch": 2.63, + "eval_loss": 1.190177083015442, + "eval_runtime": 221.8514, + "eval_samples_per_second": 450.752, + "eval_steps_per_second": 14.086, + "step": 280500 + }, + { + "epoch": 2.64, + "learning_rate": 2.363037480527768e-05, + "loss": 1.3796, + "step": 281000 + }, + { + "epoch": 2.64, + "eval_loss": 1.1883279085159302, + "eval_runtime": 221.8103, + "eval_samples_per_second": 450.836, + "eval_steps_per_second": 14.089, + "step": 281000 + }, + { + "epoch": 2.64, + "learning_rate": 2.3583453764005934e-05, + "loss": 1.3563, + "step": 281500 + }, + { + "epoch": 2.64, + "eval_loss": 1.1856937408447266, + "eval_runtime": 222.0332, + "eval_samples_per_second": 450.383, + "eval_steps_per_second": 14.074, + "step": 281500 + }, + { + "epoch": 2.65, + "learning_rate": 2.3536532722734184e-05, + "loss": 1.3546, + "step": 282000 + }, + { + "epoch": 2.65, + "eval_loss": 1.1944658756256104, + "eval_runtime": 221.9876, + "eval_samples_per_second": 450.476, + "eval_steps_per_second": 14.077, + "step": 282000 + }, + { + "epoch": 2.65, + "learning_rate": 2.3489611681462438e-05, + "loss": 1.3597, + "step": 282500 + }, + { + "epoch": 2.65, + "eval_loss": 1.1892412900924683, + "eval_runtime": 221.924, + "eval_samples_per_second": 450.605, + "eval_steps_per_second": 14.081, + "step": 282500 + }, + { + "epoch": 2.66, + "learning_rate": 2.3442690640190687e-05, + "loss": 1.368, + "step": 283000 + }, + { + "epoch": 2.66, + "eval_loss": 1.1899220943450928, + "eval_runtime": 221.8992, + "eval_samples_per_second": 450.655, + "eval_steps_per_second": 14.083, + "step": 283000 + }, + { + "epoch": 2.66, + "learning_rate": 2.339576959891894e-05, + "loss": 1.3766, + "step": 283500 + }, + { + "epoch": 2.66, + "eval_loss": 1.1951682567596436, + "eval_runtime": 221.7577, + "eval_samples_per_second": 450.943, + "eval_steps_per_second": 14.092, + "step": 283500 + }, + { + "epoch": 2.67, + "learning_rate": 2.3348848557647194e-05, + "loss": 1.369, + "step": 284000 + }, + { + "epoch": 2.67, + "eval_loss": 1.1905887126922607, + "eval_runtime": 221.7249, + "eval_samples_per_second": 451.009, + "eval_steps_per_second": 14.094, + "step": 284000 + }, + { + "epoch": 2.67, + "learning_rate": 2.3301927516375444e-05, + "loss": 1.3634, + "step": 284500 + }, + { + "epoch": 2.67, + "eval_loss": 1.1932637691497803, + "eval_runtime": 221.7372, + "eval_samples_per_second": 450.984, + "eval_steps_per_second": 14.093, + "step": 284500 + }, + { + "epoch": 2.67, + "learning_rate": 2.3255006475103697e-05, + "loss": 1.3571, + "step": 285000 + }, + { + "epoch": 2.67, + "eval_loss": 1.1916605234146118, + "eval_runtime": 221.7465, + "eval_samples_per_second": 450.965, + "eval_steps_per_second": 14.093, + "step": 285000 + }, + { + "epoch": 2.68, + "learning_rate": 2.320808543383195e-05, + "loss": 1.3663, + "step": 285500 + }, + { + "epoch": 2.68, + "eval_loss": 1.1863957643508911, + "eval_runtime": 221.7586, + "eval_samples_per_second": 450.941, + "eval_steps_per_second": 14.092, + "step": 285500 + }, + { + "epoch": 2.68, + "learning_rate": 2.31611643925602e-05, + "loss": 1.3437, + "step": 286000 + }, + { + "epoch": 2.68, + "eval_loss": 1.186560034751892, + "eval_runtime": 221.7879, + "eval_samples_per_second": 450.881, + "eval_steps_per_second": 14.09, + "step": 286000 + }, + { + "epoch": 2.69, + "learning_rate": 2.3114243351288454e-05, + "loss": 1.3614, + "step": 286500 + }, + { + "epoch": 2.69, + "eval_loss": 1.1834124326705933, + "eval_runtime": 221.7759, + "eval_samples_per_second": 450.906, + "eval_steps_per_second": 14.091, + "step": 286500 + }, + { + "epoch": 2.69, + "learning_rate": 2.3067322310016707e-05, + "loss": 1.3561, + "step": 287000 + }, + { + "epoch": 2.69, + "eval_loss": 1.1831755638122559, + "eval_runtime": 221.7594, + "eval_samples_per_second": 450.939, + "eval_steps_per_second": 14.092, + "step": 287000 + }, + { + "epoch": 2.7, + "learning_rate": 2.3020401268744957e-05, + "loss": 1.3634, + "step": 287500 + }, + { + "epoch": 2.7, + "eval_loss": 1.1881248950958252, + "eval_runtime": 221.7452, + "eval_samples_per_second": 450.968, + "eval_steps_per_second": 14.093, + "step": 287500 + }, + { + "epoch": 2.7, + "learning_rate": 2.297348022747321e-05, + "loss": 1.3574, + "step": 288000 + }, + { + "epoch": 2.7, + "eval_loss": 1.1852173805236816, + "eval_runtime": 221.7114, + "eval_samples_per_second": 451.037, + "eval_steps_per_second": 14.095, + "step": 288000 + }, + { + "epoch": 2.71, + "learning_rate": 2.292655918620146e-05, + "loss": 1.377, + "step": 288500 + }, + { + "epoch": 2.71, + "eval_loss": 1.1913198232650757, + "eval_runtime": 221.8189, + "eval_samples_per_second": 450.818, + "eval_steps_per_second": 14.088, + "step": 288500 + }, + { + "epoch": 2.71, + "learning_rate": 2.2879638144929713e-05, + "loss": 1.3511, + "step": 289000 + }, + { + "epoch": 2.71, + "eval_loss": 1.1883938312530518, + "eval_runtime": 221.884, + "eval_samples_per_second": 450.686, + "eval_steps_per_second": 14.084, + "step": 289000 + }, + { + "epoch": 2.72, + "learning_rate": 2.2832717103657967e-05, + "loss": 1.3486, + "step": 289500 + }, + { + "epoch": 2.72, + "eval_loss": 1.185508370399475, + "eval_runtime": 221.7951, + "eval_samples_per_second": 450.867, + "eval_steps_per_second": 14.09, + "step": 289500 + }, + { + "epoch": 2.72, + "learning_rate": 2.2785796062386216e-05, + "loss": 1.3584, + "step": 290000 + }, + { + "epoch": 2.72, + "eval_loss": 1.1850634813308716, + "eval_runtime": 221.7253, + "eval_samples_per_second": 451.009, + "eval_steps_per_second": 14.094, + "step": 290000 + }, + { + "epoch": 2.73, + "learning_rate": 2.273887502111447e-05, + "loss": 1.3634, + "step": 290500 + }, + { + "epoch": 2.73, + "eval_loss": 1.1823251247406006, + "eval_runtime": 221.5435, + "eval_samples_per_second": 451.379, + "eval_steps_per_second": 14.106, + "step": 290500 + }, + { + "epoch": 2.73, + "learning_rate": 2.2691953979842723e-05, + "loss": 1.3652, + "step": 291000 + }, + { + "epoch": 2.73, + "eval_loss": 1.181983232498169, + "eval_runtime": 221.5124, + "eval_samples_per_second": 451.442, + "eval_steps_per_second": 14.108, + "step": 291000 + }, + { + "epoch": 2.74, + "learning_rate": 2.2645032938570973e-05, + "loss": 1.3612, + "step": 291500 + }, + { + "epoch": 2.74, + "eval_loss": 1.182550311088562, + "eval_runtime": 221.4655, + "eval_samples_per_second": 451.538, + "eval_steps_per_second": 14.111, + "step": 291500 + }, + { + "epoch": 2.74, + "learning_rate": 2.2598111897299226e-05, + "loss": 1.3827, + "step": 292000 + }, + { + "epoch": 2.74, + "eval_loss": 1.1845837831497192, + "eval_runtime": 221.4927, + "eval_samples_per_second": 451.482, + "eval_steps_per_second": 14.109, + "step": 292000 + }, + { + "epoch": 2.74, + "learning_rate": 2.255119085602748e-05, + "loss": 1.3722, + "step": 292500 + }, + { + "epoch": 2.74, + "eval_loss": 1.1796345710754395, + "eval_runtime": 221.4954, + "eval_samples_per_second": 451.477, + "eval_steps_per_second": 14.109, + "step": 292500 + }, + { + "epoch": 2.75, + "learning_rate": 2.250426981475573e-05, + "loss": 1.3608, + "step": 293000 + }, + { + "epoch": 2.75, + "eval_loss": 1.1794308423995972, + "eval_runtime": 221.4654, + "eval_samples_per_second": 451.538, + "eval_steps_per_second": 14.111, + "step": 293000 + }, + { + "epoch": 2.75, + "learning_rate": 2.2457348773483983e-05, + "loss": 1.3601, + "step": 293500 + }, + { + "epoch": 2.75, + "eval_loss": 1.1854252815246582, + "eval_runtime": 221.506, + "eval_samples_per_second": 451.455, + "eval_steps_per_second": 14.108, + "step": 293500 + }, + { + "epoch": 2.76, + "learning_rate": 2.2410427732212233e-05, + "loss": 1.3723, + "step": 294000 + }, + { + "epoch": 2.76, + "eval_loss": 1.1760536432266235, + "eval_runtime": 221.4862, + "eval_samples_per_second": 451.495, + "eval_steps_per_second": 14.109, + "step": 294000 + }, + { + "epoch": 2.76, + "learning_rate": 2.2363506690940486e-05, + "loss": 1.3559, + "step": 294500 + }, + { + "epoch": 2.76, + "eval_loss": 1.1737596988677979, + "eval_runtime": 221.496, + "eval_samples_per_second": 451.475, + "eval_steps_per_second": 14.109, + "step": 294500 + }, + { + "epoch": 2.77, + "learning_rate": 2.231658564966874e-05, + "loss": 1.3596, + "step": 295000 + }, + { + "epoch": 2.77, + "eval_loss": 1.1840267181396484, + "eval_runtime": 221.4671, + "eval_samples_per_second": 451.534, + "eval_steps_per_second": 14.11, + "step": 295000 + }, + { + "epoch": 2.77, + "learning_rate": 2.226966460839699e-05, + "loss": 1.3469, + "step": 295500 + }, + { + "epoch": 2.77, + "eval_loss": 1.1722980737686157, + "eval_runtime": 221.4589, + "eval_samples_per_second": 451.551, + "eval_steps_per_second": 14.111, + "step": 295500 + }, + { + "epoch": 2.78, + "learning_rate": 2.2222743567125242e-05, + "loss": 1.3636, + "step": 296000 + }, + { + "epoch": 2.78, + "eval_loss": 1.1743708848953247, + "eval_runtime": 221.6515, + "eval_samples_per_second": 451.159, + "eval_steps_per_second": 14.099, + "step": 296000 + }, + { + "epoch": 2.78, + "learning_rate": 2.2175822525853496e-05, + "loss": 1.3522, + "step": 296500 + }, + { + "epoch": 2.78, + "eval_loss": 1.1773130893707275, + "eval_runtime": 221.623, + "eval_samples_per_second": 451.217, + "eval_steps_per_second": 14.101, + "step": 296500 + }, + { + "epoch": 2.79, + "learning_rate": 2.2128901484581746e-05, + "loss": 1.3459, + "step": 297000 + }, + { + "epoch": 2.79, + "eval_loss": 1.1674100160598755, + "eval_runtime": 221.5803, + "eval_samples_per_second": 451.304, + "eval_steps_per_second": 14.103, + "step": 297000 + }, + { + "epoch": 2.79, + "learning_rate": 2.208198044331e-05, + "loss": 1.3634, + "step": 297500 + }, + { + "epoch": 2.79, + "eval_loss": 1.1741117238998413, + "eval_runtime": 221.517, + "eval_samples_per_second": 451.433, + "eval_steps_per_second": 14.107, + "step": 297500 + }, + { + "epoch": 2.8, + "learning_rate": 2.2035059402038252e-05, + "loss": 1.3529, + "step": 298000 + }, + { + "epoch": 2.8, + "eval_loss": 1.1743310689926147, + "eval_runtime": 221.3651, + "eval_samples_per_second": 451.742, + "eval_steps_per_second": 14.117, + "step": 298000 + }, + { + "epoch": 2.8, + "learning_rate": 2.1988138360766502e-05, + "loss": 1.3524, + "step": 298500 + }, + { + "epoch": 2.8, + "eval_loss": 1.172131896018982, + "eval_runtime": 221.343, + "eval_samples_per_second": 451.788, + "eval_steps_per_second": 14.118, + "step": 298500 + }, + { + "epoch": 2.81, + "learning_rate": 2.1941217319494755e-05, + "loss": 1.349, + "step": 299000 + }, + { + "epoch": 2.81, + "eval_loss": 1.1744838953018188, + "eval_runtime": 221.3399, + "eval_samples_per_second": 451.794, + "eval_steps_per_second": 14.119, + "step": 299000 + }, + { + "epoch": 2.81, + "learning_rate": 2.1894296278223005e-05, + "loss": 1.3658, + "step": 299500 + }, + { + "epoch": 2.81, + "eval_loss": 1.167363166809082, + "eval_runtime": 221.371, + "eval_samples_per_second": 451.73, + "eval_steps_per_second": 14.117, + "step": 299500 + }, + { + "epoch": 2.82, + "learning_rate": 2.184737523695126e-05, + "loss": 1.3663, + "step": 300000 + }, + { + "epoch": 2.82, + "eval_loss": 1.1738266944885254, + "eval_runtime": 221.3431, + "eval_samples_per_second": 451.787, + "eval_steps_per_second": 14.118, + "step": 300000 + }, + { + "epoch": 2.82, + "learning_rate": 2.1800454195679512e-05, + "loss": 1.3473, + "step": 300500 + }, + { + "epoch": 2.82, + "eval_loss": 1.173740267753601, + "eval_runtime": 221.0442, + "eval_samples_per_second": 452.398, + "eval_steps_per_second": 14.137, + "step": 300500 + }, + { + "epoch": 2.82, + "learning_rate": 2.175353315440776e-05, + "loss": 1.3589, + "step": 301000 + }, + { + "epoch": 2.82, + "eval_loss": 1.1683049201965332, + "eval_runtime": 221.0696, + "eval_samples_per_second": 452.346, + "eval_steps_per_second": 14.136, + "step": 301000 + }, + { + "epoch": 2.83, + "learning_rate": 2.1706612113136015e-05, + "loss": 1.347, + "step": 301500 + }, + { + "epoch": 2.83, + "eval_loss": 1.1638891696929932, + "eval_runtime": 221.0645, + "eval_samples_per_second": 452.357, + "eval_steps_per_second": 14.136, + "step": 301500 + }, + { + "epoch": 2.83, + "learning_rate": 2.1659691071864268e-05, + "loss": 1.3421, + "step": 302000 + }, + { + "epoch": 2.83, + "eval_loss": 1.1762233972549438, + "eval_runtime": 221.1041, + "eval_samples_per_second": 452.276, + "eval_steps_per_second": 14.134, + "step": 302000 + }, + { + "epoch": 2.84, + "learning_rate": 2.1612770030592518e-05, + "loss": 1.3507, + "step": 302500 + }, + { + "epoch": 2.84, + "eval_loss": 1.17283034324646, + "eval_runtime": 221.1012, + "eval_samples_per_second": 452.282, + "eval_steps_per_second": 14.134, + "step": 302500 + }, + { + "epoch": 2.84, + "learning_rate": 2.156584898932077e-05, + "loss": 1.3247, + "step": 303000 + }, + { + "epoch": 2.84, + "eval_loss": 1.1722640991210938, + "eval_runtime": 221.0672, + "eval_samples_per_second": 452.351, + "eval_steps_per_second": 14.136, + "step": 303000 + }, + { + "epoch": 2.85, + "learning_rate": 2.1518927948049025e-05, + "loss": 1.3373, + "step": 303500 + }, + { + "epoch": 2.85, + "eval_loss": 1.1675173044204712, + "eval_runtime": 221.061, + "eval_samples_per_second": 452.364, + "eval_steps_per_second": 14.136, + "step": 303500 + }, + { + "epoch": 2.85, + "learning_rate": 2.1472006906777275e-05, + "loss": 1.347, + "step": 304000 + }, + { + "epoch": 2.85, + "eval_loss": 1.169805645942688, + "eval_runtime": 221.2478, + "eval_samples_per_second": 451.982, + "eval_steps_per_second": 14.124, + "step": 304000 + }, + { + "epoch": 2.86, + "learning_rate": 2.1425085865505528e-05, + "loss": 1.3549, + "step": 304500 + }, + { + "epoch": 2.86, + "eval_loss": 1.1683489084243774, + "eval_runtime": 221.2569, + "eval_samples_per_second": 451.963, + "eval_steps_per_second": 14.124, + "step": 304500 + }, + { + "epoch": 2.86, + "learning_rate": 2.137816482423378e-05, + "loss": 1.3406, + "step": 305000 + }, + { + "epoch": 2.86, + "eval_loss": 1.1658775806427002, + "eval_runtime": 221.1911, + "eval_samples_per_second": 452.098, + "eval_steps_per_second": 14.128, + "step": 305000 + }, + { + "epoch": 2.87, + "learning_rate": 2.133124378296203e-05, + "loss": 1.3497, + "step": 305500 + }, + { + "epoch": 2.87, + "eval_loss": 1.1645755767822266, + "eval_runtime": 221.1253, + "eval_samples_per_second": 452.232, + "eval_steps_per_second": 14.132, + "step": 305500 + }, + { + "epoch": 2.87, + "learning_rate": 2.1284322741690284e-05, + "loss": 1.3615, + "step": 306000 + }, + { + "epoch": 2.87, + "eval_loss": 1.1647412776947021, + "eval_runtime": 220.9938, + "eval_samples_per_second": 452.501, + "eval_steps_per_second": 14.141, + "step": 306000 + }, + { + "epoch": 2.88, + "learning_rate": 2.1237401700418534e-05, + "loss": 1.3281, + "step": 306500 + }, + { + "epoch": 2.88, + "eval_loss": 1.1673572063446045, + "eval_runtime": 220.9974, + "eval_samples_per_second": 452.494, + "eval_steps_per_second": 14.14, + "step": 306500 + }, + { + "epoch": 2.88, + "learning_rate": 2.1190480659146787e-05, + "loss": 1.3447, + "step": 307000 + }, + { + "epoch": 2.88, + "eval_loss": 1.1644020080566406, + "eval_runtime": 221.0059, + "eval_samples_per_second": 452.477, + "eval_steps_per_second": 14.14, + "step": 307000 + }, + { + "epoch": 2.89, + "learning_rate": 2.114355961787504e-05, + "loss": 1.3518, + "step": 307500 + }, + { + "epoch": 2.89, + "eval_loss": 1.1606210470199585, + "eval_runtime": 220.9981, + "eval_samples_per_second": 452.493, + "eval_steps_per_second": 14.14, + "step": 307500 + }, + { + "epoch": 2.89, + "learning_rate": 2.109663857660329e-05, + "loss": 1.3268, + "step": 308000 + }, + { + "epoch": 2.89, + "eval_loss": 1.172353982925415, + "eval_runtime": 220.9928, + "eval_samples_per_second": 452.503, + "eval_steps_per_second": 14.141, + "step": 308000 + }, + { + "epoch": 2.9, + "learning_rate": 2.1049717535331544e-05, + "loss": 1.3281, + "step": 308500 + }, + { + "epoch": 2.9, + "eval_loss": 1.1636762619018555, + "eval_runtime": 221.0284, + "eval_samples_per_second": 452.431, + "eval_steps_per_second": 14.138, + "step": 308500 + }, + { + "epoch": 2.9, + "learning_rate": 2.1002796494059797e-05, + "loss": 1.3361, + "step": 309000 + }, + { + "epoch": 2.9, + "eval_loss": 1.1628670692443848, + "eval_runtime": 221.0125, + "eval_samples_per_second": 452.463, + "eval_steps_per_second": 14.139, + "step": 309000 + }, + { + "epoch": 2.9, + "learning_rate": 2.0955875452788047e-05, + "loss": 1.3297, + "step": 309500 + }, + { + "epoch": 2.9, + "eval_loss": 1.1588451862335205, + "eval_runtime": 221.003, + "eval_samples_per_second": 452.482, + "eval_steps_per_second": 14.14, + "step": 309500 + }, + { + "epoch": 2.91, + "learning_rate": 2.09089544115163e-05, + "loss": 1.3492, + "step": 310000 + }, + { + "epoch": 2.91, + "eval_loss": 1.1633425951004028, + "eval_runtime": 220.9707, + "eval_samples_per_second": 452.549, + "eval_steps_per_second": 14.142, + "step": 310000 + }, + { + "epoch": 2.91, + "learning_rate": 2.0862033370244554e-05, + "loss": 1.3419, + "step": 310500 + }, + { + "epoch": 2.91, + "eval_loss": 1.160988211631775, + "eval_runtime": 220.9533, + "eval_samples_per_second": 452.584, + "eval_steps_per_second": 14.143, + "step": 310500 + }, + { + "epoch": 2.92, + "learning_rate": 2.0815112328972804e-05, + "loss": 1.3277, + "step": 311000 + }, + { + "epoch": 2.92, + "eval_loss": 1.1551365852355957, + "eval_runtime": 220.9421, + "eval_samples_per_second": 452.607, + "eval_steps_per_second": 14.144, + "step": 311000 + }, + { + "epoch": 2.92, + "learning_rate": 2.0768191287701057e-05, + "loss": 1.3402, + "step": 311500 + }, + { + "epoch": 2.92, + "eval_loss": 1.163130283355713, + "eval_runtime": 220.9627, + "eval_samples_per_second": 452.565, + "eval_steps_per_second": 14.143, + "step": 311500 + }, + { + "epoch": 2.93, + "learning_rate": 2.072127024642931e-05, + "loss": 1.329, + "step": 312000 + }, + { + "epoch": 2.93, + "eval_loss": 1.1555798053741455, + "eval_runtime": 221.0607, + "eval_samples_per_second": 452.364, + "eval_steps_per_second": 14.136, + "step": 312000 + }, + { + "epoch": 2.93, + "learning_rate": 2.067434920515756e-05, + "loss": 1.3467, + "step": 312500 + }, + { + "epoch": 2.93, + "eval_loss": 1.1523679494857788, + "eval_runtime": 221.1292, + "eval_samples_per_second": 452.224, + "eval_steps_per_second": 14.132, + "step": 312500 + }, + { + "epoch": 2.94, + "learning_rate": 2.0627428163885813e-05, + "loss": 1.3574, + "step": 313000 + }, + { + "epoch": 2.94, + "eval_loss": 1.159830093383789, + "eval_runtime": 221.0551, + "eval_samples_per_second": 452.376, + "eval_steps_per_second": 14.137, + "step": 313000 + }, + { + "epoch": 2.94, + "learning_rate": 2.0580507122614067e-05, + "loss": 1.343, + "step": 313500 + }, + { + "epoch": 2.94, + "eval_loss": 1.1504855155944824, + "eval_runtime": 221.0384, + "eval_samples_per_second": 452.41, + "eval_steps_per_second": 14.138, + "step": 313500 + }, + { + "epoch": 2.95, + "learning_rate": 2.0533586081342317e-05, + "loss": 1.3234, + "step": 314000 + }, + { + "epoch": 2.95, + "eval_loss": 1.1518688201904297, + "eval_runtime": 220.9554, + "eval_samples_per_second": 452.58, + "eval_steps_per_second": 14.143, + "step": 314000 + }, + { + "epoch": 2.95, + "learning_rate": 2.048666504007057e-05, + "loss": 1.3183, + "step": 314500 + }, + { + "epoch": 2.95, + "eval_loss": 1.159049391746521, + "eval_runtime": 220.9401, + "eval_samples_per_second": 452.611, + "eval_steps_per_second": 14.144, + "step": 314500 + }, + { + "epoch": 2.96, + "learning_rate": 2.0439743998798823e-05, + "loss": 1.3662, + "step": 315000 + }, + { + "epoch": 2.96, + "eval_loss": 1.15578031539917, + "eval_runtime": 220.905, + "eval_samples_per_second": 452.683, + "eval_steps_per_second": 14.146, + "step": 315000 + }, + { + "epoch": 2.96, + "learning_rate": 2.0392822957527073e-05, + "loss": 1.3285, + "step": 315500 + }, + { + "epoch": 2.96, + "eval_loss": 1.1557625532150269, + "eval_runtime": 220.9213, + "eval_samples_per_second": 452.65, + "eval_steps_per_second": 14.145, + "step": 315500 + }, + { + "epoch": 2.97, + "learning_rate": 2.0345901916255326e-05, + "loss": 1.3375, + "step": 316000 + }, + { + "epoch": 2.97, + "eval_loss": 1.153180480003357, + "eval_runtime": 220.9165, + "eval_samples_per_second": 452.66, + "eval_steps_per_second": 14.146, + "step": 316000 + }, + { + "epoch": 2.97, + "learning_rate": 2.029898087498358e-05, + "loss": 1.3208, + "step": 316500 + }, + { + "epoch": 2.97, + "eval_loss": 1.1530534029006958, + "eval_runtime": 220.9013, + "eval_samples_per_second": 452.691, + "eval_steps_per_second": 14.147, + "step": 316500 + }, + { + "epoch": 2.97, + "learning_rate": 2.025205983371183e-05, + "loss": 1.3245, + "step": 317000 + }, + { + "epoch": 2.97, + "eval_loss": 1.152458667755127, + "eval_runtime": 220.9087, + "eval_samples_per_second": 452.676, + "eval_steps_per_second": 14.146, + "step": 317000 + }, + { + "epoch": 2.98, + "learning_rate": 2.0205138792440083e-05, + "loss": 1.3383, + "step": 317500 + }, + { + "epoch": 2.98, + "eval_loss": 1.154115915298462, + "eval_runtime": 220.909, + "eval_samples_per_second": 452.675, + "eval_steps_per_second": 14.146, + "step": 317500 + }, + { + "epoch": 2.98, + "learning_rate": 2.0158217751168336e-05, + "loss": 1.3139, + "step": 318000 + }, + { + "epoch": 2.98, + "eval_loss": 1.1526515483856201, + "eval_runtime": 220.8871, + "eval_samples_per_second": 452.72, + "eval_steps_per_second": 14.147, + "step": 318000 + }, + { + "epoch": 2.99, + "learning_rate": 2.011129670989659e-05, + "loss": 1.318, + "step": 318500 + }, + { + "epoch": 2.99, + "eval_loss": 1.1521937847137451, + "eval_runtime": 220.928, + "eval_samples_per_second": 452.636, + "eval_steps_per_second": 14.145, + "step": 318500 + }, + { + "epoch": 2.99, + "learning_rate": 2.006437566862484e-05, + "loss": 1.3276, + "step": 319000 + }, + { + "epoch": 2.99, + "eval_loss": 1.1527777910232544, + "eval_runtime": 220.9248, + "eval_samples_per_second": 452.643, + "eval_steps_per_second": 14.145, + "step": 319000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0017454627353092e-05, + "loss": 1.3318, + "step": 319500 + }, + { + "epoch": 3.0, + "eval_loss": 1.1514686346054077, + "eval_runtime": 220.8852, + "eval_samples_per_second": 452.724, + "eval_steps_per_second": 14.148, + "step": 319500 + }, + { + "epoch": 3.0, + "learning_rate": 1.9970533586081346e-05, + "loss": 1.3396, + "step": 320000 + }, + { + "epoch": 3.0, + "eval_loss": 1.153663158416748, + "eval_runtime": 220.9135, + "eval_samples_per_second": 452.666, + "eval_steps_per_second": 14.146, + "step": 320000 + }, + { + "epoch": 3.01, + "learning_rate": 1.9923612544809596e-05, + "loss": 1.3293, + "step": 320500 + }, + { + "epoch": 3.01, + "eval_loss": 1.1468321084976196, + "eval_runtime": 220.9867, + "eval_samples_per_second": 452.516, + "eval_steps_per_second": 14.141, + "step": 320500 + }, + { + "epoch": 3.01, + "learning_rate": 1.987669150353785e-05, + "loss": 1.3166, + "step": 321000 + }, + { + "epoch": 3.01, + "eval_loss": 1.1455674171447754, + "eval_runtime": 221.0441, + "eval_samples_per_second": 452.398, + "eval_steps_per_second": 14.137, + "step": 321000 + }, + { + "epoch": 3.02, + "learning_rate": 1.9829770462266102e-05, + "loss": 1.3082, + "step": 321500 + }, + { + "epoch": 3.02, + "eval_loss": 1.1440938711166382, + "eval_runtime": 220.9927, + "eval_samples_per_second": 452.504, + "eval_steps_per_second": 14.141, + "step": 321500 + }, + { + "epoch": 3.02, + "learning_rate": 1.9782849420994352e-05, + "loss": 1.3352, + "step": 322000 + }, + { + "epoch": 3.02, + "eval_loss": 1.1490601301193237, + "eval_runtime": 220.9673, + "eval_samples_per_second": 452.556, + "eval_steps_per_second": 14.142, + "step": 322000 + }, + { + "epoch": 3.03, + "learning_rate": 1.9735928379722605e-05, + "loss": 1.3217, + "step": 322500 + }, + { + "epoch": 3.03, + "eval_loss": 1.1430158615112305, + "eval_runtime": 220.8559, + "eval_samples_per_second": 452.784, + "eval_steps_per_second": 14.149, + "step": 322500 + }, + { + "epoch": 3.03, + "learning_rate": 1.968900733845086e-05, + "loss": 1.2891, + "step": 323000 + }, + { + "epoch": 3.03, + "eval_loss": 1.151960849761963, + "eval_runtime": 220.8197, + "eval_samples_per_second": 452.858, + "eval_steps_per_second": 14.152, + "step": 323000 + }, + { + "epoch": 3.04, + "learning_rate": 1.964208629717911e-05, + "loss": 1.3187, + "step": 323500 + }, + { + "epoch": 3.04, + "eval_loss": 1.1531556844711304, + "eval_runtime": 220.8223, + "eval_samples_per_second": 452.853, + "eval_steps_per_second": 14.152, + "step": 323500 + }, + { + "epoch": 3.04, + "learning_rate": 1.9595165255907362e-05, + "loss": 1.3057, + "step": 324000 + }, + { + "epoch": 3.04, + "eval_loss": 1.1435818672180176, + "eval_runtime": 220.8506, + "eval_samples_per_second": 452.795, + "eval_steps_per_second": 14.15, + "step": 324000 + }, + { + "epoch": 3.05, + "learning_rate": 1.9548244214635612e-05, + "loss": 1.3006, + "step": 324500 + }, + { + "epoch": 3.05, + "eval_loss": 1.1423208713531494, + "eval_runtime": 220.8327, + "eval_samples_per_second": 452.832, + "eval_steps_per_second": 14.151, + "step": 324500 + }, + { + "epoch": 3.05, + "learning_rate": 1.9501323173363865e-05, + "loss": 1.3062, + "step": 325000 + }, + { + "epoch": 3.05, + "eval_loss": 1.1395783424377441, + "eval_runtime": 220.8318, + "eval_samples_per_second": 452.833, + "eval_steps_per_second": 14.151, + "step": 325000 + }, + { + "epoch": 3.05, + "learning_rate": 1.945440213209212e-05, + "loss": 1.2993, + "step": 325500 + }, + { + "epoch": 3.05, + "eval_loss": 1.139084815979004, + "eval_runtime": 220.827, + "eval_samples_per_second": 452.843, + "eval_steps_per_second": 14.151, + "step": 325500 + }, + { + "epoch": 3.06, + "learning_rate": 1.9407481090820368e-05, + "loss": 1.3258, + "step": 326000 + }, + { + "epoch": 3.06, + "eval_loss": 1.1458795070648193, + "eval_runtime": 220.8245, + "eval_samples_per_second": 452.848, + "eval_steps_per_second": 14.152, + "step": 326000 + }, + { + "epoch": 3.06, + "learning_rate": 1.936056004954862e-05, + "loss": 1.3166, + "step": 326500 + }, + { + "epoch": 3.06, + "eval_loss": 1.1415066719055176, + "eval_runtime": 220.8333, + "eval_samples_per_second": 452.83, + "eval_steps_per_second": 14.151, + "step": 326500 + }, + { + "epoch": 3.07, + "learning_rate": 1.9313639008276875e-05, + "loss": 1.3062, + "step": 327000 + }, + { + "epoch": 3.07, + "eval_loss": 1.141654372215271, + "eval_runtime": 220.8651, + "eval_samples_per_second": 452.765, + "eval_steps_per_second": 14.149, + "step": 327000 + }, + { + "epoch": 3.07, + "learning_rate": 1.9266717967005125e-05, + "loss": 1.3091, + "step": 327500 + }, + { + "epoch": 3.07, + "eval_loss": 1.1461706161499023, + "eval_runtime": 220.8785, + "eval_samples_per_second": 452.738, + "eval_steps_per_second": 14.148, + "step": 327500 + }, + { + "epoch": 3.08, + "learning_rate": 1.9219796925733378e-05, + "loss": 1.2958, + "step": 328000 + }, + { + "epoch": 3.08, + "eval_loss": 1.1399269104003906, + "eval_runtime": 220.8706, + "eval_samples_per_second": 452.754, + "eval_steps_per_second": 14.149, + "step": 328000 + }, + { + "epoch": 3.08, + "learning_rate": 1.917287588446163e-05, + "loss": 1.3067, + "step": 328500 + }, + { + "epoch": 3.08, + "eval_loss": 1.144376516342163, + "eval_runtime": 220.8285, + "eval_samples_per_second": 452.84, + "eval_steps_per_second": 14.151, + "step": 328500 + }, + { + "epoch": 3.09, + "learning_rate": 1.912595484318988e-05, + "loss": 1.319, + "step": 329000 + }, + { + "epoch": 3.09, + "eval_loss": 1.1402838230133057, + "eval_runtime": 220.844, + "eval_samples_per_second": 452.808, + "eval_steps_per_second": 14.15, + "step": 329000 + }, + { + "epoch": 3.09, + "learning_rate": 1.9079033801918134e-05, + "loss": 1.3399, + "step": 329500 + }, + { + "epoch": 3.09, + "eval_loss": 1.139337182044983, + "eval_runtime": 220.9399, + "eval_samples_per_second": 452.612, + "eval_steps_per_second": 14.144, + "step": 329500 + }, + { + "epoch": 3.1, + "learning_rate": 1.9032112760646384e-05, + "loss": 1.3077, + "step": 330000 + }, + { + "epoch": 3.1, + "eval_loss": 1.1460225582122803, + "eval_runtime": 220.9227, + "eval_samples_per_second": 452.647, + "eval_steps_per_second": 14.145, + "step": 330000 + }, + { + "epoch": 3.1, + "learning_rate": 1.8985191719374638e-05, + "loss": 1.2905, + "step": 330500 + }, + { + "epoch": 3.1, + "eval_loss": 1.1372867822647095, + "eval_runtime": 220.8826, + "eval_samples_per_second": 452.729, + "eval_steps_per_second": 14.148, + "step": 330500 + }, + { + "epoch": 3.11, + "learning_rate": 1.893827067810289e-05, + "loss": 1.3308, + "step": 331000 + }, + { + "epoch": 3.11, + "eval_loss": 1.1457699537277222, + "eval_runtime": 220.9045, + "eval_samples_per_second": 452.684, + "eval_steps_per_second": 14.146, + "step": 331000 + }, + { + "epoch": 3.11, + "learning_rate": 1.889134963683114e-05, + "loss": 1.311, + "step": 331500 + }, + { + "epoch": 3.11, + "eval_loss": 1.1365880966186523, + "eval_runtime": 220.7213, + "eval_samples_per_second": 453.06, + "eval_steps_per_second": 14.158, + "step": 331500 + }, + { + "epoch": 3.12, + "learning_rate": 1.8844428595559394e-05, + "loss": 1.3123, + "step": 332000 + }, + { + "epoch": 3.12, + "eval_loss": 1.1372052431106567, + "eval_runtime": 220.7014, + "eval_samples_per_second": 453.101, + "eval_steps_per_second": 14.159, + "step": 332000 + }, + { + "epoch": 3.12, + "learning_rate": 1.8797507554287647e-05, + "loss": 1.29, + "step": 332500 + }, + { + "epoch": 3.12, + "eval_loss": 1.1368725299835205, + "eval_runtime": 220.7248, + "eval_samples_per_second": 453.053, + "eval_steps_per_second": 14.158, + "step": 332500 + }, + { + "epoch": 3.12, + "learning_rate": 1.8750586513015897e-05, + "loss": 1.2973, + "step": 333000 + }, + { + "epoch": 3.12, + "eval_loss": 1.1378732919692993, + "eval_runtime": 220.7445, + "eval_samples_per_second": 453.012, + "eval_steps_per_second": 14.157, + "step": 333000 + }, + { + "epoch": 3.13, + "learning_rate": 1.870366547174415e-05, + "loss": 1.294, + "step": 333500 + }, + { + "epoch": 3.13, + "eval_loss": 1.134841799736023, + "eval_runtime": 220.7851, + "eval_samples_per_second": 452.929, + "eval_steps_per_second": 14.154, + "step": 333500 + }, + { + "epoch": 3.13, + "learning_rate": 1.8656744430472404e-05, + "loss": 1.3215, + "step": 334000 + }, + { + "epoch": 3.13, + "eval_loss": 1.135761022567749, + "eval_runtime": 220.7626, + "eval_samples_per_second": 452.975, + "eval_steps_per_second": 14.155, + "step": 334000 + }, + { + "epoch": 3.14, + "learning_rate": 1.8609823389200654e-05, + "loss": 1.3083, + "step": 334500 + }, + { + "epoch": 3.14, + "eval_loss": 1.135439157485962, + "eval_runtime": 220.7917, + "eval_samples_per_second": 452.916, + "eval_steps_per_second": 14.154, + "step": 334500 + }, + { + "epoch": 3.14, + "learning_rate": 1.8562902347928907e-05, + "loss": 1.2956, + "step": 335000 + }, + { + "epoch": 3.14, + "eval_loss": 1.1306705474853516, + "eval_runtime": 220.7752, + "eval_samples_per_second": 452.949, + "eval_steps_per_second": 14.155, + "step": 335000 + }, + { + "epoch": 3.15, + "learning_rate": 1.8515981306657157e-05, + "loss": 1.2909, + "step": 335500 + }, + { + "epoch": 3.15, + "eval_loss": 1.1373645067214966, + "eval_runtime": 220.7747, + "eval_samples_per_second": 452.95, + "eval_steps_per_second": 14.155, + "step": 335500 + }, + { + "epoch": 3.15, + "learning_rate": 1.846906026538541e-05, + "loss": 1.3079, + "step": 336000 + }, + { + "epoch": 3.15, + "eval_loss": 1.1389541625976562, + "eval_runtime": 220.7343, + "eval_samples_per_second": 453.033, + "eval_steps_per_second": 14.157, + "step": 336000 + }, + { + "epoch": 3.16, + "learning_rate": 1.8422139224113664e-05, + "loss": 1.2948, + "step": 336500 + }, + { + "epoch": 3.16, + "eval_loss": 1.13786780834198, + "eval_runtime": 220.7342, + "eval_samples_per_second": 453.034, + "eval_steps_per_second": 14.157, + "step": 336500 + }, + { + "epoch": 3.16, + "learning_rate": 1.8375218182841913e-05, + "loss": 1.287, + "step": 337000 + }, + { + "epoch": 3.16, + "eval_loss": 1.1371397972106934, + "eval_runtime": 220.7289, + "eval_samples_per_second": 453.044, + "eval_steps_per_second": 14.158, + "step": 337000 + }, + { + "epoch": 3.17, + "learning_rate": 1.8328297141570167e-05, + "loss": 1.3254, + "step": 337500 + }, + { + "epoch": 3.17, + "eval_loss": 1.1344914436340332, + "eval_runtime": 220.7461, + "eval_samples_per_second": 453.009, + "eval_steps_per_second": 14.157, + "step": 337500 + }, + { + "epoch": 3.17, + "learning_rate": 1.828137610029842e-05, + "loss": 1.2886, + "step": 338000 + }, + { + "epoch": 3.17, + "eval_loss": 1.1374002695083618, + "eval_runtime": 220.7432, + "eval_samples_per_second": 453.015, + "eval_steps_per_second": 14.157, + "step": 338000 + }, + { + "epoch": 3.18, + "learning_rate": 1.823445505902667e-05, + "loss": 1.3021, + "step": 338500 + }, + { + "epoch": 3.18, + "eval_loss": 1.1403025388717651, + "eval_runtime": 220.7142, + "eval_samples_per_second": 453.075, + "eval_steps_per_second": 14.159, + "step": 338500 + }, + { + "epoch": 3.18, + "learning_rate": 1.8187534017754923e-05, + "loss": 1.2922, + "step": 339000 + }, + { + "epoch": 3.18, + "eval_loss": 1.1281777620315552, + "eval_runtime": 220.889, + "eval_samples_per_second": 452.716, + "eval_steps_per_second": 14.147, + "step": 339000 + }, + { + "epoch": 3.19, + "learning_rate": 1.8140612976483176e-05, + "loss": 1.305, + "step": 339500 + }, + { + "epoch": 3.19, + "eval_loss": 1.1293456554412842, + "eval_runtime": 220.8438, + "eval_samples_per_second": 452.809, + "eval_steps_per_second": 14.15, + "step": 339500 + }, + { + "epoch": 3.19, + "learning_rate": 1.8093691935211426e-05, + "loss": 1.3056, + "step": 340000 + }, + { + "epoch": 3.19, + "eval_loss": 1.1341973543167114, + "eval_runtime": 220.8184, + "eval_samples_per_second": 452.861, + "eval_steps_per_second": 14.152, + "step": 340000 + }, + { + "epoch": 3.2, + "learning_rate": 1.804677089393968e-05, + "loss": 1.2914, + "step": 340500 + }, + { + "epoch": 3.2, + "eval_loss": 1.133408546447754, + "eval_runtime": 220.7019, + "eval_samples_per_second": 453.1, + "eval_steps_per_second": 14.159, + "step": 340500 + }, + { + "epoch": 3.2, + "learning_rate": 1.799984985266793e-05, + "loss": 1.3, + "step": 341000 + }, + { + "epoch": 3.2, + "eval_loss": 1.1342601776123047, + "eval_runtime": 220.5902, + "eval_samples_per_second": 453.329, + "eval_steps_per_second": 14.167, + "step": 341000 + }, + { + "epoch": 3.2, + "learning_rate": 1.7952928811396183e-05, + "loss": 1.2899, + "step": 341500 + }, + { + "epoch": 3.2, + "eval_loss": 1.1248433589935303, + "eval_runtime": 220.5614, + "eval_samples_per_second": 453.388, + "eval_steps_per_second": 14.168, + "step": 341500 + }, + { + "epoch": 3.21, + "learning_rate": 1.7906007770124436e-05, + "loss": 1.315, + "step": 342000 + }, + { + "epoch": 3.21, + "eval_loss": 1.1279655694961548, + "eval_runtime": 220.57, + "eval_samples_per_second": 453.371, + "eval_steps_per_second": 14.168, + "step": 342000 + }, + { + "epoch": 3.21, + "learning_rate": 1.7859086728852686e-05, + "loss": 1.3153, + "step": 342500 + }, + { + "epoch": 3.21, + "eval_loss": 1.125070571899414, + "eval_runtime": 220.6119, + "eval_samples_per_second": 453.285, + "eval_steps_per_second": 14.165, + "step": 342500 + }, + { + "epoch": 3.22, + "learning_rate": 1.781216568758094e-05, + "loss": 1.3028, + "step": 343000 + }, + { + "epoch": 3.22, + "eval_loss": 1.1249561309814453, + "eval_runtime": 220.5892, + "eval_samples_per_second": 453.331, + "eval_steps_per_second": 14.167, + "step": 343000 + }, + { + "epoch": 3.22, + "learning_rate": 1.7765244646309193e-05, + "loss": 1.2917, + "step": 343500 + }, + { + "epoch": 3.22, + "eval_loss": 1.1268163919448853, + "eval_runtime": 220.6095, + "eval_samples_per_second": 453.29, + "eval_steps_per_second": 14.165, + "step": 343500 + }, + { + "epoch": 3.23, + "learning_rate": 1.7718323605037442e-05, + "loss": 1.3129, + "step": 344000 + }, + { + "epoch": 3.23, + "eval_loss": 1.130949854850769, + "eval_runtime": 220.5805, + "eval_samples_per_second": 453.349, + "eval_steps_per_second": 14.167, + "step": 344000 + }, + { + "epoch": 3.23, + "learning_rate": 1.7671402563765696e-05, + "loss": 1.3006, + "step": 344500 + }, + { + "epoch": 3.23, + "eval_loss": 1.130674958229065, + "eval_runtime": 220.5791, + "eval_samples_per_second": 453.352, + "eval_steps_per_second": 14.167, + "step": 344500 + }, + { + "epoch": 3.24, + "learning_rate": 1.762448152249395e-05, + "loss": 1.296, + "step": 345000 + }, + { + "epoch": 3.24, + "eval_loss": 1.1254706382751465, + "eval_runtime": 220.6122, + "eval_samples_per_second": 453.284, + "eval_steps_per_second": 14.165, + "step": 345000 + }, + { + "epoch": 3.24, + "learning_rate": 1.75775604812222e-05, + "loss": 1.2797, + "step": 345500 + }, + { + "epoch": 3.24, + "eval_loss": 1.1241164207458496, + "eval_runtime": 220.5527, + "eval_samples_per_second": 453.406, + "eval_steps_per_second": 14.169, + "step": 345500 + }, + { + "epoch": 3.25, + "learning_rate": 1.7530639439950452e-05, + "loss": 1.2894, + "step": 346000 + }, + { + "epoch": 3.25, + "eval_loss": 1.1250615119934082, + "eval_runtime": 220.5648, + "eval_samples_per_second": 453.381, + "eval_steps_per_second": 14.168, + "step": 346000 + }, + { + "epoch": 3.25, + "learning_rate": 1.7483718398678706e-05, + "loss": 1.2935, + "step": 346500 + }, + { + "epoch": 3.25, + "eval_loss": 1.1238473653793335, + "eval_runtime": 220.5533, + "eval_samples_per_second": 453.405, + "eval_steps_per_second": 14.169, + "step": 346500 + }, + { + "epoch": 3.26, + "learning_rate": 1.7436797357406955e-05, + "loss": 1.2986, + "step": 347000 + }, + { + "epoch": 3.26, + "eval_loss": 1.1256763935089111, + "eval_runtime": 220.5765, + "eval_samples_per_second": 453.358, + "eval_steps_per_second": 14.167, + "step": 347000 + }, + { + "epoch": 3.26, + "learning_rate": 1.738987631613521e-05, + "loss": 1.3051, + "step": 347500 + }, + { + "epoch": 3.26, + "eval_loss": 1.1246867179870605, + "eval_runtime": 220.5788, + "eval_samples_per_second": 453.353, + "eval_steps_per_second": 14.167, + "step": 347500 + }, + { + "epoch": 3.27, + "learning_rate": 1.734295527486346e-05, + "loss": 1.2825, + "step": 348000 + }, + { + "epoch": 3.27, + "eval_loss": 1.1220121383666992, + "eval_runtime": 220.5841, + "eval_samples_per_second": 453.342, + "eval_steps_per_second": 14.167, + "step": 348000 + }, + { + "epoch": 3.27, + "learning_rate": 1.7296034233591712e-05, + "loss": 1.3064, + "step": 348500 + }, + { + "epoch": 3.27, + "eval_loss": 1.125480055809021, + "eval_runtime": 220.6229, + "eval_samples_per_second": 453.262, + "eval_steps_per_second": 14.164, + "step": 348500 + }, + { + "epoch": 3.28, + "learning_rate": 1.7249113192319965e-05, + "loss": 1.3057, + "step": 349000 + }, + { + "epoch": 3.28, + "eval_loss": 1.122577428817749, + "eval_runtime": 220.7794, + "eval_samples_per_second": 452.941, + "eval_steps_per_second": 14.154, + "step": 349000 + }, + { + "epoch": 3.28, + "learning_rate": 1.7202192151048215e-05, + "loss": 1.3045, + "step": 349500 + }, + { + "epoch": 3.28, + "eval_loss": 1.1199777126312256, + "eval_runtime": 220.7547, + "eval_samples_per_second": 452.991, + "eval_steps_per_second": 14.156, + "step": 349500 + }, + { + "epoch": 3.28, + "learning_rate": 1.715527110977647e-05, + "loss": 1.2658, + "step": 350000 + }, + { + "epoch": 3.28, + "eval_loss": 1.1250447034835815, + "eval_runtime": 220.7117, + "eval_samples_per_second": 453.08, + "eval_steps_per_second": 14.159, + "step": 350000 + }, + { + "epoch": 3.29, + "learning_rate": 1.710835006850472e-05, + "loss": 1.2736, + "step": 350500 + }, + { + "epoch": 3.29, + "eval_loss": 1.1251693964004517, + "eval_runtime": 220.6204, + "eval_samples_per_second": 453.267, + "eval_steps_per_second": 14.165, + "step": 350500 + }, + { + "epoch": 3.29, + "learning_rate": 1.706142902723297e-05, + "loss": 1.3004, + "step": 351000 + }, + { + "epoch": 3.29, + "eval_loss": 1.121744155883789, + "eval_runtime": 220.47, + "eval_samples_per_second": 453.576, + "eval_steps_per_second": 14.174, + "step": 351000 + }, + { + "epoch": 3.3, + "learning_rate": 1.7014507985961225e-05, + "loss": 1.3014, + "step": 351500 + }, + { + "epoch": 3.3, + "eval_loss": 1.1183676719665527, + "eval_runtime": 220.4996, + "eval_samples_per_second": 453.516, + "eval_steps_per_second": 14.172, + "step": 351500 + }, + { + "epoch": 3.3, + "learning_rate": 1.6967586944689478e-05, + "loss": 1.2903, + "step": 352000 + }, + { + "epoch": 3.3, + "eval_loss": 1.128160834312439, + "eval_runtime": 220.4904, + "eval_samples_per_second": 453.534, + "eval_steps_per_second": 14.173, + "step": 352000 + }, + { + "epoch": 3.31, + "learning_rate": 1.6920665903417728e-05, + "loss": 1.2826, + "step": 352500 + }, + { + "epoch": 3.31, + "eval_loss": 1.1223293542861938, + "eval_runtime": 220.5271, + "eval_samples_per_second": 453.459, + "eval_steps_per_second": 14.171, + "step": 352500 + }, + { + "epoch": 3.31, + "learning_rate": 1.687374486214598e-05, + "loss": 1.2968, + "step": 353000 + }, + { + "epoch": 3.31, + "eval_loss": 1.119235873222351, + "eval_runtime": 220.499, + "eval_samples_per_second": 453.517, + "eval_steps_per_second": 14.172, + "step": 353000 + }, + { + "epoch": 3.32, + "learning_rate": 1.682682382087423e-05, + "loss": 1.2715, + "step": 353500 + }, + { + "epoch": 3.32, + "eval_loss": 1.1275289058685303, + "eval_runtime": 220.5053, + "eval_samples_per_second": 453.504, + "eval_steps_per_second": 14.172, + "step": 353500 + }, + { + "epoch": 3.32, + "learning_rate": 1.6779902779602484e-05, + "loss": 1.2998, + "step": 354000 + }, + { + "epoch": 3.32, + "eval_loss": 1.1185322999954224, + "eval_runtime": 220.5197, + "eval_samples_per_second": 453.474, + "eval_steps_per_second": 14.171, + "step": 354000 + }, + { + "epoch": 3.33, + "learning_rate": 1.6732981738330738e-05, + "loss": 1.2875, + "step": 354500 + }, + { + "epoch": 3.33, + "eval_loss": 1.116925597190857, + "eval_runtime": 220.5237, + "eval_samples_per_second": 453.466, + "eval_steps_per_second": 14.171, + "step": 354500 + }, + { + "epoch": 3.33, + "learning_rate": 1.6686060697058988e-05, + "loss": 1.277, + "step": 355000 + }, + { + "epoch": 3.33, + "eval_loss": 1.116443157196045, + "eval_runtime": 220.4994, + "eval_samples_per_second": 453.516, + "eval_steps_per_second": 14.172, + "step": 355000 + }, + { + "epoch": 3.34, + "learning_rate": 1.663913965578724e-05, + "loss": 1.2764, + "step": 355500 + }, + { + "epoch": 3.34, + "eval_loss": 1.1129635572433472, + "eval_runtime": 220.5171, + "eval_samples_per_second": 453.48, + "eval_steps_per_second": 14.171, + "step": 355500 + }, + { + "epoch": 3.34, + "learning_rate": 1.6592218614515494e-05, + "loss": 1.2677, + "step": 356000 + }, + { + "epoch": 3.34, + "eval_loss": 1.1151468753814697, + "eval_runtime": 220.4847, + "eval_samples_per_second": 453.546, + "eval_steps_per_second": 14.173, + "step": 356000 + }, + { + "epoch": 3.35, + "learning_rate": 1.6545297573243744e-05, + "loss": 1.284, + "step": 356500 + }, + { + "epoch": 3.35, + "eval_loss": 1.1204290390014648, + "eval_runtime": 220.4567, + "eval_samples_per_second": 453.604, + "eval_steps_per_second": 14.175, + "step": 356500 + }, + { + "epoch": 3.35, + "learning_rate": 1.6498376531971997e-05, + "loss": 1.2776, + "step": 357000 + }, + { + "epoch": 3.35, + "eval_loss": 1.1218979358673096, + "eval_runtime": 220.4675, + "eval_samples_per_second": 453.582, + "eval_steps_per_second": 14.174, + "step": 357000 + }, + { + "epoch": 3.35, + "learning_rate": 1.645145549070025e-05, + "loss": 1.2877, + "step": 357500 + }, + { + "epoch": 3.35, + "eval_loss": 1.1146986484527588, + "eval_runtime": 220.4757, + "eval_samples_per_second": 453.565, + "eval_steps_per_second": 14.174, + "step": 357500 + }, + { + "epoch": 3.36, + "learning_rate": 1.64045344494285e-05, + "loss": 1.2732, + "step": 358000 + }, + { + "epoch": 3.36, + "eval_loss": 1.1204952001571655, + "eval_runtime": 220.4932, + "eval_samples_per_second": 453.529, + "eval_steps_per_second": 14.173, + "step": 358000 + }, + { + "epoch": 3.36, + "learning_rate": 1.6357613408156754e-05, + "loss": 1.2887, + "step": 358500 + }, + { + "epoch": 3.36, + "eval_loss": 1.113034963607788, + "eval_runtime": 220.5465, + "eval_samples_per_second": 453.419, + "eval_steps_per_second": 14.169, + "step": 358500 + }, + { + "epoch": 3.37, + "learning_rate": 1.6310692366885004e-05, + "loss": 1.273, + "step": 359000 + }, + { + "epoch": 3.37, + "eval_loss": 1.1138362884521484, + "eval_runtime": 220.5129, + "eval_samples_per_second": 453.488, + "eval_steps_per_second": 14.172, + "step": 359000 + }, + { + "epoch": 3.37, + "learning_rate": 1.6263771325613257e-05, + "loss": 1.2645, + "step": 359500 + }, + { + "epoch": 3.37, + "eval_loss": 1.1159985065460205, + "eval_runtime": 220.5661, + "eval_samples_per_second": 453.379, + "eval_steps_per_second": 14.168, + "step": 359500 + }, + { + "epoch": 3.38, + "learning_rate": 1.621685028434151e-05, + "loss": 1.2909, + "step": 360000 + }, + { + "epoch": 3.38, + "eval_loss": 1.1123580932617188, + "eval_runtime": 220.6926, + "eval_samples_per_second": 453.119, + "eval_steps_per_second": 14.16, + "step": 360000 + }, + { + "epoch": 3.38, + "learning_rate": 1.616992924306976e-05, + "loss": 1.2871, + "step": 360500 + }, + { + "epoch": 3.38, + "eval_loss": 1.112741231918335, + "eval_runtime": 220.5711, + "eval_samples_per_second": 453.369, + "eval_steps_per_second": 14.168, + "step": 360500 + }, + { + "epoch": 3.39, + "learning_rate": 1.6123008201798014e-05, + "loss": 1.2848, + "step": 361000 + }, + { + "epoch": 3.39, + "eval_loss": 1.1092541217803955, + "eval_runtime": 220.558, + "eval_samples_per_second": 453.396, + "eval_steps_per_second": 14.169, + "step": 361000 + }, + { + "epoch": 3.39, + "learning_rate": 1.6076087160526267e-05, + "loss": 1.2805, + "step": 361500 + }, + { + "epoch": 3.39, + "eval_loss": 1.1068326234817505, + "eval_runtime": 220.4445, + "eval_samples_per_second": 453.629, + "eval_steps_per_second": 14.176, + "step": 361500 + }, + { + "epoch": 3.4, + "learning_rate": 1.602916611925452e-05, + "loss": 1.2747, + "step": 362000 + }, + { + "epoch": 3.4, + "eval_loss": 1.108471393585205, + "eval_runtime": 220.3945, + "eval_samples_per_second": 453.732, + "eval_steps_per_second": 14.179, + "step": 362000 + }, + { + "epoch": 3.4, + "learning_rate": 1.598224507798277e-05, + "loss": 1.286, + "step": 362500 + }, + { + "epoch": 3.4, + "eval_loss": 1.116951584815979, + "eval_runtime": 220.4488, + "eval_samples_per_second": 453.62, + "eval_steps_per_second": 14.176, + "step": 362500 + }, + { + "epoch": 3.41, + "learning_rate": 1.5935324036711023e-05, + "loss": 1.2923, + "step": 363000 + }, + { + "epoch": 3.41, + "eval_loss": 1.106821060180664, + "eval_runtime": 220.4737, + "eval_samples_per_second": 453.569, + "eval_steps_per_second": 14.174, + "step": 363000 + }, + { + "epoch": 3.41, + "learning_rate": 1.5888402995439277e-05, + "loss": 1.2452, + "step": 363500 + }, + { + "epoch": 3.41, + "eval_loss": 1.1107667684555054, + "eval_runtime": 220.4687, + "eval_samples_per_second": 453.579, + "eval_steps_per_second": 14.174, + "step": 363500 + }, + { + "epoch": 3.42, + "learning_rate": 1.5841481954167526e-05, + "loss": 1.301, + "step": 364000 + }, + { + "epoch": 3.42, + "eval_loss": 1.103507161140442, + "eval_runtime": 220.5105, + "eval_samples_per_second": 453.493, + "eval_steps_per_second": 14.172, + "step": 364000 + }, + { + "epoch": 3.42, + "learning_rate": 1.579456091289578e-05, + "loss": 1.2933, + "step": 364500 + }, + { + "epoch": 3.42, + "eval_loss": 1.1143946647644043, + "eval_runtime": 220.552, + "eval_samples_per_second": 453.408, + "eval_steps_per_second": 14.169, + "step": 364500 + }, + { + "epoch": 3.43, + "learning_rate": 1.5747639871624033e-05, + "loss": 1.2676, + "step": 365000 + }, + { + "epoch": 3.43, + "eval_loss": 1.1107447147369385, + "eval_runtime": 220.5769, + "eval_samples_per_second": 453.357, + "eval_steps_per_second": 14.167, + "step": 365000 + }, + { + "epoch": 3.43, + "learning_rate": 1.5700718830352283e-05, + "loss": 1.2726, + "step": 365500 + }, + { + "epoch": 3.43, + "eval_loss": 1.109850525856018, + "eval_runtime": 220.6362, + "eval_samples_per_second": 453.235, + "eval_steps_per_second": 14.164, + "step": 365500 + }, + { + "epoch": 3.43, + "learning_rate": 1.5653797789080536e-05, + "loss": 1.2798, + "step": 366000 + }, + { + "epoch": 3.43, + "eval_loss": 1.1126877069473267, + "eval_runtime": 220.6282, + "eval_samples_per_second": 453.251, + "eval_steps_per_second": 14.164, + "step": 366000 + }, + { + "epoch": 3.44, + "learning_rate": 1.560687674780879e-05, + "loss": 1.297, + "step": 366500 + }, + { + "epoch": 3.44, + "eval_loss": 1.1066131591796875, + "eval_runtime": 220.6565, + "eval_samples_per_second": 453.193, + "eval_steps_per_second": 14.162, + "step": 366500 + }, + { + "epoch": 3.44, + "learning_rate": 1.555995570653704e-05, + "loss": 1.2886, + "step": 367000 + }, + { + "epoch": 3.44, + "eval_loss": 1.1080513000488281, + "eval_runtime": 220.6411, + "eval_samples_per_second": 453.225, + "eval_steps_per_second": 14.163, + "step": 367000 + }, + { + "epoch": 3.45, + "learning_rate": 1.5513034665265293e-05, + "loss": 1.294, + "step": 367500 + }, + { + "epoch": 3.45, + "eval_loss": 1.109612226486206, + "eval_runtime": 220.6484, + "eval_samples_per_second": 453.21, + "eval_steps_per_second": 14.163, + "step": 367500 + }, + { + "epoch": 3.45, + "learning_rate": 1.5466113623993546e-05, + "loss": 1.2738, + "step": 368000 + }, + { + "epoch": 3.45, + "eval_loss": 1.099606990814209, + "eval_runtime": 220.7045, + "eval_samples_per_second": 453.095, + "eval_steps_per_second": 14.159, + "step": 368000 + }, + { + "epoch": 3.46, + "learning_rate": 1.54191925827218e-05, + "loss": 1.2877, + "step": 368500 + }, + { + "epoch": 3.46, + "eval_loss": 1.106638789176941, + "eval_runtime": 220.7202, + "eval_samples_per_second": 453.062, + "eval_steps_per_second": 14.158, + "step": 368500 + }, + { + "epoch": 3.46, + "learning_rate": 1.537227154145005e-05, + "loss": 1.276, + "step": 369000 + }, + { + "epoch": 3.46, + "eval_loss": 1.1104212999343872, + "eval_runtime": 220.7131, + "eval_samples_per_second": 453.077, + "eval_steps_per_second": 14.159, + "step": 369000 + }, + { + "epoch": 3.47, + "learning_rate": 1.5325350500178302e-05, + "loss": 1.2741, + "step": 369500 + }, + { + "epoch": 3.47, + "eval_loss": 1.1078869104385376, + "eval_runtime": 220.7191, + "eval_samples_per_second": 453.065, + "eval_steps_per_second": 14.158, + "step": 369500 + }, + { + "epoch": 3.47, + "learning_rate": 1.5278429458906556e-05, + "loss": 1.2641, + "step": 370000 + }, + { + "epoch": 3.47, + "eval_loss": 1.1033143997192383, + "eval_runtime": 220.9613, + "eval_samples_per_second": 452.568, + "eval_steps_per_second": 14.143, + "step": 370000 + }, + { + "epoch": 3.48, + "learning_rate": 1.5231508417634804e-05, + "loss": 1.2774, + "step": 370500 + }, + { + "epoch": 3.48, + "eval_loss": 1.1071778535842896, + "eval_runtime": 220.9875, + "eval_samples_per_second": 452.514, + "eval_steps_per_second": 14.141, + "step": 370500 + }, + { + "epoch": 3.48, + "learning_rate": 1.5184587376363057e-05, + "loss": 1.2555, + "step": 371000 + }, + { + "epoch": 3.48, + "eval_loss": 1.1127955913543701, + "eval_runtime": 220.9818, + "eval_samples_per_second": 452.526, + "eval_steps_per_second": 14.141, + "step": 371000 + }, + { + "epoch": 3.49, + "learning_rate": 1.5137666335091307e-05, + "loss": 1.2704, + "step": 371500 + }, + { + "epoch": 3.49, + "eval_loss": 1.1075456142425537, + "eval_runtime": 220.945, + "eval_samples_per_second": 452.601, + "eval_steps_per_second": 14.144, + "step": 371500 + }, + { + "epoch": 3.49, + "learning_rate": 1.509074529381956e-05, + "loss": 1.273, + "step": 372000 + }, + { + "epoch": 3.49, + "eval_loss": 1.0989867448806763, + "eval_runtime": 220.9281, + "eval_samples_per_second": 452.636, + "eval_steps_per_second": 14.145, + "step": 372000 + }, + { + "epoch": 3.5, + "learning_rate": 1.5043824252547814e-05, + "loss": 1.2788, + "step": 372500 + }, + { + "epoch": 3.5, + "eval_loss": 1.107438087463379, + "eval_runtime": 220.898, + "eval_samples_per_second": 452.698, + "eval_steps_per_second": 14.147, + "step": 372500 + }, + { + "epoch": 3.5, + "learning_rate": 1.4996903211276064e-05, + "loss": 1.2729, + "step": 373000 + }, + { + "epoch": 3.5, + "eval_loss": 1.104393720626831, + "eval_runtime": 220.8922, + "eval_samples_per_second": 452.71, + "eval_steps_per_second": 14.147, + "step": 373000 + }, + { + "epoch": 3.51, + "learning_rate": 1.4949982170004317e-05, + "loss": 1.261, + "step": 373500 + }, + { + "epoch": 3.51, + "eval_loss": 1.1048295497894287, + "eval_runtime": 220.9264, + "eval_samples_per_second": 452.639, + "eval_steps_per_second": 14.145, + "step": 373500 + }, + { + "epoch": 3.51, + "learning_rate": 1.490306112873257e-05, + "loss": 1.2739, + "step": 374000 + }, + { + "epoch": 3.51, + "eval_loss": 1.1010278463363647, + "eval_runtime": 220.921, + "eval_samples_per_second": 452.651, + "eval_steps_per_second": 14.145, + "step": 374000 + }, + { + "epoch": 3.51, + "learning_rate": 1.4856140087460822e-05, + "loss": 1.288, + "step": 374500 + }, + { + "epoch": 3.51, + "eval_loss": 1.1028172969818115, + "eval_runtime": 220.9673, + "eval_samples_per_second": 452.556, + "eval_steps_per_second": 14.142, + "step": 374500 + }, + { + "epoch": 3.52, + "learning_rate": 1.4809219046189073e-05, + "loss": 1.2669, + "step": 375000 + }, + { + "epoch": 3.52, + "eval_loss": 1.1051478385925293, + "eval_runtime": 221.0246, + "eval_samples_per_second": 452.438, + "eval_steps_per_second": 14.139, + "step": 375000 + }, + { + "epoch": 3.52, + "learning_rate": 1.4762298004917327e-05, + "loss": 1.2715, + "step": 375500 + }, + { + "epoch": 3.52, + "eval_loss": 1.1000804901123047, + "eval_runtime": 221.0676, + "eval_samples_per_second": 452.35, + "eval_steps_per_second": 14.136, + "step": 375500 + }, + { + "epoch": 3.53, + "learning_rate": 1.4715376963645578e-05, + "loss": 1.2728, + "step": 376000 + }, + { + "epoch": 3.53, + "eval_loss": 1.1016603708267212, + "eval_runtime": 221.0626, + "eval_samples_per_second": 452.361, + "eval_steps_per_second": 14.136, + "step": 376000 + }, + { + "epoch": 3.53, + "learning_rate": 1.466845592237383e-05, + "loss": 1.2625, + "step": 376500 + }, + { + "epoch": 3.53, + "eval_loss": 1.0998036861419678, + "eval_runtime": 221.0903, + "eval_samples_per_second": 452.304, + "eval_steps_per_second": 14.134, + "step": 376500 + }, + { + "epoch": 3.54, + "learning_rate": 1.4621534881102081e-05, + "loss": 1.2652, + "step": 377000 + }, + { + "epoch": 3.54, + "eval_loss": 1.1029491424560547, + "eval_runtime": 221.0807, + "eval_samples_per_second": 452.324, + "eval_steps_per_second": 14.135, + "step": 377000 + }, + { + "epoch": 3.54, + "learning_rate": 1.4574613839830335e-05, + "loss": 1.2623, + "step": 377500 + }, + { + "epoch": 3.54, + "eval_loss": 1.103481411933899, + "eval_runtime": 221.1214, + "eval_samples_per_second": 452.24, + "eval_steps_per_second": 14.133, + "step": 377500 + }, + { + "epoch": 3.55, + "learning_rate": 1.4527692798558586e-05, + "loss": 1.2741, + "step": 378000 + }, + { + "epoch": 3.55, + "eval_loss": 1.099875569343567, + "eval_runtime": 221.1227, + "eval_samples_per_second": 452.238, + "eval_steps_per_second": 14.132, + "step": 378000 + }, + { + "epoch": 3.55, + "learning_rate": 1.4480771757286838e-05, + "loss": 1.2711, + "step": 378500 + }, + { + "epoch": 3.55, + "eval_loss": 1.1044048070907593, + "eval_runtime": 221.595, + "eval_samples_per_second": 451.274, + "eval_steps_per_second": 14.102, + "step": 378500 + }, + { + "epoch": 3.56, + "learning_rate": 1.4433850716015091e-05, + "loss": 1.2755, + "step": 379000 + }, + { + "epoch": 3.56, + "eval_loss": 1.099456548690796, + "eval_runtime": 221.7605, + "eval_samples_per_second": 450.937, + "eval_steps_per_second": 14.092, + "step": 379000 + }, + { + "epoch": 3.56, + "learning_rate": 1.4386929674743343e-05, + "loss": 1.277, + "step": 379500 + }, + { + "epoch": 3.56, + "eval_loss": 1.0986175537109375, + "eval_runtime": 221.6625, + "eval_samples_per_second": 451.136, + "eval_steps_per_second": 14.098, + "step": 379500 + }, + { + "epoch": 3.57, + "learning_rate": 1.4340008633471594e-05, + "loss": 1.2927, + "step": 380000 + }, + { + "epoch": 3.57, + "eval_loss": 1.0979562997817993, + "eval_runtime": 221.6264, + "eval_samples_per_second": 451.21, + "eval_steps_per_second": 14.1, + "step": 380000 + }, + { + "epoch": 3.57, + "learning_rate": 1.4293087592199848e-05, + "loss": 1.2719, + "step": 380500 + }, + { + "epoch": 3.57, + "eval_loss": 1.0883769989013672, + "eval_runtime": 221.2672, + "eval_samples_per_second": 451.942, + "eval_steps_per_second": 14.123, + "step": 380500 + }, + { + "epoch": 3.58, + "learning_rate": 1.4246166550928101e-05, + "loss": 1.2681, + "step": 381000 + }, + { + "epoch": 3.58, + "eval_loss": 1.0984450578689575, + "eval_runtime": 221.3116, + "eval_samples_per_second": 451.852, + "eval_steps_per_second": 14.12, + "step": 381000 + }, + { + "epoch": 3.58, + "learning_rate": 1.419924550965635e-05, + "loss": 1.2686, + "step": 381500 + }, + { + "epoch": 3.58, + "eval_loss": 1.0925695896148682, + "eval_runtime": 221.3252, + "eval_samples_per_second": 451.824, + "eval_steps_per_second": 14.119, + "step": 381500 + }, + { + "epoch": 3.58, + "learning_rate": 1.4152324468384604e-05, + "loss": 1.2669, + "step": 382000 + }, + { + "epoch": 3.58, + "eval_loss": 1.1006863117218018, + "eval_runtime": 221.3717, + "eval_samples_per_second": 451.729, + "eval_steps_per_second": 14.117, + "step": 382000 + }, + { + "epoch": 3.59, + "learning_rate": 1.4105403427112854e-05, + "loss": 1.2682, + "step": 382500 + }, + { + "epoch": 3.59, + "eval_loss": 1.0927406549453735, + "eval_runtime": 221.3896, + "eval_samples_per_second": 451.692, + "eval_steps_per_second": 14.115, + "step": 382500 + }, + { + "epoch": 3.59, + "learning_rate": 1.4058482385841107e-05, + "loss": 1.2713, + "step": 383000 + }, + { + "epoch": 3.59, + "eval_loss": 1.0991029739379883, + "eval_runtime": 221.3668, + "eval_samples_per_second": 451.739, + "eval_steps_per_second": 14.117, + "step": 383000 + }, + { + "epoch": 3.6, + "learning_rate": 1.401156134456936e-05, + "loss": 1.2405, + "step": 383500 + }, + { + "epoch": 3.6, + "eval_loss": 1.1023343801498413, + "eval_runtime": 221.3768, + "eval_samples_per_second": 451.718, + "eval_steps_per_second": 14.116, + "step": 383500 + }, + { + "epoch": 3.6, + "learning_rate": 1.396464030329761e-05, + "loss": 1.2564, + "step": 384000 + }, + { + "epoch": 3.6, + "eval_loss": 1.0970436334609985, + "eval_runtime": 221.4221, + "eval_samples_per_second": 451.626, + "eval_steps_per_second": 14.113, + "step": 384000 + }, + { + "epoch": 3.61, + "learning_rate": 1.3917719262025864e-05, + "loss": 1.2774, + "step": 384500 + }, + { + "epoch": 3.61, + "eval_loss": 1.0976711511611938, + "eval_runtime": 221.4388, + "eval_samples_per_second": 451.592, + "eval_steps_per_second": 14.112, + "step": 384500 + }, + { + "epoch": 3.61, + "learning_rate": 1.3870798220754117e-05, + "loss": 1.2732, + "step": 385000 + }, + { + "epoch": 3.61, + "eval_loss": 1.091884732246399, + "eval_runtime": 221.434, + "eval_samples_per_second": 451.602, + "eval_steps_per_second": 14.113, + "step": 385000 + }, + { + "epoch": 3.62, + "learning_rate": 1.3823877179482367e-05, + "loss": 1.2868, + "step": 385500 + }, + { + "epoch": 3.62, + "eval_loss": 1.0924078226089478, + "eval_runtime": 221.4826, + "eval_samples_per_second": 451.503, + "eval_steps_per_second": 14.109, + "step": 385500 + }, + { + "epoch": 3.62, + "learning_rate": 1.377695613821062e-05, + "loss": 1.2648, + "step": 386000 + }, + { + "epoch": 3.62, + "eval_loss": 1.0871402025222778, + "eval_runtime": 221.6939, + "eval_samples_per_second": 451.072, + "eval_steps_per_second": 14.096, + "step": 386000 + }, + { + "epoch": 3.63, + "learning_rate": 1.3730035096938873e-05, + "loss": 1.238, + "step": 386500 + }, + { + "epoch": 3.63, + "eval_loss": 1.0820244550704956, + "eval_runtime": 221.7091, + "eval_samples_per_second": 451.042, + "eval_steps_per_second": 14.095, + "step": 386500 + }, + { + "epoch": 3.63, + "learning_rate": 1.3683114055667123e-05, + "loss": 1.261, + "step": 387000 + }, + { + "epoch": 3.63, + "eval_loss": 1.0901474952697754, + "eval_runtime": 221.6939, + "eval_samples_per_second": 451.072, + "eval_steps_per_second": 14.096, + "step": 387000 + }, + { + "epoch": 3.64, + "learning_rate": 1.3636193014395377e-05, + "loss": 1.2589, + "step": 387500 + }, + { + "epoch": 3.64, + "eval_loss": 1.0906635522842407, + "eval_runtime": 221.6797, + "eval_samples_per_second": 451.101, + "eval_steps_per_second": 14.097, + "step": 387500 + }, + { + "epoch": 3.64, + "learning_rate": 1.358927197312363e-05, + "loss": 1.271, + "step": 388000 + }, + { + "epoch": 3.64, + "eval_loss": 1.0888662338256836, + "eval_runtime": 221.5473, + "eval_samples_per_second": 451.371, + "eval_steps_per_second": 14.105, + "step": 388000 + }, + { + "epoch": 3.65, + "learning_rate": 1.354235093185188e-05, + "loss": 1.261, + "step": 388500 + }, + { + "epoch": 3.65, + "eval_loss": 1.0955654382705688, + "eval_runtime": 221.4943, + "eval_samples_per_second": 451.479, + "eval_steps_per_second": 14.109, + "step": 388500 + }, + { + "epoch": 3.65, + "learning_rate": 1.3495429890580133e-05, + "loss": 1.2545, + "step": 389000 + }, + { + "epoch": 3.65, + "eval_loss": 1.096941590309143, + "eval_runtime": 221.5197, + "eval_samples_per_second": 451.427, + "eval_steps_per_second": 14.107, + "step": 389000 + }, + { + "epoch": 3.66, + "learning_rate": 1.3448508849308383e-05, + "loss": 1.2533, + "step": 389500 + }, + { + "epoch": 3.66, + "eval_loss": 1.0853080749511719, + "eval_runtime": 221.5678, + "eval_samples_per_second": 451.329, + "eval_steps_per_second": 14.104, + "step": 389500 + }, + { + "epoch": 3.66, + "learning_rate": 1.3401587808036636e-05, + "loss": 1.2683, + "step": 390000 + }, + { + "epoch": 3.66, + "eval_loss": 1.0878088474273682, + "eval_runtime": 221.5397, + "eval_samples_per_second": 451.386, + "eval_steps_per_second": 14.106, + "step": 390000 + }, + { + "epoch": 3.66, + "learning_rate": 1.335466676676489e-05, + "loss": 1.2485, + "step": 390500 + }, + { + "epoch": 3.66, + "eval_loss": 1.0916895866394043, + "eval_runtime": 221.4993, + "eval_samples_per_second": 451.469, + "eval_steps_per_second": 14.108, + "step": 390500 + }, + { + "epoch": 3.67, + "learning_rate": 1.330774572549314e-05, + "loss": 1.2733, + "step": 391000 + }, + { + "epoch": 3.67, + "eval_loss": 1.085897445678711, + "eval_runtime": 221.5237, + "eval_samples_per_second": 451.419, + "eval_steps_per_second": 14.107, + "step": 391000 + }, + { + "epoch": 3.67, + "learning_rate": 1.3260824684221393e-05, + "loss": 1.269, + "step": 391500 + }, + { + "epoch": 3.67, + "eval_loss": 1.0844569206237793, + "eval_runtime": 221.5194, + "eval_samples_per_second": 451.428, + "eval_steps_per_second": 14.107, + "step": 391500 + }, + { + "epoch": 3.68, + "learning_rate": 1.3213903642949646e-05, + "loss": 1.2515, + "step": 392000 + }, + { + "epoch": 3.68, + "eval_loss": 1.0949499607086182, + "eval_runtime": 221.5302, + "eval_samples_per_second": 451.406, + "eval_steps_per_second": 14.106, + "step": 392000 + }, + { + "epoch": 3.68, + "learning_rate": 1.3166982601677896e-05, + "loss": 1.2674, + "step": 392500 + }, + { + "epoch": 3.68, + "eval_loss": 1.0817222595214844, + "eval_runtime": 221.5421, + "eval_samples_per_second": 451.382, + "eval_steps_per_second": 14.106, + "step": 392500 + }, + { + "epoch": 3.69, + "learning_rate": 1.312006156040615e-05, + "loss": 1.2747, + "step": 393000 + }, + { + "epoch": 3.69, + "eval_loss": 1.0874230861663818, + "eval_runtime": 221.4987, + "eval_samples_per_second": 451.47, + "eval_steps_per_second": 14.108, + "step": 393000 + }, + { + "epoch": 3.69, + "learning_rate": 1.3073140519134402e-05, + "loss": 1.2679, + "step": 393500 + }, + { + "epoch": 3.69, + "eval_loss": 1.0829836130142212, + "eval_runtime": 221.764, + "eval_samples_per_second": 450.93, + "eval_steps_per_second": 14.092, + "step": 393500 + }, + { + "epoch": 3.7, + "learning_rate": 1.3026219477862652e-05, + "loss": 1.2376, + "step": 394000 + }, + { + "epoch": 3.7, + "eval_loss": 1.0813868045806885, + "eval_runtime": 221.788, + "eval_samples_per_second": 450.881, + "eval_steps_per_second": 14.09, + "step": 394000 + }, + { + "epoch": 3.7, + "learning_rate": 1.2979298436590906e-05, + "loss": 1.2527, + "step": 394500 + }, + { + "epoch": 3.7, + "eval_loss": 1.081449270248413, + "eval_runtime": 221.7504, + "eval_samples_per_second": 450.957, + "eval_steps_per_second": 14.092, + "step": 394500 + }, + { + "epoch": 3.71, + "learning_rate": 1.2932377395319156e-05, + "loss": 1.2559, + "step": 395000 + }, + { + "epoch": 3.71, + "eval_loss": 1.0823462009429932, + "eval_runtime": 221.6871, + "eval_samples_per_second": 451.086, + "eval_steps_per_second": 14.096, + "step": 395000 + }, + { + "epoch": 3.71, + "learning_rate": 1.2885456354047409e-05, + "loss": 1.2265, + "step": 395500 + }, + { + "epoch": 3.71, + "eval_loss": 1.085395097732544, + "eval_runtime": 221.4734, + "eval_samples_per_second": 451.522, + "eval_steps_per_second": 14.11, + "step": 395500 + }, + { + "epoch": 3.72, + "learning_rate": 1.2838535312775662e-05, + "loss": 1.2799, + "step": 396000 + }, + { + "epoch": 3.72, + "eval_loss": 1.0858019590377808, + "eval_runtime": 221.5008, + "eval_samples_per_second": 451.466, + "eval_steps_per_second": 14.108, + "step": 396000 + }, + { + "epoch": 3.72, + "learning_rate": 1.2791614271503912e-05, + "loss": 1.2449, + "step": 396500 + }, + { + "epoch": 3.72, + "eval_loss": 1.0877788066864014, + "eval_runtime": 221.5121, + "eval_samples_per_second": 451.443, + "eval_steps_per_second": 14.108, + "step": 396500 + }, + { + "epoch": 3.73, + "learning_rate": 1.2744693230232165e-05, + "loss": 1.2451, + "step": 397000 + }, + { + "epoch": 3.73, + "eval_loss": 1.0856256484985352, + "eval_runtime": 221.5418, + "eval_samples_per_second": 451.382, + "eval_steps_per_second": 14.106, + "step": 397000 + }, + { + "epoch": 3.73, + "learning_rate": 1.2697772188960419e-05, + "loss": 1.2292, + "step": 397500 + }, + { + "epoch": 3.73, + "eval_loss": 1.080007791519165, + "eval_runtime": 221.5442, + "eval_samples_per_second": 451.377, + "eval_steps_per_second": 14.106, + "step": 397500 + }, + { + "epoch": 3.73, + "learning_rate": 1.2650851147688668e-05, + "loss": 1.2452, + "step": 398000 + }, + { + "epoch": 3.73, + "eval_loss": 1.0801745653152466, + "eval_runtime": 221.5032, + "eval_samples_per_second": 451.461, + "eval_steps_per_second": 14.108, + "step": 398000 + }, + { + "epoch": 3.74, + "learning_rate": 1.2603930106416922e-05, + "loss": 1.2385, + "step": 398500 + }, + { + "epoch": 3.74, + "eval_loss": 1.079880714416504, + "eval_runtime": 221.5406, + "eval_samples_per_second": 451.385, + "eval_steps_per_second": 14.106, + "step": 398500 + }, + { + "epoch": 3.74, + "learning_rate": 1.2557009065145175e-05, + "loss": 1.2576, + "step": 399000 + }, + { + "epoch": 3.74, + "eval_loss": 1.0792741775512695, + "eval_runtime": 221.5278, + "eval_samples_per_second": 451.411, + "eval_steps_per_second": 14.107, + "step": 399000 + }, + { + "epoch": 3.75, + "learning_rate": 1.2510088023873427e-05, + "loss": 1.2567, + "step": 399500 + }, + { + "epoch": 3.75, + "eval_loss": 1.0734611749649048, + "eval_runtime": 221.555, + "eval_samples_per_second": 451.355, + "eval_steps_per_second": 14.105, + "step": 399500 + }, + { + "epoch": 3.75, + "learning_rate": 1.2463166982601678e-05, + "loss": 1.2384, + "step": 400000 + }, + { + "epoch": 3.75, + "eval_loss": 1.0803865194320679, + "eval_runtime": 221.5706, + "eval_samples_per_second": 451.324, + "eval_steps_per_second": 14.104, + "step": 400000 + }, + { + "epoch": 3.76, + "learning_rate": 1.241624594132993e-05, + "loss": 1.2782, + "step": 400500 + }, + { + "epoch": 3.76, + "eval_loss": 1.085302472114563, + "eval_runtime": 221.6737, + "eval_samples_per_second": 451.113, + "eval_steps_per_second": 14.097, + "step": 400500 + }, + { + "epoch": 3.76, + "learning_rate": 1.2369324900058183e-05, + "loss": 1.2418, + "step": 401000 + }, + { + "epoch": 3.76, + "eval_loss": 1.0813859701156616, + "eval_runtime": 221.7597, + "eval_samples_per_second": 450.938, + "eval_steps_per_second": 14.092, + "step": 401000 + }, + { + "epoch": 3.77, + "learning_rate": 1.2322403858786435e-05, + "loss": 1.2467, + "step": 401500 + }, + { + "epoch": 3.77, + "eval_loss": 1.0765790939331055, + "eval_runtime": 221.6965, + "eval_samples_per_second": 451.067, + "eval_steps_per_second": 14.096, + "step": 401500 + }, + { + "epoch": 3.77, + "learning_rate": 1.2275482817514688e-05, + "loss": 1.2557, + "step": 402000 + }, + { + "epoch": 3.77, + "eval_loss": 1.074906587600708, + "eval_runtime": 221.6682, + "eval_samples_per_second": 451.125, + "eval_steps_per_second": 14.098, + "step": 402000 + }, + { + "epoch": 3.78, + "learning_rate": 1.222856177624294e-05, + "loss": 1.2417, + "step": 402500 + }, + { + "epoch": 3.78, + "eval_loss": 1.0793219804763794, + "eval_runtime": 221.5416, + "eval_samples_per_second": 451.383, + "eval_steps_per_second": 14.106, + "step": 402500 + }, + { + "epoch": 3.78, + "learning_rate": 1.2181640734971191e-05, + "loss": 1.2544, + "step": 403000 + }, + { + "epoch": 3.78, + "eval_loss": 1.0767314434051514, + "eval_runtime": 221.4973, + "eval_samples_per_second": 451.473, + "eval_steps_per_second": 14.109, + "step": 403000 + }, + { + "epoch": 3.79, + "learning_rate": 1.2134719693699444e-05, + "loss": 1.2453, + "step": 403500 + }, + { + "epoch": 3.79, + "eval_loss": 1.0776728391647339, + "eval_runtime": 221.5112, + "eval_samples_per_second": 451.444, + "eval_steps_per_second": 14.108, + "step": 403500 + }, + { + "epoch": 3.79, + "learning_rate": 1.2087798652427696e-05, + "loss": 1.2508, + "step": 404000 + }, + { + "epoch": 3.79, + "eval_loss": 1.0749331712722778, + "eval_runtime": 221.5304, + "eval_samples_per_second": 451.405, + "eval_steps_per_second": 14.106, + "step": 404000 + }, + { + "epoch": 3.8, + "learning_rate": 1.2040877611155948e-05, + "loss": 1.2388, + "step": 404500 + }, + { + "epoch": 3.8, + "eval_loss": 1.079124093055725, + "eval_runtime": 221.5211, + "eval_samples_per_second": 451.424, + "eval_steps_per_second": 14.107, + "step": 404500 + }, + { + "epoch": 3.8, + "learning_rate": 1.19939565698842e-05, + "loss": 1.2614, + "step": 405000 + }, + { + "epoch": 3.8, + "eval_loss": 1.0767608880996704, + "eval_runtime": 221.5647, + "eval_samples_per_second": 451.335, + "eval_steps_per_second": 14.104, + "step": 405000 + }, + { + "epoch": 3.81, + "learning_rate": 1.1947035528612453e-05, + "loss": 1.2355, + "step": 405500 + }, + { + "epoch": 3.81, + "eval_loss": 1.0779807567596436, + "eval_runtime": 221.5326, + "eval_samples_per_second": 451.401, + "eval_steps_per_second": 14.106, + "step": 405500 + }, + { + "epoch": 3.81, + "learning_rate": 1.1900114487340704e-05, + "loss": 1.2481, + "step": 406000 + }, + { + "epoch": 3.81, + "eval_loss": 1.076469898223877, + "eval_runtime": 221.5584, + "eval_samples_per_second": 451.348, + "eval_steps_per_second": 14.105, + "step": 406000 + }, + { + "epoch": 3.81, + "learning_rate": 1.1853193446068956e-05, + "loss": 1.2382, + "step": 406500 + }, + { + "epoch": 3.81, + "eval_loss": 1.0811249017715454, + "eval_runtime": 221.5716, + "eval_samples_per_second": 451.321, + "eval_steps_per_second": 14.104, + "step": 406500 + }, + { + "epoch": 3.82, + "learning_rate": 1.1806272404797209e-05, + "loss": 1.2331, + "step": 407000 + }, + { + "epoch": 3.82, + "eval_loss": 1.0766884088516235, + "eval_runtime": 221.5762, + "eval_samples_per_second": 451.312, + "eval_steps_per_second": 14.104, + "step": 407000 + }, + { + "epoch": 3.82, + "learning_rate": 1.175935136352546e-05, + "loss": 1.235, + "step": 407500 + }, + { + "epoch": 3.82, + "eval_loss": 1.0687439441680908, + "eval_runtime": 221.5531, + "eval_samples_per_second": 451.359, + "eval_steps_per_second": 14.105, + "step": 407500 + }, + { + "epoch": 3.83, + "learning_rate": 1.1712430322253712e-05, + "loss": 1.2506, + "step": 408000 + }, + { + "epoch": 3.83, + "eval_loss": 1.0767359733581543, + "eval_runtime": 221.7729, + "eval_samples_per_second": 450.912, + "eval_steps_per_second": 14.091, + "step": 408000 + }, + { + "epoch": 3.83, + "learning_rate": 1.1665509280981964e-05, + "loss": 1.2455, + "step": 408500 + }, + { + "epoch": 3.83, + "eval_loss": 1.0734050273895264, + "eval_runtime": 221.7206, + "eval_samples_per_second": 451.018, + "eval_steps_per_second": 14.094, + "step": 408500 + }, + { + "epoch": 3.84, + "learning_rate": 1.1618588239710217e-05, + "loss": 1.235, + "step": 409000 + }, + { + "epoch": 3.84, + "eval_loss": 1.076144814491272, + "eval_runtime": 221.6417, + "eval_samples_per_second": 451.179, + "eval_steps_per_second": 14.099, + "step": 409000 + }, + { + "epoch": 3.84, + "learning_rate": 1.1571667198438469e-05, + "loss": 1.227, + "step": 409500 + }, + { + "epoch": 3.84, + "eval_loss": 1.072109341621399, + "eval_runtime": 221.5766, + "eval_samples_per_second": 451.311, + "eval_steps_per_second": 14.103, + "step": 409500 + }, + { + "epoch": 3.85, + "learning_rate": 1.152474615716672e-05, + "loss": 1.2566, + "step": 410000 + }, + { + "epoch": 3.85, + "eval_loss": 1.0735670328140259, + "eval_runtime": 221.4239, + "eval_samples_per_second": 451.623, + "eval_steps_per_second": 14.113, + "step": 410000 + }, + { + "epoch": 3.85, + "learning_rate": 1.1477825115894974e-05, + "loss": 1.2523, + "step": 410500 + }, + { + "epoch": 3.85, + "eval_loss": 1.0737024545669556, + "eval_runtime": 221.4176, + "eval_samples_per_second": 451.635, + "eval_steps_per_second": 14.114, + "step": 410500 + }, + { + "epoch": 3.86, + "learning_rate": 1.1430904074623225e-05, + "loss": 1.239, + "step": 411000 + }, + { + "epoch": 3.86, + "eval_loss": 1.075139045715332, + "eval_runtime": 221.4292, + "eval_samples_per_second": 451.612, + "eval_steps_per_second": 14.113, + "step": 411000 + }, + { + "epoch": 3.86, + "learning_rate": 1.1383983033351477e-05, + "loss": 1.2632, + "step": 411500 + }, + { + "epoch": 3.86, + "eval_loss": 1.0686564445495605, + "eval_runtime": 221.4503, + "eval_samples_per_second": 451.569, + "eval_steps_per_second": 14.112, + "step": 411500 + }, + { + "epoch": 3.87, + "learning_rate": 1.1337061992079728e-05, + "loss": 1.2299, + "step": 412000 + }, + { + "epoch": 3.87, + "eval_loss": 1.0703197717666626, + "eval_runtime": 221.4314, + "eval_samples_per_second": 451.607, + "eval_steps_per_second": 14.113, + "step": 412000 + }, + { + "epoch": 3.87, + "learning_rate": 1.1290140950807982e-05, + "loss": 1.2451, + "step": 412500 + }, + { + "epoch": 3.87, + "eval_loss": 1.0710246562957764, + "eval_runtime": 221.4462, + "eval_samples_per_second": 451.577, + "eval_steps_per_second": 14.112, + "step": 412500 + }, + { + "epoch": 3.88, + "learning_rate": 1.1243219909536233e-05, + "loss": 1.2372, + "step": 413000 + }, + { + "epoch": 3.88, + "eval_loss": 1.0705777406692505, + "eval_runtime": 221.433, + "eval_samples_per_second": 451.604, + "eval_steps_per_second": 14.113, + "step": 413000 + }, + { + "epoch": 3.88, + "learning_rate": 1.1196298868264485e-05, + "loss": 1.2282, + "step": 413500 + }, + { + "epoch": 3.88, + "eval_loss": 1.0712275505065918, + "eval_runtime": 221.4443, + "eval_samples_per_second": 451.581, + "eval_steps_per_second": 14.112, + "step": 413500 + }, + { + "epoch": 3.89, + "learning_rate": 1.1149377826992736e-05, + "loss": 1.2334, + "step": 414000 + }, + { + "epoch": 3.89, + "eval_loss": 1.072120189666748, + "eval_runtime": 221.422, + "eval_samples_per_second": 451.626, + "eval_steps_per_second": 14.113, + "step": 414000 + }, + { + "epoch": 3.89, + "learning_rate": 1.110245678572099e-05, + "loss": 1.23, + "step": 414500 + }, + { + "epoch": 3.89, + "eval_loss": 1.0694152116775513, + "eval_runtime": 221.4396, + "eval_samples_per_second": 451.59, + "eval_steps_per_second": 14.112, + "step": 414500 + }, + { + "epoch": 3.89, + "learning_rate": 1.1055535744449241e-05, + "loss": 1.2466, + "step": 415000 + }, + { + "epoch": 3.89, + "eval_loss": 1.0689135789871216, + "eval_runtime": 221.4928, + "eval_samples_per_second": 451.482, + "eval_steps_per_second": 14.109, + "step": 415000 + }, + { + "epoch": 3.9, + "learning_rate": 1.1008614703177493e-05, + "loss": 1.2166, + "step": 415500 + }, + { + "epoch": 3.9, + "eval_loss": 1.0654420852661133, + "eval_runtime": 221.667, + "eval_samples_per_second": 451.127, + "eval_steps_per_second": 14.098, + "step": 415500 + }, + { + "epoch": 3.9, + "learning_rate": 1.0961693661905746e-05, + "loss": 1.242, + "step": 416000 + }, + { + "epoch": 3.9, + "eval_loss": 1.060294270515442, + "eval_runtime": 221.5802, + "eval_samples_per_second": 451.304, + "eval_steps_per_second": 14.103, + "step": 416000 + }, + { + "epoch": 3.91, + "learning_rate": 1.0914772620633998e-05, + "loss": 1.2328, + "step": 416500 + }, + { + "epoch": 3.91, + "eval_loss": 1.0750603675842285, + "eval_runtime": 221.529, + "eval_samples_per_second": 451.408, + "eval_steps_per_second": 14.107, + "step": 416500 + }, + { + "epoch": 3.91, + "learning_rate": 1.086785157936225e-05, + "loss": 1.2305, + "step": 417000 + }, + { + "epoch": 3.91, + "eval_loss": 1.0709779262542725, + "eval_runtime": 221.4321, + "eval_samples_per_second": 451.606, + "eval_steps_per_second": 14.113, + "step": 417000 + }, + { + "epoch": 3.92, + "learning_rate": 1.0820930538090501e-05, + "loss": 1.2583, + "step": 417500 + }, + { + "epoch": 3.92, + "eval_loss": 1.069564700126648, + "eval_runtime": 221.3517, + "eval_samples_per_second": 451.77, + "eval_steps_per_second": 14.118, + "step": 417500 + }, + { + "epoch": 3.92, + "learning_rate": 1.0774009496818754e-05, + "loss": 1.2281, + "step": 418000 + }, + { + "epoch": 3.92, + "eval_loss": 1.068252444267273, + "eval_runtime": 221.3357, + "eval_samples_per_second": 451.802, + "eval_steps_per_second": 14.119, + "step": 418000 + }, + { + "epoch": 3.93, + "learning_rate": 1.0727088455547006e-05, + "loss": 1.2311, + "step": 418500 + }, + { + "epoch": 3.93, + "eval_loss": 1.0643572807312012, + "eval_runtime": 221.2957, + "eval_samples_per_second": 451.884, + "eval_steps_per_second": 14.121, + "step": 418500 + }, + { + "epoch": 3.93, + "learning_rate": 1.0680167414275257e-05, + "loss": 1.2295, + "step": 419000 + }, + { + "epoch": 3.93, + "eval_loss": 1.0733916759490967, + "eval_runtime": 221.3151, + "eval_samples_per_second": 451.844, + "eval_steps_per_second": 14.12, + "step": 419000 + }, + { + "epoch": 3.94, + "learning_rate": 1.063324637300351e-05, + "loss": 1.2443, + "step": 419500 + }, + { + "epoch": 3.94, + "eval_loss": 1.068743109703064, + "eval_runtime": 221.3228, + "eval_samples_per_second": 451.829, + "eval_steps_per_second": 14.12, + "step": 419500 + }, + { + "epoch": 3.94, + "learning_rate": 1.0586325331731762e-05, + "loss": 1.2457, + "step": 420000 + }, + { + "epoch": 3.94, + "eval_loss": 1.0654057264328003, + "eval_runtime": 221.3124, + "eval_samples_per_second": 451.85, + "eval_steps_per_second": 14.12, + "step": 420000 + }, + { + "epoch": 3.95, + "learning_rate": 1.0539404290460014e-05, + "loss": 1.2431, + "step": 420500 + }, + { + "epoch": 3.95, + "eval_loss": 1.0636610984802246, + "eval_runtime": 221.147, + "eval_samples_per_second": 452.188, + "eval_steps_per_second": 14.131, + "step": 420500 + }, + { + "epoch": 3.95, + "learning_rate": 1.0492483249188265e-05, + "loss": 1.2248, + "step": 421000 + }, + { + "epoch": 3.95, + "eval_loss": 1.0696020126342773, + "eval_runtime": 221.1217, + "eval_samples_per_second": 452.24, + "eval_steps_per_second": 14.132, + "step": 421000 + }, + { + "epoch": 3.96, + "learning_rate": 1.0445562207916519e-05, + "loss": 1.2361, + "step": 421500 + }, + { + "epoch": 3.96, + "eval_loss": 1.0683060884475708, + "eval_runtime": 221.1561, + "eval_samples_per_second": 452.169, + "eval_steps_per_second": 14.13, + "step": 421500 + }, + { + "epoch": 3.96, + "learning_rate": 1.039864116664477e-05, + "loss": 1.2502, + "step": 422000 + }, + { + "epoch": 3.96, + "eval_loss": 1.0591408014297485, + "eval_runtime": 221.1321, + "eval_samples_per_second": 452.218, + "eval_steps_per_second": 14.132, + "step": 422000 + }, + { + "epoch": 3.96, + "learning_rate": 1.0351720125373022e-05, + "loss": 1.2188, + "step": 422500 + }, + { + "epoch": 3.96, + "eval_loss": 1.0668104887008667, + "eval_runtime": 221.1228, + "eval_samples_per_second": 452.237, + "eval_steps_per_second": 14.132, + "step": 422500 + }, + { + "epoch": 3.97, + "learning_rate": 1.0304799084101273e-05, + "loss": 1.2439, + "step": 423000 + }, + { + "epoch": 3.97, + "eval_loss": 1.06576406955719, + "eval_runtime": 221.2975, + "eval_samples_per_second": 451.88, + "eval_steps_per_second": 14.121, + "step": 423000 + }, + { + "epoch": 3.97, + "learning_rate": 1.0257878042829527e-05, + "loss": 1.242, + "step": 423500 + }, + { + "epoch": 3.97, + "eval_loss": 1.0613994598388672, + "eval_runtime": 221.2789, + "eval_samples_per_second": 451.918, + "eval_steps_per_second": 14.122, + "step": 423500 + }, + { + "epoch": 3.98, + "learning_rate": 1.0210957001557778e-05, + "loss": 1.2245, + "step": 424000 + }, + { + "epoch": 3.98, + "eval_loss": 1.0643956661224365, + "eval_runtime": 221.2185, + "eval_samples_per_second": 452.042, + "eval_steps_per_second": 14.126, + "step": 424000 + }, + { + "epoch": 3.98, + "learning_rate": 1.0164035960286032e-05, + "loss": 1.2288, + "step": 424500 + }, + { + "epoch": 3.98, + "eval_loss": 1.0604496002197266, + "eval_runtime": 221.1793, + "eval_samples_per_second": 452.122, + "eval_steps_per_second": 14.129, + "step": 424500 + }, + { + "epoch": 3.99, + "learning_rate": 1.0117114919014283e-05, + "loss": 1.2308, + "step": 425000 + }, + { + "epoch": 3.99, + "eval_loss": 1.0600295066833496, + "eval_runtime": 221.044, + "eval_samples_per_second": 452.399, + "eval_steps_per_second": 14.137, + "step": 425000 + }, + { + "epoch": 3.99, + "learning_rate": 1.0070193877742535e-05, + "loss": 1.2294, + "step": 425500 + }, + { + "epoch": 3.99, + "eval_loss": 1.066137433052063, + "eval_runtime": 220.9979, + "eval_samples_per_second": 452.493, + "eval_steps_per_second": 14.14, + "step": 425500 + }, + { + "epoch": 4.0, + "learning_rate": 1.0023272836470788e-05, + "loss": 1.2377, + "step": 426000 + }, + { + "epoch": 4.0, + "eval_loss": 1.0587615966796875, + "eval_runtime": 220.9922, + "eval_samples_per_second": 452.505, + "eval_steps_per_second": 14.141, + "step": 426000 + }, + { + "epoch": 4.0, + "learning_rate": 9.97635179519904e-06, + "loss": 1.219, + "step": 426500 + }, + { + "epoch": 4.0, + "eval_loss": 1.0649616718292236, + "eval_runtime": 221.0888, + "eval_samples_per_second": 452.307, + "eval_steps_per_second": 14.135, + "step": 426500 + }, + { + "epoch": 4.01, + "learning_rate": 9.929430753927293e-06, + "loss": 1.2135, + "step": 427000 + }, + { + "epoch": 4.01, + "eval_loss": 1.0579185485839844, + "eval_runtime": 221.0924, + "eval_samples_per_second": 452.3, + "eval_steps_per_second": 14.134, + "step": 427000 + }, + { + "epoch": 4.01, + "learning_rate": 9.882509712655545e-06, + "loss": 1.2194, + "step": 427500 + }, + { + "epoch": 4.01, + "eval_loss": 1.0547699928283691, + "eval_runtime": 221.1208, + "eval_samples_per_second": 452.241, + "eval_steps_per_second": 14.133, + "step": 427500 + }, + { + "epoch": 4.02, + "learning_rate": 9.835588671383796e-06, + "loss": 1.2386, + "step": 428000 + }, + { + "epoch": 4.02, + "eval_loss": 1.0614423751831055, + "eval_runtime": 221.0918, + "eval_samples_per_second": 452.301, + "eval_steps_per_second": 14.134, + "step": 428000 + }, + { + "epoch": 4.02, + "learning_rate": 9.788667630112048e-06, + "loss": 1.2259, + "step": 428500 + }, + { + "epoch": 4.02, + "eval_loss": 1.0620990991592407, + "eval_runtime": 221.0818, + "eval_samples_per_second": 452.321, + "eval_steps_per_second": 14.135, + "step": 428500 + }, + { + "epoch": 4.03, + "learning_rate": 9.741746588840301e-06, + "loss": 1.2263, + "step": 429000 + }, + { + "epoch": 4.03, + "eval_loss": 1.0643956661224365, + "eval_runtime": 221.0828, + "eval_samples_per_second": 452.319, + "eval_steps_per_second": 14.135, + "step": 429000 + }, + { + "epoch": 4.03, + "learning_rate": 9.694825547568553e-06, + "loss": 1.2305, + "step": 429500 + }, + { + "epoch": 4.03, + "eval_loss": 1.0544147491455078, + "eval_runtime": 221.1024, + "eval_samples_per_second": 452.279, + "eval_steps_per_second": 14.134, + "step": 429500 + }, + { + "epoch": 4.04, + "learning_rate": 9.647904506296804e-06, + "loss": 1.2101, + "step": 430000 + }, + { + "epoch": 4.04, + "eval_loss": 1.0544720888137817, + "eval_runtime": 221.1133, + "eval_samples_per_second": 452.257, + "eval_steps_per_second": 14.133, + "step": 430000 + }, + { + "epoch": 4.04, + "learning_rate": 9.600983465025057e-06, + "loss": 1.2334, + "step": 430500 + }, + { + "epoch": 4.04, + "eval_loss": 1.0552457571029663, + "eval_runtime": 220.9889, + "eval_samples_per_second": 452.511, + "eval_steps_per_second": 14.141, + "step": 430500 + }, + { + "epoch": 4.04, + "learning_rate": 9.554062423753309e-06, + "loss": 1.2204, + "step": 431000 + }, + { + "epoch": 4.04, + "eval_loss": 1.0604522228240967, + "eval_runtime": 221.1288, + "eval_samples_per_second": 452.225, + "eval_steps_per_second": 14.132, + "step": 431000 + }, + { + "epoch": 4.05, + "learning_rate": 9.50714138248156e-06, + "loss": 1.2375, + "step": 431500 + }, + { + "epoch": 4.05, + "eval_loss": 1.0569729804992676, + "eval_runtime": 221.1398, + "eval_samples_per_second": 452.203, + "eval_steps_per_second": 14.131, + "step": 431500 + }, + { + "epoch": 4.05, + "learning_rate": 9.460220341209812e-06, + "loss": 1.2491, + "step": 432000 + }, + { + "epoch": 4.05, + "eval_loss": 1.052681803703308, + "eval_runtime": 221.0809, + "eval_samples_per_second": 452.323, + "eval_steps_per_second": 14.135, + "step": 432000 + }, + { + "epoch": 4.06, + "learning_rate": 9.413299299938066e-06, + "loss": 1.2265, + "step": 432500 + }, + { + "epoch": 4.06, + "eval_loss": 1.057190179824829, + "eval_runtime": 221.0473, + "eval_samples_per_second": 452.392, + "eval_steps_per_second": 14.137, + "step": 432500 + }, + { + "epoch": 4.06, + "learning_rate": 9.366378258666317e-06, + "loss": 1.2213, + "step": 433000 + }, + { + "epoch": 4.06, + "eval_loss": 1.0613055229187012, + "eval_runtime": 220.9425, + "eval_samples_per_second": 452.606, + "eval_steps_per_second": 14.144, + "step": 433000 + }, + { + "epoch": 4.07, + "learning_rate": 9.319457217394569e-06, + "loss": 1.2242, + "step": 433500 + }, + { + "epoch": 4.07, + "eval_loss": 1.054632306098938, + "eval_runtime": 220.92, + "eval_samples_per_second": 452.652, + "eval_steps_per_second": 14.145, + "step": 433500 + }, + { + "epoch": 4.07, + "learning_rate": 9.272536176122822e-06, + "loss": 1.2335, + "step": 434000 + }, + { + "epoch": 4.07, + "eval_loss": 1.0627985000610352, + "eval_runtime": 220.9221, + "eval_samples_per_second": 452.648, + "eval_steps_per_second": 14.145, + "step": 434000 + }, + { + "epoch": 4.08, + "learning_rate": 9.225615134851074e-06, + "loss": 1.2306, + "step": 434500 + }, + { + "epoch": 4.08, + "eval_loss": 1.058109164237976, + "eval_runtime": 220.938, + "eval_samples_per_second": 452.616, + "eval_steps_per_second": 14.144, + "step": 434500 + }, + { + "epoch": 4.08, + "learning_rate": 9.178694093579325e-06, + "loss": 1.2317, + "step": 435000 + }, + { + "epoch": 4.08, + "eval_loss": 1.060577392578125, + "eval_runtime": 220.957, + "eval_samples_per_second": 452.577, + "eval_steps_per_second": 14.143, + "step": 435000 + }, + { + "epoch": 4.09, + "learning_rate": 9.131773052307577e-06, + "loss": 1.2207, + "step": 435500 + }, + { + "epoch": 4.09, + "eval_loss": 1.0583726167678833, + "eval_runtime": 220.9147, + "eval_samples_per_second": 452.663, + "eval_steps_per_second": 14.146, + "step": 435500 + }, + { + "epoch": 4.09, + "learning_rate": 9.08485201103583e-06, + "loss": 1.2118, + "step": 436000 + }, + { + "epoch": 4.09, + "eval_loss": 1.0572487115859985, + "eval_runtime": 220.9401, + "eval_samples_per_second": 452.611, + "eval_steps_per_second": 14.144, + "step": 436000 + }, + { + "epoch": 4.1, + "learning_rate": 9.037930969764082e-06, + "loss": 1.2033, + "step": 436500 + }, + { + "epoch": 4.1, + "eval_loss": 1.0609925985336304, + "eval_runtime": 220.9828, + "eval_samples_per_second": 452.524, + "eval_steps_per_second": 14.141, + "step": 436500 + }, + { + "epoch": 4.1, + "learning_rate": 8.991009928492333e-06, + "loss": 1.2224, + "step": 437000 + }, + { + "epoch": 4.1, + "eval_loss": 1.0533952713012695, + "eval_runtime": 220.933, + "eval_samples_per_second": 452.626, + "eval_steps_per_second": 14.145, + "step": 437000 + }, + { + "epoch": 4.11, + "learning_rate": 8.944088887220585e-06, + "loss": 1.223, + "step": 437500 + }, + { + "epoch": 4.11, + "eval_loss": 1.0515124797821045, + "eval_runtime": 220.9716, + "eval_samples_per_second": 452.547, + "eval_steps_per_second": 14.142, + "step": 437500 + }, + { + "epoch": 4.11, + "learning_rate": 8.897167845948838e-06, + "loss": 1.2215, + "step": 438000 + }, + { + "epoch": 4.11, + "eval_loss": 1.0595310926437378, + "eval_runtime": 220.9271, + "eval_samples_per_second": 452.638, + "eval_steps_per_second": 14.145, + "step": 438000 + }, + { + "epoch": 4.11, + "learning_rate": 8.85024680467709e-06, + "loss": 1.2241, + "step": 438500 + }, + { + "epoch": 4.11, + "eval_loss": 1.055729627609253, + "eval_runtime": 220.9211, + "eval_samples_per_second": 452.65, + "eval_steps_per_second": 14.145, + "step": 438500 + }, + { + "epoch": 4.12, + "learning_rate": 8.803325763405341e-06, + "loss": 1.1953, + "step": 439000 + }, + { + "epoch": 4.12, + "eval_loss": 1.0478371381759644, + "eval_runtime": 220.9454, + "eval_samples_per_second": 452.6, + "eval_steps_per_second": 14.144, + "step": 439000 + }, + { + "epoch": 4.12, + "learning_rate": 8.756404722133595e-06, + "loss": 1.2235, + "step": 439500 + }, + { + "epoch": 4.12, + "eval_loss": 1.05348801612854, + "eval_runtime": 221.0593, + "eval_samples_per_second": 452.367, + "eval_steps_per_second": 14.136, + "step": 439500 + }, + { + "epoch": 4.13, + "learning_rate": 8.709483680861846e-06, + "loss": 1.211, + "step": 440000 + }, + { + "epoch": 4.13, + "eval_loss": 1.0495777130126953, + "eval_runtime": 221.0334, + "eval_samples_per_second": 452.42, + "eval_steps_per_second": 14.138, + "step": 440000 + }, + { + "epoch": 4.13, + "learning_rate": 8.662562639590098e-06, + "loss": 1.2257, + "step": 440500 + }, + { + "epoch": 4.13, + "eval_loss": 1.056633472442627, + "eval_runtime": 220.9965, + "eval_samples_per_second": 452.496, + "eval_steps_per_second": 14.14, + "step": 440500 + }, + { + "epoch": 4.14, + "learning_rate": 8.61564159831835e-06, + "loss": 1.2171, + "step": 441000 + }, + { + "epoch": 4.14, + "eval_loss": 1.0503853559494019, + "eval_runtime": 220.9748, + "eval_samples_per_second": 452.54, + "eval_steps_per_second": 14.142, + "step": 441000 + }, + { + "epoch": 4.14, + "learning_rate": 8.568720557046603e-06, + "loss": 1.2105, + "step": 441500 + }, + { + "epoch": 4.14, + "eval_loss": 1.0520578622817993, + "eval_runtime": 220.8941, + "eval_samples_per_second": 452.706, + "eval_steps_per_second": 14.147, + "step": 441500 + }, + { + "epoch": 4.15, + "learning_rate": 8.521799515774854e-06, + "loss": 1.2045, + "step": 442000 + }, + { + "epoch": 4.15, + "eval_loss": 1.053121566772461, + "eval_runtime": 220.8544, + "eval_samples_per_second": 452.787, + "eval_steps_per_second": 14.15, + "step": 442000 + }, + { + "epoch": 4.15, + "learning_rate": 8.474878474503106e-06, + "loss": 1.205, + "step": 442500 + }, + { + "epoch": 4.15, + "eval_loss": 1.0605696439743042, + "eval_runtime": 220.8587, + "eval_samples_per_second": 452.778, + "eval_steps_per_second": 14.149, + "step": 442500 + }, + { + "epoch": 4.16, + "learning_rate": 8.427957433231359e-06, + "loss": 1.2145, + "step": 443000 + }, + { + "epoch": 4.16, + "eval_loss": 1.0511351823806763, + "eval_runtime": 220.8862, + "eval_samples_per_second": 452.722, + "eval_steps_per_second": 14.148, + "step": 443000 + }, + { + "epoch": 4.16, + "learning_rate": 8.38103639195961e-06, + "loss": 1.1973, + "step": 443500 + }, + { + "epoch": 4.16, + "eval_loss": 1.0576379299163818, + "eval_runtime": 220.8821, + "eval_samples_per_second": 452.73, + "eval_steps_per_second": 14.148, + "step": 443500 + }, + { + "epoch": 4.17, + "learning_rate": 8.334115350687862e-06, + "loss": 1.2061, + "step": 444000 + }, + { + "epoch": 4.17, + "eval_loss": 1.0536248683929443, + "eval_runtime": 220.8693, + "eval_samples_per_second": 452.756, + "eval_steps_per_second": 14.149, + "step": 444000 + }, + { + "epoch": 4.17, + "learning_rate": 8.287194309416114e-06, + "loss": 1.2241, + "step": 444500 + }, + { + "epoch": 4.17, + "eval_loss": 1.0456198453903198, + "eval_runtime": 220.876, + "eval_samples_per_second": 452.743, + "eval_steps_per_second": 14.148, + "step": 444500 + }, + { + "epoch": 4.18, + "learning_rate": 8.240273268144367e-06, + "loss": 1.1978, + "step": 445000 + }, + { + "epoch": 4.18, + "eval_loss": 1.0541229248046875, + "eval_runtime": 220.8941, + "eval_samples_per_second": 452.706, + "eval_steps_per_second": 14.147, + "step": 445000 + }, + { + "epoch": 4.18, + "learning_rate": 8.193352226872619e-06, + "loss": 1.2131, + "step": 445500 + }, + { + "epoch": 4.18, + "eval_loss": 1.050695538520813, + "eval_runtime": 220.8941, + "eval_samples_per_second": 452.706, + "eval_steps_per_second": 14.147, + "step": 445500 + }, + { + "epoch": 4.19, + "learning_rate": 8.14643118560087e-06, + "loss": 1.1941, + "step": 446000 + }, + { + "epoch": 4.19, + "eval_loss": 1.0486228466033936, + "eval_runtime": 220.8943, + "eval_samples_per_second": 452.705, + "eval_steps_per_second": 14.147, + "step": 446000 + }, + { + "epoch": 4.19, + "learning_rate": 8.099510144329122e-06, + "loss": 1.2137, + "step": 446500 + }, + { + "epoch": 4.19, + "eval_loss": 1.050898790359497, + "eval_runtime": 220.9099, + "eval_samples_per_second": 452.673, + "eval_steps_per_second": 14.146, + "step": 446500 + }, + { + "epoch": 4.19, + "learning_rate": 8.052589103057375e-06, + "loss": 1.212, + "step": 447000 + }, + { + "epoch": 4.19, + "eval_loss": 1.052153468132019, + "eval_runtime": 220.9042, + "eval_samples_per_second": 452.685, + "eval_steps_per_second": 14.146, + "step": 447000 + }, + { + "epoch": 4.2, + "learning_rate": 8.005668061785627e-06, + "loss": 1.2141, + "step": 447500 + }, + { + "epoch": 4.2, + "eval_loss": 1.0504639148712158, + "eval_runtime": 220.8838, + "eval_samples_per_second": 452.727, + "eval_steps_per_second": 14.148, + "step": 447500 + }, + { + "epoch": 4.2, + "learning_rate": 7.958747020513878e-06, + "loss": 1.2035, + "step": 448000 + }, + { + "epoch": 4.2, + "eval_loss": 1.0515419244766235, + "eval_runtime": 220.9254, + "eval_samples_per_second": 452.642, + "eval_steps_per_second": 14.145, + "step": 448000 + }, + { + "epoch": 4.21, + "learning_rate": 7.911825979242132e-06, + "loss": 1.2162, + "step": 448500 + }, + { + "epoch": 4.21, + "eval_loss": 1.045407772064209, + "eval_runtime": 221.0538, + "eval_samples_per_second": 452.379, + "eval_steps_per_second": 14.137, + "step": 448500 + }, + { + "epoch": 4.21, + "learning_rate": 7.864904937970383e-06, + "loss": 1.2062, + "step": 449000 + }, + { + "epoch": 4.21, + "eval_loss": 1.0419821739196777, + "eval_runtime": 221.0256, + "eval_samples_per_second": 452.436, + "eval_steps_per_second": 14.139, + "step": 449000 + }, + { + "epoch": 4.22, + "learning_rate": 7.817983896698637e-06, + "loss": 1.1996, + "step": 449500 + }, + { + "epoch": 4.22, + "eval_loss": 1.0498727560043335, + "eval_runtime": 220.9307, + "eval_samples_per_second": 452.631, + "eval_steps_per_second": 14.145, + "step": 449500 + }, + { + "epoch": 4.22, + "learning_rate": 7.771062855426888e-06, + "loss": 1.2194, + "step": 450000 + }, + { + "epoch": 4.22, + "eval_loss": 1.0517551898956299, + "eval_runtime": 220.9312, + "eval_samples_per_second": 452.63, + "eval_steps_per_second": 14.145, + "step": 450000 + }, + { + "epoch": 4.23, + "learning_rate": 7.72414181415514e-06, + "loss": 1.2062, + "step": 450500 + }, + { + "epoch": 4.23, + "eval_loss": 1.0490150451660156, + "eval_runtime": 220.7475, + "eval_samples_per_second": 453.006, + "eval_steps_per_second": 14.156, + "step": 450500 + }, + { + "epoch": 4.23, + "learning_rate": 7.677220772883393e-06, + "loss": 1.2046, + "step": 451000 + }, + { + "epoch": 4.23, + "eval_loss": 1.0536123514175415, + "eval_runtime": 220.7673, + "eval_samples_per_second": 452.966, + "eval_steps_per_second": 14.155, + "step": 451000 + }, + { + "epoch": 4.24, + "learning_rate": 7.630299731611645e-06, + "loss": 1.2012, + "step": 451500 + }, + { + "epoch": 4.24, + "eval_loss": 1.0506072044372559, + "eval_runtime": 220.7213, + "eval_samples_per_second": 453.06, + "eval_steps_per_second": 14.158, + "step": 451500 + }, + { + "epoch": 4.24, + "learning_rate": 7.583378690339897e-06, + "loss": 1.2161, + "step": 452000 + }, + { + "epoch": 4.24, + "eval_loss": 1.0469845533370972, + "eval_runtime": 220.7698, + "eval_samples_per_second": 452.96, + "eval_steps_per_second": 14.155, + "step": 452000 + }, + { + "epoch": 4.25, + "learning_rate": 7.536457649068149e-06, + "loss": 1.2054, + "step": 452500 + }, + { + "epoch": 4.25, + "eval_loss": 1.0446325540542603, + "eval_runtime": 220.7759, + "eval_samples_per_second": 452.948, + "eval_steps_per_second": 14.155, + "step": 452500 + }, + { + "epoch": 4.25, + "learning_rate": 7.4895366077964e-06, + "loss": 1.208, + "step": 453000 + }, + { + "epoch": 4.25, + "eval_loss": 1.0471805334091187, + "eval_runtime": 220.7642, + "eval_samples_per_second": 452.972, + "eval_steps_per_second": 14.155, + "step": 453000 + }, + { + "epoch": 4.26, + "learning_rate": 7.442615566524652e-06, + "loss": 1.1995, + "step": 453500 + }, + { + "epoch": 4.26, + "eval_loss": 1.0422724485397339, + "eval_runtime": 220.7863, + "eval_samples_per_second": 452.927, + "eval_steps_per_second": 14.154, + "step": 453500 + }, + { + "epoch": 4.26, + "learning_rate": 7.395694525252905e-06, + "loss": 1.2153, + "step": 454000 + }, + { + "epoch": 4.26, + "eval_loss": 1.046839714050293, + "eval_runtime": 220.7559, + "eval_samples_per_second": 452.989, + "eval_steps_per_second": 14.156, + "step": 454000 + }, + { + "epoch": 4.27, + "learning_rate": 7.348773483981157e-06, + "loss": 1.1999, + "step": 454500 + }, + { + "epoch": 4.27, + "eval_loss": 1.0423718690872192, + "eval_runtime": 220.7317, + "eval_samples_per_second": 453.039, + "eval_steps_per_second": 14.157, + "step": 454500 + }, + { + "epoch": 4.27, + "learning_rate": 7.301852442709409e-06, + "loss": 1.1919, + "step": 455000 + }, + { + "epoch": 4.27, + "eval_loss": 1.040187120437622, + "eval_runtime": 220.7387, + "eval_samples_per_second": 453.024, + "eval_steps_per_second": 14.157, + "step": 455000 + }, + { + "epoch": 4.27, + "learning_rate": 7.254931401437661e-06, + "loss": 1.1848, + "step": 455500 + }, + { + "epoch": 4.27, + "eval_loss": 1.0394939184188843, + "eval_runtime": 220.7554, + "eval_samples_per_second": 452.99, + "eval_steps_per_second": 14.156, + "step": 455500 + }, + { + "epoch": 4.28, + "learning_rate": 7.208010360165913e-06, + "loss": 1.2012, + "step": 456000 + }, + { + "epoch": 4.28, + "eval_loss": 1.0482308864593506, + "eval_runtime": 220.7304, + "eval_samples_per_second": 453.041, + "eval_steps_per_second": 14.158, + "step": 456000 + }, + { + "epoch": 4.28, + "learning_rate": 7.161089318894166e-06, + "loss": 1.2028, + "step": 456500 + }, + { + "epoch": 4.28, + "eval_loss": 1.0419962406158447, + "eval_runtime": 220.7448, + "eval_samples_per_second": 453.012, + "eval_steps_per_second": 14.157, + "step": 456500 + }, + { + "epoch": 4.29, + "learning_rate": 7.114168277622417e-06, + "loss": 1.212, + "step": 457000 + }, + { + "epoch": 4.29, + "eval_loss": 1.0430151224136353, + "eval_runtime": 220.7318, + "eval_samples_per_second": 453.038, + "eval_steps_per_second": 14.157, + "step": 457000 + }, + { + "epoch": 4.29, + "learning_rate": 7.06724723635067e-06, + "loss": 1.1949, + "step": 457500 + }, + { + "epoch": 4.29, + "eval_loss": 1.047107219696045, + "eval_runtime": 220.7582, + "eval_samples_per_second": 452.984, + "eval_steps_per_second": 14.156, + "step": 457500 + }, + { + "epoch": 4.3, + "learning_rate": 7.020326195078922e-06, + "loss": 1.2095, + "step": 458000 + }, + { + "epoch": 4.3, + "eval_loss": 1.0399595499038696, + "eval_runtime": 220.8799, + "eval_samples_per_second": 452.735, + "eval_steps_per_second": 14.148, + "step": 458000 + }, + { + "epoch": 4.3, + "learning_rate": 6.973405153807174e-06, + "loss": 1.2085, + "step": 458500 + }, + { + "epoch": 4.3, + "eval_loss": 1.0479100942611694, + "eval_runtime": 220.8879, + "eval_samples_per_second": 452.718, + "eval_steps_per_second": 14.147, + "step": 458500 + }, + { + "epoch": 4.31, + "learning_rate": 6.926484112535425e-06, + "loss": 1.1901, + "step": 459000 + }, + { + "epoch": 4.31, + "eval_loss": 1.0417808294296265, + "eval_runtime": 220.8257, + "eval_samples_per_second": 452.846, + "eval_steps_per_second": 14.151, + "step": 459000 + }, + { + "epoch": 4.31, + "learning_rate": 6.8795630712636785e-06, + "loss": 1.2034, + "step": 459500 + }, + { + "epoch": 4.31, + "eval_loss": 1.0389989614486694, + "eval_runtime": 220.7906, + "eval_samples_per_second": 452.918, + "eval_steps_per_second": 14.154, + "step": 459500 + }, + { + "epoch": 4.32, + "learning_rate": 6.83264202999193e-06, + "loss": 1.225, + "step": 460000 + }, + { + "epoch": 4.32, + "eval_loss": 1.0430644750595093, + "eval_runtime": 220.6771, + "eval_samples_per_second": 453.151, + "eval_steps_per_second": 14.161, + "step": 460000 + }, + { + "epoch": 4.32, + "learning_rate": 6.785720988720182e-06, + "loss": 1.1967, + "step": 460500 + }, + { + "epoch": 4.32, + "eval_loss": 1.0424212217330933, + "eval_runtime": 220.4736, + "eval_samples_per_second": 453.569, + "eval_steps_per_second": 14.174, + "step": 460500 + }, + { + "epoch": 4.33, + "learning_rate": 6.738799947448435e-06, + "loss": 1.2059, + "step": 461000 + }, + { + "epoch": 4.33, + "eval_loss": 1.0408573150634766, + "eval_runtime": 220.4712, + "eval_samples_per_second": 453.574, + "eval_steps_per_second": 14.174, + "step": 461000 + }, + { + "epoch": 4.33, + "learning_rate": 6.691878906176687e-06, + "loss": 1.1939, + "step": 461500 + }, + { + "epoch": 4.33, + "eval_loss": 1.0424449443817139, + "eval_runtime": 220.5212, + "eval_samples_per_second": 453.471, + "eval_steps_per_second": 14.171, + "step": 461500 + }, + { + "epoch": 4.34, + "learning_rate": 6.644957864904938e-06, + "loss": 1.2027, + "step": 462000 + }, + { + "epoch": 4.34, + "eval_loss": 1.04022216796875, + "eval_runtime": 220.5052, + "eval_samples_per_second": 453.504, + "eval_steps_per_second": 14.172, + "step": 462000 + }, + { + "epoch": 4.34, + "learning_rate": 6.59803682363319e-06, + "loss": 1.1745, + "step": 462500 + }, + { + "epoch": 4.34, + "eval_loss": 1.0380291938781738, + "eval_runtime": 220.5027, + "eval_samples_per_second": 453.509, + "eval_steps_per_second": 14.172, + "step": 462500 + }, + { + "epoch": 4.34, + "learning_rate": 6.551115782361443e-06, + "loss": 1.2155, + "step": 463000 + }, + { + "epoch": 4.34, + "eval_loss": 1.0391091108322144, + "eval_runtime": 220.4856, + "eval_samples_per_second": 453.544, + "eval_steps_per_second": 14.173, + "step": 463000 + }, + { + "epoch": 4.35, + "learning_rate": 6.504194741089695e-06, + "loss": 1.2033, + "step": 463500 + }, + { + "epoch": 4.35, + "eval_loss": 1.042190670967102, + "eval_runtime": 220.4876, + "eval_samples_per_second": 453.54, + "eval_steps_per_second": 14.173, + "step": 463500 + }, + { + "epoch": 4.35, + "learning_rate": 6.457273699817946e-06, + "loss": 1.2064, + "step": 464000 + }, + { + "epoch": 4.35, + "eval_loss": 1.040285348892212, + "eval_runtime": 220.4809, + "eval_samples_per_second": 453.554, + "eval_steps_per_second": 14.174, + "step": 464000 + }, + { + "epoch": 4.36, + "learning_rate": 6.410352658546198e-06, + "loss": 1.1943, + "step": 464500 + }, + { + "epoch": 4.36, + "eval_loss": 1.0418440103530884, + "eval_runtime": 220.482, + "eval_samples_per_second": 453.552, + "eval_steps_per_second": 14.173, + "step": 464500 + }, + { + "epoch": 4.36, + "learning_rate": 6.363431617274451e-06, + "loss": 1.1875, + "step": 465000 + }, + { + "epoch": 4.36, + "eval_loss": 1.0404266119003296, + "eval_runtime": 220.484, + "eval_samples_per_second": 453.548, + "eval_steps_per_second": 14.173, + "step": 465000 + }, + { + "epoch": 4.37, + "learning_rate": 6.316510576002703e-06, + "loss": 1.1978, + "step": 465500 + }, + { + "epoch": 4.37, + "eval_loss": 1.0388667583465576, + "eval_runtime": 220.5052, + "eval_samples_per_second": 453.504, + "eval_steps_per_second": 14.172, + "step": 465500 + }, + { + "epoch": 4.37, + "learning_rate": 6.269589534730954e-06, + "loss": 1.1963, + "step": 466000 + }, + { + "epoch": 4.37, + "eval_loss": 1.039662480354309, + "eval_runtime": 220.4721, + "eval_samples_per_second": 453.572, + "eval_steps_per_second": 14.174, + "step": 466000 + }, + { + "epoch": 4.38, + "learning_rate": 6.222668493459207e-06, + "loss": 1.1991, + "step": 466500 + }, + { + "epoch": 4.38, + "eval_loss": 1.0323457717895508, + "eval_runtime": 220.4655, + "eval_samples_per_second": 453.586, + "eval_steps_per_second": 14.175, + "step": 466500 + }, + { + "epoch": 4.38, + "learning_rate": 6.175747452187459e-06, + "loss": 1.2045, + "step": 467000 + }, + { + "epoch": 4.38, + "eval_loss": 1.0490866899490356, + "eval_runtime": 220.4714, + "eval_samples_per_second": 453.574, + "eval_steps_per_second": 14.174, + "step": 467000 + }, + { + "epoch": 4.39, + "learning_rate": 6.128826410915712e-06, + "loss": 1.1782, + "step": 467500 + }, + { + "epoch": 4.39, + "eval_loss": 1.0345863103866577, + "eval_runtime": 220.4724, + "eval_samples_per_second": 453.571, + "eval_steps_per_second": 14.174, + "step": 467500 + }, + { + "epoch": 4.39, + "learning_rate": 6.081905369643963e-06, + "loss": 1.1832, + "step": 468000 + }, + { + "epoch": 4.39, + "eval_loss": 1.0386844873428345, + "eval_runtime": 220.5673, + "eval_samples_per_second": 453.376, + "eval_steps_per_second": 14.168, + "step": 468000 + }, + { + "epoch": 4.4, + "learning_rate": 6.034984328372216e-06, + "loss": 1.2035, + "step": 468500 + }, + { + "epoch": 4.4, + "eval_loss": 1.0390254259109497, + "eval_runtime": 220.603, + "eval_samples_per_second": 453.303, + "eval_steps_per_second": 14.166, + "step": 468500 + }, + { + "epoch": 4.4, + "learning_rate": 5.988063287100468e-06, + "loss": 1.2036, + "step": 469000 + }, + { + "epoch": 4.4, + "eval_loss": 1.0352883338928223, + "eval_runtime": 220.4977, + "eval_samples_per_second": 453.52, + "eval_steps_per_second": 14.172, + "step": 469000 + }, + { + "epoch": 4.41, + "learning_rate": 5.94114224582872e-06, + "loss": 1.2, + "step": 469500 + }, + { + "epoch": 4.41, + "eval_loss": 1.03839111328125, + "eval_runtime": 220.4659, + "eval_samples_per_second": 453.585, + "eval_steps_per_second": 14.175, + "step": 469500 + }, + { + "epoch": 4.41, + "learning_rate": 5.894221204556972e-06, + "loss": 1.1869, + "step": 470000 + }, + { + "epoch": 4.41, + "eval_loss": 1.0423153638839722, + "eval_runtime": 220.3637, + "eval_samples_per_second": 453.795, + "eval_steps_per_second": 14.181, + "step": 470000 + }, + { + "epoch": 4.42, + "learning_rate": 5.8473001632852245e-06, + "loss": 1.2012, + "step": 470500 + }, + { + "epoch": 4.42, + "eval_loss": 1.0392744541168213, + "eval_runtime": 220.3412, + "eval_samples_per_second": 453.842, + "eval_steps_per_second": 14.183, + "step": 470500 + }, + { + "epoch": 4.42, + "learning_rate": 5.800379122013476e-06, + "loss": 1.1912, + "step": 471000 + }, + { + "epoch": 4.42, + "eval_loss": 1.0390135049819946, + "eval_runtime": 220.3713, + "eval_samples_per_second": 453.78, + "eval_steps_per_second": 14.181, + "step": 471000 + }, + { + "epoch": 4.42, + "learning_rate": 5.7534580807417286e-06, + "loss": 1.2062, + "step": 471500 + }, + { + "epoch": 4.42, + "eval_loss": 1.0349584817886353, + "eval_runtime": 220.3752, + "eval_samples_per_second": 453.771, + "eval_steps_per_second": 14.18, + "step": 471500 + }, + { + "epoch": 4.43, + "learning_rate": 5.70653703946998e-06, + "loss": 1.2079, + "step": 472000 + }, + { + "epoch": 4.43, + "eval_loss": 1.0324441194534302, + "eval_runtime": 220.391, + "eval_samples_per_second": 453.739, + "eval_steps_per_second": 14.179, + "step": 472000 + }, + { + "epoch": 4.43, + "learning_rate": 5.659615998198233e-06, + "loss": 1.1967, + "step": 472500 + }, + { + "epoch": 4.43, + "eval_loss": 1.0354297161102295, + "eval_runtime": 220.4046, + "eval_samples_per_second": 453.711, + "eval_steps_per_second": 14.178, + "step": 472500 + }, + { + "epoch": 4.44, + "learning_rate": 5.612694956926484e-06, + "loss": 1.194, + "step": 473000 + }, + { + "epoch": 4.44, + "eval_loss": 1.0334423780441284, + "eval_runtime": 220.39, + "eval_samples_per_second": 453.741, + "eval_steps_per_second": 14.179, + "step": 473000 + }, + { + "epoch": 4.44, + "learning_rate": 5.565773915654737e-06, + "loss": 1.1962, + "step": 473500 + }, + { + "epoch": 4.44, + "eval_loss": 1.0328459739685059, + "eval_runtime": 220.3907, + "eval_samples_per_second": 453.74, + "eval_steps_per_second": 14.179, + "step": 473500 + }, + { + "epoch": 4.45, + "learning_rate": 5.518852874382988e-06, + "loss": 1.209, + "step": 474000 + }, + { + "epoch": 4.45, + "eval_loss": 1.0326672792434692, + "eval_runtime": 220.4416, + "eval_samples_per_second": 453.635, + "eval_steps_per_second": 14.176, + "step": 474000 + }, + { + "epoch": 4.45, + "learning_rate": 5.471931833111241e-06, + "loss": 1.1964, + "step": 474500 + }, + { + "epoch": 4.45, + "eval_loss": 1.0362826585769653, + "eval_runtime": 220.4411, + "eval_samples_per_second": 453.636, + "eval_steps_per_second": 14.176, + "step": 474500 + }, + { + "epoch": 4.46, + "learning_rate": 5.425010791839493e-06, + "loss": 1.1993, + "step": 475000 + }, + { + "epoch": 4.46, + "eval_loss": 1.0309995412826538, + "eval_runtime": 220.4439, + "eval_samples_per_second": 453.63, + "eval_steps_per_second": 14.176, + "step": 475000 + }, + { + "epoch": 4.46, + "learning_rate": 5.378089750567745e-06, + "loss": 1.2089, + "step": 475500 + }, + { + "epoch": 4.46, + "eval_loss": 1.0346201658248901, + "eval_runtime": 220.3965, + "eval_samples_per_second": 453.728, + "eval_steps_per_second": 14.179, + "step": 475500 + }, + { + "epoch": 4.47, + "learning_rate": 5.331168709295997e-06, + "loss": 1.2001, + "step": 476000 + }, + { + "epoch": 4.47, + "eval_loss": 1.0281124114990234, + "eval_runtime": 220.429, + "eval_samples_per_second": 453.661, + "eval_steps_per_second": 14.177, + "step": 476000 + }, + { + "epoch": 4.47, + "learning_rate": 5.284247668024249e-06, + "loss": 1.2331, + "step": 476500 + }, + { + "epoch": 4.47, + "eval_loss": 1.0332673788070679, + "eval_runtime": 220.4002, + "eval_samples_per_second": 453.72, + "eval_steps_per_second": 14.179, + "step": 476500 + }, + { + "epoch": 4.48, + "learning_rate": 5.237326626752501e-06, + "loss": 1.2059, + "step": 477000 + }, + { + "epoch": 4.48, + "eval_loss": 1.033066987991333, + "eval_runtime": 220.4364, + "eval_samples_per_second": 453.646, + "eval_steps_per_second": 14.176, + "step": 477000 + }, + { + "epoch": 4.48, + "learning_rate": 5.190405585480753e-06, + "loss": 1.1919, + "step": 477500 + }, + { + "epoch": 4.48, + "eval_loss": 1.035326600074768, + "eval_runtime": 220.4207, + "eval_samples_per_second": 453.678, + "eval_steps_per_second": 14.177, + "step": 477500 + }, + { + "epoch": 4.49, + "learning_rate": 5.143484544209005e-06, + "loss": 1.2028, + "step": 478000 + }, + { + "epoch": 4.49, + "eval_loss": 1.0330538749694824, + "eval_runtime": 220.3873, + "eval_samples_per_second": 453.747, + "eval_steps_per_second": 14.18, + "step": 478000 + }, + { + "epoch": 4.49, + "learning_rate": 5.096563502937257e-06, + "loss": 1.1815, + "step": 478500 + }, + { + "epoch": 4.49, + "eval_loss": 1.0284647941589355, + "eval_runtime": 220.3832, + "eval_samples_per_second": 453.755, + "eval_steps_per_second": 14.18, + "step": 478500 + }, + { + "epoch": 4.5, + "learning_rate": 5.049642461665509e-06, + "loss": 1.2108, + "step": 479000 + }, + { + "epoch": 4.5, + "eval_loss": 1.0274449586868286, + "eval_runtime": 220.5409, + "eval_samples_per_second": 453.431, + "eval_steps_per_second": 14.17, + "step": 479000 + }, + { + "epoch": 4.5, + "learning_rate": 5.002721420393762e-06, + "loss": 1.1973, + "step": 479500 + }, + { + "epoch": 4.5, + "eval_loss": 1.030910849571228, + "eval_runtime": 220.517, + "eval_samples_per_second": 453.48, + "eval_steps_per_second": 14.171, + "step": 479500 + }, + { + "epoch": 4.5, + "learning_rate": 4.955800379122014e-06, + "loss": 1.2209, + "step": 480000 + }, + { + "epoch": 4.5, + "eval_loss": 1.0277793407440186, + "eval_runtime": 220.48, + "eval_samples_per_second": 453.556, + "eval_steps_per_second": 14.174, + "step": 480000 + }, + { + "epoch": 4.51, + "learning_rate": 4.908879337850266e-06, + "loss": 1.1701, + "step": 480500 + }, + { + "epoch": 4.51, + "eval_loss": 1.0242763757705688, + "eval_runtime": 220.3969, + "eval_samples_per_second": 453.727, + "eval_steps_per_second": 14.179, + "step": 480500 + }, + { + "epoch": 4.51, + "learning_rate": 4.861958296578518e-06, + "loss": 1.1876, + "step": 481000 + }, + { + "epoch": 4.51, + "eval_loss": 1.0288127660751343, + "eval_runtime": 220.2841, + "eval_samples_per_second": 453.959, + "eval_steps_per_second": 14.186, + "step": 481000 + }, + { + "epoch": 4.52, + "learning_rate": 4.8150372553067706e-06, + "loss": 1.1949, + "step": 481500 + }, + { + "epoch": 4.52, + "eval_loss": 1.0324766635894775, + "eval_runtime": 220.2548, + "eval_samples_per_second": 454.02, + "eval_steps_per_second": 14.188, + "step": 481500 + }, + { + "epoch": 4.52, + "learning_rate": 4.768116214035022e-06, + "loss": 1.1933, + "step": 482000 + }, + { + "epoch": 4.52, + "eval_loss": 1.0230944156646729, + "eval_runtime": 220.3139, + "eval_samples_per_second": 453.898, + "eval_steps_per_second": 14.184, + "step": 482000 + }, + { + "epoch": 4.53, + "learning_rate": 4.721195172763275e-06, + "loss": 1.1811, + "step": 482500 + }, + { + "epoch": 4.53, + "eval_loss": 1.029863953590393, + "eval_runtime": 220.2726, + "eval_samples_per_second": 453.983, + "eval_steps_per_second": 14.187, + "step": 482500 + }, + { + "epoch": 4.53, + "learning_rate": 4.674274131491526e-06, + "loss": 1.2033, + "step": 483000 + }, + { + "epoch": 4.53, + "eval_loss": 1.0319043397903442, + "eval_runtime": 220.2655, + "eval_samples_per_second": 453.998, + "eval_steps_per_second": 14.187, + "step": 483000 + }, + { + "epoch": 4.54, + "learning_rate": 4.627353090219779e-06, + "loss": 1.1919, + "step": 483500 + }, + { + "epoch": 4.54, + "eval_loss": 1.0290050506591797, + "eval_runtime": 220.268, + "eval_samples_per_second": 453.992, + "eval_steps_per_second": 14.187, + "step": 483500 + }, + { + "epoch": 4.54, + "learning_rate": 4.58043204894803e-06, + "loss": 1.195, + "step": 484000 + }, + { + "epoch": 4.54, + "eval_loss": 1.0286130905151367, + "eval_runtime": 220.2866, + "eval_samples_per_second": 453.954, + "eval_steps_per_second": 14.186, + "step": 484000 + }, + { + "epoch": 4.55, + "learning_rate": 4.533511007676283e-06, + "loss": 1.1795, + "step": 484500 + }, + { + "epoch": 4.55, + "eval_loss": 1.0310746431350708, + "eval_runtime": 220.3028, + "eval_samples_per_second": 453.921, + "eval_steps_per_second": 14.185, + "step": 484500 + }, + { + "epoch": 4.55, + "learning_rate": 4.486589966404535e-06, + "loss": 1.1981, + "step": 485000 + }, + { + "epoch": 4.55, + "eval_loss": 1.0278400182724, + "eval_runtime": 220.2765, + "eval_samples_per_second": 453.975, + "eval_steps_per_second": 14.187, + "step": 485000 + }, + { + "epoch": 4.56, + "learning_rate": 4.439668925132787e-06, + "loss": 1.1935, + "step": 485500 + }, + { + "epoch": 4.56, + "eval_loss": 1.0325404405593872, + "eval_runtime": 220.258, + "eval_samples_per_second": 454.013, + "eval_steps_per_second": 14.188, + "step": 485500 + }, + { + "epoch": 4.56, + "learning_rate": 4.392747883861039e-06, + "loss": 1.1901, + "step": 486000 + }, + { + "epoch": 4.56, + "eval_loss": 1.029636263847351, + "eval_runtime": 220.2457, + "eval_samples_per_second": 454.038, + "eval_steps_per_second": 14.189, + "step": 486000 + }, + { + "epoch": 4.57, + "learning_rate": 4.345826842589291e-06, + "loss": 1.1998, + "step": 486500 + }, + { + "epoch": 4.57, + "eval_loss": 1.0245474576950073, + "eval_runtime": 220.2767, + "eval_samples_per_second": 453.975, + "eval_steps_per_second": 14.187, + "step": 486500 + }, + { + "epoch": 4.57, + "learning_rate": 4.298905801317543e-06, + "loss": 1.1818, + "step": 487000 + }, + { + "epoch": 4.57, + "eval_loss": 1.0290453433990479, + "eval_runtime": 220.2654, + "eval_samples_per_second": 453.998, + "eval_steps_per_second": 14.187, + "step": 487000 + }, + { + "epoch": 4.57, + "learning_rate": 4.251984760045795e-06, + "loss": 1.1713, + "step": 487500 + }, + { + "epoch": 4.57, + "eval_loss": 1.028717279434204, + "eval_runtime": 220.2563, + "eval_samples_per_second": 454.016, + "eval_steps_per_second": 14.188, + "step": 487500 + }, + { + "epoch": 4.58, + "learning_rate": 4.205063718774047e-06, + "loss": 1.1841, + "step": 488000 + }, + { + "epoch": 4.58, + "eval_loss": 1.0243215560913086, + "eval_runtime": 220.2412, + "eval_samples_per_second": 454.048, + "eval_steps_per_second": 14.189, + "step": 488000 + }, + { + "epoch": 4.58, + "learning_rate": 4.158142677502299e-06, + "loss": 1.184, + "step": 488500 + }, + { + "epoch": 4.58, + "eval_loss": 1.0320249795913696, + "eval_runtime": 220.2122, + "eval_samples_per_second": 454.107, + "eval_steps_per_second": 14.191, + "step": 488500 + }, + { + "epoch": 4.59, + "learning_rate": 4.111221636230551e-06, + "loss": 1.1945, + "step": 489000 + }, + { + "epoch": 4.59, + "eval_loss": 1.0234476327896118, + "eval_runtime": 220.2507, + "eval_samples_per_second": 454.028, + "eval_steps_per_second": 14.188, + "step": 489000 + }, + { + "epoch": 4.59, + "learning_rate": 4.064300594958804e-06, + "loss": 1.1826, + "step": 489500 + }, + { + "epoch": 4.59, + "eval_loss": 1.027974009513855, + "eval_runtime": 220.2473, + "eval_samples_per_second": 454.035, + "eval_steps_per_second": 14.189, + "step": 489500 + }, + { + "epoch": 4.6, + "learning_rate": 4.017379553687055e-06, + "loss": 1.188, + "step": 490000 + }, + { + "epoch": 4.6, + "eval_loss": 1.0277783870697021, + "eval_runtime": 220.2581, + "eval_samples_per_second": 454.013, + "eval_steps_per_second": 14.188, + "step": 490000 + }, + { + "epoch": 4.6, + "learning_rate": 3.970458512415308e-06, + "loss": 1.1939, + "step": 490500 + }, + { + "epoch": 4.6, + "eval_loss": 1.0183277130126953, + "eval_runtime": 220.2638, + "eval_samples_per_second": 454.001, + "eval_steps_per_second": 14.188, + "step": 490500 + }, + { + "epoch": 4.61, + "learning_rate": 3.923537471143559e-06, + "loss": 1.1657, + "step": 491000 + }, + { + "epoch": 4.61, + "eval_loss": 1.029819369316101, + "eval_runtime": 220.2579, + "eval_samples_per_second": 454.013, + "eval_steps_per_second": 14.188, + "step": 491000 + }, + { + "epoch": 4.61, + "learning_rate": 3.876616429871812e-06, + "loss": 1.1917, + "step": 491500 + }, + { + "epoch": 4.61, + "eval_loss": 1.0290553569793701, + "eval_runtime": 220.4728, + "eval_samples_per_second": 453.571, + "eval_steps_per_second": 14.174, + "step": 491500 + }, + { + "epoch": 4.62, + "learning_rate": 3.829695388600064e-06, + "loss": 1.1945, + "step": 492000 + }, + { + "epoch": 4.62, + "eval_loss": 1.031397819519043, + "eval_runtime": 220.4389, + "eval_samples_per_second": 453.64, + "eval_steps_per_second": 14.176, + "step": 492000 + }, + { + "epoch": 4.62, + "learning_rate": 3.782774347328316e-06, + "loss": 1.2071, + "step": 492500 + }, + { + "epoch": 4.62, + "eval_loss": 1.027860164642334, + "eval_runtime": 220.3723, + "eval_samples_per_second": 453.778, + "eval_steps_per_second": 14.181, + "step": 492500 + }, + { + "epoch": 4.63, + "learning_rate": 3.7358533060565677e-06, + "loss": 1.2013, + "step": 493000 + }, + { + "epoch": 4.63, + "eval_loss": 1.020934820175171, + "eval_runtime": 220.3406, + "eval_samples_per_second": 453.843, + "eval_steps_per_second": 14.183, + "step": 493000 + }, + { + "epoch": 4.63, + "learning_rate": 3.68893226478482e-06, + "loss": 1.1762, + "step": 493500 + }, + { + "epoch": 4.63, + "eval_loss": 1.0232270956039429, + "eval_runtime": 220.2361, + "eval_samples_per_second": 454.058, + "eval_steps_per_second": 14.189, + "step": 493500 + }, + { + "epoch": 4.64, + "learning_rate": 3.6420112235130726e-06, + "loss": 1.2014, + "step": 494000 + }, + { + "epoch": 4.64, + "eval_loss": 1.0220469236373901, + "eval_runtime": 220.2877, + "eval_samples_per_second": 453.952, + "eval_steps_per_second": 14.186, + "step": 494000 + }, + { + "epoch": 4.64, + "learning_rate": 3.595090182241324e-06, + "loss": 1.1817, + "step": 494500 + }, + { + "epoch": 4.64, + "eval_loss": 1.024511456489563, + "eval_runtime": 220.3417, + "eval_samples_per_second": 453.841, + "eval_steps_per_second": 14.183, + "step": 494500 + }, + { + "epoch": 4.65, + "learning_rate": 3.5481691409695766e-06, + "loss": 1.1694, + "step": 495000 + }, + { + "epoch": 4.65, + "eval_loss": 1.0245423316955566, + "eval_runtime": 220.3569, + "eval_samples_per_second": 453.809, + "eval_steps_per_second": 14.182, + "step": 495000 + }, + { + "epoch": 4.65, + "learning_rate": 3.5012480996978287e-06, + "loss": 1.1864, + "step": 495500 + }, + { + "epoch": 4.65, + "eval_loss": 1.0235341787338257, + "eval_runtime": 220.3891, + "eval_samples_per_second": 453.743, + "eval_steps_per_second": 14.179, + "step": 495500 + }, + { + "epoch": 4.65, + "learning_rate": 3.454327058426081e-06, + "loss": 1.2, + "step": 496000 + }, + { + "epoch": 4.65, + "eval_loss": 1.0256927013397217, + "eval_runtime": 220.4025, + "eval_samples_per_second": 453.715, + "eval_steps_per_second": 14.179, + "step": 496000 + }, + { + "epoch": 4.66, + "learning_rate": 3.4074060171543327e-06, + "loss": 1.1897, + "step": 496500 + }, + { + "epoch": 4.66, + "eval_loss": 1.0257229804992676, + "eval_runtime": 220.4604, + "eval_samples_per_second": 453.596, + "eval_steps_per_second": 14.175, + "step": 496500 + }, + { + "epoch": 4.66, + "learning_rate": 3.360484975882585e-06, + "loss": 1.1929, + "step": 497000 + }, + { + "epoch": 4.66, + "eval_loss": 1.023755431175232, + "eval_runtime": 220.4521, + "eval_samples_per_second": 453.613, + "eval_steps_per_second": 14.175, + "step": 497000 + }, + { + "epoch": 4.67, + "learning_rate": 3.3135639346108367e-06, + "loss": 1.1853, + "step": 497500 + }, + { + "epoch": 4.67, + "eval_loss": 1.0197463035583496, + "eval_runtime": 220.4946, + "eval_samples_per_second": 453.526, + "eval_steps_per_second": 14.173, + "step": 497500 + }, + { + "epoch": 4.67, + "learning_rate": 3.266642893339089e-06, + "loss": 1.1909, + "step": 498000 + }, + { + "epoch": 4.67, + "eval_loss": 1.0215179920196533, + "eval_runtime": 220.5236, + "eval_samples_per_second": 453.466, + "eval_steps_per_second": 14.171, + "step": 498000 + }, + { + "epoch": 4.68, + "learning_rate": 3.2197218520673416e-06, + "loss": 1.1784, + "step": 498500 + }, + { + "epoch": 4.68, + "eval_loss": 1.0229579210281372, + "eval_runtime": 220.8462, + "eval_samples_per_second": 452.804, + "eval_steps_per_second": 14.15, + "step": 498500 + }, + { + "epoch": 4.68, + "learning_rate": 3.172800810795593e-06, + "loss": 1.1955, + "step": 499000 + }, + { + "epoch": 4.68, + "eval_loss": 1.019065499305725, + "eval_runtime": 220.55, + "eval_samples_per_second": 453.412, + "eval_steps_per_second": 14.169, + "step": 499000 + }, + { + "epoch": 4.69, + "learning_rate": 3.1258797695238456e-06, + "loss": 1.177, + "step": 499500 + }, + { + "epoch": 4.69, + "eval_loss": 1.021791696548462, + "eval_runtime": 220.5844, + "eval_samples_per_second": 453.341, + "eval_steps_per_second": 14.167, + "step": 499500 + }, + { + "epoch": 4.69, + "learning_rate": 3.078958728252097e-06, + "loss": 1.1883, + "step": 500000 + }, + { + "epoch": 4.69, + "eval_loss": 1.0202616453170776, + "eval_runtime": 220.5578, + "eval_samples_per_second": 453.396, + "eval_steps_per_second": 14.169, + "step": 500000 + }, + { + "epoch": 4.7, + "learning_rate": 3.0320376869803496e-06, + "loss": 1.1911, + "step": 500500 + }, + { + "epoch": 4.7, + "eval_loss": 1.0237817764282227, + "eval_runtime": 220.6247, + "eval_samples_per_second": 453.258, + "eval_steps_per_second": 14.164, + "step": 500500 + }, + { + "epoch": 4.7, + "learning_rate": 2.9851166457086017e-06, + "loss": 1.1923, + "step": 501000 + }, + { + "epoch": 4.7, + "eval_loss": 1.0245826244354248, + "eval_runtime": 220.6502, + "eval_samples_per_second": 453.206, + "eval_steps_per_second": 14.163, + "step": 501000 + }, + { + "epoch": 4.71, + "learning_rate": 2.9381956044368537e-06, + "loss": 1.1754, + "step": 501500 + }, + { + "epoch": 4.71, + "eval_loss": 1.0203845500946045, + "eval_runtime": 220.7296, + "eval_samples_per_second": 453.043, + "eval_steps_per_second": 14.158, + "step": 501500 + }, + { + "epoch": 4.71, + "learning_rate": 2.891274563165106e-06, + "loss": 1.1848, + "step": 502000 + }, + { + "epoch": 4.71, + "eval_loss": 1.0270147323608398, + "eval_runtime": 221.2106, + "eval_samples_per_second": 452.058, + "eval_steps_per_second": 14.127, + "step": 502000 + }, + { + "epoch": 4.72, + "learning_rate": 2.844353521893358e-06, + "loss": 1.1739, + "step": 502500 + }, + { + "epoch": 4.72, + "eval_loss": 1.0190746784210205, + "eval_runtime": 221.5269, + "eval_samples_per_second": 451.412, + "eval_steps_per_second": 14.107, + "step": 502500 + }, + { + "epoch": 4.72, + "learning_rate": 2.79743248062161e-06, + "loss": 1.1925, + "step": 503000 + }, + { + "epoch": 4.72, + "eval_loss": 1.0178605318069458, + "eval_runtime": 221.4578, + "eval_samples_per_second": 451.553, + "eval_steps_per_second": 14.111, + "step": 503000 + }, + { + "epoch": 4.72, + "learning_rate": 2.750511439349862e-06, + "loss": 1.1799, + "step": 503500 + }, + { + "epoch": 4.72, + "eval_loss": 1.0173200368881226, + "eval_runtime": 221.4615, + "eval_samples_per_second": 451.546, + "eval_steps_per_second": 14.111, + "step": 503500 + }, + { + "epoch": 4.73, + "learning_rate": 2.703590398078114e-06, + "loss": 1.1773, + "step": 504000 + }, + { + "epoch": 4.73, + "eval_loss": 1.0217922925949097, + "eval_runtime": 222.476, + "eval_samples_per_second": 449.487, + "eval_steps_per_second": 14.046, + "step": 504000 + }, + { + "epoch": 4.73, + "learning_rate": 2.656669356806366e-06, + "loss": 1.1675, + "step": 504500 + }, + { + "epoch": 4.73, + "eval_loss": 1.0150542259216309, + "eval_runtime": 242.0486, + "eval_samples_per_second": 413.14, + "eval_steps_per_second": 12.911, + "step": 504500 + }, + { + "epoch": 4.74, + "learning_rate": 2.6097483155346186e-06, + "loss": 1.1813, + "step": 505000 + }, + { + "epoch": 4.74, + "eval_loss": 1.0178701877593994, + "eval_runtime": 242.0285, + "eval_samples_per_second": 413.175, + "eval_steps_per_second": 12.912, + "step": 505000 + }, + { + "epoch": 4.74, + "learning_rate": 2.5628272742628706e-06, + "loss": 1.1726, + "step": 505500 + }, + { + "epoch": 4.74, + "eval_loss": 1.0183017253875732, + "eval_runtime": 242.2866, + "eval_samples_per_second": 412.734, + "eval_steps_per_second": 12.898, + "step": 505500 + }, + { + "epoch": 4.75, + "learning_rate": 2.5159062329911226e-06, + "loss": 1.1944, + "step": 506000 + }, + { + "epoch": 4.75, + "eval_loss": 1.011798620223999, + "eval_runtime": 242.0251, + "eval_samples_per_second": 413.18, + "eval_steps_per_second": 12.912, + "step": 506000 + }, + { + "epoch": 4.75, + "learning_rate": 2.4689851917193747e-06, + "loss": 1.1972, + "step": 506500 + }, + { + "epoch": 4.75, + "eval_loss": 1.021371841430664, + "eval_runtime": 242.3781, + "eval_samples_per_second": 412.579, + "eval_steps_per_second": 12.893, + "step": 506500 + }, + { + "epoch": 4.76, + "learning_rate": 2.4220641504476267e-06, + "loss": 1.1835, + "step": 507000 + }, + { + "epoch": 4.76, + "eval_loss": 1.024190902709961, + "eval_runtime": 243.2084, + "eval_samples_per_second": 411.17, + "eval_steps_per_second": 12.849, + "step": 507000 + }, + { + "epoch": 4.76, + "learning_rate": 2.375143109175879e-06, + "loss": 1.1807, + "step": 507500 + }, + { + "epoch": 4.76, + "eval_loss": 1.0209712982177734, + "eval_runtime": 247.1697, + "eval_samples_per_second": 404.58, + "eval_steps_per_second": 12.643, + "step": 507500 + }, + { + "epoch": 4.77, + "learning_rate": 2.328222067904131e-06, + "loss": 1.1801, + "step": 508000 + }, + { + "epoch": 4.77, + "eval_loss": 1.0197240114212036, + "eval_runtime": 245.9329, + "eval_samples_per_second": 406.615, + "eval_steps_per_second": 12.707, + "step": 508000 + }, + { + "epoch": 4.77, + "learning_rate": 2.281301026632383e-06, + "loss": 1.1707, + "step": 508500 + }, + { + "epoch": 4.77, + "eval_loss": 1.017072319984436, + "eval_runtime": 246.0676, + "eval_samples_per_second": 406.392, + "eval_steps_per_second": 12.7, + "step": 508500 + }, + { + "epoch": 4.78, + "learning_rate": 2.234379985360635e-06, + "loss": 1.1831, + "step": 509000 + }, + { + "epoch": 4.78, + "eval_loss": 1.0189136266708374, + "eval_runtime": 241.7862, + "eval_samples_per_second": 413.589, + "eval_steps_per_second": 12.925, + "step": 509000 + }, + { + "epoch": 4.78, + "learning_rate": 2.1874589440888876e-06, + "loss": 1.182, + "step": 509500 + }, + { + "epoch": 4.78, + "eval_loss": 1.0172502994537354, + "eval_runtime": 233.7568, + "eval_samples_per_second": 427.795, + "eval_steps_per_second": 13.369, + "step": 509500 + }, + { + "epoch": 4.79, + "learning_rate": 2.1405379028171396e-06, + "loss": 1.1946, + "step": 510000 + }, + { + "epoch": 4.79, + "eval_loss": 1.0199698209762573, + "eval_runtime": 233.9329, + "eval_samples_per_second": 427.473, + "eval_steps_per_second": 13.359, + "step": 510000 + }, + { + "epoch": 4.79, + "learning_rate": 2.0936168615453916e-06, + "loss": 1.1738, + "step": 510500 + }, + { + "epoch": 4.79, + "eval_loss": 1.016062617301941, + "eval_runtime": 235.9837, + "eval_samples_per_second": 423.758, + "eval_steps_per_second": 13.242, + "step": 510500 + }, + { + "epoch": 4.8, + "learning_rate": 2.0466958202736436e-06, + "loss": 1.1735, + "step": 511000 + }, + { + "epoch": 4.8, + "eval_loss": 1.0233088731765747, + "eval_runtime": 236.8841, + "eval_samples_per_second": 422.147, + "eval_steps_per_second": 13.192, + "step": 511000 + }, + { + "epoch": 4.8, + "learning_rate": 1.9997747790018957e-06, + "loss": 1.1771, + "step": 511500 + }, + { + "epoch": 4.8, + "eval_loss": 1.0144411325454712, + "eval_runtime": 236.4685, + "eval_samples_per_second": 422.889, + "eval_steps_per_second": 13.215, + "step": 511500 + }, + { + "epoch": 4.8, + "learning_rate": 1.9528537377301477e-06, + "loss": 1.165, + "step": 512000 + }, + { + "epoch": 4.8, + "eval_loss": 1.0135992765426636, + "eval_runtime": 233.2323, + "eval_samples_per_second": 428.757, + "eval_steps_per_second": 13.399, + "step": 512000 + }, + { + "epoch": 4.81, + "learning_rate": 1.9059326964583999e-06, + "loss": 1.1783, + "step": 512500 + }, + { + "epoch": 4.81, + "eval_loss": 1.0253207683563232, + "eval_runtime": 231.2367, + "eval_samples_per_second": 432.457, + "eval_steps_per_second": 13.514, + "step": 512500 + }, + { + "epoch": 4.81, + "learning_rate": 1.859011655186652e-06, + "loss": 1.1783, + "step": 513000 + }, + { + "epoch": 4.81, + "eval_loss": 1.0126628875732422, + "eval_runtime": 233.9645, + "eval_samples_per_second": 427.415, + "eval_steps_per_second": 13.357, + "step": 513000 + }, + { + "epoch": 4.82, + "learning_rate": 1.812090613914904e-06, + "loss": 1.1824, + "step": 513500 + }, + { + "epoch": 4.82, + "eval_loss": 1.021931767463684, + "eval_runtime": 232.7488, + "eval_samples_per_second": 429.648, + "eval_steps_per_second": 13.426, + "step": 513500 + }, + { + "epoch": 4.82, + "learning_rate": 1.765169572643156e-06, + "loss": 1.1553, + "step": 514000 + }, + { + "epoch": 4.82, + "eval_loss": 1.015001654624939, + "eval_runtime": 232.0708, + "eval_samples_per_second": 430.903, + "eval_steps_per_second": 13.466, + "step": 514000 + }, + { + "epoch": 4.83, + "learning_rate": 1.7182485313714084e-06, + "loss": 1.1799, + "step": 514500 + }, + { + "epoch": 4.83, + "eval_loss": 1.0170458555221558, + "eval_runtime": 231.7543, + "eval_samples_per_second": 431.491, + "eval_steps_per_second": 13.484, + "step": 514500 + }, + { + "epoch": 4.83, + "learning_rate": 1.6713274900996604e-06, + "loss": 1.1958, + "step": 515000 + }, + { + "epoch": 4.83, + "eval_loss": 1.0164183378219604, + "eval_runtime": 230.7078, + "eval_samples_per_second": 433.449, + "eval_steps_per_second": 13.545, + "step": 515000 + }, + { + "epoch": 4.84, + "learning_rate": 1.6244064488279126e-06, + "loss": 1.1946, + "step": 515500 + }, + { + "epoch": 4.84, + "eval_loss": 1.016738772392273, + "eval_runtime": 231.8187, + "eval_samples_per_second": 431.372, + "eval_steps_per_second": 13.48, + "step": 515500 + }, + { + "epoch": 4.84, + "learning_rate": 1.5774854075561646e-06, + "loss": 1.1803, + "step": 516000 + }, + { + "epoch": 4.84, + "eval_loss": 1.0190505981445312, + "eval_runtime": 229.9801, + "eval_samples_per_second": 434.82, + "eval_steps_per_second": 13.588, + "step": 516000 + }, + { + "epoch": 4.85, + "learning_rate": 1.5305643662844166e-06, + "loss": 1.1723, + "step": 516500 + }, + { + "epoch": 4.85, + "eval_loss": 1.017173409461975, + "eval_runtime": 230.1856, + "eval_samples_per_second": 434.432, + "eval_steps_per_second": 13.576, + "step": 516500 + }, + { + "epoch": 4.85, + "learning_rate": 1.4836433250126687e-06, + "loss": 1.1627, + "step": 517000 + }, + { + "epoch": 4.85, + "eval_loss": 1.013454556465149, + "eval_runtime": 229.3387, + "eval_samples_per_second": 436.036, + "eval_steps_per_second": 13.626, + "step": 517000 + }, + { + "epoch": 4.86, + "learning_rate": 1.4367222837409209e-06, + "loss": 1.1677, + "step": 517500 + }, + { + "epoch": 4.86, + "eval_loss": 1.0167146921157837, + "eval_runtime": 229.9528, + "eval_samples_per_second": 434.872, + "eval_steps_per_second": 13.59, + "step": 517500 + }, + { + "epoch": 4.86, + "learning_rate": 1.389801242469173e-06, + "loss": 1.1923, + "step": 518000 + }, + { + "epoch": 4.86, + "eval_loss": 1.0137449502944946, + "eval_runtime": 233.5878, + "eval_samples_per_second": 428.105, + "eval_steps_per_second": 13.378, + "step": 518000 + }, + { + "epoch": 4.87, + "learning_rate": 1.3428802011974251e-06, + "loss": 1.1911, + "step": 518500 + }, + { + "epoch": 4.87, + "eval_loss": 1.0253547430038452, + "eval_runtime": 227.612, + "eval_samples_per_second": 439.344, + "eval_steps_per_second": 13.73, + "step": 518500 + }, + { + "epoch": 4.87, + "learning_rate": 1.2959591599256771e-06, + "loss": 1.1708, + "step": 519000 + }, + { + "epoch": 4.87, + "eval_loss": 1.018088936805725, + "eval_runtime": 229.1156, + "eval_samples_per_second": 436.461, + "eval_steps_per_second": 13.639, + "step": 519000 + }, + { + "epoch": 4.88, + "learning_rate": 1.2490381186539292e-06, + "loss": 1.1737, + "step": 519500 + }, + { + "epoch": 4.88, + "eval_loss": 1.0122038125991821, + "eval_runtime": 234.5523, + "eval_samples_per_second": 426.344, + "eval_steps_per_second": 13.323, + "step": 519500 + }, + { + "epoch": 4.88, + "learning_rate": 1.2021170773821814e-06, + "loss": 1.1872, + "step": 520000 + }, + { + "epoch": 4.88, + "eval_loss": 1.0127085447311401, + "eval_runtime": 229.3109, + "eval_samples_per_second": 436.089, + "eval_steps_per_second": 13.628, + "step": 520000 + }, + { + "epoch": 4.88, + "learning_rate": 1.1551960361104334e-06, + "loss": 1.1734, + "step": 520500 + }, + { + "epoch": 4.88, + "eval_loss": 1.0173479318618774, + "eval_runtime": 228.8989, + "eval_samples_per_second": 436.874, + "eval_steps_per_second": 13.652, + "step": 520500 + }, + { + "epoch": 4.89, + "learning_rate": 1.1082749948386854e-06, + "loss": 1.179, + "step": 521000 + }, + { + "epoch": 4.89, + "eval_loss": 1.0175164937973022, + "eval_runtime": 223.9589, + "eval_samples_per_second": 446.51, + "eval_steps_per_second": 13.953, + "step": 521000 + }, + { + "epoch": 4.89, + "learning_rate": 1.0613539535669376e-06, + "loss": 1.1778, + "step": 521500 + }, + { + "epoch": 4.89, + "eval_loss": 1.0166733264923096, + "eval_runtime": 229.4526, + "eval_samples_per_second": 435.82, + "eval_steps_per_second": 13.619, + "step": 521500 + }, + { + "epoch": 4.9, + "learning_rate": 1.0144329122951896e-06, + "loss": 1.1732, + "step": 522000 + }, + { + "epoch": 4.9, + "eval_loss": 1.0115753412246704, + "eval_runtime": 229.4827, + "eval_samples_per_second": 435.763, + "eval_steps_per_second": 13.618, + "step": 522000 + }, + { + "epoch": 4.9, + "learning_rate": 9.675118710234419e-07, + "loss": 1.1635, + "step": 522500 + }, + { + "epoch": 4.9, + "eval_loss": 1.0163120031356812, + "eval_runtime": 227.8263, + "eval_samples_per_second": 438.931, + "eval_steps_per_second": 13.717, + "step": 522500 + }, + { + "epoch": 4.91, + "learning_rate": 9.205908297516939e-07, + "loss": 1.1677, + "step": 523000 + }, + { + "epoch": 4.91, + "eval_loss": 1.0148788690567017, + "eval_runtime": 222.1269, + "eval_samples_per_second": 450.193, + "eval_steps_per_second": 14.069, + "step": 523000 + }, + { + "epoch": 4.91, + "learning_rate": 8.73669788479946e-07, + "loss": 1.1967, + "step": 523500 + }, + { + "epoch": 4.91, + "eval_loss": 1.010750412940979, + "eval_runtime": 221.9772, + "eval_samples_per_second": 450.497, + "eval_steps_per_second": 14.078, + "step": 523500 + }, + { + "epoch": 4.92, + "learning_rate": 8.26748747208198e-07, + "loss": 1.1772, + "step": 524000 + }, + { + "epoch": 4.92, + "eval_loss": 1.0138213634490967, + "eval_runtime": 221.8206, + "eval_samples_per_second": 450.815, + "eval_steps_per_second": 14.088, + "step": 524000 + }, + { + "epoch": 4.92, + "learning_rate": 7.798277059364501e-07, + "loss": 1.1817, + "step": 524500 + }, + { + "epoch": 4.92, + "eval_loss": 1.0085700750350952, + "eval_runtime": 221.8601, + "eval_samples_per_second": 450.735, + "eval_steps_per_second": 14.085, + "step": 524500 + }, + { + "epoch": 4.93, + "learning_rate": 7.329066646647023e-07, + "loss": 1.1755, + "step": 525000 + }, + { + "epoch": 4.93, + "eval_loss": 1.0190365314483643, + "eval_runtime": 221.8179, + "eval_samples_per_second": 450.82, + "eval_steps_per_second": 14.088, + "step": 525000 + }, + { + "epoch": 4.93, + "learning_rate": 6.859856233929544e-07, + "loss": 1.1866, + "step": 525500 + }, + { + "epoch": 4.93, + "eval_loss": 1.0140148401260376, + "eval_runtime": 221.8188, + "eval_samples_per_second": 450.818, + "eval_steps_per_second": 14.088, + "step": 525500 + }, + { + "epoch": 4.94, + "learning_rate": 6.390645821212064e-07, + "loss": 1.1818, + "step": 526000 + }, + { + "epoch": 4.94, + "eval_loss": 1.0161159038543701, + "eval_runtime": 221.8238, + "eval_samples_per_second": 450.808, + "eval_steps_per_second": 14.088, + "step": 526000 + }, + { + "epoch": 4.94, + "learning_rate": 5.921435408494586e-07, + "loss": 1.1818, + "step": 526500 + }, + { + "epoch": 4.94, + "eval_loss": 1.0107953548431396, + "eval_runtime": 221.8183, + "eval_samples_per_second": 450.819, + "eval_steps_per_second": 14.088, + "step": 526500 + }, + { + "epoch": 4.95, + "learning_rate": 5.452224995777106e-07, + "loss": 1.1613, + "step": 527000 + }, + { + "epoch": 4.95, + "eval_loss": 1.0149364471435547, + "eval_runtime": 221.7834, + "eval_samples_per_second": 450.89, + "eval_steps_per_second": 14.09, + "step": 527000 + }, + { + "epoch": 4.95, + "learning_rate": 4.983014583059628e-07, + "loss": 1.1645, + "step": 527500 + }, + { + "epoch": 4.95, + "eval_loss": 1.0124648809432983, + "eval_runtime": 221.8169, + "eval_samples_per_second": 450.822, + "eval_steps_per_second": 14.088, + "step": 527500 + }, + { + "epoch": 4.95, + "learning_rate": 4.513804170342148e-07, + "loss": 1.193, + "step": 528000 + }, + { + "epoch": 4.95, + "eval_loss": 1.009380578994751, + "eval_runtime": 221.7526, + "eval_samples_per_second": 450.953, + "eval_steps_per_second": 14.092, + "step": 528000 + }, + { + "epoch": 4.96, + "learning_rate": 4.04459375762467e-07, + "loss": 1.1537, + "step": 528500 + }, + { + "epoch": 4.96, + "eval_loss": 1.0124224424362183, + "eval_runtime": 221.8595, + "eval_samples_per_second": 450.736, + "eval_steps_per_second": 14.085, + "step": 528500 + }, + { + "epoch": 4.96, + "learning_rate": 3.57538334490719e-07, + "loss": 1.1691, + "step": 529000 + }, + { + "epoch": 4.96, + "eval_loss": 1.0114690065383911, + "eval_runtime": 222.0421, + "eval_samples_per_second": 450.365, + "eval_steps_per_second": 14.074, + "step": 529000 + }, + { + "epoch": 4.97, + "learning_rate": 3.1061729321897113e-07, + "loss": 1.1643, + "step": 529500 + }, + { + "epoch": 4.97, + "eval_loss": 1.0129092931747437, + "eval_runtime": 227.2564, + "eval_samples_per_second": 440.032, + "eval_steps_per_second": 13.751, + "step": 529500 + }, + { + "epoch": 4.97, + "learning_rate": 2.6369625194722325e-07, + "loss": 1.1592, + "step": 530000 + }, + { + "epoch": 4.97, + "eval_loss": 1.0124961137771606, + "eval_runtime": 228.0531, + "eval_samples_per_second": 438.494, + "eval_steps_per_second": 13.703, + "step": 530000 + } + ], + "max_steps": 532810, + "num_train_epochs": 5, + "total_flos": 5.5361381292640083e+17, + "trial_name": null, + "trial_params": null +}