diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12816 @@ +{ + "best_metric": 0.7345585823059082, + "best_model_checkpoint": "/home/khalid/Documents/github_rep/MyProjects/CodeBase/training/output/checkpoint-1000000", + "epoch": 1.0723469406083477, + "global_step": 1000000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 3e-05, + "loss": 1.7484, + "step": 500 + }, + { + "epoch": 0.0, + "learning_rate": 2.9969969969969972e-05, + "loss": 1.6173, + "step": 1000 + }, + { + "epoch": 0.0, + "learning_rate": 2.9939939939939944e-05, + "loss": 1.5574, + "step": 1500 + }, + { + "epoch": 0.0, + "learning_rate": 2.9909909909909908e-05, + "loss": 1.5441, + "step": 2000 + }, + { + "epoch": 0.0, + "learning_rate": 2.987987987987988e-05, + "loss": 1.5047, + "step": 2500 + }, + { + "epoch": 0.0, + "learning_rate": 2.984984984984985e-05, + "loss": 1.4768, + "step": 3000 + }, + { + "epoch": 0.0, + "learning_rate": 2.9819819819819822e-05, + "loss": 1.4624, + "step": 3500 + }, + { + "epoch": 0.0, + "learning_rate": 2.978978978978979e-05, + "loss": 1.4313, + "step": 4000 + }, + { + "epoch": 0.0, + "learning_rate": 2.975975975975976e-05, + "loss": 1.4138, + "step": 4500 + }, + { + "epoch": 0.01, + "learning_rate": 2.972972972972973e-05, + "loss": 1.4181, + "step": 5000 + }, + { + "epoch": 0.01, + "learning_rate": 2.96996996996997e-05, + "loss": 1.3907, + "step": 5500 + }, + { + "epoch": 0.01, + "learning_rate": 2.9669669669669673e-05, + "loss": 1.377, + "step": 6000 + }, + { + "epoch": 0.01, + "learning_rate": 2.9639699699699702e-05, + "loss": 1.383, + "step": 6500 + }, + { + "epoch": 0.01, + "learning_rate": 2.960972972972973e-05, + "loss": 1.3738, + "step": 7000 + }, + { + "epoch": 0.01, + "learning_rate": 2.95796996996997e-05, + "loss": 1.3614, + "step": 7500 + }, + { + "epoch": 0.01, + "learning_rate": 2.954966966966967e-05, + "loss": 1.344, + "step": 8000 + }, + { + "epoch": 0.01, + "learning_rate": 2.9519639639639642e-05, + "loss": 1.3477, + "step": 8500 + }, + { + "epoch": 0.01, + "learning_rate": 2.948960960960961e-05, + "loss": 1.3436, + "step": 9000 + }, + { + "epoch": 0.01, + "learning_rate": 2.9459639639639642e-05, + "loss": 1.334, + "step": 9500 + }, + { + "epoch": 0.01, + "learning_rate": 2.942966966966967e-05, + "loss": 1.3165, + "step": 10000 + }, + { + "epoch": 0.01, + "eval_loss": 1.2129223346710205, + "eval_runtime": 577.4417, + "eval_samples_per_second": 173.178, + "eval_steps_per_second": 43.294, + "step": 10000 + }, + { + "epoch": 0.01, + "learning_rate": 2.9399639639639643e-05, + "loss": 1.3218, + "step": 10500 + }, + { + "epoch": 0.01, + "learning_rate": 2.9369609609609608e-05, + "loss": 1.3243, + "step": 11000 + }, + { + "epoch": 0.01, + "learning_rate": 2.933957957957958e-05, + "loss": 1.3016, + "step": 11500 + }, + { + "epoch": 0.01, + "learning_rate": 2.930954954954955e-05, + "loss": 1.3057, + "step": 12000 + }, + { + "epoch": 0.01, + "learning_rate": 2.927951951951952e-05, + "loss": 1.2973, + "step": 12500 + }, + { + "epoch": 0.01, + "learning_rate": 2.9249489489489493e-05, + "loss": 1.2911, + "step": 13000 + }, + { + "epoch": 0.01, + "learning_rate": 2.9219459459459458e-05, + "loss": 1.294, + "step": 13500 + }, + { + "epoch": 0.02, + "learning_rate": 2.918942942942943e-05, + "loss": 1.291, + "step": 14000 + }, + { + "epoch": 0.02, + "learning_rate": 2.91593993993994e-05, + "loss": 1.2678, + "step": 14500 + }, + { + "epoch": 0.02, + "learning_rate": 2.9129369369369372e-05, + "loss": 1.27, + "step": 15000 + }, + { + "epoch": 0.02, + "learning_rate": 2.909933933933934e-05, + "loss": 1.2418, + "step": 15500 + }, + { + "epoch": 0.02, + "learning_rate": 2.906936936936937e-05, + "loss": 1.2373, + "step": 16000 + }, + { + "epoch": 0.02, + "learning_rate": 2.903933933933934e-05, + "loss": 1.2567, + "step": 16500 + }, + { + "epoch": 0.02, + "learning_rate": 2.900930930930931e-05, + "loss": 1.2449, + "step": 17000 + }, + { + "epoch": 0.02, + "learning_rate": 2.897927927927928e-05, + "loss": 1.2554, + "step": 17500 + }, + { + "epoch": 0.02, + "learning_rate": 2.894924924924925e-05, + "loss": 1.2579, + "step": 18000 + }, + { + "epoch": 0.02, + "learning_rate": 2.891927927927928e-05, + "loss": 1.2351, + "step": 18500 + }, + { + "epoch": 0.02, + "learning_rate": 2.888924924924925e-05, + "loss": 1.2227, + "step": 19000 + }, + { + "epoch": 0.02, + "learning_rate": 2.885921921921922e-05, + "loss": 1.2436, + "step": 19500 + }, + { + "epoch": 0.02, + "learning_rate": 2.882924924924925e-05, + "loss": 1.2256, + "step": 20000 + }, + { + "epoch": 0.02, + "eval_loss": 1.1297707557678223, + "eval_runtime": 613.186, + "eval_samples_per_second": 163.083, + "eval_steps_per_second": 40.771, + "step": 20000 + }, + { + "epoch": 0.02, + "learning_rate": 2.879921921921922e-05, + "loss": 1.2092, + "step": 20500 + }, + { + "epoch": 0.02, + "learning_rate": 2.8769189189189192e-05, + "loss": 1.2248, + "step": 21000 + }, + { + "epoch": 0.02, + "learning_rate": 2.873915915915916e-05, + "loss": 1.2378, + "step": 21500 + }, + { + "epoch": 0.02, + "learning_rate": 2.8709129129129128e-05, + "loss": 1.2182, + "step": 22000 + }, + { + "epoch": 0.02, + "learning_rate": 2.8679159159159157e-05, + "loss": 1.2111, + "step": 22500 + }, + { + "epoch": 0.02, + "learning_rate": 2.864912912912913e-05, + "loss": 1.2088, + "step": 23000 + }, + { + "epoch": 0.03, + "learning_rate": 2.86190990990991e-05, + "loss": 1.2104, + "step": 23500 + }, + { + "epoch": 0.03, + "learning_rate": 2.858906906906907e-05, + "loss": 1.1984, + "step": 24000 + }, + { + "epoch": 0.03, + "learning_rate": 2.8559039039039043e-05, + "loss": 1.206, + "step": 24500 + }, + { + "epoch": 0.03, + "learning_rate": 2.8529009009009007e-05, + "loss": 1.1827, + "step": 25000 + }, + { + "epoch": 0.03, + "learning_rate": 2.849897897897898e-05, + "loss": 1.1953, + "step": 25500 + }, + { + "epoch": 0.03, + "learning_rate": 2.8469009009009008e-05, + "loss": 1.1964, + "step": 26000 + }, + { + "epoch": 0.03, + "learning_rate": 2.843903903903904e-05, + "loss": 1.2048, + "step": 26500 + }, + { + "epoch": 0.03, + "learning_rate": 2.840900900900901e-05, + "loss": 1.1847, + "step": 27000 + }, + { + "epoch": 0.03, + "learning_rate": 2.837897897897898e-05, + "loss": 1.1808, + "step": 27500 + }, + { + "epoch": 0.03, + "learning_rate": 2.834894894894895e-05, + "loss": 1.1685, + "step": 28000 + }, + { + "epoch": 0.03, + "learning_rate": 2.831891891891892e-05, + "loss": 1.1838, + "step": 28500 + }, + { + "epoch": 0.03, + "learning_rate": 2.828888888888889e-05, + "loss": 1.1876, + "step": 29000 + }, + { + "epoch": 0.03, + "learning_rate": 2.825885885885886e-05, + "loss": 1.1851, + "step": 29500 + }, + { + "epoch": 0.03, + "learning_rate": 2.822882882882883e-05, + "loss": 1.1683, + "step": 30000 + }, + { + "epoch": 0.03, + "eval_loss": 1.090136170387268, + "eval_runtime": 615.6857, + "eval_samples_per_second": 162.421, + "eval_steps_per_second": 40.605, + "step": 30000 + }, + { + "epoch": 0.03, + "learning_rate": 2.8198798798798798e-05, + "loss": 1.1725, + "step": 30500 + }, + { + "epoch": 0.03, + "learning_rate": 2.816876876876877e-05, + "loss": 1.1879, + "step": 31000 + }, + { + "epoch": 0.03, + "learning_rate": 2.813873873873874e-05, + "loss": 1.1635, + "step": 31500 + }, + { + "epoch": 0.03, + "learning_rate": 2.810870870870871e-05, + "loss": 1.1892, + "step": 32000 + }, + { + "epoch": 0.03, + "learning_rate": 2.807873873873874e-05, + "loss": 1.1675, + "step": 32500 + }, + { + "epoch": 0.04, + "learning_rate": 2.804870870870871e-05, + "loss": 1.1766, + "step": 33000 + }, + { + "epoch": 0.04, + "learning_rate": 2.8018678678678677e-05, + "loss": 1.1699, + "step": 33500 + }, + { + "epoch": 0.04, + "learning_rate": 2.798864864864865e-05, + "loss": 1.1573, + "step": 34000 + }, + { + "epoch": 0.04, + "learning_rate": 2.7958678678678678e-05, + "loss": 1.1695, + "step": 34500 + }, + { + "epoch": 0.04, + "learning_rate": 2.792864864864865e-05, + "loss": 1.1419, + "step": 35000 + }, + { + "epoch": 0.04, + "learning_rate": 2.789861861861862e-05, + "loss": 1.1523, + "step": 35500 + }, + { + "epoch": 0.04, + "learning_rate": 2.7868588588588592e-05, + "loss": 1.1544, + "step": 36000 + }, + { + "epoch": 0.04, + "learning_rate": 2.783861861861862e-05, + "loss": 1.1501, + "step": 36500 + }, + { + "epoch": 0.04, + "learning_rate": 2.780864864864865e-05, + "loss": 1.1583, + "step": 37000 + }, + { + "epoch": 0.04, + "learning_rate": 2.777861861861862e-05, + "loss": 1.1455, + "step": 37500 + }, + { + "epoch": 0.04, + "learning_rate": 2.774858858858859e-05, + "loss": 1.1276, + "step": 38000 + }, + { + "epoch": 0.04, + "learning_rate": 2.7718558558558558e-05, + "loss": 1.139, + "step": 38500 + }, + { + "epoch": 0.04, + "learning_rate": 2.768852852852853e-05, + "loss": 1.1363, + "step": 39000 + }, + { + "epoch": 0.04, + "learning_rate": 2.7658498498498497e-05, + "loss": 1.1287, + "step": 39500 + }, + { + "epoch": 0.04, + "learning_rate": 2.762846846846847e-05, + "loss": 1.1429, + "step": 40000 + }, + { + "epoch": 0.04, + "eval_loss": 1.0513352155685425, + "eval_runtime": 613.186, + "eval_samples_per_second": 163.083, + "eval_steps_per_second": 40.771, + "step": 40000 + }, + { + "epoch": 0.04, + "learning_rate": 2.759843843843844e-05, + "loss": 1.1433, + "step": 40500 + }, + { + "epoch": 0.04, + "learning_rate": 2.756846846846847e-05, + "loss": 1.1378, + "step": 41000 + }, + { + "epoch": 0.04, + "learning_rate": 2.753843843843844e-05, + "loss": 1.1398, + "step": 41500 + }, + { + "epoch": 0.05, + "learning_rate": 2.750846846846847e-05, + "loss": 1.1185, + "step": 42000 + }, + { + "epoch": 0.05, + "learning_rate": 2.74784984984985e-05, + "loss": 1.1246, + "step": 42500 + }, + { + "epoch": 0.05, + "learning_rate": 2.744846846846847e-05, + "loss": 1.128, + "step": 43000 + }, + { + "epoch": 0.05, + "learning_rate": 2.741843843843844e-05, + "loss": 1.1205, + "step": 43500 + }, + { + "epoch": 0.05, + "learning_rate": 2.738840840840841e-05, + "loss": 1.1289, + "step": 44000 + }, + { + "epoch": 0.05, + "learning_rate": 2.7358378378378378e-05, + "loss": 1.1206, + "step": 44500 + }, + { + "epoch": 0.05, + "learning_rate": 2.732834834834835e-05, + "loss": 1.1348, + "step": 45000 + }, + { + "epoch": 0.05, + "learning_rate": 2.729831831831832e-05, + "loss": 1.1181, + "step": 45500 + }, + { + "epoch": 0.05, + "learning_rate": 2.726828828828829e-05, + "loss": 1.1186, + "step": 46000 + }, + { + "epoch": 0.05, + "learning_rate": 2.7238258258258257e-05, + "loss": 1.1241, + "step": 46500 + }, + { + "epoch": 0.05, + "learning_rate": 2.7208228228228228e-05, + "loss": 1.1175, + "step": 47000 + }, + { + "epoch": 0.05, + "learning_rate": 2.717825825825826e-05, + "loss": 1.1179, + "step": 47500 + }, + { + "epoch": 0.05, + "learning_rate": 2.714822822822823e-05, + "loss": 1.1185, + "step": 48000 + }, + { + "epoch": 0.05, + "learning_rate": 2.71181981981982e-05, + "loss": 1.1051, + "step": 48500 + }, + { + "epoch": 0.05, + "learning_rate": 2.7088168168168168e-05, + "loss": 1.1127, + "step": 49000 + }, + { + "epoch": 0.05, + "learning_rate": 2.705813813813814e-05, + "loss": 1.1079, + "step": 49500 + }, + { + "epoch": 0.05, + "learning_rate": 2.7028108108108107e-05, + "loss": 1.09, + "step": 50000 + }, + { + "epoch": 0.05, + "eval_loss": 1.0235236883163452, + "eval_runtime": 630.7694, + "eval_samples_per_second": 158.537, + "eval_steps_per_second": 39.634, + "step": 50000 + }, + { + "epoch": 0.05, + "learning_rate": 2.699807807807808e-05, + "loss": 1.1017, + "step": 50500 + }, + { + "epoch": 0.05, + "learning_rate": 2.6968048048048047e-05, + "loss": 1.1153, + "step": 51000 + }, + { + "epoch": 0.06, + "learning_rate": 2.693807807807808e-05, + "loss": 1.1244, + "step": 51500 + }, + { + "epoch": 0.06, + "learning_rate": 2.6908108108108112e-05, + "loss": 1.1129, + "step": 52000 + }, + { + "epoch": 0.06, + "learning_rate": 2.6878078078078077e-05, + "loss": 1.1032, + "step": 52500 + }, + { + "epoch": 0.06, + "learning_rate": 2.6848048048048048e-05, + "loss": 1.1024, + "step": 53000 + }, + { + "epoch": 0.06, + "learning_rate": 2.681801801801802e-05, + "loss": 1.1119, + "step": 53500 + }, + { + "epoch": 0.06, + "learning_rate": 2.678798798798799e-05, + "loss": 1.0847, + "step": 54000 + }, + { + "epoch": 0.06, + "learning_rate": 2.675795795795796e-05, + "loss": 1.1061, + "step": 54500 + }, + { + "epoch": 0.06, + "learning_rate": 2.6727927927927927e-05, + "loss": 1.0983, + "step": 55000 + }, + { + "epoch": 0.06, + "learning_rate": 2.6697897897897898e-05, + "loss": 1.0909, + "step": 55500 + }, + { + "epoch": 0.06, + "learning_rate": 2.666786786786787e-05, + "loss": 1.1001, + "step": 56000 + }, + { + "epoch": 0.06, + "learning_rate": 2.66378978978979e-05, + "loss": 1.0911, + "step": 56500 + }, + { + "epoch": 0.06, + "learning_rate": 2.660786786786787e-05, + "loss": 1.0993, + "step": 57000 + }, + { + "epoch": 0.06, + "learning_rate": 2.6577837837837838e-05, + "loss": 1.09, + "step": 57500 + }, + { + "epoch": 0.06, + "learning_rate": 2.654780780780781e-05, + "loss": 1.0906, + "step": 58000 + }, + { + "epoch": 0.06, + "learning_rate": 2.6517897897897897e-05, + "loss": 1.089, + "step": 58500 + }, + { + "epoch": 0.06, + "learning_rate": 2.6487867867867868e-05, + "loss": 1.097, + "step": 59000 + }, + { + "epoch": 0.06, + "learning_rate": 2.645783783783784e-05, + "loss": 1.0785, + "step": 59500 + }, + { + "epoch": 0.06, + "learning_rate": 2.642780780780781e-05, + "loss": 1.0867, + "step": 60000 + }, + { + "epoch": 0.06, + "eval_loss": 1.0042845010757446, + "eval_runtime": 612.5588, + "eval_samples_per_second": 163.25, + "eval_steps_per_second": 40.812, + "step": 60000 + }, + { + "epoch": 0.06, + "learning_rate": 2.639777777777778e-05, + "loss": 1.0828, + "step": 60500 + }, + { + "epoch": 0.07, + "learning_rate": 2.6367747747747747e-05, + "loss": 1.0736, + "step": 61000 + }, + { + "epoch": 0.07, + "learning_rate": 2.6337717717717718e-05, + "loss": 1.0702, + "step": 61500 + }, + { + "epoch": 0.07, + "learning_rate": 2.630768768768769e-05, + "loss": 1.0765, + "step": 62000 + }, + { + "epoch": 0.07, + "learning_rate": 2.627771771771772e-05, + "loss": 1.0801, + "step": 62500 + }, + { + "epoch": 0.07, + "learning_rate": 2.624768768768769e-05, + "loss": 1.0709, + "step": 63000 + }, + { + "epoch": 0.07, + "learning_rate": 2.621771771771772e-05, + "loss": 1.0813, + "step": 63500 + }, + { + "epoch": 0.07, + "learning_rate": 2.618768768768769e-05, + "loss": 1.072, + "step": 64000 + }, + { + "epoch": 0.07, + "learning_rate": 2.615765765765766e-05, + "loss": 1.0789, + "step": 64500 + }, + { + "epoch": 0.07, + "learning_rate": 2.6127627627627627e-05, + "loss": 1.0926, + "step": 65000 + }, + { + "epoch": 0.07, + "learning_rate": 2.6097597597597598e-05, + "loss": 1.0765, + "step": 65500 + }, + { + "epoch": 0.07, + "learning_rate": 2.606756756756757e-05, + "loss": 1.0673, + "step": 66000 + }, + { + "epoch": 0.07, + "learning_rate": 2.60375975975976e-05, + "loss": 1.075, + "step": 66500 + }, + { + "epoch": 0.07, + "learning_rate": 2.6007567567567567e-05, + "loss": 1.0678, + "step": 67000 + }, + { + "epoch": 0.07, + "learning_rate": 2.5977537537537538e-05, + "loss": 1.0645, + "step": 67500 + }, + { + "epoch": 0.07, + "learning_rate": 2.594750750750751e-05, + "loss": 1.0708, + "step": 68000 + }, + { + "epoch": 0.07, + "learning_rate": 2.591753753753754e-05, + "loss": 1.0746, + "step": 68500 + }, + { + "epoch": 0.07, + "learning_rate": 2.588750750750751e-05, + "loss": 1.0644, + "step": 69000 + }, + { + "epoch": 0.07, + "learning_rate": 2.5857477477477478e-05, + "loss": 1.0572, + "step": 69500 + }, + { + "epoch": 0.08, + "learning_rate": 2.5827447447447446e-05, + "loss": 1.0777, + "step": 70000 + }, + { + "epoch": 0.08, + "eval_loss": 0.989311158657074, + "eval_runtime": 626.1693, + "eval_samples_per_second": 159.701, + "eval_steps_per_second": 39.925, + "step": 70000 + }, + { + "epoch": 0.08, + "learning_rate": 2.5797477477477475e-05, + "loss": 1.0623, + "step": 70500 + }, + { + "epoch": 0.08, + "learning_rate": 2.5767447447447447e-05, + "loss": 1.0469, + "step": 71000 + }, + { + "epoch": 0.08, + "learning_rate": 2.5737417417417418e-05, + "loss": 1.0564, + "step": 71500 + }, + { + "epoch": 0.08, + "learning_rate": 2.570738738738739e-05, + "loss": 1.0434, + "step": 72000 + }, + { + "epoch": 0.08, + "learning_rate": 2.567735735735736e-05, + "loss": 1.0593, + "step": 72500 + }, + { + "epoch": 0.08, + "learning_rate": 2.5647327327327325e-05, + "loss": 1.0587, + "step": 73000 + }, + { + "epoch": 0.08, + "learning_rate": 2.5617297297297297e-05, + "loss": 1.0507, + "step": 73500 + }, + { + "epoch": 0.08, + "learning_rate": 2.5587267267267268e-05, + "loss": 1.057, + "step": 74000 + }, + { + "epoch": 0.08, + "learning_rate": 2.5557297297297297e-05, + "loss": 1.0607, + "step": 74500 + }, + { + "epoch": 0.08, + "learning_rate": 2.552726726726727e-05, + "loss": 1.0645, + "step": 75000 + }, + { + "epoch": 0.08, + "learning_rate": 2.549723723723724e-05, + "loss": 1.0603, + "step": 75500 + }, + { + "epoch": 0.08, + "learning_rate": 2.5467207207207208e-05, + "loss": 1.0407, + "step": 76000 + }, + { + "epoch": 0.08, + "learning_rate": 2.5437177177177176e-05, + "loss": 1.0573, + "step": 76500 + }, + { + "epoch": 0.08, + "learning_rate": 2.5407147147147148e-05, + "loss": 1.0331, + "step": 77000 + }, + { + "epoch": 0.08, + "learning_rate": 2.5377177177177177e-05, + "loss": 1.0456, + "step": 77500 + }, + { + "epoch": 0.08, + "learning_rate": 2.5347147147147148e-05, + "loss": 1.0586, + "step": 78000 + }, + { + "epoch": 0.08, + "learning_rate": 2.5317117117117116e-05, + "loss": 1.0591, + "step": 78500 + }, + { + "epoch": 0.08, + "learning_rate": 2.5287087087087088e-05, + "loss": 1.0519, + "step": 79000 + }, + { + "epoch": 0.09, + "learning_rate": 2.525705705705706e-05, + "loss": 1.0549, + "step": 79500 + }, + { + "epoch": 0.09, + "learning_rate": 2.5227027027027027e-05, + "loss": 1.0588, + "step": 80000 + }, + { + "epoch": 0.09, + "eval_loss": 0.9711708426475525, + "eval_runtime": 621.1495, + "eval_samples_per_second": 160.992, + "eval_steps_per_second": 40.248, + "step": 80000 + }, + { + "epoch": 0.09, + "learning_rate": 2.5196996996997e-05, + "loss": 1.0534, + "step": 80500 + }, + { + "epoch": 0.09, + "learning_rate": 2.5166966966966966e-05, + "loss": 1.0529, + "step": 81000 + }, + { + "epoch": 0.09, + "learning_rate": 2.5136996996996996e-05, + "loss": 1.0605, + "step": 81500 + }, + { + "epoch": 0.09, + "learning_rate": 2.5106966966966967e-05, + "loss": 1.0639, + "step": 82000 + }, + { + "epoch": 0.09, + "learning_rate": 2.507693693693694e-05, + "loss": 1.0455, + "step": 82500 + }, + { + "epoch": 0.09, + "learning_rate": 2.504690690690691e-05, + "loss": 1.042, + "step": 83000 + }, + { + "epoch": 0.09, + "learning_rate": 2.501693693693694e-05, + "loss": 1.0267, + "step": 83500 + }, + { + "epoch": 0.09, + "learning_rate": 2.498690690690691e-05, + "loss": 1.0534, + "step": 84000 + }, + { + "epoch": 0.09, + "learning_rate": 2.4956876876876875e-05, + "loss": 1.0345, + "step": 84500 + }, + { + "epoch": 0.09, + "learning_rate": 2.4926906906906908e-05, + "loss": 1.0255, + "step": 85000 + }, + { + "epoch": 0.09, + "learning_rate": 2.4896876876876876e-05, + "loss": 1.0277, + "step": 85500 + }, + { + "epoch": 0.09, + "learning_rate": 2.4866846846846847e-05, + "loss": 1.0501, + "step": 86000 + }, + { + "epoch": 0.09, + "learning_rate": 2.483681681681682e-05, + "loss": 1.0417, + "step": 86500 + }, + { + "epoch": 0.09, + "learning_rate": 2.4806846846846848e-05, + "loss": 1.0324, + "step": 87000 + }, + { + "epoch": 0.09, + "learning_rate": 2.477681681681682e-05, + "loss": 1.0552, + "step": 87500 + }, + { + "epoch": 0.09, + "learning_rate": 2.4746786786786787e-05, + "loss": 1.038, + "step": 88000 + }, + { + "epoch": 0.09, + "learning_rate": 2.471675675675676e-05, + "loss": 1.0434, + "step": 88500 + }, + { + "epoch": 0.1, + "learning_rate": 2.4686726726726726e-05, + "loss": 1.0349, + "step": 89000 + }, + { + "epoch": 0.1, + "learning_rate": 2.4656696696696698e-05, + "loss": 1.0166, + "step": 89500 + }, + { + "epoch": 0.1, + "learning_rate": 2.4626666666666666e-05, + "loss": 1.0389, + "step": 90000 + }, + { + "epoch": 0.1, + "eval_loss": 0.9617297649383545, + "eval_runtime": 593.47, + "eval_samples_per_second": 168.501, + "eval_steps_per_second": 42.125, + "step": 90000 + }, + { + "epoch": 0.1, + "learning_rate": 2.4596636636636637e-05, + "loss": 1.0382, + "step": 90500 + }, + { + "epoch": 0.1, + "learning_rate": 2.456660660660661e-05, + "loss": 1.0389, + "step": 91000 + }, + { + "epoch": 0.1, + "learning_rate": 2.4536576576576577e-05, + "loss": 1.0397, + "step": 91500 + }, + { + "epoch": 0.1, + "learning_rate": 2.4506546546546548e-05, + "loss": 1.028, + "step": 92000 + }, + { + "epoch": 0.1, + "learning_rate": 2.4476516516516516e-05, + "loss": 1.0317, + "step": 92500 + }, + { + "epoch": 0.1, + "learning_rate": 2.4446546546546545e-05, + "loss": 1.0281, + "step": 93000 + }, + { + "epoch": 0.1, + "learning_rate": 2.4416516516516517e-05, + "loss": 1.0385, + "step": 93500 + }, + { + "epoch": 0.1, + "learning_rate": 2.4386486486486488e-05, + "loss": 1.0183, + "step": 94000 + }, + { + "epoch": 0.1, + "learning_rate": 2.435645645645646e-05, + "loss": 1.0204, + "step": 94500 + }, + { + "epoch": 0.1, + "learning_rate": 2.432648648648649e-05, + "loss": 1.044, + "step": 95000 + }, + { + "epoch": 0.1, + "learning_rate": 2.4296516516516518e-05, + "loss": 1.0402, + "step": 95500 + }, + { + "epoch": 0.1, + "learning_rate": 2.426648648648649e-05, + "loss": 1.0346, + "step": 96000 + }, + { + "epoch": 0.1, + "learning_rate": 2.4236456456456457e-05, + "loss": 1.0156, + "step": 96500 + }, + { + "epoch": 0.1, + "learning_rate": 2.4206426426426425e-05, + "loss": 1.0054, + "step": 97000 + }, + { + "epoch": 0.1, + "learning_rate": 2.4176396396396397e-05, + "loss": 1.0276, + "step": 97500 + }, + { + "epoch": 0.11, + "learning_rate": 2.4146366366366368e-05, + "loss": 1.022, + "step": 98000 + }, + { + "epoch": 0.11, + "learning_rate": 2.4116336336336336e-05, + "loss": 1.0351, + "step": 98500 + }, + { + "epoch": 0.11, + "learning_rate": 2.4086306306306307e-05, + "loss": 1.0306, + "step": 99000 + }, + { + "epoch": 0.11, + "learning_rate": 2.4056276276276275e-05, + "loss": 1.0183, + "step": 99500 + }, + { + "epoch": 0.11, + "learning_rate": 2.4026246246246247e-05, + "loss": 1.0245, + "step": 100000 + }, + { + "epoch": 0.11, + "eval_loss": 0.9445381760597229, + "eval_runtime": 618.5347, + "eval_samples_per_second": 161.672, + "eval_steps_per_second": 40.418, + "step": 100000 + }, + { + "epoch": 0.11, + "learning_rate": 2.3996216216216218e-05, + "loss": 1.0274, + "step": 100500 + }, + { + "epoch": 0.11, + "learning_rate": 2.3966186186186186e-05, + "loss": 1.007, + "step": 101000 + }, + { + "epoch": 0.11, + "learning_rate": 2.3936276276276276e-05, + "loss": 1.021, + "step": 101500 + }, + { + "epoch": 0.11, + "learning_rate": 2.3906246246246245e-05, + "loss": 1.0176, + "step": 102000 + }, + { + "epoch": 0.11, + "learning_rate": 2.3876216216216216e-05, + "loss": 1.0279, + "step": 102500 + }, + { + "epoch": 0.11, + "learning_rate": 2.3846186186186187e-05, + "loss": 1.0051, + "step": 103000 + }, + { + "epoch": 0.11, + "learning_rate": 2.3816216216216216e-05, + "loss": 1.0184, + "step": 103500 + }, + { + "epoch": 0.11, + "learning_rate": 2.3786186186186188e-05, + "loss": 1.0125, + "step": 104000 + }, + { + "epoch": 0.11, + "learning_rate": 2.375615615615616e-05, + "loss": 1.0202, + "step": 104500 + }, + { + "epoch": 0.11, + "learning_rate": 2.3726126126126124e-05, + "loss": 1.0255, + "step": 105000 + }, + { + "epoch": 0.11, + "learning_rate": 2.3696096096096095e-05, + "loss": 0.9965, + "step": 105500 + }, + { + "epoch": 0.11, + "learning_rate": 2.3666066066066067e-05, + "loss": 1.0157, + "step": 106000 + }, + { + "epoch": 0.11, + "learning_rate": 2.3636036036036038e-05, + "loss": 1.0168, + "step": 106500 + }, + { + "epoch": 0.11, + "learning_rate": 2.360600600600601e-05, + "loss": 1.012, + "step": 107000 + }, + { + "epoch": 0.12, + "learning_rate": 2.3575975975975974e-05, + "loss": 1.0102, + "step": 107500 + }, + { + "epoch": 0.12, + "learning_rate": 2.3545945945945945e-05, + "loss": 1.0161, + "step": 108000 + }, + { + "epoch": 0.12, + "learning_rate": 2.3515915915915917e-05, + "loss": 0.9957, + "step": 108500 + }, + { + "epoch": 0.12, + "learning_rate": 2.3485945945945946e-05, + "loss": 1.0009, + "step": 109000 + }, + { + "epoch": 0.12, + "learning_rate": 2.3455915915915917e-05, + "loss": 1.0096, + "step": 109500 + }, + { + "epoch": 0.12, + "learning_rate": 2.3425885885885885e-05, + "loss": 1.0146, + "step": 110000 + }, + { + "epoch": 0.12, + "eval_loss": 0.934283435344696, + "eval_runtime": 601.7192, + "eval_samples_per_second": 166.19, + "eval_steps_per_second": 41.548, + "step": 110000 + }, + { + "epoch": 0.12, + "learning_rate": 2.3395975975975976e-05, + "loss": 1.0183, + "step": 110500 + }, + { + "epoch": 0.12, + "learning_rate": 2.3365945945945947e-05, + "loss": 1.0182, + "step": 111000 + }, + { + "epoch": 0.12, + "learning_rate": 2.3335915915915915e-05, + "loss": 1.0091, + "step": 111500 + }, + { + "epoch": 0.12, + "learning_rate": 2.3305885885885887e-05, + "loss": 1.0111, + "step": 112000 + }, + { + "epoch": 0.12, + "learning_rate": 2.3275855855855858e-05, + "loss": 1.0015, + "step": 112500 + }, + { + "epoch": 0.12, + "learning_rate": 2.3245825825825826e-05, + "loss": 1.0082, + "step": 113000 + }, + { + "epoch": 0.12, + "learning_rate": 2.3215795795795794e-05, + "loss": 0.9918, + "step": 113500 + }, + { + "epoch": 0.12, + "learning_rate": 2.3185765765765765e-05, + "loss": 1.0009, + "step": 114000 + }, + { + "epoch": 0.12, + "learning_rate": 2.3155735735735737e-05, + "loss": 1.0195, + "step": 114500 + }, + { + "epoch": 0.12, + "learning_rate": 2.3125705705705708e-05, + "loss": 1.0189, + "step": 115000 + }, + { + "epoch": 0.12, + "learning_rate": 2.3095675675675676e-05, + "loss": 1.0038, + "step": 115500 + }, + { + "epoch": 0.12, + "learning_rate": 2.3065645645645644e-05, + "loss": 0.9963, + "step": 116000 + }, + { + "epoch": 0.12, + "learning_rate": 2.3035615615615616e-05, + "loss": 1.0125, + "step": 116500 + }, + { + "epoch": 0.13, + "learning_rate": 2.3005585585585587e-05, + "loss": 0.988, + "step": 117000 + }, + { + "epoch": 0.13, + "learning_rate": 2.297555555555556e-05, + "loss": 0.9997, + "step": 117500 + }, + { + "epoch": 0.13, + "learning_rate": 2.2945525525525523e-05, + "loss": 0.9983, + "step": 118000 + }, + { + "epoch": 0.13, + "learning_rate": 2.291555555555556e-05, + "loss": 1.0094, + "step": 118500 + }, + { + "epoch": 0.13, + "learning_rate": 2.2885585585585588e-05, + "loss": 0.9988, + "step": 119000 + }, + { + "epoch": 0.13, + "learning_rate": 2.2855555555555556e-05, + "loss": 0.9928, + "step": 119500 + }, + { + "epoch": 0.13, + "learning_rate": 2.2825525525525524e-05, + "loss": 0.9951, + "step": 120000 + }, + { + "epoch": 0.13, + "eval_loss": 0.9260998964309692, + "eval_runtime": 605.8053, + "eval_samples_per_second": 165.07, + "eval_steps_per_second": 41.267, + "step": 120000 + }, + { + "epoch": 0.13, + "learning_rate": 2.2795495495495496e-05, + "loss": 0.9923, + "step": 120500 + }, + { + "epoch": 0.13, + "learning_rate": 2.2765465465465467e-05, + "loss": 1.0081, + "step": 121000 + }, + { + "epoch": 0.13, + "learning_rate": 2.2735495495495496e-05, + "loss": 1.0009, + "step": 121500 + }, + { + "epoch": 0.13, + "learning_rate": 2.2705465465465464e-05, + "loss": 0.9929, + "step": 122000 + }, + { + "epoch": 0.13, + "learning_rate": 2.2675435435435436e-05, + "loss": 0.9956, + "step": 122500 + }, + { + "epoch": 0.13, + "learning_rate": 2.2645405405405407e-05, + "loss": 1.0, + "step": 123000 + }, + { + "epoch": 0.13, + "learning_rate": 2.2615435435435436e-05, + "loss": 0.9927, + "step": 123500 + }, + { + "epoch": 0.13, + "learning_rate": 2.2585405405405408e-05, + "loss": 0.9882, + "step": 124000 + }, + { + "epoch": 0.13, + "learning_rate": 2.2555375375375376e-05, + "loss": 0.9915, + "step": 124500 + }, + { + "epoch": 0.13, + "learning_rate": 2.2525345345345344e-05, + "loss": 0.9897, + "step": 125000 + }, + { + "epoch": 0.13, + "learning_rate": 2.2495315315315315e-05, + "loss": 1.0072, + "step": 125500 + }, + { + "epoch": 0.14, + "learning_rate": 2.2465285285285286e-05, + "loss": 0.9896, + "step": 126000 + }, + { + "epoch": 0.14, + "learning_rate": 2.2435315315315316e-05, + "loss": 0.9879, + "step": 126500 + }, + { + "epoch": 0.14, + "learning_rate": 2.2405285285285287e-05, + "loss": 0.9952, + "step": 127000 + }, + { + "epoch": 0.14, + "learning_rate": 2.237525525525526e-05, + "loss": 0.9748, + "step": 127500 + }, + { + "epoch": 0.14, + "learning_rate": 2.2345225225225223e-05, + "loss": 0.9967, + "step": 128000 + }, + { + "epoch": 0.14, + "learning_rate": 2.2315195195195194e-05, + "loss": 1.0016, + "step": 128500 + }, + { + "epoch": 0.14, + "learning_rate": 2.2285165165165166e-05, + "loss": 0.9801, + "step": 129000 + }, + { + "epoch": 0.14, + "learning_rate": 2.2255135135135137e-05, + "loss": 0.9876, + "step": 129500 + }, + { + "epoch": 0.14, + "learning_rate": 2.2225165165165166e-05, + "loss": 0.978, + "step": 130000 + }, + { + "epoch": 0.14, + "eval_loss": 0.9173310399055481, + "eval_runtime": 602.5766, + "eval_samples_per_second": 165.954, + "eval_steps_per_second": 41.489, + "step": 130000 + }, + { + "epoch": 0.14, + "learning_rate": 2.2195135135135138e-05, + "loss": 0.9867, + "step": 130500 + }, + { + "epoch": 0.14, + "learning_rate": 2.2165105105105106e-05, + "loss": 0.9824, + "step": 131000 + }, + { + "epoch": 0.14, + "learning_rate": 2.2135075075075074e-05, + "loss": 0.9877, + "step": 131500 + }, + { + "epoch": 0.14, + "learning_rate": 2.2105045045045045e-05, + "loss": 0.9828, + "step": 132000 + }, + { + "epoch": 0.14, + "learning_rate": 2.2075015015015017e-05, + "loss": 0.9772, + "step": 132500 + }, + { + "epoch": 0.14, + "learning_rate": 2.2044984984984985e-05, + "loss": 0.9906, + "step": 133000 + }, + { + "epoch": 0.14, + "learning_rate": 2.2015015015015014e-05, + "loss": 0.9847, + "step": 133500 + }, + { + "epoch": 0.14, + "learning_rate": 2.1984984984984985e-05, + "loss": 0.9826, + "step": 134000 + }, + { + "epoch": 0.14, + "learning_rate": 2.195507507507508e-05, + "loss": 0.9903, + "step": 134500 + }, + { + "epoch": 0.14, + "learning_rate": 2.1925045045045043e-05, + "loss": 0.9907, + "step": 135000 + }, + { + "epoch": 0.15, + "learning_rate": 2.1895015015015015e-05, + "loss": 0.9909, + "step": 135500 + }, + { + "epoch": 0.15, + "learning_rate": 2.1864984984984986e-05, + "loss": 0.9906, + "step": 136000 + }, + { + "epoch": 0.15, + "learning_rate": 2.1834954954954958e-05, + "loss": 0.9933, + "step": 136500 + }, + { + "epoch": 0.15, + "learning_rate": 2.1804924924924922e-05, + "loss": 0.9754, + "step": 137000 + }, + { + "epoch": 0.15, + "learning_rate": 2.1774894894894894e-05, + "loss": 0.9774, + "step": 137500 + }, + { + "epoch": 0.15, + "learning_rate": 2.1744864864864865e-05, + "loss": 0.9889, + "step": 138000 + }, + { + "epoch": 0.15, + "learning_rate": 2.1714834834834836e-05, + "loss": 0.9972, + "step": 138500 + }, + { + "epoch": 0.15, + "learning_rate": 2.1684804804804808e-05, + "loss": 0.9852, + "step": 139000 + }, + { + "epoch": 0.15, + "learning_rate": 2.1654834834834837e-05, + "loss": 0.9615, + "step": 139500 + }, + { + "epoch": 0.15, + "learning_rate": 2.1624804804804805e-05, + "loss": 0.9865, + "step": 140000 + }, + { + "epoch": 0.15, + "eval_loss": 0.909109354019165, + "eval_runtime": 589.6128, + "eval_samples_per_second": 169.603, + "eval_steps_per_second": 42.401, + "step": 140000 + }, + { + "epoch": 0.15, + "learning_rate": 2.1594774774774773e-05, + "loss": 0.9732, + "step": 140500 + }, + { + "epoch": 0.15, + "learning_rate": 2.1564744744744744e-05, + "loss": 0.9657, + "step": 141000 + }, + { + "epoch": 0.15, + "learning_rate": 2.1534714714714716e-05, + "loss": 0.9651, + "step": 141500 + }, + { + "epoch": 0.15, + "learning_rate": 2.1504684684684687e-05, + "loss": 0.9913, + "step": 142000 + }, + { + "epoch": 0.15, + "learning_rate": 2.1474654654654655e-05, + "loss": 0.9685, + "step": 142500 + }, + { + "epoch": 0.15, + "learning_rate": 2.1444624624624623e-05, + "loss": 0.9663, + "step": 143000 + }, + { + "epoch": 0.15, + "learning_rate": 2.1414594594594595e-05, + "loss": 0.9743, + "step": 143500 + }, + { + "epoch": 0.15, + "learning_rate": 2.1384564564564566e-05, + "loss": 0.9723, + "step": 144000 + }, + { + "epoch": 0.15, + "learning_rate": 2.1354534534534534e-05, + "loss": 0.9815, + "step": 144500 + }, + { + "epoch": 0.16, + "learning_rate": 2.1324564564564563e-05, + "loss": 0.9714, + "step": 145000 + }, + { + "epoch": 0.16, + "learning_rate": 2.1294534534534535e-05, + "loss": 0.9734, + "step": 145500 + }, + { + "epoch": 0.16, + "learning_rate": 2.1264504504504506e-05, + "loss": 0.9679, + "step": 146000 + }, + { + "epoch": 0.16, + "learning_rate": 2.1234474474474474e-05, + "loss": 0.9697, + "step": 146500 + }, + { + "epoch": 0.16, + "learning_rate": 2.1204444444444445e-05, + "loss": 0.9941, + "step": 147000 + }, + { + "epoch": 0.16, + "learning_rate": 2.1174474474474475e-05, + "loss": 0.9786, + "step": 147500 + }, + { + "epoch": 0.16, + "learning_rate": 2.1144444444444443e-05, + "loss": 0.97, + "step": 148000 + }, + { + "epoch": 0.16, + "learning_rate": 2.1114414414414414e-05, + "loss": 0.9672, + "step": 148500 + }, + { + "epoch": 0.16, + "learning_rate": 2.1084384384384385e-05, + "loss": 0.9661, + "step": 149000 + }, + { + "epoch": 0.16, + "learning_rate": 2.1054354354354357e-05, + "loss": 0.9867, + "step": 149500 + }, + { + "epoch": 0.16, + "learning_rate": 2.1024324324324325e-05, + "loss": 0.972, + "step": 150000 + }, + { + "epoch": 0.16, + "eval_loss": 0.8991417288780212, + "eval_runtime": 602.234, + "eval_samples_per_second": 166.048, + "eval_steps_per_second": 41.512, + "step": 150000 + }, + { + "epoch": 0.16, + "learning_rate": 2.0994294294294293e-05, + "loss": 0.9681, + "step": 150500 + }, + { + "epoch": 0.16, + "learning_rate": 2.0964264264264264e-05, + "loss": 0.9601, + "step": 151000 + }, + { + "epoch": 0.16, + "learning_rate": 2.0934294294294293e-05, + "loss": 0.955, + "step": 151500 + }, + { + "epoch": 0.16, + "learning_rate": 2.0904264264264265e-05, + "loss": 0.9703, + "step": 152000 + }, + { + "epoch": 0.16, + "learning_rate": 2.0874234234234236e-05, + "loss": 0.9535, + "step": 152500 + }, + { + "epoch": 0.16, + "learning_rate": 2.0844204204204204e-05, + "loss": 0.968, + "step": 153000 + }, + { + "epoch": 0.16, + "learning_rate": 2.0814174174174172e-05, + "loss": 0.9737, + "step": 153500 + }, + { + "epoch": 0.17, + "learning_rate": 2.0784204204204205e-05, + "loss": 0.9714, + "step": 154000 + }, + { + "epoch": 0.17, + "learning_rate": 2.0754174174174173e-05, + "loss": 0.9737, + "step": 154500 + }, + { + "epoch": 0.17, + "learning_rate": 2.0724144144144144e-05, + "loss": 0.955, + "step": 155000 + }, + { + "epoch": 0.17, + "learning_rate": 2.0694114114114116e-05, + "loss": 0.9677, + "step": 155500 + }, + { + "epoch": 0.17, + "learning_rate": 2.0664084084084084e-05, + "loss": 0.964, + "step": 156000 + }, + { + "epoch": 0.17, + "learning_rate": 2.0634054054054055e-05, + "loss": 0.9675, + "step": 156500 + }, + { + "epoch": 0.17, + "learning_rate": 2.0604024024024023e-05, + "loss": 0.9705, + "step": 157000 + }, + { + "epoch": 0.17, + "learning_rate": 2.0573993993993994e-05, + "loss": 0.9634, + "step": 157500 + }, + { + "epoch": 0.17, + "learning_rate": 2.0544024024024024e-05, + "loss": 0.9583, + "step": 158000 + }, + { + "epoch": 0.17, + "learning_rate": 2.0513993993993995e-05, + "loss": 0.9614, + "step": 158500 + }, + { + "epoch": 0.17, + "learning_rate": 2.0483963963963963e-05, + "loss": 0.9707, + "step": 159000 + }, + { + "epoch": 0.17, + "learning_rate": 2.0453933933933934e-05, + "loss": 0.95, + "step": 159500 + }, + { + "epoch": 0.17, + "learning_rate": 2.0423903903903906e-05, + "loss": 0.9673, + "step": 160000 + }, + { + "epoch": 0.17, + "eval_loss": 0.8937882781028748, + "eval_runtime": 585.5551, + "eval_samples_per_second": 170.778, + "eval_steps_per_second": 42.695, + "step": 160000 + }, + { + "epoch": 0.17, + "learning_rate": 2.0393873873873874e-05, + "loss": 0.9509, + "step": 160500 + }, + { + "epoch": 0.17, + "learning_rate": 2.0363843843843842e-05, + "loss": 0.9537, + "step": 161000 + }, + { + "epoch": 0.17, + "learning_rate": 2.0333813813813813e-05, + "loss": 0.9528, + "step": 161500 + }, + { + "epoch": 0.17, + "learning_rate": 2.0303903903903907e-05, + "loss": 0.958, + "step": 162000 + }, + { + "epoch": 0.17, + "learning_rate": 2.0273873873873875e-05, + "loss": 0.9773, + "step": 162500 + }, + { + "epoch": 0.17, + "learning_rate": 2.0243843843843843e-05, + "loss": 0.963, + "step": 163000 + }, + { + "epoch": 0.18, + "learning_rate": 2.0213813813813814e-05, + "loss": 0.9617, + "step": 163500 + }, + { + "epoch": 0.18, + "learning_rate": 2.0183843843843844e-05, + "loss": 0.9572, + "step": 164000 + }, + { + "epoch": 0.18, + "learning_rate": 2.0153813813813815e-05, + "loss": 0.9535, + "step": 164500 + }, + { + "epoch": 0.18, + "learning_rate": 2.0123783783783783e-05, + "loss": 0.9599, + "step": 165000 + }, + { + "epoch": 0.18, + "learning_rate": 2.0093813813813816e-05, + "loss": 0.9612, + "step": 165500 + }, + { + "epoch": 0.18, + "learning_rate": 2.0063783783783784e-05, + "loss": 0.9474, + "step": 166000 + }, + { + "epoch": 0.18, + "learning_rate": 2.0033753753753755e-05, + "loss": 0.9534, + "step": 166500 + }, + { + "epoch": 0.18, + "learning_rate": 2.0003723723723726e-05, + "loss": 0.9509, + "step": 167000 + }, + { + "epoch": 0.18, + "learning_rate": 1.9973693693693694e-05, + "loss": 0.955, + "step": 167500 + }, + { + "epoch": 0.18, + "learning_rate": 1.9943663663663662e-05, + "loss": 0.9719, + "step": 168000 + }, + { + "epoch": 0.18, + "learning_rate": 1.9913633633633634e-05, + "loss": 0.9597, + "step": 168500 + }, + { + "epoch": 0.18, + "learning_rate": 1.9883603603603605e-05, + "loss": 0.9484, + "step": 169000 + }, + { + "epoch": 0.18, + "learning_rate": 1.9853573573573577e-05, + "loss": 0.9539, + "step": 169500 + }, + { + "epoch": 0.18, + "learning_rate": 1.9823543543543545e-05, + "loss": 0.9493, + "step": 170000 + }, + { + "epoch": 0.18, + "eval_loss": 0.8906692266464233, + "eval_runtime": 620.1154, + "eval_samples_per_second": 161.26, + "eval_steps_per_second": 40.315, + "step": 170000 + }, + { + "epoch": 0.18, + "learning_rate": 1.9793513513513513e-05, + "loss": 0.9523, + "step": 170500 + }, + { + "epoch": 0.18, + "learning_rate": 1.9763483483483484e-05, + "loss": 0.9462, + "step": 171000 + }, + { + "epoch": 0.18, + "learning_rate": 1.9733453453453455e-05, + "loss": 0.9438, + "step": 171500 + }, + { + "epoch": 0.18, + "learning_rate": 1.9703423423423423e-05, + "loss": 0.9407, + "step": 172000 + }, + { + "epoch": 0.18, + "learning_rate": 1.967339339339339e-05, + "loss": 0.9508, + "step": 172500 + }, + { + "epoch": 0.19, + "learning_rate": 1.9643363363363363e-05, + "loss": 0.9544, + "step": 173000 + }, + { + "epoch": 0.19, + "learning_rate": 1.9613393393393392e-05, + "loss": 0.9576, + "step": 173500 + }, + { + "epoch": 0.19, + "learning_rate": 1.9583363363363363e-05, + "loss": 0.9528, + "step": 174000 + }, + { + "epoch": 0.19, + "learning_rate": 1.9553333333333335e-05, + "loss": 0.9481, + "step": 174500 + }, + { + "epoch": 0.19, + "learning_rate": 1.9523363363363364e-05, + "loss": 0.9393, + "step": 175000 + }, + { + "epoch": 0.19, + "learning_rate": 1.9493333333333335e-05, + "loss": 0.9377, + "step": 175500 + }, + { + "epoch": 0.19, + "learning_rate": 1.9463303303303303e-05, + "loss": 0.9342, + "step": 176000 + }, + { + "epoch": 0.19, + "learning_rate": 1.9433273273273275e-05, + "loss": 0.9309, + "step": 176500 + }, + { + "epoch": 0.19, + "learning_rate": 1.9403243243243243e-05, + "loss": 0.9607, + "step": 177000 + }, + { + "epoch": 0.19, + "learning_rate": 1.9373213213213214e-05, + "loss": 0.9508, + "step": 177500 + }, + { + "epoch": 0.19, + "learning_rate": 1.9343183183183186e-05, + "loss": 0.9361, + "step": 178000 + }, + { + "epoch": 0.19, + "learning_rate": 1.9313153153153154e-05, + "loss": 0.9312, + "step": 178500 + }, + { + "epoch": 0.19, + "learning_rate": 1.9283183183183183e-05, + "loss": 0.9503, + "step": 179000 + }, + { + "epoch": 0.19, + "learning_rate": 1.9253153153153154e-05, + "loss": 0.9509, + "step": 179500 + }, + { + "epoch": 0.19, + "learning_rate": 1.9223183183183183e-05, + "loss": 0.9311, + "step": 180000 + }, + { + "epoch": 0.19, + "eval_loss": 0.8821685910224915, + "eval_runtime": 603.0834, + "eval_samples_per_second": 165.815, + "eval_steps_per_second": 41.454, + "step": 180000 + }, + { + "epoch": 0.19, + "learning_rate": 1.9193153153153155e-05, + "loss": 0.95, + "step": 180500 + }, + { + "epoch": 0.19, + "learning_rate": 1.9163123123123126e-05, + "loss": 0.9439, + "step": 181000 + }, + { + "epoch": 0.19, + "learning_rate": 1.9133093093093094e-05, + "loss": 0.9397, + "step": 181500 + }, + { + "epoch": 0.2, + "learning_rate": 1.9103063063063062e-05, + "loss": 0.9353, + "step": 182000 + }, + { + "epoch": 0.2, + "learning_rate": 1.9073033033033033e-05, + "loss": 0.9464, + "step": 182500 + }, + { + "epoch": 0.2, + "learning_rate": 1.9043003003003005e-05, + "loss": 0.9346, + "step": 183000 + }, + { + "epoch": 0.2, + "learning_rate": 1.9012972972972976e-05, + "loss": 0.9596, + "step": 183500 + }, + { + "epoch": 0.2, + "learning_rate": 1.8983003003003005e-05, + "loss": 0.9653, + "step": 184000 + }, + { + "epoch": 0.2, + "learning_rate": 1.8952972972972973e-05, + "loss": 0.9459, + "step": 184500 + }, + { + "epoch": 0.2, + "learning_rate": 1.8923003003003006e-05, + "loss": 0.9249, + "step": 185000 + }, + { + "epoch": 0.2, + "learning_rate": 1.8892972972972974e-05, + "loss": 0.9413, + "step": 185500 + }, + { + "epoch": 0.2, + "learning_rate": 1.8862942942942942e-05, + "loss": 0.943, + "step": 186000 + }, + { + "epoch": 0.2, + "learning_rate": 1.8832912912912913e-05, + "loss": 0.947, + "step": 186500 + }, + { + "epoch": 0.2, + "learning_rate": 1.8802882882882885e-05, + "loss": 0.9567, + "step": 187000 + }, + { + "epoch": 0.2, + "learning_rate": 1.8772912912912914e-05, + "loss": 0.9314, + "step": 187500 + }, + { + "epoch": 0.2, + "learning_rate": 1.8742882882882882e-05, + "loss": 0.9512, + "step": 188000 + }, + { + "epoch": 0.2, + "learning_rate": 1.8712852852852853e-05, + "loss": 0.9505, + "step": 188500 + }, + { + "epoch": 0.2, + "learning_rate": 1.8682822822822825e-05, + "loss": 0.9338, + "step": 189000 + }, + { + "epoch": 0.2, + "learning_rate": 1.8652792792792793e-05, + "loss": 0.9395, + "step": 189500 + }, + { + "epoch": 0.2, + "learning_rate": 1.8622762762762764e-05, + "loss": 0.9462, + "step": 190000 + }, + { + "epoch": 0.2, + "eval_loss": 0.8739633560180664, + "eval_runtime": 600.0958, + "eval_samples_per_second": 166.64, + "eval_steps_per_second": 41.66, + "step": 190000 + }, + { + "epoch": 0.2, + "learning_rate": 1.8592732732732732e-05, + "loss": 0.9298, + "step": 190500 + }, + { + "epoch": 0.2, + "learning_rate": 1.8562702702702704e-05, + "loss": 0.9386, + "step": 191000 + }, + { + "epoch": 0.21, + "learning_rate": 1.8532732732732733e-05, + "loss": 0.937, + "step": 191500 + }, + { + "epoch": 0.21, + "learning_rate": 1.8502702702702704e-05, + "loss": 0.9436, + "step": 192000 + }, + { + "epoch": 0.21, + "learning_rate": 1.8472672672672676e-05, + "loss": 0.9416, + "step": 192500 + }, + { + "epoch": 0.21, + "learning_rate": 1.8442642642642644e-05, + "loss": 0.9432, + "step": 193000 + }, + { + "epoch": 0.21, + "learning_rate": 1.841261261261261e-05, + "loss": 0.9358, + "step": 193500 + }, + { + "epoch": 0.21, + "learning_rate": 1.8382582582582583e-05, + "loss": 0.9322, + "step": 194000 + }, + { + "epoch": 0.21, + "learning_rate": 1.8352552552552554e-05, + "loss": 0.9325, + "step": 194500 + }, + { + "epoch": 0.21, + "learning_rate": 1.8322522522522526e-05, + "loss": 0.9309, + "step": 195000 + }, + { + "epoch": 0.21, + "learning_rate": 1.8292552552552555e-05, + "loss": 0.9439, + "step": 195500 + }, + { + "epoch": 0.21, + "learning_rate": 1.8262522522522523e-05, + "loss": 0.9186, + "step": 196000 + }, + { + "epoch": 0.21, + "learning_rate": 1.823249249249249e-05, + "loss": 0.9252, + "step": 196500 + }, + { + "epoch": 0.21, + "learning_rate": 1.8202462462462462e-05, + "loss": 0.9458, + "step": 197000 + }, + { + "epoch": 0.21, + "learning_rate": 1.817249249249249e-05, + "loss": 0.9415, + "step": 197500 + }, + { + "epoch": 0.21, + "learning_rate": 1.8142462462462463e-05, + "loss": 0.9387, + "step": 198000 + }, + { + "epoch": 0.21, + "learning_rate": 1.8112432432432434e-05, + "loss": 0.922, + "step": 198500 + }, + { + "epoch": 0.21, + "learning_rate": 1.8082462462462464e-05, + "loss": 0.9299, + "step": 199000 + }, + { + "epoch": 0.21, + "learning_rate": 1.805243243243243e-05, + "loss": 0.9296, + "step": 199500 + }, + { + "epoch": 0.21, + "learning_rate": 1.8022402402402403e-05, + "loss": 0.9302, + "step": 200000 + }, + { + "epoch": 0.21, + "eval_loss": 0.868446946144104, + "eval_runtime": 599.5829, + "eval_samples_per_second": 166.783, + "eval_steps_per_second": 41.696, + "step": 200000 + }, + { + "epoch": 0.22, + "learning_rate": 1.7992372372372374e-05, + "loss": 0.9236, + "step": 200500 + }, + { + "epoch": 0.22, + "learning_rate": 1.7962342342342342e-05, + "loss": 0.9376, + "step": 201000 + }, + { + "epoch": 0.22, + "learning_rate": 1.7932312312312314e-05, + "loss": 0.9373, + "step": 201500 + }, + { + "epoch": 0.22, + "learning_rate": 1.7902342342342343e-05, + "loss": 0.9363, + "step": 202000 + }, + { + "epoch": 0.22, + "learning_rate": 1.787231231231231e-05, + "loss": 0.9386, + "step": 202500 + }, + { + "epoch": 0.22, + "learning_rate": 1.7842282282282282e-05, + "loss": 0.9139, + "step": 203000 + }, + { + "epoch": 0.22, + "learning_rate": 1.7812252252252254e-05, + "loss": 0.9403, + "step": 203500 + }, + { + "epoch": 0.22, + "learning_rate": 1.7782282282282283e-05, + "loss": 0.9184, + "step": 204000 + }, + { + "epoch": 0.22, + "learning_rate": 1.7752252252252254e-05, + "loss": 0.9281, + "step": 204500 + }, + { + "epoch": 0.22, + "learning_rate": 1.7722222222222226e-05, + "loss": 0.9358, + "step": 205000 + }, + { + "epoch": 0.22, + "learning_rate": 1.769219219219219e-05, + "loss": 0.9337, + "step": 205500 + }, + { + "epoch": 0.22, + "learning_rate": 1.7662162162162162e-05, + "loss": 0.9295, + "step": 206000 + }, + { + "epoch": 0.22, + "learning_rate": 1.7632132132132133e-05, + "loss": 0.931, + "step": 206500 + }, + { + "epoch": 0.22, + "learning_rate": 1.7602102102102105e-05, + "loss": 0.9348, + "step": 207000 + }, + { + "epoch": 0.22, + "learning_rate": 1.7572072072072073e-05, + "loss": 0.9153, + "step": 207500 + }, + { + "epoch": 0.22, + "learning_rate": 1.7542102102102102e-05, + "loss": 0.9248, + "step": 208000 + }, + { + "epoch": 0.22, + "learning_rate": 1.7512072072072073e-05, + "loss": 0.9298, + "step": 208500 + }, + { + "epoch": 0.22, + "learning_rate": 1.748204204204204e-05, + "loss": 0.929, + "step": 209000 + }, + { + "epoch": 0.22, + "learning_rate": 1.7452012012012013e-05, + "loss": 0.9327, + "step": 209500 + }, + { + "epoch": 0.23, + "learning_rate": 1.7421981981981984e-05, + "loss": 0.921, + "step": 210000 + }, + { + "epoch": 0.23, + "eval_loss": 0.8622388243675232, + "eval_runtime": 575.3524, + "eval_samples_per_second": 173.807, + "eval_steps_per_second": 43.452, + "step": 210000 + }, + { + "epoch": 0.23, + "learning_rate": 1.7392012012012013e-05, + "loss": 0.9248, + "step": 210500 + }, + { + "epoch": 0.23, + "learning_rate": 1.736198198198198e-05, + "loss": 0.9119, + "step": 211000 + }, + { + "epoch": 0.23, + "learning_rate": 1.7331951951951952e-05, + "loss": 0.9253, + "step": 211500 + }, + { + "epoch": 0.23, + "learning_rate": 1.7301921921921924e-05, + "loss": 0.9215, + "step": 212000 + }, + { + "epoch": 0.23, + "learning_rate": 1.7271891891891892e-05, + "loss": 0.947, + "step": 212500 + }, + { + "epoch": 0.23, + "learning_rate": 1.7241861861861863e-05, + "loss": 0.9279, + "step": 213000 + }, + { + "epoch": 0.23, + "learning_rate": 1.721183183183183e-05, + "loss": 0.9202, + "step": 213500 + }, + { + "epoch": 0.23, + "learning_rate": 1.7181801801801803e-05, + "loss": 0.9283, + "step": 214000 + }, + { + "epoch": 0.23, + "learning_rate": 1.7151831831831832e-05, + "loss": 0.917, + "step": 214500 + }, + { + "epoch": 0.23, + "learning_rate": 1.712186186186186e-05, + "loss": 0.9394, + "step": 215000 + }, + { + "epoch": 0.23, + "learning_rate": 1.7091831831831832e-05, + "loss": 0.9431, + "step": 215500 + }, + { + "epoch": 0.23, + "learning_rate": 1.7061801801801804e-05, + "loss": 0.9241, + "step": 216000 + }, + { + "epoch": 0.23, + "learning_rate": 1.7031771771771775e-05, + "loss": 0.9305, + "step": 216500 + }, + { + "epoch": 0.23, + "learning_rate": 1.700174174174174e-05, + "loss": 0.9231, + "step": 217000 + }, + { + "epoch": 0.23, + "learning_rate": 1.6971771771771772e-05, + "loss": 0.9238, + "step": 217500 + }, + { + "epoch": 0.23, + "learning_rate": 1.694174174174174e-05, + "loss": 0.9168, + "step": 218000 + }, + { + "epoch": 0.23, + "learning_rate": 1.6911771771771773e-05, + "loss": 0.9201, + "step": 218500 + }, + { + "epoch": 0.23, + "learning_rate": 1.688174174174174e-05, + "loss": 0.9141, + "step": 219000 + }, + { + "epoch": 0.24, + "learning_rate": 1.6851771771771774e-05, + "loss": 0.9076, + "step": 219500 + }, + { + "epoch": 0.24, + "learning_rate": 1.682174174174174e-05, + "loss": 0.9198, + "step": 220000 + }, + { + "epoch": 0.24, + "eval_loss": 0.85796719789505, + "eval_runtime": 609.8643, + "eval_samples_per_second": 163.971, + "eval_steps_per_second": 40.993, + "step": 220000 + }, + { + "epoch": 0.24, + "learning_rate": 1.6791711711711713e-05, + "loss": 0.9256, + "step": 220500 + }, + { + "epoch": 0.24, + "learning_rate": 1.676168168168168e-05, + "loss": 0.9195, + "step": 221000 + }, + { + "epoch": 0.24, + "learning_rate": 1.6731651651651652e-05, + "loss": 0.916, + "step": 221500 + }, + { + "epoch": 0.24, + "learning_rate": 1.6701621621621624e-05, + "loss": 0.9283, + "step": 222000 + }, + { + "epoch": 0.24, + "learning_rate": 1.6671591591591592e-05, + "loss": 0.912, + "step": 222500 + }, + { + "epoch": 0.24, + "learning_rate": 1.664156156156156e-05, + "loss": 0.9138, + "step": 223000 + }, + { + "epoch": 0.24, + "learning_rate": 1.661153153153153e-05, + "loss": 0.924, + "step": 223500 + }, + { + "epoch": 0.24, + "learning_rate": 1.6581501501501503e-05, + "loss": 0.9147, + "step": 224000 + }, + { + "epoch": 0.24, + "learning_rate": 1.6551471471471474e-05, + "loss": 0.9228, + "step": 224500 + }, + { + "epoch": 0.24, + "learning_rate": 1.6521441441441442e-05, + "loss": 0.9203, + "step": 225000 + }, + { + "epoch": 0.24, + "learning_rate": 1.649141141141141e-05, + "loss": 0.9118, + "step": 225500 + }, + { + "epoch": 0.24, + "learning_rate": 1.646138138138138e-05, + "loss": 0.9151, + "step": 226000 + }, + { + "epoch": 0.24, + "learning_rate": 1.6431351351351353e-05, + "loss": 0.9149, + "step": 226500 + }, + { + "epoch": 0.24, + "learning_rate": 1.6401321321321324e-05, + "loss": 0.9162, + "step": 227000 + }, + { + "epoch": 0.24, + "learning_rate": 1.6371471471471472e-05, + "loss": 0.9191, + "step": 227500 + }, + { + "epoch": 0.24, + "learning_rate": 1.63415015015015e-05, + "loss": 0.9146, + "step": 228000 + }, + { + "epoch": 0.25, + "learning_rate": 1.6311471471471473e-05, + "loss": 0.9198, + "step": 228500 + }, + { + "epoch": 0.25, + "learning_rate": 1.628144144144144e-05, + "loss": 0.9175, + "step": 229000 + }, + { + "epoch": 0.25, + "learning_rate": 1.6251411411411412e-05, + "loss": 0.9145, + "step": 229500 + }, + { + "epoch": 0.25, + "learning_rate": 1.622138138138138e-05, + "loss": 0.9115, + "step": 230000 + }, + { + "epoch": 0.25, + "eval_loss": 0.8521190881729126, + "eval_runtime": 576.8956, + "eval_samples_per_second": 173.342, + "eval_steps_per_second": 43.335, + "step": 230000 + }, + { + "epoch": 0.25, + "learning_rate": 1.619141141141141e-05, + "loss": 0.9222, + "step": 230500 + }, + { + "epoch": 0.25, + "learning_rate": 1.616138138138138e-05, + "loss": 0.9218, + "step": 231000 + }, + { + "epoch": 0.25, + "learning_rate": 1.6131351351351352e-05, + "loss": 0.9173, + "step": 231500 + }, + { + "epoch": 0.25, + "learning_rate": 1.6101321321321324e-05, + "loss": 0.9215, + "step": 232000 + }, + { + "epoch": 0.25, + "learning_rate": 1.6071291291291292e-05, + "loss": 0.905, + "step": 232500 + }, + { + "epoch": 0.25, + "learning_rate": 1.604126126126126e-05, + "loss": 0.9172, + "step": 233000 + }, + { + "epoch": 0.25, + "learning_rate": 1.601123123123123e-05, + "loss": 0.9092, + "step": 233500 + }, + { + "epoch": 0.25, + "learning_rate": 1.5981201201201203e-05, + "loss": 0.9137, + "step": 234000 + }, + { + "epoch": 0.25, + "learning_rate": 1.5951171171171174e-05, + "loss": 0.9082, + "step": 234500 + }, + { + "epoch": 0.25, + "learning_rate": 1.592114114114114e-05, + "loss": 0.9221, + "step": 235000 + }, + { + "epoch": 0.25, + "learning_rate": 1.589111111111111e-05, + "loss": 0.9057, + "step": 235500 + }, + { + "epoch": 0.25, + "learning_rate": 1.586114114114114e-05, + "loss": 0.9145, + "step": 236000 + }, + { + "epoch": 0.25, + "learning_rate": 1.583111111111111e-05, + "loss": 0.9046, + "step": 236500 + }, + { + "epoch": 0.25, + "learning_rate": 1.5801081081081082e-05, + "loss": 0.8992, + "step": 237000 + }, + { + "epoch": 0.25, + "learning_rate": 1.5771051051051053e-05, + "loss": 0.9011, + "step": 237500 + }, + { + "epoch": 0.26, + "learning_rate": 1.574102102102102e-05, + "loss": 0.9066, + "step": 238000 + }, + { + "epoch": 0.26, + "learning_rate": 1.571099099099099e-05, + "loss": 0.9076, + "step": 238500 + }, + { + "epoch": 0.26, + "learning_rate": 1.568096096096096e-05, + "loss": 0.8947, + "step": 239000 + }, + { + "epoch": 0.26, + "learning_rate": 1.5650930930930932e-05, + "loss": 0.8981, + "step": 239500 + }, + { + "epoch": 0.26, + "learning_rate": 1.5620900900900904e-05, + "loss": 0.9177, + "step": 240000 + }, + { + "epoch": 0.26, + "eval_loss": 0.8470381498336792, + "eval_runtime": 609.4393, + "eval_samples_per_second": 164.085, + "eval_steps_per_second": 41.021, + "step": 240000 + }, + { + "epoch": 0.26, + "learning_rate": 1.559087087087087e-05, + "loss": 0.9091, + "step": 240500 + }, + { + "epoch": 0.26, + "learning_rate": 1.556084084084084e-05, + "loss": 0.9154, + "step": 241000 + }, + { + "epoch": 0.26, + "learning_rate": 1.553081081081081e-05, + "loss": 0.9091, + "step": 241500 + }, + { + "epoch": 0.26, + "learning_rate": 1.5500780780780782e-05, + "loss": 0.9216, + "step": 242000 + }, + { + "epoch": 0.26, + "learning_rate": 1.547075075075075e-05, + "loss": 0.9147, + "step": 242500 + }, + { + "epoch": 0.26, + "learning_rate": 1.5440720720720722e-05, + "loss": 0.9052, + "step": 243000 + }, + { + "epoch": 0.26, + "learning_rate": 1.541069069069069e-05, + "loss": 0.9056, + "step": 243500 + }, + { + "epoch": 0.26, + "learning_rate": 1.5380720720720722e-05, + "loss": 0.9149, + "step": 244000 + }, + { + "epoch": 0.26, + "learning_rate": 1.535069069069069e-05, + "loss": 0.9082, + "step": 244500 + }, + { + "epoch": 0.26, + "learning_rate": 1.5320660660660662e-05, + "loss": 0.9136, + "step": 245000 + }, + { + "epoch": 0.26, + "learning_rate": 1.529069069069069e-05, + "loss": 0.9214, + "step": 245500 + }, + { + "epoch": 0.26, + "learning_rate": 1.526066066066066e-05, + "loss": 0.9158, + "step": 246000 + }, + { + "epoch": 0.26, + "learning_rate": 1.523063063063063e-05, + "loss": 0.8907, + "step": 246500 + }, + { + "epoch": 0.26, + "learning_rate": 1.520066066066066e-05, + "loss": 0.8975, + "step": 247000 + }, + { + "epoch": 0.27, + "learning_rate": 1.5170630630630631e-05, + "loss": 0.8965, + "step": 247500 + }, + { + "epoch": 0.27, + "learning_rate": 1.5140600600600602e-05, + "loss": 0.8937, + "step": 248000 + }, + { + "epoch": 0.27, + "learning_rate": 1.5110570570570572e-05, + "loss": 0.8949, + "step": 248500 + }, + { + "epoch": 0.27, + "learning_rate": 1.508054054054054e-05, + "loss": 0.9091, + "step": 249000 + }, + { + "epoch": 0.27, + "learning_rate": 1.505051051051051e-05, + "loss": 0.9114, + "step": 249500 + }, + { + "epoch": 0.27, + "learning_rate": 1.5020480480480481e-05, + "loss": 0.8959, + "step": 250000 + }, + { + "epoch": 0.27, + "eval_loss": 0.8428720235824585, + "eval_runtime": 586.0492, + "eval_samples_per_second": 170.634, + "eval_steps_per_second": 42.659, + "step": 250000 + }, + { + "epoch": 0.27, + "learning_rate": 1.4990510510510512e-05, + "loss": 0.904, + "step": 250500 + }, + { + "epoch": 0.27, + "learning_rate": 1.496048048048048e-05, + "loss": 0.9161, + "step": 251000 + }, + { + "epoch": 0.27, + "learning_rate": 1.4930450450450451e-05, + "loss": 0.8939, + "step": 251500 + }, + { + "epoch": 0.27, + "learning_rate": 1.4900420420420421e-05, + "loss": 0.9136, + "step": 252000 + }, + { + "epoch": 0.27, + "learning_rate": 1.487039039039039e-05, + "loss": 0.9042, + "step": 252500 + }, + { + "epoch": 0.27, + "learning_rate": 1.484036036036036e-05, + "loss": 0.9086, + "step": 253000 + }, + { + "epoch": 0.27, + "learning_rate": 1.481033033033033e-05, + "loss": 0.9049, + "step": 253500 + }, + { + "epoch": 0.27, + "learning_rate": 1.4780300300300302e-05, + "loss": 0.9066, + "step": 254000 + }, + { + "epoch": 0.27, + "learning_rate": 1.475027027027027e-05, + "loss": 0.8934, + "step": 254500 + }, + { + "epoch": 0.27, + "learning_rate": 1.4720240240240241e-05, + "loss": 0.909, + "step": 255000 + }, + { + "epoch": 0.27, + "learning_rate": 1.469021021021021e-05, + "loss": 0.8941, + "step": 255500 + }, + { + "epoch": 0.27, + "learning_rate": 1.4660240240240242e-05, + "loss": 0.9059, + "step": 256000 + }, + { + "epoch": 0.28, + "learning_rate": 1.463021021021021e-05, + "loss": 0.9114, + "step": 256500 + }, + { + "epoch": 0.28, + "learning_rate": 1.4600180180180181e-05, + "loss": 0.8954, + "step": 257000 + }, + { + "epoch": 0.28, + "learning_rate": 1.457015015015015e-05, + "loss": 0.9018, + "step": 257500 + }, + { + "epoch": 0.28, + "learning_rate": 1.454018018018018e-05, + "loss": 0.9021, + "step": 258000 + }, + { + "epoch": 0.28, + "learning_rate": 1.4510150150150151e-05, + "loss": 0.8995, + "step": 258500 + }, + { + "epoch": 0.28, + "learning_rate": 1.448012012012012e-05, + "loss": 0.909, + "step": 259000 + }, + { + "epoch": 0.28, + "learning_rate": 1.4450150150150152e-05, + "loss": 0.8923, + "step": 259500 + }, + { + "epoch": 0.28, + "learning_rate": 1.442012012012012e-05, + "loss": 0.9074, + "step": 260000 + }, + { + "epoch": 0.28, + "eval_loss": 0.8390381932258606, + "eval_runtime": 573.4535, + "eval_samples_per_second": 174.382, + "eval_steps_per_second": 43.596, + "step": 260000 + }, + { + "epoch": 0.28, + "learning_rate": 1.4390090090090091e-05, + "loss": 0.9006, + "step": 260500 + }, + { + "epoch": 0.28, + "learning_rate": 1.4360060060060061e-05, + "loss": 0.9005, + "step": 261000 + }, + { + "epoch": 0.28, + "learning_rate": 1.433003003003003e-05, + "loss": 0.9141, + "step": 261500 + }, + { + "epoch": 0.28, + "learning_rate": 1.43e-05, + "loss": 0.9145, + "step": 262000 + }, + { + "epoch": 0.28, + "learning_rate": 1.426996996996997e-05, + "loss": 0.899, + "step": 262500 + }, + { + "epoch": 0.28, + "learning_rate": 1.423993993993994e-05, + "loss": 0.9111, + "step": 263000 + }, + { + "epoch": 0.28, + "learning_rate": 1.420996996996997e-05, + "loss": 0.906, + "step": 263500 + }, + { + "epoch": 0.28, + "learning_rate": 1.417993993993994e-05, + "loss": 0.8911, + "step": 264000 + }, + { + "epoch": 0.28, + "learning_rate": 1.4149909909909912e-05, + "loss": 0.8914, + "step": 264500 + }, + { + "epoch": 0.28, + "learning_rate": 1.411987987987988e-05, + "loss": 0.8978, + "step": 265000 + }, + { + "epoch": 0.28, + "learning_rate": 1.4089849849849851e-05, + "loss": 0.8921, + "step": 265500 + }, + { + "epoch": 0.29, + "learning_rate": 1.4059819819819819e-05, + "loss": 0.8938, + "step": 266000 + }, + { + "epoch": 0.29, + "learning_rate": 1.402978978978979e-05, + "loss": 0.9009, + "step": 266500 + }, + { + "epoch": 0.29, + "learning_rate": 1.3999759759759759e-05, + "loss": 0.8957, + "step": 267000 + }, + { + "epoch": 0.29, + "learning_rate": 1.396984984984985e-05, + "loss": 0.8944, + "step": 267500 + }, + { + "epoch": 0.29, + "learning_rate": 1.393981981981982e-05, + "loss": 0.8903, + "step": 268000 + }, + { + "epoch": 0.29, + "learning_rate": 1.390978978978979e-05, + "loss": 0.8932, + "step": 268500 + }, + { + "epoch": 0.29, + "learning_rate": 1.3879759759759761e-05, + "loss": 0.8853, + "step": 269000 + }, + { + "epoch": 0.29, + "learning_rate": 1.384972972972973e-05, + "loss": 0.8908, + "step": 269500 + }, + { + "epoch": 0.29, + "learning_rate": 1.38196996996997e-05, + "loss": 0.8888, + "step": 270000 + }, + { + "epoch": 0.29, + "eval_loss": 0.8343602418899536, + "eval_runtime": 589.1233, + "eval_samples_per_second": 169.744, + "eval_steps_per_second": 42.436, + "step": 270000 + }, + { + "epoch": 0.29, + "learning_rate": 1.3789669669669669e-05, + "loss": 0.8947, + "step": 270500 + }, + { + "epoch": 0.29, + "learning_rate": 1.375963963963964e-05, + "loss": 0.8939, + "step": 271000 + }, + { + "epoch": 0.29, + "learning_rate": 1.3729609609609612e-05, + "loss": 0.9018, + "step": 271500 + }, + { + "epoch": 0.29, + "learning_rate": 1.369957957957958e-05, + "loss": 0.9036, + "step": 272000 + }, + { + "epoch": 0.29, + "learning_rate": 1.3669549549549551e-05, + "loss": 0.8973, + "step": 272500 + }, + { + "epoch": 0.29, + "learning_rate": 1.3639519519519519e-05, + "loss": 0.885, + "step": 273000 + }, + { + "epoch": 0.29, + "learning_rate": 1.360954954954955e-05, + "loss": 0.8845, + "step": 273500 + }, + { + "epoch": 0.29, + "learning_rate": 1.357951951951952e-05, + "loss": 0.8789, + "step": 274000 + }, + { + "epoch": 0.29, + "learning_rate": 1.354948948948949e-05, + "loss": 0.8986, + "step": 274500 + }, + { + "epoch": 0.29, + "learning_rate": 1.351945945945946e-05, + "loss": 0.8909, + "step": 275000 + }, + { + "epoch": 0.3, + "learning_rate": 1.348942942942943e-05, + "loss": 0.9002, + "step": 275500 + }, + { + "epoch": 0.3, + "learning_rate": 1.3459459459459461e-05, + "loss": 0.8946, + "step": 276000 + }, + { + "epoch": 0.3, + "learning_rate": 1.342942942942943e-05, + "loss": 0.8911, + "step": 276500 + }, + { + "epoch": 0.3, + "learning_rate": 1.33993993993994e-05, + "loss": 0.891, + "step": 277000 + }, + { + "epoch": 0.3, + "learning_rate": 1.3369369369369369e-05, + "loss": 0.8833, + "step": 277500 + }, + { + "epoch": 0.3, + "learning_rate": 1.33393993993994e-05, + "loss": 0.8863, + "step": 278000 + }, + { + "epoch": 0.3, + "learning_rate": 1.330936936936937e-05, + "loss": 0.8765, + "step": 278500 + }, + { + "epoch": 0.3, + "learning_rate": 1.32793993993994e-05, + "loss": 0.8945, + "step": 279000 + }, + { + "epoch": 0.3, + "learning_rate": 1.324936936936937e-05, + "loss": 0.8936, + "step": 279500 + }, + { + "epoch": 0.3, + "learning_rate": 1.321933933933934e-05, + "loss": 0.9012, + "step": 280000 + }, + { + "epoch": 0.3, + "eval_loss": 0.8308248519897461, + "eval_runtime": 569.4338, + "eval_samples_per_second": 175.613, + "eval_steps_per_second": 43.903, + "step": 280000 + }, + { + "epoch": 0.3, + "learning_rate": 1.3189309309309311e-05, + "loss": 0.8835, + "step": 280500 + }, + { + "epoch": 0.3, + "learning_rate": 1.3159279279279279e-05, + "loss": 0.8866, + "step": 281000 + }, + { + "epoch": 0.3, + "learning_rate": 1.312924924924925e-05, + "loss": 0.8876, + "step": 281500 + }, + { + "epoch": 0.3, + "learning_rate": 1.309927927927928e-05, + "loss": 0.8846, + "step": 282000 + }, + { + "epoch": 0.3, + "learning_rate": 1.306924924924925e-05, + "loss": 0.8833, + "step": 282500 + }, + { + "epoch": 0.3, + "learning_rate": 1.3039219219219219e-05, + "loss": 0.8993, + "step": 283000 + }, + { + "epoch": 0.3, + "learning_rate": 1.300918918918919e-05, + "loss": 0.8802, + "step": 283500 + }, + { + "epoch": 0.3, + "learning_rate": 1.297915915915916e-05, + "loss": 0.8787, + "step": 284000 + }, + { + "epoch": 0.31, + "learning_rate": 1.294912912912913e-05, + "loss": 0.8809, + "step": 284500 + }, + { + "epoch": 0.31, + "learning_rate": 1.291915915915916e-05, + "loss": 0.8848, + "step": 285000 + }, + { + "epoch": 0.31, + "learning_rate": 1.2889129129129129e-05, + "loss": 0.8847, + "step": 285500 + }, + { + "epoch": 0.31, + "learning_rate": 1.28590990990991e-05, + "loss": 0.891, + "step": 286000 + }, + { + "epoch": 0.31, + "learning_rate": 1.2829069069069068e-05, + "loss": 0.8789, + "step": 286500 + }, + { + "epoch": 0.31, + "learning_rate": 1.279903903903904e-05, + "loss": 0.8962, + "step": 287000 + }, + { + "epoch": 0.31, + "learning_rate": 1.276900900900901e-05, + "loss": 0.8931, + "step": 287500 + }, + { + "epoch": 0.31, + "learning_rate": 1.2738978978978979e-05, + "loss": 0.8789, + "step": 288000 + }, + { + "epoch": 0.31, + "learning_rate": 1.270900900900901e-05, + "loss": 0.8914, + "step": 288500 + }, + { + "epoch": 0.31, + "learning_rate": 1.267897897897898e-05, + "loss": 0.8902, + "step": 289000 + }, + { + "epoch": 0.31, + "learning_rate": 1.2648948948948949e-05, + "loss": 0.8773, + "step": 289500 + }, + { + "epoch": 0.31, + "learning_rate": 1.2618918918918919e-05, + "loss": 0.8996, + "step": 290000 + }, + { + "epoch": 0.31, + "eval_loss": 0.8253816366195679, + "eval_runtime": 603.3986, + "eval_samples_per_second": 165.728, + "eval_steps_per_second": 41.432, + "step": 290000 + }, + { + "epoch": 0.31, + "learning_rate": 1.2588888888888888e-05, + "loss": 0.8824, + "step": 290500 + }, + { + "epoch": 0.31, + "learning_rate": 1.255885885885886e-05, + "loss": 0.8916, + "step": 291000 + }, + { + "epoch": 0.31, + "learning_rate": 1.252882882882883e-05, + "loss": 0.887, + "step": 291500 + }, + { + "epoch": 0.31, + "learning_rate": 1.24987987987988e-05, + "loss": 0.8808, + "step": 292000 + }, + { + "epoch": 0.31, + "learning_rate": 1.2468828828828828e-05, + "loss": 0.8912, + "step": 292500 + }, + { + "epoch": 0.31, + "learning_rate": 1.24387987987988e-05, + "loss": 0.8746, + "step": 293000 + }, + { + "epoch": 0.31, + "learning_rate": 1.2408768768768768e-05, + "loss": 0.8738, + "step": 293500 + }, + { + "epoch": 0.32, + "learning_rate": 1.237873873873874e-05, + "loss": 0.8837, + "step": 294000 + }, + { + "epoch": 0.32, + "learning_rate": 1.2348768768768768e-05, + "loss": 0.8826, + "step": 294500 + }, + { + "epoch": 0.32, + "learning_rate": 1.231873873873874e-05, + "loss": 0.875, + "step": 295000 + }, + { + "epoch": 0.32, + "learning_rate": 1.228870870870871e-05, + "loss": 0.8842, + "step": 295500 + }, + { + "epoch": 0.32, + "learning_rate": 1.225867867867868e-05, + "loss": 0.8778, + "step": 296000 + }, + { + "epoch": 0.32, + "learning_rate": 1.222870870870871e-05, + "loss": 0.8771, + "step": 296500 + }, + { + "epoch": 0.32, + "learning_rate": 1.2198678678678678e-05, + "loss": 0.8934, + "step": 297000 + }, + { + "epoch": 0.32, + "learning_rate": 1.216864864864865e-05, + "loss": 0.881, + "step": 297500 + }, + { + "epoch": 0.32, + "learning_rate": 1.2138618618618617e-05, + "loss": 0.8993, + "step": 298000 + }, + { + "epoch": 0.32, + "learning_rate": 1.2108648648648648e-05, + "loss": 0.8738, + "step": 298500 + }, + { + "epoch": 0.32, + "learning_rate": 1.2078618618618618e-05, + "loss": 0.8851, + "step": 299000 + }, + { + "epoch": 0.32, + "learning_rate": 1.204858858858859e-05, + "loss": 0.8802, + "step": 299500 + }, + { + "epoch": 0.32, + "learning_rate": 1.201855855855856e-05, + "loss": 0.8783, + "step": 300000 + }, + { + "epoch": 0.32, + "eval_loss": 0.8227924108505249, + "eval_runtime": 588.0109, + "eval_samples_per_second": 170.065, + "eval_steps_per_second": 42.516, + "step": 300000 + }, + { + "epoch": 0.32, + "learning_rate": 1.198864864864865e-05, + "loss": 0.8749, + "step": 300500 + }, + { + "epoch": 0.32, + "learning_rate": 1.195861861861862e-05, + "loss": 0.8893, + "step": 301000 + }, + { + "epoch": 0.32, + "learning_rate": 1.1928588588588589e-05, + "loss": 0.8715, + "step": 301500 + }, + { + "epoch": 0.32, + "learning_rate": 1.1898558558558559e-05, + "loss": 0.8894, + "step": 302000 + }, + { + "epoch": 0.32, + "learning_rate": 1.1868588588588588e-05, + "loss": 0.875, + "step": 302500 + }, + { + "epoch": 0.32, + "learning_rate": 1.183855855855856e-05, + "loss": 0.8829, + "step": 303000 + }, + { + "epoch": 0.33, + "learning_rate": 1.1808528528528529e-05, + "loss": 0.8872, + "step": 303500 + }, + { + "epoch": 0.33, + "learning_rate": 1.1778498498498499e-05, + "loss": 0.8888, + "step": 304000 + }, + { + "epoch": 0.33, + "learning_rate": 1.174846846846847e-05, + "loss": 0.8716, + "step": 304500 + }, + { + "epoch": 0.33, + "learning_rate": 1.1718438438438438e-05, + "loss": 0.8725, + "step": 305000 + }, + { + "epoch": 0.33, + "learning_rate": 1.168840840840841e-05, + "loss": 0.8749, + "step": 305500 + }, + { + "epoch": 0.33, + "learning_rate": 1.1658378378378377e-05, + "loss": 0.8823, + "step": 306000 + }, + { + "epoch": 0.33, + "learning_rate": 1.1628348348348349e-05, + "loss": 0.8788, + "step": 306500 + }, + { + "epoch": 0.33, + "learning_rate": 1.1598378378378378e-05, + "loss": 0.8827, + "step": 307000 + }, + { + "epoch": 0.33, + "learning_rate": 1.156834834834835e-05, + "loss": 0.8728, + "step": 307500 + }, + { + "epoch": 0.33, + "learning_rate": 1.1538318318318319e-05, + "loss": 0.8875, + "step": 308000 + }, + { + "epoch": 0.33, + "learning_rate": 1.1508288288288289e-05, + "loss": 0.8812, + "step": 308500 + }, + { + "epoch": 0.33, + "learning_rate": 1.1478378378378377e-05, + "loss": 0.8671, + "step": 309000 + }, + { + "epoch": 0.33, + "learning_rate": 1.1448348348348349e-05, + "loss": 0.8723, + "step": 309500 + }, + { + "epoch": 0.33, + "learning_rate": 1.1418318318318319e-05, + "loss": 0.8796, + "step": 310000 + }, + { + "epoch": 0.33, + "eval_loss": 0.8185199499130249, + "eval_runtime": 589.4368, + "eval_samples_per_second": 169.653, + "eval_steps_per_second": 42.413, + "step": 310000 + }, + { + "epoch": 0.33, + "learning_rate": 1.1388288288288288e-05, + "loss": 0.8775, + "step": 310500 + }, + { + "epoch": 0.33, + "learning_rate": 1.1358318318318319e-05, + "loss": 0.875, + "step": 311000 + }, + { + "epoch": 0.33, + "learning_rate": 1.1328288288288289e-05, + "loss": 0.8852, + "step": 311500 + }, + { + "epoch": 0.33, + "learning_rate": 1.1298258258258259e-05, + "loss": 0.877, + "step": 312000 + }, + { + "epoch": 0.34, + "learning_rate": 1.1268228228228228e-05, + "loss": 0.8655, + "step": 312500 + }, + { + "epoch": 0.34, + "learning_rate": 1.1238198198198198e-05, + "loss": 0.8779, + "step": 313000 + }, + { + "epoch": 0.34, + "learning_rate": 1.1208228228228227e-05, + "loss": 0.8751, + "step": 313500 + }, + { + "epoch": 0.34, + "learning_rate": 1.1178198198198199e-05, + "loss": 0.8678, + "step": 314000 + }, + { + "epoch": 0.34, + "learning_rate": 1.114816816816817e-05, + "loss": 0.8676, + "step": 314500 + }, + { + "epoch": 0.34, + "learning_rate": 1.1118138138138138e-05, + "loss": 0.8695, + "step": 315000 + }, + { + "epoch": 0.34, + "learning_rate": 1.108810810810811e-05, + "loss": 0.8735, + "step": 315500 + }, + { + "epoch": 0.34, + "learning_rate": 1.1058078078078077e-05, + "loss": 0.8809, + "step": 316000 + }, + { + "epoch": 0.34, + "learning_rate": 1.1028048048048049e-05, + "loss": 0.8762, + "step": 316500 + }, + { + "epoch": 0.34, + "learning_rate": 1.0998018018018018e-05, + "loss": 0.871, + "step": 317000 + }, + { + "epoch": 0.34, + "learning_rate": 1.0968108108108109e-05, + "loss": 0.8726, + "step": 317500 + }, + { + "epoch": 0.34, + "learning_rate": 1.0938078078078077e-05, + "loss": 0.8649, + "step": 318000 + }, + { + "epoch": 0.34, + "learning_rate": 1.0908048048048048e-05, + "loss": 0.8849, + "step": 318500 + }, + { + "epoch": 0.34, + "learning_rate": 1.087801801801802e-05, + "loss": 0.8812, + "step": 319000 + }, + { + "epoch": 0.34, + "learning_rate": 1.0848048048048049e-05, + "loss": 0.862, + "step": 319500 + }, + { + "epoch": 0.34, + "learning_rate": 1.0818018018018018e-05, + "loss": 0.8739, + "step": 320000 + }, + { + "epoch": 0.34, + "eval_loss": 0.8156814575195312, + "eval_runtime": 622.3771, + "eval_samples_per_second": 160.674, + "eval_steps_per_second": 40.169, + "step": 320000 + }, + { + "epoch": 0.34, + "learning_rate": 1.0787987987987988e-05, + "loss": 0.8759, + "step": 320500 + }, + { + "epoch": 0.34, + "learning_rate": 1.0757957957957958e-05, + "loss": 0.8817, + "step": 321000 + }, + { + "epoch": 0.34, + "learning_rate": 1.0727927927927928e-05, + "loss": 0.8834, + "step": 321500 + }, + { + "epoch": 0.35, + "learning_rate": 1.0697897897897897e-05, + "loss": 0.8693, + "step": 322000 + }, + { + "epoch": 0.35, + "learning_rate": 1.0667867867867869e-05, + "loss": 0.8618, + "step": 322500 + }, + { + "epoch": 0.35, + "learning_rate": 1.0637837837837838e-05, + "loss": 0.8678, + "step": 323000 + }, + { + "epoch": 0.35, + "learning_rate": 1.0607807807807808e-05, + "loss": 0.8805, + "step": 323500 + }, + { + "epoch": 0.35, + "learning_rate": 1.0577777777777778e-05, + "loss": 0.8654, + "step": 324000 + }, + { + "epoch": 0.35, + "learning_rate": 1.0547747747747747e-05, + "loss": 0.8551, + "step": 324500 + }, + { + "epoch": 0.35, + "learning_rate": 1.0517717717717719e-05, + "loss": 0.8831, + "step": 325000 + }, + { + "epoch": 0.35, + "learning_rate": 1.0487747747747748e-05, + "loss": 0.8771, + "step": 325500 + }, + { + "epoch": 0.35, + "learning_rate": 1.0457717717717718e-05, + "loss": 0.8717, + "step": 326000 + }, + { + "epoch": 0.35, + "learning_rate": 1.0427747747747749e-05, + "loss": 0.8588, + "step": 326500 + }, + { + "epoch": 0.35, + "learning_rate": 1.0397717717717718e-05, + "loss": 0.882, + "step": 327000 + }, + { + "epoch": 0.35, + "learning_rate": 1.0367687687687688e-05, + "loss": 0.8688, + "step": 327500 + }, + { + "epoch": 0.35, + "learning_rate": 1.0337657657657658e-05, + "loss": 0.8675, + "step": 328000 + }, + { + "epoch": 0.35, + "learning_rate": 1.0307627627627627e-05, + "loss": 0.8583, + "step": 328500 + }, + { + "epoch": 0.35, + "learning_rate": 1.0277597597597597e-05, + "loss": 0.8745, + "step": 329000 + }, + { + "epoch": 0.35, + "learning_rate": 1.0247567567567569e-05, + "loss": 0.8631, + "step": 329500 + }, + { + "epoch": 0.35, + "learning_rate": 1.0217537537537537e-05, + "loss": 0.8637, + "step": 330000 + }, + { + "epoch": 0.35, + "eval_loss": 0.8101323246955872, + "eval_runtime": 593.644, + "eval_samples_per_second": 168.451, + "eval_steps_per_second": 42.113, + "step": 330000 + }, + { + "epoch": 0.35, + "learning_rate": 1.0187567567567569e-05, + "loss": 0.8626, + "step": 330500 + }, + { + "epoch": 0.35, + "learning_rate": 1.0157597597597598e-05, + "loss": 0.8651, + "step": 331000 + }, + { + "epoch": 0.36, + "learning_rate": 1.012762762762763e-05, + "loss": 0.8706, + "step": 331500 + }, + { + "epoch": 0.36, + "learning_rate": 1.0097597597597597e-05, + "loss": 0.8766, + "step": 332000 + }, + { + "epoch": 0.36, + "learning_rate": 1.0067567567567569e-05, + "loss": 0.8753, + "step": 332500 + }, + { + "epoch": 0.36, + "learning_rate": 1.0037537537537537e-05, + "loss": 0.8735, + "step": 333000 + }, + { + "epoch": 0.36, + "learning_rate": 1.0007567567567567e-05, + "loss": 0.8747, + "step": 333500 + }, + { + "epoch": 0.36, + "learning_rate": 9.977537537537537e-06, + "loss": 0.8596, + "step": 334000 + }, + { + "epoch": 0.36, + "learning_rate": 9.947507507507509e-06, + "loss": 0.868, + "step": 334500 + }, + { + "epoch": 0.36, + "learning_rate": 9.917477477477478e-06, + "loss": 0.8683, + "step": 335000 + }, + { + "epoch": 0.36, + "learning_rate": 9.887447447447448e-06, + "loss": 0.8763, + "step": 335500 + }, + { + "epoch": 0.36, + "learning_rate": 9.857417417417418e-06, + "loss": 0.8638, + "step": 336000 + }, + { + "epoch": 0.36, + "learning_rate": 9.827387387387387e-06, + "loss": 0.8677, + "step": 336500 + }, + { + "epoch": 0.36, + "learning_rate": 9.797357357357357e-06, + "loss": 0.8753, + "step": 337000 + }, + { + "epoch": 0.36, + "learning_rate": 9.767327327327328e-06, + "loss": 0.8708, + "step": 337500 + }, + { + "epoch": 0.36, + "learning_rate": 9.737297297297298e-06, + "loss": 0.8722, + "step": 338000 + }, + { + "epoch": 0.36, + "learning_rate": 9.707327327327329e-06, + "loss": 0.877, + "step": 338500 + }, + { + "epoch": 0.36, + "learning_rate": 9.677357357357358e-06, + "loss": 0.8641, + "step": 339000 + }, + { + "epoch": 0.36, + "learning_rate": 9.647327327327328e-06, + "loss": 0.8656, + "step": 339500 + }, + { + "epoch": 0.36, + "learning_rate": 9.617297297297298e-06, + "loss": 0.87, + "step": 340000 + }, + { + "epoch": 0.36, + "eval_loss": 0.8081399202346802, + "eval_runtime": 594.2224, + "eval_samples_per_second": 168.287, + "eval_steps_per_second": 42.072, + "step": 340000 + }, + { + "epoch": 0.37, + "learning_rate": 9.587267267267267e-06, + "loss": 0.8514, + "step": 340500 + }, + { + "epoch": 0.37, + "learning_rate": 9.557237237237237e-06, + "loss": 0.8726, + "step": 341000 + }, + { + "epoch": 0.37, + "learning_rate": 9.527207207207207e-06, + "loss": 0.8674, + "step": 341500 + }, + { + "epoch": 0.37, + "learning_rate": 9.497177177177178e-06, + "loss": 0.8656, + "step": 342000 + }, + { + "epoch": 0.37, + "learning_rate": 9.467147147147148e-06, + "loss": 0.8688, + "step": 342500 + }, + { + "epoch": 0.37, + "learning_rate": 9.437177177177179e-06, + "loss": 0.8619, + "step": 343000 + }, + { + "epoch": 0.37, + "learning_rate": 9.407147147147147e-06, + "loss": 0.8576, + "step": 343500 + }, + { + "epoch": 0.37, + "learning_rate": 9.377117117117118e-06, + "loss": 0.861, + "step": 344000 + }, + { + "epoch": 0.37, + "learning_rate": 9.347087087087086e-06, + "loss": 0.8655, + "step": 344500 + }, + { + "epoch": 0.37, + "learning_rate": 9.317057057057058e-06, + "loss": 0.8534, + "step": 345000 + }, + { + "epoch": 0.37, + "learning_rate": 9.287087087087087e-06, + "loss": 0.8768, + "step": 345500 + }, + { + "epoch": 0.37, + "learning_rate": 9.257057057057058e-06, + "loss": 0.8584, + "step": 346000 + }, + { + "epoch": 0.37, + "learning_rate": 9.227027027027028e-06, + "loss": 0.8716, + "step": 346500 + }, + { + "epoch": 0.37, + "learning_rate": 9.196996996996997e-06, + "loss": 0.8568, + "step": 347000 + }, + { + "epoch": 0.37, + "learning_rate": 9.166966966966967e-06, + "loss": 0.8503, + "step": 347500 + }, + { + "epoch": 0.37, + "learning_rate": 9.136936936936937e-06, + "loss": 0.8573, + "step": 348000 + }, + { + "epoch": 0.37, + "learning_rate": 9.106906906906907e-06, + "loss": 0.8619, + "step": 348500 + }, + { + "epoch": 0.37, + "learning_rate": 9.076936936936936e-06, + "loss": 0.8588, + "step": 349000 + }, + { + "epoch": 0.37, + "learning_rate": 9.046906906906907e-06, + "loss": 0.8632, + "step": 349500 + }, + { + "epoch": 0.38, + "learning_rate": 9.016876876876879e-06, + "loss": 0.8517, + "step": 350000 + }, + { + "epoch": 0.38, + "eval_loss": 0.8035141229629517, + "eval_runtime": 567.0359, + "eval_samples_per_second": 176.356, + "eval_steps_per_second": 44.089, + "step": 350000 + }, + { + "epoch": 0.38, + "learning_rate": 8.986846846846847e-06, + "loss": 0.8759, + "step": 350500 + }, + { + "epoch": 0.38, + "learning_rate": 8.956816816816818e-06, + "loss": 0.8636, + "step": 351000 + }, + { + "epoch": 0.38, + "learning_rate": 8.926906906906907e-06, + "loss": 0.8523, + "step": 351500 + }, + { + "epoch": 0.38, + "learning_rate": 8.896876876876878e-06, + "loss": 0.8508, + "step": 352000 + }, + { + "epoch": 0.38, + "learning_rate": 8.866906906906907e-06, + "loss": 0.86, + "step": 352500 + }, + { + "epoch": 0.38, + "learning_rate": 8.836876876876877e-06, + "loss": 0.8693, + "step": 353000 + }, + { + "epoch": 0.38, + "learning_rate": 8.806846846846847e-06, + "loss": 0.8597, + "step": 353500 + }, + { + "epoch": 0.38, + "learning_rate": 8.776816816816818e-06, + "loss": 0.8509, + "step": 354000 + }, + { + "epoch": 0.38, + "learning_rate": 8.746786786786786e-06, + "loss": 0.8521, + "step": 354500 + }, + { + "epoch": 0.38, + "learning_rate": 8.716756756756757e-06, + "loss": 0.847, + "step": 355000 + }, + { + "epoch": 0.38, + "learning_rate": 8.686726726726727e-06, + "loss": 0.8739, + "step": 355500 + }, + { + "epoch": 0.38, + "learning_rate": 8.656696696696697e-06, + "loss": 0.8625, + "step": 356000 + }, + { + "epoch": 0.38, + "learning_rate": 8.626666666666667e-06, + "loss": 0.866, + "step": 356500 + }, + { + "epoch": 0.38, + "learning_rate": 8.596636636636636e-06, + "loss": 0.8592, + "step": 357000 + }, + { + "epoch": 0.38, + "learning_rate": 8.566606606606608e-06, + "loss": 0.8673, + "step": 357500 + }, + { + "epoch": 0.38, + "learning_rate": 8.536576576576577e-06, + "loss": 0.8658, + "step": 358000 + }, + { + "epoch": 0.38, + "learning_rate": 8.506546546546547e-06, + "loss": 0.8646, + "step": 358500 + }, + { + "epoch": 0.38, + "learning_rate": 8.476576576576578e-06, + "loss": 0.8572, + "step": 359000 + }, + { + "epoch": 0.39, + "learning_rate": 8.446546546546546e-06, + "loss": 0.8656, + "step": 359500 + }, + { + "epoch": 0.39, + "learning_rate": 8.416516516516517e-06, + "loss": 0.8523, + "step": 360000 + }, + { + "epoch": 0.39, + "eval_loss": 0.802210807800293, + "eval_runtime": 630.0985, + "eval_samples_per_second": 158.705, + "eval_steps_per_second": 39.676, + "step": 360000 + }, + { + "epoch": 0.39, + "learning_rate": 8.386486486486485e-06, + "loss": 0.8687, + "step": 360500 + }, + { + "epoch": 0.39, + "learning_rate": 8.356456456456457e-06, + "loss": 0.8481, + "step": 361000 + }, + { + "epoch": 0.39, + "learning_rate": 8.326426426426428e-06, + "loss": 0.87, + "step": 361500 + }, + { + "epoch": 0.39, + "learning_rate": 8.296456456456457e-06, + "loss": 0.8499, + "step": 362000 + }, + { + "epoch": 0.39, + "learning_rate": 8.266486486486488e-06, + "loss": 0.8482, + "step": 362500 + }, + { + "epoch": 0.39, + "learning_rate": 8.236456456456456e-06, + "loss": 0.8452, + "step": 363000 + }, + { + "epoch": 0.39, + "learning_rate": 8.206426426426428e-06, + "loss": 0.8575, + "step": 363500 + }, + { + "epoch": 0.39, + "learning_rate": 8.176456456456457e-06, + "loss": 0.8567, + "step": 364000 + }, + { + "epoch": 0.39, + "learning_rate": 8.146426426426426e-06, + "loss": 0.8449, + "step": 364500 + }, + { + "epoch": 0.39, + "learning_rate": 8.116396396396396e-06, + "loss": 0.8634, + "step": 365000 + }, + { + "epoch": 0.39, + "learning_rate": 8.086366366366368e-06, + "loss": 0.8552, + "step": 365500 + }, + { + "epoch": 0.39, + "learning_rate": 8.056336336336337e-06, + "loss": 0.8593, + "step": 366000 + }, + { + "epoch": 0.39, + "learning_rate": 8.026306306306307e-06, + "loss": 0.846, + "step": 366500 + }, + { + "epoch": 0.39, + "learning_rate": 7.996276276276277e-06, + "loss": 0.8575, + "step": 367000 + }, + { + "epoch": 0.39, + "learning_rate": 7.966246246246246e-06, + "loss": 0.8659, + "step": 367500 + }, + { + "epoch": 0.39, + "learning_rate": 7.936216216216216e-06, + "loss": 0.8543, + "step": 368000 + }, + { + "epoch": 0.4, + "learning_rate": 7.906186186186186e-06, + "loss": 0.8501, + "step": 368500 + }, + { + "epoch": 0.4, + "learning_rate": 7.876156156156155e-06, + "loss": 0.8605, + "step": 369000 + }, + { + "epoch": 0.4, + "learning_rate": 7.846126126126127e-06, + "loss": 0.8541, + "step": 369500 + }, + { + "epoch": 0.4, + "learning_rate": 7.816096096096097e-06, + "loss": 0.85, + "step": 370000 + }, + { + "epoch": 0.4, + "eval_loss": 0.798128604888916, + "eval_runtime": 622.5637, + "eval_samples_per_second": 160.626, + "eval_steps_per_second": 40.157, + "step": 370000 + }, + { + "epoch": 0.4, + "learning_rate": 7.786126126126127e-06, + "loss": 0.8495, + "step": 370500 + }, + { + "epoch": 0.4, + "learning_rate": 7.756096096096095e-06, + "loss": 0.8448, + "step": 371000 + }, + { + "epoch": 0.4, + "learning_rate": 7.726066066066067e-06, + "loss": 0.8508, + "step": 371500 + }, + { + "epoch": 0.4, + "learning_rate": 7.696036036036035e-06, + "loss": 0.8564, + "step": 372000 + }, + { + "epoch": 0.4, + "learning_rate": 7.666006006006006e-06, + "loss": 0.8434, + "step": 372500 + }, + { + "epoch": 0.4, + "learning_rate": 7.636036036036037e-06, + "loss": 0.853, + "step": 373000 + }, + { + "epoch": 0.4, + "learning_rate": 7.606006006006006e-06, + "loss": 0.8536, + "step": 373500 + }, + { + "epoch": 0.4, + "learning_rate": 7.5759759759759765e-06, + "loss": 0.8416, + "step": 374000 + }, + { + "epoch": 0.4, + "learning_rate": 7.545945945945945e-06, + "loss": 0.8509, + "step": 374500 + }, + { + "epoch": 0.4, + "learning_rate": 7.515915915915916e-06, + "loss": 0.852, + "step": 375000 + }, + { + "epoch": 0.4, + "learning_rate": 7.4858858858858865e-06, + "loss": 0.8481, + "step": 375500 + }, + { + "epoch": 0.4, + "learning_rate": 7.455855855855856e-06, + "loss": 0.8505, + "step": 376000 + }, + { + "epoch": 0.4, + "learning_rate": 7.425825825825826e-06, + "loss": 0.8425, + "step": 376500 + }, + { + "epoch": 0.4, + "learning_rate": 7.395855855855856e-06, + "loss": 0.8514, + "step": 377000 + }, + { + "epoch": 0.4, + "learning_rate": 7.365825825825826e-06, + "loss": 0.8478, + "step": 377500 + }, + { + "epoch": 0.41, + "learning_rate": 7.335795795795796e-06, + "loss": 0.8468, + "step": 378000 + }, + { + "epoch": 0.41, + "learning_rate": 7.305765765765766e-06, + "loss": 0.8621, + "step": 378500 + }, + { + "epoch": 0.41, + "learning_rate": 7.275795795795796e-06, + "loss": 0.8486, + "step": 379000 + }, + { + "epoch": 0.41, + "learning_rate": 7.245825825825827e-06, + "loss": 0.8491, + "step": 379500 + }, + { + "epoch": 0.41, + "learning_rate": 7.2157957957957965e-06, + "loss": 0.8329, + "step": 380000 + }, + { + "epoch": 0.41, + "eval_loss": 0.795859694480896, + "eval_runtime": 607.5667, + "eval_samples_per_second": 164.591, + "eval_steps_per_second": 41.148, + "step": 380000 + }, + { + "epoch": 0.41, + "learning_rate": 7.185765765765766e-06, + "loss": 0.852, + "step": 380500 + }, + { + "epoch": 0.41, + "learning_rate": 7.155735735735736e-06, + "loss": 0.8493, + "step": 381000 + }, + { + "epoch": 0.41, + "learning_rate": 7.125765765765766e-06, + "loss": 0.8506, + "step": 381500 + }, + { + "epoch": 0.41, + "learning_rate": 7.095735735735736e-06, + "loss": 0.8537, + "step": 382000 + }, + { + "epoch": 0.41, + "learning_rate": 7.065765765765766e-06, + "loss": 0.8387, + "step": 382500 + }, + { + "epoch": 0.41, + "learning_rate": 7.035735735735736e-06, + "loss": 0.844, + "step": 383000 + }, + { + "epoch": 0.41, + "learning_rate": 7.005705705705706e-06, + "loss": 0.8467, + "step": 383500 + }, + { + "epoch": 0.41, + "learning_rate": 6.975675675675676e-06, + "loss": 0.8499, + "step": 384000 + }, + { + "epoch": 0.41, + "learning_rate": 6.945645645645646e-06, + "loss": 0.8463, + "step": 384500 + }, + { + "epoch": 0.41, + "learning_rate": 6.915615615615616e-06, + "loss": 0.8336, + "step": 385000 + }, + { + "epoch": 0.41, + "learning_rate": 6.8855855855855855e-06, + "loss": 0.8423, + "step": 385500 + }, + { + "epoch": 0.41, + "learning_rate": 6.855555555555555e-06, + "loss": 0.8323, + "step": 386000 + }, + { + "epoch": 0.41, + "learning_rate": 6.825525525525526e-06, + "loss": 0.8372, + "step": 386500 + }, + { + "epoch": 0.41, + "learning_rate": 6.795495495495496e-06, + "loss": 0.8564, + "step": 387000 + }, + { + "epoch": 0.42, + "learning_rate": 6.765465465465466e-06, + "loss": 0.8527, + "step": 387500 + }, + { + "epoch": 0.42, + "learning_rate": 6.735435435435436e-06, + "loss": 0.8376, + "step": 388000 + }, + { + "epoch": 0.42, + "learning_rate": 6.7054054054054054e-06, + "loss": 0.835, + "step": 388500 + }, + { + "epoch": 0.42, + "learning_rate": 6.675375375375375e-06, + "loss": 0.8449, + "step": 389000 + }, + { + "epoch": 0.42, + "learning_rate": 6.645345345345346e-06, + "loss": 0.8472, + "step": 389500 + }, + { + "epoch": 0.42, + "learning_rate": 6.615315315315315e-06, + "loss": 0.8473, + "step": 390000 + }, + { + "epoch": 0.42, + "eval_loss": 0.7919498682022095, + "eval_runtime": 570.4158, + "eval_samples_per_second": 175.311, + "eval_steps_per_second": 43.828, + "step": 390000 + }, + { + "epoch": 0.42, + "learning_rate": 6.585285285285285e-06, + "loss": 0.8466, + "step": 390500 + }, + { + "epoch": 0.42, + "learning_rate": 6.555315315315316e-06, + "loss": 0.8593, + "step": 391000 + }, + { + "epoch": 0.42, + "learning_rate": 6.525285285285286e-06, + "loss": 0.8513, + "step": 391500 + }, + { + "epoch": 0.42, + "learning_rate": 6.495255255255255e-06, + "loss": 0.8398, + "step": 392000 + }, + { + "epoch": 0.42, + "learning_rate": 6.465225225225225e-06, + "loss": 0.8447, + "step": 392500 + }, + { + "epoch": 0.42, + "learning_rate": 6.4353153153153154e-06, + "loss": 0.8523, + "step": 393000 + }, + { + "epoch": 0.42, + "learning_rate": 6.405285285285285e-06, + "loss": 0.8435, + "step": 393500 + }, + { + "epoch": 0.42, + "learning_rate": 6.375255255255255e-06, + "loss": 0.8606, + "step": 394000 + }, + { + "epoch": 0.42, + "learning_rate": 6.345225225225225e-06, + "loss": 0.8538, + "step": 394500 + }, + { + "epoch": 0.42, + "learning_rate": 6.315195195195196e-06, + "loss": 0.8616, + "step": 395000 + }, + { + "epoch": 0.42, + "learning_rate": 6.285165165165166e-06, + "loss": 0.8418, + "step": 395500 + }, + { + "epoch": 0.42, + "learning_rate": 6.255135135135135e-06, + "loss": 0.8449, + "step": 396000 + }, + { + "epoch": 0.43, + "learning_rate": 6.225105105105105e-06, + "loss": 0.8382, + "step": 396500 + }, + { + "epoch": 0.43, + "learning_rate": 6.195135135135135e-06, + "loss": 0.8516, + "step": 397000 + }, + { + "epoch": 0.43, + "learning_rate": 6.165105105105105e-06, + "loss": 0.8472, + "step": 397500 + }, + { + "epoch": 0.43, + "learning_rate": 6.135135135135135e-06, + "loss": 0.8354, + "step": 398000 + }, + { + "epoch": 0.43, + "learning_rate": 6.1051051051051045e-06, + "loss": 0.856, + "step": 398500 + }, + { + "epoch": 0.43, + "learning_rate": 6.075075075075076e-06, + "loss": 0.8292, + "step": 399000 + }, + { + "epoch": 0.43, + "learning_rate": 6.045045045045046e-06, + "loss": 0.8403, + "step": 399500 + }, + { + "epoch": 0.43, + "learning_rate": 6.015015015015015e-06, + "loss": 0.8495, + "step": 400000 + }, + { + "epoch": 0.43, + "eval_loss": 0.7893297672271729, + "eval_runtime": 586.3311, + "eval_samples_per_second": 170.552, + "eval_steps_per_second": 42.638, + "step": 400000 + }, + { + "epoch": 0.43, + "learning_rate": 5.985045045045045e-06, + "loss": 0.8557, + "step": 400500 + }, + { + "epoch": 0.43, + "learning_rate": 5.955015015015015e-06, + "loss": 0.8496, + "step": 401000 + }, + { + "epoch": 0.43, + "learning_rate": 5.925045045045045e-06, + "loss": 0.8406, + "step": 401500 + }, + { + "epoch": 0.43, + "learning_rate": 5.895015015015015e-06, + "loss": 0.8466, + "step": 402000 + }, + { + "epoch": 0.43, + "learning_rate": 5.8649849849849845e-06, + "loss": 0.8401, + "step": 402500 + }, + { + "epoch": 0.43, + "learning_rate": 5.834954954954956e-06, + "loss": 0.8327, + "step": 403000 + }, + { + "epoch": 0.43, + "learning_rate": 5.8049249249249256e-06, + "loss": 0.8292, + "step": 403500 + }, + { + "epoch": 0.43, + "learning_rate": 5.774894894894895e-06, + "loss": 0.8479, + "step": 404000 + }, + { + "epoch": 0.43, + "learning_rate": 5.744864864864865e-06, + "loss": 0.8425, + "step": 404500 + }, + { + "epoch": 0.43, + "learning_rate": 5.714834834834835e-06, + "loss": 0.8512, + "step": 405000 + }, + { + "epoch": 0.43, + "learning_rate": 5.684804804804804e-06, + "loss": 0.8414, + "step": 405500 + }, + { + "epoch": 0.44, + "learning_rate": 5.654774774774775e-06, + "loss": 0.8307, + "step": 406000 + }, + { + "epoch": 0.44, + "learning_rate": 5.624744744744745e-06, + "loss": 0.829, + "step": 406500 + }, + { + "epoch": 0.44, + "learning_rate": 5.5947747747747755e-06, + "loss": 0.8441, + "step": 407000 + }, + { + "epoch": 0.44, + "learning_rate": 5.564744744744745e-06, + "loss": 0.8467, + "step": 407500 + }, + { + "epoch": 0.44, + "learning_rate": 5.534714714714715e-06, + "loss": 0.8505, + "step": 408000 + }, + { + "epoch": 0.44, + "learning_rate": 5.504684684684685e-06, + "loss": 0.8398, + "step": 408500 + }, + { + "epoch": 0.44, + "learning_rate": 5.474714714714715e-06, + "loss": 0.8352, + "step": 409000 + }, + { + "epoch": 0.44, + "learning_rate": 5.444684684684684e-06, + "loss": 0.8452, + "step": 409500 + }, + { + "epoch": 0.44, + "learning_rate": 5.414654654654655e-06, + "loss": 0.8468, + "step": 410000 + }, + { + "epoch": 0.44, + "eval_loss": 0.7882465124130249, + "eval_runtime": 564.1275, + "eval_samples_per_second": 177.265, + "eval_steps_per_second": 44.316, + "step": 410000 + }, + { + "epoch": 0.44, + "learning_rate": 5.384684684684685e-06, + "loss": 0.8428, + "step": 410500 + }, + { + "epoch": 0.44, + "learning_rate": 5.3546546546546555e-06, + "loss": 0.8524, + "step": 411000 + }, + { + "epoch": 0.44, + "learning_rate": 5.324624624624625e-06, + "loss": 0.8399, + "step": 411500 + }, + { + "epoch": 0.44, + "learning_rate": 5.294594594594595e-06, + "loss": 0.8295, + "step": 412000 + }, + { + "epoch": 0.44, + "learning_rate": 5.2645645645645646e-06, + "loss": 0.8332, + "step": 412500 + }, + { + "epoch": 0.44, + "learning_rate": 5.234534534534534e-06, + "loss": 0.8444, + "step": 413000 + }, + { + "epoch": 0.44, + "learning_rate": 5.204504504504505e-06, + "loss": 0.8372, + "step": 413500 + }, + { + "epoch": 0.44, + "learning_rate": 5.1744744744744745e-06, + "loss": 0.8402, + "step": 414000 + }, + { + "epoch": 0.44, + "learning_rate": 5.144444444444445e-06, + "loss": 0.8521, + "step": 414500 + }, + { + "epoch": 0.45, + "learning_rate": 5.114414414414415e-06, + "loss": 0.8418, + "step": 415000 + }, + { + "epoch": 0.45, + "learning_rate": 5.0843843843843845e-06, + "loss": 0.8454, + "step": 415500 + }, + { + "epoch": 0.45, + "learning_rate": 5.0544144144144145e-06, + "loss": 0.8176, + "step": 416000 + }, + { + "epoch": 0.45, + "learning_rate": 5.024384384384384e-06, + "loss": 0.8556, + "step": 416500 + }, + { + "epoch": 0.45, + "learning_rate": 4.994354354354355e-06, + "loss": 0.8245, + "step": 417000 + }, + { + "epoch": 0.45, + "learning_rate": 4.9643243243243245e-06, + "loss": 0.8418, + "step": 417500 + }, + { + "epoch": 0.45, + "learning_rate": 4.934294294294294e-06, + "loss": 0.8351, + "step": 418000 + }, + { + "epoch": 0.45, + "learning_rate": 4.904264264264265e-06, + "loss": 0.841, + "step": 418500 + }, + { + "epoch": 0.45, + "learning_rate": 4.874234234234234e-06, + "loss": 0.8335, + "step": 419000 + }, + { + "epoch": 0.45, + "learning_rate": 4.844204204204204e-06, + "loss": 0.8299, + "step": 419500 + }, + { + "epoch": 0.45, + "learning_rate": 4.814234234234234e-06, + "loss": 0.8355, + "step": 420000 + }, + { + "epoch": 0.45, + "eval_loss": 0.7847426533699036, + "eval_runtime": 565.6865, + "eval_samples_per_second": 176.776, + "eval_steps_per_second": 44.194, + "step": 420000 + }, + { + "epoch": 0.45, + "learning_rate": 4.784204204204205e-06, + "loss": 0.8216, + "step": 420500 + }, + { + "epoch": 0.45, + "learning_rate": 4.754174174174174e-06, + "loss": 0.8486, + "step": 421000 + }, + { + "epoch": 0.45, + "learning_rate": 4.724144144144144e-06, + "loss": 0.8411, + "step": 421500 + }, + { + "epoch": 0.45, + "learning_rate": 4.694114114114114e-06, + "loss": 0.8447, + "step": 422000 + }, + { + "epoch": 0.45, + "learning_rate": 4.664144144144145e-06, + "loss": 0.8363, + "step": 422500 + }, + { + "epoch": 0.45, + "learning_rate": 4.634114114114114e-06, + "loss": 0.83, + "step": 423000 + }, + { + "epoch": 0.45, + "learning_rate": 4.604084084084084e-06, + "loss": 0.8398, + "step": 423500 + }, + { + "epoch": 0.45, + "learning_rate": 4.574114114114114e-06, + "loss": 0.8441, + "step": 424000 + }, + { + "epoch": 0.46, + "learning_rate": 4.544084084084084e-06, + "loss": 0.8373, + "step": 424500 + }, + { + "epoch": 0.46, + "learning_rate": 4.514054054054054e-06, + "loss": 0.8403, + "step": 425000 + }, + { + "epoch": 0.46, + "learning_rate": 4.484024024024024e-06, + "loss": 0.8391, + "step": 425500 + }, + { + "epoch": 0.46, + "learning_rate": 4.453993993993994e-06, + "loss": 0.832, + "step": 426000 + }, + { + "epoch": 0.46, + "learning_rate": 4.423963963963964e-06, + "loss": 0.8524, + "step": 426500 + }, + { + "epoch": 0.46, + "learning_rate": 4.393933933933934e-06, + "loss": 0.8396, + "step": 427000 + }, + { + "epoch": 0.46, + "learning_rate": 4.3639039039039046e-06, + "loss": 0.8363, + "step": 427500 + }, + { + "epoch": 0.46, + "learning_rate": 4.333993993993994e-06, + "loss": 0.834, + "step": 428000 + }, + { + "epoch": 0.46, + "learning_rate": 4.303963963963964e-06, + "loss": 0.8476, + "step": 428500 + }, + { + "epoch": 0.46, + "learning_rate": 4.2739339339339335e-06, + "loss": 0.8509, + "step": 429000 + }, + { + "epoch": 0.46, + "learning_rate": 4.243903903903904e-06, + "loss": 0.8368, + "step": 429500 + }, + { + "epoch": 0.46, + "learning_rate": 4.213873873873874e-06, + "loss": 0.8366, + "step": 430000 + }, + { + "epoch": 0.46, + "eval_loss": 0.7822093963623047, + "eval_runtime": 571.4633, + "eval_samples_per_second": 174.989, + "eval_steps_per_second": 43.747, + "step": 430000 + }, + { + "epoch": 0.46, + "learning_rate": 4.183843843843844e-06, + "loss": 0.8383, + "step": 430500 + }, + { + "epoch": 0.46, + "learning_rate": 4.153813813813814e-06, + "loss": 0.8404, + "step": 431000 + }, + { + "epoch": 0.46, + "learning_rate": 4.123783783783784e-06, + "loss": 0.8348, + "step": 431500 + }, + { + "epoch": 0.46, + "learning_rate": 4.093813813813814e-06, + "loss": 0.8332, + "step": 432000 + }, + { + "epoch": 0.46, + "learning_rate": 4.063783783783783e-06, + "loss": 0.8323, + "step": 432500 + }, + { + "epoch": 0.46, + "learning_rate": 4.033753753753754e-06, + "loss": 0.8299, + "step": 433000 + }, + { + "epoch": 0.46, + "learning_rate": 4.003783783783784e-06, + "loss": 0.831, + "step": 433500 + }, + { + "epoch": 0.47, + "learning_rate": 3.973813813813814e-06, + "loss": 0.837, + "step": 434000 + }, + { + "epoch": 0.47, + "learning_rate": 3.9437837837837846e-06, + "loss": 0.8455, + "step": 434500 + }, + { + "epoch": 0.47, + "learning_rate": 3.913753753753754e-06, + "loss": 0.8317, + "step": 435000 + }, + { + "epoch": 0.47, + "learning_rate": 3.883723723723724e-06, + "loss": 0.817, + "step": 435500 + }, + { + "epoch": 0.47, + "learning_rate": 3.853693693693694e-06, + "loss": 0.8346, + "step": 436000 + }, + { + "epoch": 0.47, + "learning_rate": 3.823663663663663e-06, + "loss": 0.8277, + "step": 436500 + }, + { + "epoch": 0.47, + "learning_rate": 3.7936336336336343e-06, + "loss": 0.8357, + "step": 437000 + }, + { + "epoch": 0.47, + "learning_rate": 3.763603603603604e-06, + "loss": 0.8382, + "step": 437500 + }, + { + "epoch": 0.47, + "learning_rate": 3.7335735735735737e-06, + "loss": 0.8501, + "step": 438000 + }, + { + "epoch": 0.47, + "learning_rate": 3.7035435435435434e-06, + "loss": 0.8233, + "step": 438500 + }, + { + "epoch": 0.47, + "learning_rate": 3.673513513513514e-06, + "loss": 0.8342, + "step": 439000 + }, + { + "epoch": 0.47, + "learning_rate": 3.6434834834834837e-06, + "loss": 0.8502, + "step": 439500 + }, + { + "epoch": 0.47, + "learning_rate": 3.6135135135135137e-06, + "loss": 0.8418, + "step": 440000 + }, + { + "epoch": 0.47, + "eval_loss": 0.7793955206871033, + "eval_runtime": 566.9399, + "eval_samples_per_second": 176.386, + "eval_steps_per_second": 44.096, + "step": 440000 + }, + { + "epoch": 0.47, + "learning_rate": 3.5834834834834834e-06, + "loss": 0.8153, + "step": 440500 + }, + { + "epoch": 0.47, + "learning_rate": 3.5534534534534536e-06, + "loss": 0.8435, + "step": 441000 + }, + { + "epoch": 0.47, + "learning_rate": 3.5234234234234237e-06, + "loss": 0.8288, + "step": 441500 + }, + { + "epoch": 0.47, + "learning_rate": 3.4933933933933934e-06, + "loss": 0.8185, + "step": 442000 + }, + { + "epoch": 0.47, + "learning_rate": 3.4633633633633635e-06, + "loss": 0.819, + "step": 442500 + }, + { + "epoch": 0.48, + "learning_rate": 3.4333333333333336e-06, + "loss": 0.8296, + "step": 443000 + }, + { + "epoch": 0.48, + "learning_rate": 3.4033033033033033e-06, + "loss": 0.815, + "step": 443500 + }, + { + "epoch": 0.48, + "learning_rate": 3.3733933933933933e-06, + "loss": 0.8266, + "step": 444000 + }, + { + "epoch": 0.48, + "learning_rate": 3.3433633633633634e-06, + "loss": 0.8252, + "step": 444500 + }, + { + "epoch": 0.48, + "learning_rate": 3.3133333333333335e-06, + "loss": 0.8336, + "step": 445000 + }, + { + "epoch": 0.48, + "learning_rate": 3.2833033033033036e-06, + "loss": 0.8363, + "step": 445500 + }, + { + "epoch": 0.48, + "learning_rate": 3.2532732732732733e-06, + "loss": 0.8297, + "step": 446000 + }, + { + "epoch": 0.48, + "learning_rate": 3.2233033033033034e-06, + "loss": 0.8247, + "step": 446500 + }, + { + "epoch": 0.48, + "learning_rate": 3.1932732732732735e-06, + "loss": 0.8205, + "step": 447000 + }, + { + "epoch": 0.48, + "learning_rate": 3.163243243243243e-06, + "loss": 0.8311, + "step": 447500 + }, + { + "epoch": 0.48, + "learning_rate": 3.1332132132132133e-06, + "loss": 0.8398, + "step": 448000 + }, + { + "epoch": 0.48, + "learning_rate": 3.103183183183183e-06, + "loss": 0.8344, + "step": 448500 + }, + { + "epoch": 0.48, + "learning_rate": 3.0731531531531536e-06, + "loss": 0.8124, + "step": 449000 + }, + { + "epoch": 0.48, + "learning_rate": 3.043183183183183e-06, + "loss": 0.8206, + "step": 449500 + }, + { + "epoch": 0.48, + "learning_rate": 3.0131531531531533e-06, + "loss": 0.8271, + "step": 450000 + }, + { + "epoch": 0.48, + "eval_loss": 0.7778518199920654, + "eval_runtime": 574.7064, + "eval_samples_per_second": 174.002, + "eval_steps_per_second": 43.5, + "step": 450000 + }, + { + "epoch": 0.48, + "learning_rate": 2.983123123123123e-06, + "loss": 0.8326, + "step": 450500 + }, + { + "epoch": 0.48, + "learning_rate": 2.953093093093093e-06, + "loss": 0.8222, + "step": 451000 + }, + { + "epoch": 0.48, + "learning_rate": 2.9230630630630633e-06, + "loss": 0.8365, + "step": 451500 + }, + { + "epoch": 0.48, + "learning_rate": 2.893033033033033e-06, + "loss": 0.8341, + "step": 452000 + }, + { + "epoch": 0.49, + "learning_rate": 2.863003003003003e-06, + "loss": 0.8402, + "step": 452500 + }, + { + "epoch": 0.49, + "learning_rate": 2.8329729729729732e-06, + "loss": 0.8316, + "step": 453000 + }, + { + "epoch": 0.49, + "learning_rate": 2.8030030030030032e-06, + "loss": 0.8329, + "step": 453500 + }, + { + "epoch": 0.49, + "learning_rate": 2.772972972972973e-06, + "loss": 0.8108, + "step": 454000 + }, + { + "epoch": 0.49, + "learning_rate": 2.7429429429429426e-06, + "loss": 0.815, + "step": 454500 + }, + { + "epoch": 0.49, + "learning_rate": 2.712912912912913e-06, + "loss": 0.834, + "step": 455000 + }, + { + "epoch": 0.49, + "learning_rate": 2.683003003003003e-06, + "loss": 0.8178, + "step": 455500 + }, + { + "epoch": 0.49, + "learning_rate": 2.652972972972973e-06, + "loss": 0.8206, + "step": 456000 + }, + { + "epoch": 0.49, + "learning_rate": 2.622942942942943e-06, + "loss": 0.8338, + "step": 456500 + }, + { + "epoch": 0.49, + "learning_rate": 2.592912912912913e-06, + "loss": 0.825, + "step": 457000 + }, + { + "epoch": 0.49, + "learning_rate": 2.5628828828828828e-06, + "loss": 0.818, + "step": 457500 + }, + { + "epoch": 0.49, + "learning_rate": 2.532852852852853e-06, + "loss": 0.8246, + "step": 458000 + }, + { + "epoch": 0.49, + "learning_rate": 2.502822822822823e-06, + "loss": 0.8359, + "step": 458500 + }, + { + "epoch": 0.49, + "learning_rate": 2.472852852852853e-06, + "loss": 0.8253, + "step": 459000 + }, + { + "epoch": 0.49, + "learning_rate": 2.4428228228228228e-06, + "loss": 0.8227, + "step": 459500 + }, + { + "epoch": 0.49, + "learning_rate": 2.412792792792793e-06, + "loss": 0.8394, + "step": 460000 + }, + { + "epoch": 0.49, + "eval_loss": 0.7761884331703186, + "eval_runtime": 598.2224, + "eval_samples_per_second": 167.162, + "eval_steps_per_second": 41.79, + "step": 460000 + }, + { + "epoch": 0.49, + "learning_rate": 2.382762762762763e-06, + "loss": 0.8264, + "step": 460500 + }, + { + "epoch": 0.49, + "learning_rate": 2.3527327327327327e-06, + "loss": 0.8397, + "step": 461000 + }, + { + "epoch": 0.49, + "learning_rate": 2.3227627627627627e-06, + "loss": 0.8178, + "step": 461500 + }, + { + "epoch": 0.5, + "learning_rate": 2.292732732732733e-06, + "loss": 0.8224, + "step": 462000 + }, + { + "epoch": 0.5, + "learning_rate": 2.262702702702703e-06, + "loss": 0.8338, + "step": 462500 + }, + { + "epoch": 0.5, + "learning_rate": 2.2326726726726727e-06, + "loss": 0.8179, + "step": 463000 + }, + { + "epoch": 0.5, + "learning_rate": 2.2027027027027027e-06, + "loss": 0.8332, + "step": 463500 + }, + { + "epoch": 0.5, + "learning_rate": 2.1726726726726724e-06, + "loss": 0.8318, + "step": 464000 + }, + { + "epoch": 0.5, + "learning_rate": 2.142642642642643e-06, + "loss": 0.8298, + "step": 464500 + }, + { + "epoch": 0.5, + "learning_rate": 2.1126126126126127e-06, + "loss": 0.8298, + "step": 465000 + }, + { + "epoch": 0.5, + "learning_rate": 2.082582582582583e-06, + "loss": 0.818, + "step": 465500 + }, + { + "epoch": 0.5, + "learning_rate": 2.0525525525525525e-06, + "loss": 0.8202, + "step": 466000 + }, + { + "epoch": 0.5, + "learning_rate": 2.0225225225225226e-06, + "loss": 0.8174, + "step": 466500 + }, + { + "epoch": 0.5, + "learning_rate": 1.9924924924924928e-06, + "loss": 0.8165, + "step": 467000 + }, + { + "epoch": 0.5, + "learning_rate": 1.9625225225225224e-06, + "loss": 0.8184, + "step": 467500 + }, + { + "epoch": 0.5, + "learning_rate": 1.9324924924924925e-06, + "loss": 0.823, + "step": 468000 + }, + { + "epoch": 0.5, + "learning_rate": 1.9024624624624624e-06, + "loss": 0.8224, + "step": 468500 + }, + { + "epoch": 0.5, + "learning_rate": 1.8724324324324325e-06, + "loss": 0.8266, + "step": 469000 + }, + { + "epoch": 0.5, + "learning_rate": 1.8424024024024024e-06, + "loss": 0.8199, + "step": 469500 + }, + { + "epoch": 0.5, + "learning_rate": 1.8123723723723726e-06, + "loss": 0.8319, + "step": 470000 + }, + { + "epoch": 0.5, + "eval_loss": 0.7738833427429199, + "eval_runtime": 565.2199, + "eval_samples_per_second": 176.922, + "eval_steps_per_second": 44.231, + "step": 470000 + }, + { + "epoch": 0.5, + "learning_rate": 1.7823423423423423e-06, + "loss": 0.834, + "step": 470500 + }, + { + "epoch": 0.51, + "learning_rate": 1.7523123123123124e-06, + "loss": 0.8208, + "step": 471000 + }, + { + "epoch": 0.51, + "learning_rate": 1.7223423423423424e-06, + "loss": 0.8136, + "step": 471500 + }, + { + "epoch": 0.51, + "learning_rate": 1.6923123123123123e-06, + "loss": 0.8286, + "step": 472000 + }, + { + "epoch": 0.51, + "learning_rate": 1.6622822822822823e-06, + "loss": 0.8284, + "step": 472500 + }, + { + "epoch": 0.51, + "learning_rate": 1.6323123123123125e-06, + "loss": 0.8276, + "step": 473000 + }, + { + "epoch": 0.51, + "learning_rate": 1.6022822822822822e-06, + "loss": 0.8387, + "step": 473500 + }, + { + "epoch": 0.51, + "learning_rate": 1.5722522522522523e-06, + "loss": 0.825, + "step": 474000 + }, + { + "epoch": 0.51, + "learning_rate": 1.5422222222222222e-06, + "loss": 0.8339, + "step": 474500 + }, + { + "epoch": 0.51, + "learning_rate": 1.5121921921921924e-06, + "loss": 0.8279, + "step": 475000 + }, + { + "epoch": 0.51, + "learning_rate": 1.482162162162162e-06, + "loss": 0.8162, + "step": 475500 + }, + { + "epoch": 0.51, + "learning_rate": 1.4521321321321322e-06, + "loss": 0.8312, + "step": 476000 + }, + { + "epoch": 0.51, + "learning_rate": 1.422102102102102e-06, + "loss": 0.826, + "step": 476500 + }, + { + "epoch": 0.51, + "learning_rate": 1.3921321321321321e-06, + "loss": 0.8415, + "step": 477000 + }, + { + "epoch": 0.51, + "learning_rate": 1.362102102102102e-06, + "loss": 0.8175, + "step": 477500 + }, + { + "epoch": 0.51, + "learning_rate": 1.3320720720720722e-06, + "loss": 0.8104, + "step": 478000 + }, + { + "epoch": 0.51, + "learning_rate": 1.302042042042042e-06, + "loss": 0.8368, + "step": 478500 + }, + { + "epoch": 0.51, + "learning_rate": 1.272012012012012e-06, + "loss": 0.8226, + "step": 479000 + }, + { + "epoch": 0.51, + "learning_rate": 1.242042042042042e-06, + "loss": 0.819, + "step": 479500 + }, + { + "epoch": 0.51, + "learning_rate": 1.2120120120120121e-06, + "loss": 0.8125, + "step": 480000 + }, + { + "epoch": 0.51, + "eval_loss": 0.7732232809066772, + "eval_runtime": 573.449, + "eval_samples_per_second": 174.383, + "eval_steps_per_second": 43.596, + "step": 480000 + }, + { + "epoch": 0.52, + "learning_rate": 1.1819819819819819e-06, + "loss": 0.8312, + "step": 480500 + }, + { + "epoch": 0.52, + "learning_rate": 1.151951951951952e-06, + "loss": 0.8299, + "step": 481000 + }, + { + "epoch": 0.52, + "learning_rate": 1.121981981981982e-06, + "loss": 0.8081, + "step": 481500 + }, + { + "epoch": 0.52, + "learning_rate": 1.091951951951952e-06, + "loss": 0.8267, + "step": 482000 + }, + { + "epoch": 0.52, + "learning_rate": 1.061981981981982e-06, + "loss": 0.8213, + "step": 482500 + }, + { + "epoch": 0.52, + "learning_rate": 1.031951951951952e-06, + "loss": 0.8024, + "step": 483000 + }, + { + "epoch": 0.52, + "learning_rate": 1.0019219219219218e-06, + "loss": 0.8223, + "step": 483500 + }, + { + "epoch": 0.52, + "learning_rate": 9.71891891891892e-07, + "loss": 0.8254, + "step": 484000 + }, + { + "epoch": 0.52, + "learning_rate": 9.418618618618619e-07, + "loss": 0.8258, + "step": 484500 + }, + { + "epoch": 0.52, + "learning_rate": 9.118318318318318e-07, + "loss": 0.8303, + "step": 485000 + }, + { + "epoch": 0.52, + "learning_rate": 8.818018018018019e-07, + "loss": 0.8197, + "step": 485500 + }, + { + "epoch": 0.52, + "learning_rate": 8.517717717717718e-07, + "loss": 0.824, + "step": 486000 + }, + { + "epoch": 0.52, + "learning_rate": 8.218018018018018e-07, + "loss": 0.8255, + "step": 486500 + }, + { + "epoch": 0.52, + "learning_rate": 7.917717717717718e-07, + "loss": 0.8235, + "step": 487000 + }, + { + "epoch": 0.52, + "learning_rate": 7.618018018018018e-07, + "loss": 0.8036, + "step": 487500 + }, + { + "epoch": 0.52, + "learning_rate": 7.317717717717718e-07, + "loss": 0.818, + "step": 488000 + }, + { + "epoch": 0.52, + "learning_rate": 7.017417417417418e-07, + "loss": 0.8402, + "step": 488500 + }, + { + "epoch": 0.52, + "learning_rate": 6.717117117117117e-07, + "loss": 0.8439, + "step": 489000 + }, + { + "epoch": 0.52, + "learning_rate": 6.416816816816817e-07, + "loss": 0.8121, + "step": 489500 + }, + { + "epoch": 0.53, + "learning_rate": 6.116516516516516e-07, + "loss": 0.8065, + "step": 490000 + }, + { + "epoch": 0.53, + "eval_loss": 0.7721747756004333, + "eval_runtime": 582.9907, + "eval_samples_per_second": 171.529, + "eval_steps_per_second": 42.882, + "step": 490000 + }, + { + "epoch": 0.53, + "learning_rate": 5.816216216216216e-07, + "loss": 0.827, + "step": 490500 + }, + { + "epoch": 0.53, + "learning_rate": 5.515915915915916e-07, + "loss": 0.8138, + "step": 491000 + }, + { + "epoch": 0.53, + "learning_rate": 5.216216216216216e-07, + "loss": 0.8212, + "step": 491500 + }, + { + "epoch": 0.53, + "learning_rate": 4.915915915915916e-07, + "loss": 0.8166, + "step": 492000 + }, + { + "epoch": 0.53, + "learning_rate": 4.6156156156156157e-07, + "loss": 0.8246, + "step": 492500 + }, + { + "epoch": 0.53, + "learning_rate": 4.3153153153153154e-07, + "loss": 0.821, + "step": 493000 + }, + { + "epoch": 0.53, + "learning_rate": 4.015015015015015e-07, + "loss": 0.8265, + "step": 493500 + }, + { + "epoch": 0.53, + "learning_rate": 3.7153153153153153e-07, + "loss": 0.8297, + "step": 494000 + }, + { + "epoch": 0.53, + "learning_rate": 3.415015015015015e-07, + "loss": 0.8053, + "step": 494500 + }, + { + "epoch": 0.53, + "learning_rate": 3.1147147147147147e-07, + "loss": 0.8171, + "step": 495000 + }, + { + "epoch": 0.53, + "learning_rate": 2.8144144144144143e-07, + "loss": 0.8261, + "step": 495500 + }, + { + "epoch": 0.53, + "learning_rate": 2.5147147147147146e-07, + "loss": 0.8216, + "step": 496000 + }, + { + "epoch": 0.53, + "learning_rate": 2.2144144144144145e-07, + "loss": 0.8195, + "step": 496500 + }, + { + "epoch": 0.53, + "learning_rate": 1.9141141141141142e-07, + "loss": 0.8303, + "step": 497000 + }, + { + "epoch": 0.53, + "learning_rate": 1.613813813813814e-07, + "loss": 0.836, + "step": 497500 + }, + { + "epoch": 0.53, + "learning_rate": 1.3135135135135136e-07, + "loss": 0.833, + "step": 498000 + }, + { + "epoch": 0.53, + "learning_rate": 1.0132132132132131e-07, + "loss": 0.8264, + "step": 498500 + }, + { + "epoch": 0.54, + "learning_rate": 7.129129129129129e-08, + "loss": 0.8173, + "step": 499000 + }, + { + "epoch": 0.54, + "learning_rate": 4.1261261261261266e-08, + "loss": 0.8153, + "step": 499500 + }, + { + "epoch": 0.54, + "learning_rate": 1.1291291291291292e-08, + "loss": 0.8383, + "step": 500000 + }, + { + "epoch": 0.54, + "eval_loss": 0.7712512016296387, + "eval_runtime": 603.0742, + "eval_samples_per_second": 165.817, + "eval_steps_per_second": 41.454, + "step": 500000 + }, + { + "epoch": 0.54, + "learning_rate": 1.4998139069534767e-05, + "loss": 0.8465, + "step": 500500 + }, + { + "epoch": 0.54, + "learning_rate": 1.4983131565782893e-05, + "loss": 0.8468, + "step": 501000 + }, + { + "epoch": 0.54, + "learning_rate": 1.4968124062031015e-05, + "loss": 0.8479, + "step": 501500 + }, + { + "epoch": 0.54, + "learning_rate": 1.495311655827914e-05, + "loss": 0.8614, + "step": 502000 + }, + { + "epoch": 0.54, + "learning_rate": 1.4938139069534766e-05, + "loss": 0.8439, + "step": 502500 + }, + { + "epoch": 0.54, + "learning_rate": 1.4923131565782892e-05, + "loss": 0.8538, + "step": 503000 + }, + { + "epoch": 0.54, + "learning_rate": 1.4908124062031016e-05, + "loss": 0.8375, + "step": 503500 + }, + { + "epoch": 0.54, + "learning_rate": 1.4893116558279141e-05, + "loss": 0.8551, + "step": 504000 + }, + { + "epoch": 0.54, + "learning_rate": 1.4878109054527264e-05, + "loss": 0.8507, + "step": 504500 + }, + { + "epoch": 0.54, + "learning_rate": 1.4863101550775387e-05, + "loss": 0.8539, + "step": 505000 + }, + { + "epoch": 0.54, + "learning_rate": 1.4848094047023513e-05, + "loss": 0.8515, + "step": 505500 + }, + { + "epoch": 0.54, + "learning_rate": 1.483311655827914e-05, + "loss": 0.8389, + "step": 506000 + }, + { + "epoch": 0.54, + "learning_rate": 1.4818109054527264e-05, + "loss": 0.8546, + "step": 506500 + }, + { + "epoch": 0.54, + "learning_rate": 1.4803101550775388e-05, + "loss": 0.844, + "step": 507000 + }, + { + "epoch": 0.54, + "learning_rate": 1.4788094047023512e-05, + "loss": 0.8389, + "step": 507500 + }, + { + "epoch": 0.54, + "learning_rate": 1.4773086543271636e-05, + "loss": 0.8542, + "step": 508000 + }, + { + "epoch": 0.55, + "learning_rate": 1.475807903951976e-05, + "loss": 0.8595, + "step": 508500 + }, + { + "epoch": 0.55, + "learning_rate": 1.4743071535767885e-05, + "loss": 0.8535, + "step": 509000 + }, + { + "epoch": 0.55, + "learning_rate": 1.4728064032016008e-05, + "loss": 0.8618, + "step": 509500 + }, + { + "epoch": 0.55, + "learning_rate": 1.4713086543271636e-05, + "loss": 0.8531, + "step": 510000 + }, + { + "epoch": 0.55, + "eval_loss": 0.7975139021873474, + "eval_runtime": 604.9535, + "eval_samples_per_second": 165.302, + "eval_steps_per_second": 41.325, + "step": 510000 + }, + { + "epoch": 0.55, + "learning_rate": 1.4698079039519759e-05, + "loss": 0.844, + "step": 510500 + }, + { + "epoch": 0.55, + "learning_rate": 1.4683071535767884e-05, + "loss": 0.8603, + "step": 511000 + }, + { + "epoch": 0.55, + "learning_rate": 1.4668064032016008e-05, + "loss": 0.8658, + "step": 511500 + }, + { + "epoch": 0.55, + "learning_rate": 1.4653056528264134e-05, + "loss": 0.8574, + "step": 512000 + }, + { + "epoch": 0.55, + "learning_rate": 1.463807903951976e-05, + "loss": 0.8587, + "step": 512500 + }, + { + "epoch": 0.55, + "learning_rate": 1.4623071535767885e-05, + "loss": 0.8496, + "step": 513000 + }, + { + "epoch": 0.55, + "learning_rate": 1.4608064032016009e-05, + "loss": 0.8558, + "step": 513500 + }, + { + "epoch": 0.55, + "learning_rate": 1.4593056528264133e-05, + "loss": 0.8604, + "step": 514000 + }, + { + "epoch": 0.55, + "learning_rate": 1.4578079039519762e-05, + "loss": 0.8619, + "step": 514500 + }, + { + "epoch": 0.55, + "learning_rate": 1.4563101550775387e-05, + "loss": 0.8637, + "step": 515000 + }, + { + "epoch": 0.55, + "learning_rate": 1.4548094047023513e-05, + "loss": 0.8512, + "step": 515500 + }, + { + "epoch": 0.55, + "learning_rate": 1.4533086543271637e-05, + "loss": 0.8522, + "step": 516000 + }, + { + "epoch": 0.55, + "learning_rate": 1.451807903951976e-05, + "loss": 0.8531, + "step": 516500 + }, + { + "epoch": 0.55, + "learning_rate": 1.4503071535767884e-05, + "loss": 0.859, + "step": 517000 + }, + { + "epoch": 0.55, + "learning_rate": 1.4488064032016008e-05, + "loss": 0.8399, + "step": 517500 + }, + { + "epoch": 0.56, + "learning_rate": 1.4473056528264132e-05, + "loss": 0.8597, + "step": 518000 + }, + { + "epoch": 0.56, + "learning_rate": 1.4458049024512256e-05, + "loss": 0.8739, + "step": 518500 + }, + { + "epoch": 0.56, + "learning_rate": 1.4443071535767885e-05, + "loss": 0.8528, + "step": 519000 + }, + { + "epoch": 0.56, + "learning_rate": 1.4428064032016007e-05, + "loss": 0.8613, + "step": 519500 + }, + { + "epoch": 0.56, + "learning_rate": 1.4413056528264133e-05, + "loss": 0.8647, + "step": 520000 + }, + { + "epoch": 0.56, + "eval_loss": 0.8006455302238464, + "eval_runtime": 614.931, + "eval_samples_per_second": 162.62, + "eval_steps_per_second": 40.655, + "step": 520000 + }, + { + "epoch": 0.56, + "learning_rate": 1.4398049024512257e-05, + "loss": 0.8663, + "step": 520500 + }, + { + "epoch": 0.56, + "learning_rate": 1.4383071535767884e-05, + "loss": 0.8476, + "step": 521000 + }, + { + "epoch": 0.56, + "learning_rate": 1.4368064032016008e-05, + "loss": 0.8491, + "step": 521500 + }, + { + "epoch": 0.56, + "learning_rate": 1.4353056528264133e-05, + "loss": 0.8642, + "step": 522000 + }, + { + "epoch": 0.56, + "learning_rate": 1.4338049024512256e-05, + "loss": 0.8615, + "step": 522500 + }, + { + "epoch": 0.56, + "learning_rate": 1.4323071535767884e-05, + "loss": 0.8509, + "step": 523000 + }, + { + "epoch": 0.56, + "learning_rate": 1.4308064032016008e-05, + "loss": 0.8413, + "step": 523500 + }, + { + "epoch": 0.56, + "learning_rate": 1.4293056528264132e-05, + "loss": 0.8668, + "step": 524000 + }, + { + "epoch": 0.56, + "learning_rate": 1.4278049024512256e-05, + "loss": 0.8579, + "step": 524500 + }, + { + "epoch": 0.56, + "learning_rate": 1.426304152076038e-05, + "loss": 0.8592, + "step": 525000 + }, + { + "epoch": 0.56, + "learning_rate": 1.4248034017008506e-05, + "loss": 0.8657, + "step": 525500 + }, + { + "epoch": 0.56, + "learning_rate": 1.4233026513256628e-05, + "loss": 0.8439, + "step": 526000 + }, + { + "epoch": 0.56, + "learning_rate": 1.4218019009504754e-05, + "loss": 0.8613, + "step": 526500 + }, + { + "epoch": 0.57, + "learning_rate": 1.4203041520760379e-05, + "loss": 0.8532, + "step": 527000 + }, + { + "epoch": 0.57, + "learning_rate": 1.4188034017008505e-05, + "loss": 0.8533, + "step": 527500 + }, + { + "epoch": 0.57, + "learning_rate": 1.4173026513256629e-05, + "loss": 0.8421, + "step": 528000 + }, + { + "epoch": 0.57, + "learning_rate": 1.4158019009504754e-05, + "loss": 0.8463, + "step": 528500 + }, + { + "epoch": 0.57, + "learning_rate": 1.4143041520760381e-05, + "loss": 0.8644, + "step": 529000 + }, + { + "epoch": 0.57, + "learning_rate": 1.4128034017008505e-05, + "loss": 0.8666, + "step": 529500 + }, + { + "epoch": 0.57, + "learning_rate": 1.4113056528264132e-05, + "loss": 0.8528, + "step": 530000 + }, + { + "epoch": 0.57, + "eval_loss": 0.7983748316764832, + "eval_runtime": 645.3764, + "eval_samples_per_second": 154.948, + "eval_steps_per_second": 38.737, + "step": 530000 + }, + { + "epoch": 0.57, + "learning_rate": 1.4098049024512256e-05, + "loss": 0.8359, + "step": 530500 + }, + { + "epoch": 0.57, + "learning_rate": 1.4083041520760382e-05, + "loss": 0.8603, + "step": 531000 + }, + { + "epoch": 0.57, + "learning_rate": 1.4068034017008504e-05, + "loss": 0.8479, + "step": 531500 + }, + { + "epoch": 0.57, + "learning_rate": 1.4053026513256628e-05, + "loss": 0.8581, + "step": 532000 + }, + { + "epoch": 0.57, + "learning_rate": 1.4038019009504754e-05, + "loss": 0.8553, + "step": 532500 + }, + { + "epoch": 0.57, + "learning_rate": 1.402304152076038e-05, + "loss": 0.8564, + "step": 533000 + }, + { + "epoch": 0.57, + "learning_rate": 1.4008034017008505e-05, + "loss": 0.855, + "step": 533500 + }, + { + "epoch": 0.57, + "learning_rate": 1.3993026513256629e-05, + "loss": 0.8573, + "step": 534000 + }, + { + "epoch": 0.57, + "learning_rate": 1.3978019009504753e-05, + "loss": 0.8689, + "step": 534500 + }, + { + "epoch": 0.57, + "learning_rate": 1.3963041520760381e-05, + "loss": 0.8597, + "step": 535000 + }, + { + "epoch": 0.57, + "learning_rate": 1.3948034017008505e-05, + "loss": 0.8429, + "step": 535500 + }, + { + "epoch": 0.57, + "learning_rate": 1.3933026513256627e-05, + "loss": 0.8561, + "step": 536000 + }, + { + "epoch": 0.58, + "learning_rate": 1.3918019009504753e-05, + "loss": 0.8651, + "step": 536500 + }, + { + "epoch": 0.58, + "learning_rate": 1.3903011505752877e-05, + "loss": 0.8675, + "step": 537000 + }, + { + "epoch": 0.58, + "learning_rate": 1.3888004002001001e-05, + "loss": 0.8466, + "step": 537500 + }, + { + "epoch": 0.58, + "learning_rate": 1.3872996498249125e-05, + "loss": 0.8629, + "step": 538000 + }, + { + "epoch": 0.58, + "learning_rate": 1.3857988994497249e-05, + "loss": 0.8458, + "step": 538500 + }, + { + "epoch": 0.58, + "learning_rate": 1.3843011505752876e-05, + "loss": 0.8502, + "step": 539000 + }, + { + "epoch": 0.58, + "learning_rate": 1.3828004002001e-05, + "loss": 0.8528, + "step": 539500 + }, + { + "epoch": 0.58, + "learning_rate": 1.3812996498249125e-05, + "loss": 0.8557, + "step": 540000 + }, + { + "epoch": 0.58, + "eval_loss": 0.7993029952049255, + "eval_runtime": 619.1901, + "eval_samples_per_second": 161.501, + "eval_steps_per_second": 40.375, + "step": 540000 + }, + { + "epoch": 0.58, + "learning_rate": 1.379798899449725e-05, + "loss": 0.8428, + "step": 540500 + }, + { + "epoch": 0.58, + "learning_rate": 1.3783011505752876e-05, + "loss": 0.8375, + "step": 541000 + }, + { + "epoch": 0.58, + "learning_rate": 1.3768004002001e-05, + "loss": 0.8569, + "step": 541500 + }, + { + "epoch": 0.58, + "learning_rate": 1.3752996498249126e-05, + "loss": 0.8465, + "step": 542000 + }, + { + "epoch": 0.58, + "learning_rate": 1.3738019009504753e-05, + "loss": 0.8545, + "step": 542500 + }, + { + "epoch": 0.58, + "learning_rate": 1.3723011505752877e-05, + "loss": 0.8584, + "step": 543000 + }, + { + "epoch": 0.58, + "learning_rate": 1.3708004002001001e-05, + "loss": 0.855, + "step": 543500 + }, + { + "epoch": 0.58, + "learning_rate": 1.3692996498249125e-05, + "loss": 0.8386, + "step": 544000 + }, + { + "epoch": 0.58, + "learning_rate": 1.3677988994497249e-05, + "loss": 0.8451, + "step": 544500 + }, + { + "epoch": 0.58, + "learning_rate": 1.3663011505752876e-05, + "loss": 0.8503, + "step": 545000 + }, + { + "epoch": 0.58, + "learning_rate": 1.3648004002001002e-05, + "loss": 0.8647, + "step": 545500 + }, + { + "epoch": 0.59, + "learning_rate": 1.3632996498249125e-05, + "loss": 0.8711, + "step": 546000 + }, + { + "epoch": 0.59, + "learning_rate": 1.3617988994497248e-05, + "loss": 0.8475, + "step": 546500 + }, + { + "epoch": 0.59, + "learning_rate": 1.3602981490745373e-05, + "loss": 0.8646, + "step": 547000 + }, + { + "epoch": 0.59, + "learning_rate": 1.3587973986993497e-05, + "loss": 0.8595, + "step": 547500 + }, + { + "epoch": 0.59, + "learning_rate": 1.3572966483241621e-05, + "loss": 0.8466, + "step": 548000 + }, + { + "epoch": 0.59, + "learning_rate": 1.3557958979489745e-05, + "loss": 0.8538, + "step": 548500 + }, + { + "epoch": 0.59, + "learning_rate": 1.3542981490745374e-05, + "loss": 0.849, + "step": 549000 + }, + { + "epoch": 0.59, + "learning_rate": 1.3527973986993498e-05, + "loss": 0.8513, + "step": 549500 + }, + { + "epoch": 0.59, + "learning_rate": 1.351296648324162e-05, + "loss": 0.8447, + "step": 550000 + }, + { + "epoch": 0.59, + "eval_loss": 0.7979006171226501, + "eval_runtime": 598.1411, + "eval_samples_per_second": 167.185, + "eval_steps_per_second": 41.796, + "step": 550000 + }, + { + "epoch": 0.59, + "learning_rate": 1.3497958979489746e-05, + "loss": 0.839, + "step": 550500 + }, + { + "epoch": 0.59, + "learning_rate": 1.3482981490745373e-05, + "loss": 0.8704, + "step": 551000 + }, + { + "epoch": 0.59, + "learning_rate": 1.3467973986993497e-05, + "loss": 0.8562, + "step": 551500 + }, + { + "epoch": 0.59, + "learning_rate": 1.345296648324162e-05, + "loss": 0.8529, + "step": 552000 + }, + { + "epoch": 0.59, + "learning_rate": 1.3437958979489746e-05, + "loss": 0.8441, + "step": 552500 + }, + { + "epoch": 0.59, + "learning_rate": 1.3422981490745373e-05, + "loss": 0.856, + "step": 553000 + }, + { + "epoch": 0.59, + "learning_rate": 1.3407973986993497e-05, + "loss": 0.849, + "step": 553500 + }, + { + "epoch": 0.59, + "learning_rate": 1.3392966483241621e-05, + "loss": 0.8478, + "step": 554000 + }, + { + "epoch": 0.59, + "learning_rate": 1.3377958979489745e-05, + "loss": 0.8524, + "step": 554500 + }, + { + "epoch": 0.6, + "learning_rate": 1.3362951475737869e-05, + "loss": 0.8477, + "step": 555000 + }, + { + "epoch": 0.6, + "learning_rate": 1.3347943971985995e-05, + "loss": 0.8488, + "step": 555500 + }, + { + "epoch": 0.6, + "learning_rate": 1.3332996498249125e-05, + "loss": 0.8426, + "step": 556000 + }, + { + "epoch": 0.6, + "learning_rate": 1.3317988994497249e-05, + "loss": 0.8702, + "step": 556500 + }, + { + "epoch": 0.6, + "learning_rate": 1.3302981490745373e-05, + "loss": 0.854, + "step": 557000 + }, + { + "epoch": 0.6, + "learning_rate": 1.3287973986993497e-05, + "loss": 0.8539, + "step": 557500 + }, + { + "epoch": 0.6, + "learning_rate": 1.327296648324162e-05, + "loss": 0.8529, + "step": 558000 + }, + { + "epoch": 0.6, + "learning_rate": 1.3257958979489745e-05, + "loss": 0.8548, + "step": 558500 + }, + { + "epoch": 0.6, + "learning_rate": 1.3242951475737869e-05, + "loss": 0.8587, + "step": 559000 + }, + { + "epoch": 0.6, + "learning_rate": 1.3227943971985994e-05, + "loss": 0.8469, + "step": 559500 + }, + { + "epoch": 0.6, + "learning_rate": 1.3212936468234118e-05, + "loss": 0.8529, + "step": 560000 + }, + { + "epoch": 0.6, + "eval_loss": 0.7950595021247864, + "eval_runtime": 602.8951, + "eval_samples_per_second": 165.866, + "eval_steps_per_second": 41.467, + "step": 560000 + }, + { + "epoch": 0.6, + "learning_rate": 1.319792896448224e-05, + "loss": 0.8347, + "step": 560500 + }, + { + "epoch": 0.6, + "learning_rate": 1.3182921460730366e-05, + "loss": 0.8425, + "step": 561000 + }, + { + "epoch": 0.6, + "learning_rate": 1.316791395697849e-05, + "loss": 0.8658, + "step": 561500 + }, + { + "epoch": 0.6, + "learning_rate": 1.3152936468234117e-05, + "loss": 0.8605, + "step": 562000 + }, + { + "epoch": 0.6, + "learning_rate": 1.3137928964482241e-05, + "loss": 0.8643, + "step": 562500 + }, + { + "epoch": 0.6, + "learning_rate": 1.3122921460730367e-05, + "loss": 0.8543, + "step": 563000 + }, + { + "epoch": 0.6, + "learning_rate": 1.3107913956978489e-05, + "loss": 0.8436, + "step": 563500 + }, + { + "epoch": 0.6, + "learning_rate": 1.3092906453226613e-05, + "loss": 0.8369, + "step": 564000 + }, + { + "epoch": 0.61, + "learning_rate": 1.3077928964482241e-05, + "loss": 0.8563, + "step": 564500 + }, + { + "epoch": 0.61, + "learning_rate": 1.3062921460730365e-05, + "loss": 0.8431, + "step": 565000 + }, + { + "epoch": 0.61, + "learning_rate": 1.304791395697849e-05, + "loss": 0.8434, + "step": 565500 + }, + { + "epoch": 0.61, + "learning_rate": 1.3032906453226613e-05, + "loss": 0.8527, + "step": 566000 + }, + { + "epoch": 0.61, + "learning_rate": 1.3017898949474737e-05, + "loss": 0.8626, + "step": 566500 + }, + { + "epoch": 0.61, + "learning_rate": 1.3002891445722861e-05, + "loss": 0.8644, + "step": 567000 + }, + { + "epoch": 0.61, + "learning_rate": 1.2987883941970987e-05, + "loss": 0.8478, + "step": 567500 + }, + { + "epoch": 0.61, + "learning_rate": 1.297287643821911e-05, + "loss": 0.8429, + "step": 568000 + }, + { + "epoch": 0.61, + "learning_rate": 1.2957898949474738e-05, + "loss": 0.8492, + "step": 568500 + }, + { + "epoch": 0.61, + "learning_rate": 1.2942891445722862e-05, + "loss": 0.8498, + "step": 569000 + }, + { + "epoch": 0.61, + "learning_rate": 1.2927883941970987e-05, + "loss": 0.8469, + "step": 569500 + }, + { + "epoch": 0.61, + "learning_rate": 1.2912906453226614e-05, + "loss": 0.8568, + "step": 570000 + }, + { + "epoch": 0.61, + "eval_loss": 0.7931195497512817, + "eval_runtime": 601.1433, + "eval_samples_per_second": 166.35, + "eval_steps_per_second": 41.587, + "step": 570000 + }, + { + "epoch": 0.61, + "learning_rate": 1.2897898949474738e-05, + "loss": 0.8439, + "step": 570500 + }, + { + "epoch": 0.61, + "learning_rate": 1.288289144572286e-05, + "loss": 0.8329, + "step": 571000 + }, + { + "epoch": 0.61, + "learning_rate": 1.2867883941970986e-05, + "loss": 0.864, + "step": 571500 + }, + { + "epoch": 0.61, + "learning_rate": 1.285287643821911e-05, + "loss": 0.8517, + "step": 572000 + }, + { + "epoch": 0.61, + "learning_rate": 1.2837898949474737e-05, + "loss": 0.8559, + "step": 572500 + }, + { + "epoch": 0.61, + "learning_rate": 1.2822891445722861e-05, + "loss": 0.8606, + "step": 573000 + }, + { + "epoch": 0.61, + "learning_rate": 1.280791395697849e-05, + "loss": 0.8449, + "step": 573500 + }, + { + "epoch": 0.62, + "learning_rate": 1.2792906453226614e-05, + "loss": 0.8377, + "step": 574000 + }, + { + "epoch": 0.62, + "learning_rate": 1.2777898949474738e-05, + "loss": 0.8525, + "step": 574500 + }, + { + "epoch": 0.62, + "learning_rate": 1.2762891445722862e-05, + "loss": 0.8573, + "step": 575000 + }, + { + "epoch": 0.62, + "learning_rate": 1.2747883941970986e-05, + "loss": 0.851, + "step": 575500 + }, + { + "epoch": 0.62, + "learning_rate": 1.273287643821911e-05, + "loss": 0.8596, + "step": 576000 + }, + { + "epoch": 0.62, + "learning_rate": 1.2717898949474738e-05, + "loss": 0.8489, + "step": 576500 + }, + { + "epoch": 0.62, + "learning_rate": 1.270289144572286e-05, + "loss": 0.8385, + "step": 577000 + }, + { + "epoch": 0.62, + "learning_rate": 1.2687883941970986e-05, + "loss": 0.839, + "step": 577500 + }, + { + "epoch": 0.62, + "learning_rate": 1.267287643821911e-05, + "loss": 0.8542, + "step": 578000 + }, + { + "epoch": 0.62, + "learning_rate": 1.2657868934467232e-05, + "loss": 0.8386, + "step": 578500 + }, + { + "epoch": 0.62, + "learning_rate": 1.2642861430715358e-05, + "loss": 0.8556, + "step": 579000 + }, + { + "epoch": 0.62, + "learning_rate": 1.2627853926963482e-05, + "loss": 0.8598, + "step": 579500 + }, + { + "epoch": 0.62, + "learning_rate": 1.2612846423211606e-05, + "loss": 0.858, + "step": 580000 + }, + { + "epoch": 0.62, + "eval_loss": 0.7923389077186584, + "eval_runtime": 603.6948, + "eval_samples_per_second": 165.647, + "eval_steps_per_second": 41.412, + "step": 580000 + }, + { + "epoch": 0.62, + "learning_rate": 1.259783891945973e-05, + "loss": 0.8411, + "step": 580500 + }, + { + "epoch": 0.62, + "learning_rate": 1.2582831415707854e-05, + "loss": 0.8443, + "step": 581000 + }, + { + "epoch": 0.62, + "learning_rate": 1.2567853926963483e-05, + "loss": 0.8564, + "step": 581500 + }, + { + "epoch": 0.62, + "learning_rate": 1.2552846423211606e-05, + "loss": 0.8485, + "step": 582000 + }, + { + "epoch": 0.62, + "learning_rate": 1.253783891945973e-05, + "loss": 0.8542, + "step": 582500 + }, + { + "epoch": 0.63, + "learning_rate": 1.2522831415707854e-05, + "loss": 0.846, + "step": 583000 + }, + { + "epoch": 0.63, + "learning_rate": 1.2507823911955978e-05, + "loss": 0.8492, + "step": 583500 + }, + { + "epoch": 0.63, + "learning_rate": 1.2492816408204102e-05, + "loss": 0.8476, + "step": 584000 + }, + { + "epoch": 0.63, + "learning_rate": 1.2477808904452226e-05, + "loss": 0.85, + "step": 584500 + }, + { + "epoch": 0.63, + "learning_rate": 1.2462801400700352e-05, + "loss": 0.8449, + "step": 585000 + }, + { + "epoch": 0.63, + "learning_rate": 1.2447853926963482e-05, + "loss": 0.8547, + "step": 585500 + }, + { + "epoch": 0.63, + "learning_rate": 1.2432846423211608e-05, + "loss": 0.835, + "step": 586000 + }, + { + "epoch": 0.63, + "learning_rate": 1.241783891945973e-05, + "loss": 0.8502, + "step": 586500 + }, + { + "epoch": 0.63, + "learning_rate": 1.2402831415707854e-05, + "loss": 0.8596, + "step": 587000 + }, + { + "epoch": 0.63, + "learning_rate": 1.238782391195598e-05, + "loss": 0.8586, + "step": 587500 + }, + { + "epoch": 0.63, + "learning_rate": 1.2372816408204102e-05, + "loss": 0.8478, + "step": 588000 + }, + { + "epoch": 0.63, + "learning_rate": 1.2357808904452226e-05, + "loss": 0.8472, + "step": 588500 + }, + { + "epoch": 0.63, + "learning_rate": 1.2342831415707854e-05, + "loss": 0.8523, + "step": 589000 + }, + { + "epoch": 0.63, + "learning_rate": 1.2327823911955978e-05, + "loss": 0.8395, + "step": 589500 + }, + { + "epoch": 0.63, + "learning_rate": 1.2312816408204102e-05, + "loss": 0.8573, + "step": 590000 + }, + { + "epoch": 0.63, + "eval_loss": 0.7895151376724243, + "eval_runtime": 593.5374, + "eval_samples_per_second": 168.481, + "eval_steps_per_second": 42.12, + "step": 590000 + }, + { + "epoch": 0.63, + "learning_rate": 1.2297808904452226e-05, + "loss": 0.8431, + "step": 590500 + }, + { + "epoch": 0.63, + "learning_rate": 1.228280140070035e-05, + "loss": 0.8416, + "step": 591000 + }, + { + "epoch": 0.63, + "learning_rate": 1.2267793896948474e-05, + "loss": 0.8563, + "step": 591500 + }, + { + "epoch": 0.63, + "learning_rate": 1.22527863931966e-05, + "loss": 0.8352, + "step": 592000 + }, + { + "epoch": 0.64, + "learning_rate": 1.2237778889444724e-05, + "loss": 0.8399, + "step": 592500 + }, + { + "epoch": 0.64, + "learning_rate": 1.222280140070035e-05, + "loss": 0.841, + "step": 593000 + }, + { + "epoch": 0.64, + "learning_rate": 1.220782391195598e-05, + "loss": 0.8455, + "step": 593500 + }, + { + "epoch": 0.64, + "learning_rate": 1.2192816408204102e-05, + "loss": 0.8493, + "step": 594000 + }, + { + "epoch": 0.64, + "learning_rate": 1.2177808904452226e-05, + "loss": 0.8465, + "step": 594500 + }, + { + "epoch": 0.64, + "learning_rate": 1.2162801400700351e-05, + "loss": 0.8478, + "step": 595000 + }, + { + "epoch": 0.64, + "learning_rate": 1.2147793896948473e-05, + "loss": 0.8447, + "step": 595500 + }, + { + "epoch": 0.64, + "learning_rate": 1.2132786393196599e-05, + "loss": 0.8333, + "step": 596000 + }, + { + "epoch": 0.64, + "learning_rate": 1.2117778889444723e-05, + "loss": 0.8496, + "step": 596500 + }, + { + "epoch": 0.64, + "learning_rate": 1.2102771385692847e-05, + "loss": 0.8357, + "step": 597000 + }, + { + "epoch": 0.64, + "learning_rate": 1.2087793896948474e-05, + "loss": 0.8534, + "step": 597500 + }, + { + "epoch": 0.64, + "learning_rate": 1.20727863931966e-05, + "loss": 0.8443, + "step": 598000 + }, + { + "epoch": 0.64, + "learning_rate": 1.2057778889444722e-05, + "loss": 0.8482, + "step": 598500 + }, + { + "epoch": 0.64, + "learning_rate": 1.2042771385692846e-05, + "loss": 0.8462, + "step": 599000 + }, + { + "epoch": 0.64, + "learning_rate": 1.2027823911955978e-05, + "loss": 0.8487, + "step": 599500 + }, + { + "epoch": 0.64, + "learning_rate": 1.2012816408204102e-05, + "loss": 0.8527, + "step": 600000 + }, + { + "epoch": 0.64, + "eval_loss": 0.7874695062637329, + "eval_runtime": 597.2742, + "eval_samples_per_second": 167.427, + "eval_steps_per_second": 41.857, + "step": 600000 + }, + { + "epoch": 0.64, + "learning_rate": 1.1997808904452227e-05, + "loss": 0.8327, + "step": 600500 + }, + { + "epoch": 0.64, + "learning_rate": 1.1982801400700351e-05, + "loss": 0.8275, + "step": 601000 + }, + { + "epoch": 0.65, + "learning_rate": 1.1967793896948473e-05, + "loss": 0.8332, + "step": 601500 + }, + { + "epoch": 0.65, + "learning_rate": 1.1952786393196599e-05, + "loss": 0.8389, + "step": 602000 + }, + { + "epoch": 0.65, + "learning_rate": 1.1937778889444723e-05, + "loss": 0.8557, + "step": 602500 + }, + { + "epoch": 0.65, + "learning_rate": 1.1922771385692845e-05, + "loss": 0.8465, + "step": 603000 + }, + { + "epoch": 0.65, + "learning_rate": 1.1907793896948474e-05, + "loss": 0.8416, + "step": 603500 + }, + { + "epoch": 0.65, + "learning_rate": 1.18927863931966e-05, + "loss": 0.8341, + "step": 604000 + }, + { + "epoch": 0.65, + "learning_rate": 1.1877778889444722e-05, + "loss": 0.8451, + "step": 604500 + }, + { + "epoch": 0.65, + "learning_rate": 1.1862771385692846e-05, + "loss": 0.8517, + "step": 605000 + }, + { + "epoch": 0.65, + "learning_rate": 1.1847763881940971e-05, + "loss": 0.8417, + "step": 605500 + }, + { + "epoch": 0.65, + "learning_rate": 1.1832756378189095e-05, + "loss": 0.8435, + "step": 606000 + }, + { + "epoch": 0.65, + "learning_rate": 1.181774887443722e-05, + "loss": 0.855, + "step": 606500 + }, + { + "epoch": 0.65, + "learning_rate": 1.1802741370685343e-05, + "loss": 0.8568, + "step": 607000 + }, + { + "epoch": 0.65, + "learning_rate": 1.1787793896948474e-05, + "loss": 0.8465, + "step": 607500 + }, + { + "epoch": 0.65, + "learning_rate": 1.1772786393196599e-05, + "loss": 0.8427, + "step": 608000 + }, + { + "epoch": 0.65, + "learning_rate": 1.1757778889444723e-05, + "loss": 0.8507, + "step": 608500 + }, + { + "epoch": 0.65, + "learning_rate": 1.1742771385692845e-05, + "loss": 0.8348, + "step": 609000 + }, + { + "epoch": 0.65, + "learning_rate": 1.1727763881940971e-05, + "loss": 0.8327, + "step": 609500 + }, + { + "epoch": 0.65, + "learning_rate": 1.1712756378189095e-05, + "loss": 0.8321, + "step": 610000 + }, + { + "epoch": 0.65, + "eval_loss": 0.7858200073242188, + "eval_runtime": 639.6935, + "eval_samples_per_second": 156.325, + "eval_steps_per_second": 39.081, + "step": 610000 + }, + { + "epoch": 0.65, + "learning_rate": 1.1697748874437219e-05, + "loss": 0.8321, + "step": 610500 + }, + { + "epoch": 0.66, + "learning_rate": 1.1682741370685343e-05, + "loss": 0.8484, + "step": 611000 + }, + { + "epoch": 0.66, + "learning_rate": 1.1667733866933467e-05, + "loss": 0.8498, + "step": 611500 + }, + { + "epoch": 0.66, + "learning_rate": 1.1652726363181592e-05, + "loss": 0.8476, + "step": 612000 + }, + { + "epoch": 0.66, + "learning_rate": 1.1637718859429715e-05, + "loss": 0.8507, + "step": 612500 + }, + { + "epoch": 0.66, + "learning_rate": 1.1622711355677838e-05, + "loss": 0.8308, + "step": 613000 + }, + { + "epoch": 0.66, + "learning_rate": 1.1607733866933467e-05, + "loss": 0.8443, + "step": 613500 + }, + { + "epoch": 0.66, + "learning_rate": 1.1592726363181591e-05, + "loss": 0.8424, + "step": 614000 + }, + { + "epoch": 0.66, + "learning_rate": 1.1577718859429715e-05, + "loss": 0.8445, + "step": 614500 + }, + { + "epoch": 0.66, + "learning_rate": 1.1562711355677839e-05, + "loss": 0.8503, + "step": 615000 + }, + { + "epoch": 0.66, + "learning_rate": 1.1547703851925963e-05, + "loss": 0.8312, + "step": 615500 + }, + { + "epoch": 0.66, + "learning_rate": 1.1532696348174087e-05, + "loss": 0.8474, + "step": 616000 + }, + { + "epoch": 0.66, + "learning_rate": 1.151768884442221e-05, + "loss": 0.8477, + "step": 616500 + }, + { + "epoch": 0.66, + "learning_rate": 1.1502711355677838e-05, + "loss": 0.8313, + "step": 617000 + }, + { + "epoch": 0.66, + "learning_rate": 1.1487703851925964e-05, + "loss": 0.8423, + "step": 617500 + }, + { + "epoch": 0.66, + "learning_rate": 1.1472696348174087e-05, + "loss": 0.8389, + "step": 618000 + }, + { + "epoch": 0.66, + "learning_rate": 1.1457688844422211e-05, + "loss": 0.8361, + "step": 618500 + }, + { + "epoch": 0.66, + "learning_rate": 1.1442711355677838e-05, + "loss": 0.847, + "step": 619000 + }, + { + "epoch": 0.66, + "learning_rate": 1.1427703851925964e-05, + "loss": 0.8392, + "step": 619500 + }, + { + "epoch": 0.66, + "learning_rate": 1.1412696348174086e-05, + "loss": 0.828, + "step": 620000 + }, + { + "epoch": 0.66, + "eval_loss": 0.7836601734161377, + "eval_runtime": 599.8827, + "eval_samples_per_second": 166.699, + "eval_steps_per_second": 41.675, + "step": 620000 + }, + { + "epoch": 0.67, + "learning_rate": 1.1397688844422212e-05, + "loss": 0.8334, + "step": 620500 + }, + { + "epoch": 0.67, + "learning_rate": 1.1382681340670336e-05, + "loss": 0.8389, + "step": 621000 + }, + { + "epoch": 0.67, + "learning_rate": 1.136767383691846e-05, + "loss": 0.8306, + "step": 621500 + }, + { + "epoch": 0.67, + "learning_rate": 1.1352666333166584e-05, + "loss": 0.8488, + "step": 622000 + }, + { + "epoch": 0.67, + "learning_rate": 1.1337658829414708e-05, + "loss": 0.8244, + "step": 622500 + }, + { + "epoch": 0.67, + "learning_rate": 1.1322681340670335e-05, + "loss": 0.8449, + "step": 623000 + }, + { + "epoch": 0.67, + "learning_rate": 1.1307673836918459e-05, + "loss": 0.8417, + "step": 623500 + }, + { + "epoch": 0.67, + "learning_rate": 1.1292666333166584e-05, + "loss": 0.8319, + "step": 624000 + }, + { + "epoch": 0.67, + "learning_rate": 1.1277658829414708e-05, + "loss": 0.8309, + "step": 624500 + }, + { + "epoch": 0.67, + "learning_rate": 1.126265132566283e-05, + "loss": 0.8326, + "step": 625000 + }, + { + "epoch": 0.67, + "learning_rate": 1.124767383691846e-05, + "loss": 0.8243, + "step": 625500 + }, + { + "epoch": 0.67, + "learning_rate": 1.1232696348174086e-05, + "loss": 0.834, + "step": 626000 + }, + { + "epoch": 0.67, + "learning_rate": 1.1217688844422212e-05, + "loss": 0.8355, + "step": 626500 + }, + { + "epoch": 0.67, + "learning_rate": 1.1202711355677839e-05, + "loss": 0.8326, + "step": 627000 + }, + { + "epoch": 0.67, + "learning_rate": 1.1187703851925963e-05, + "loss": 0.8316, + "step": 627500 + }, + { + "epoch": 0.67, + "learning_rate": 1.1172696348174087e-05, + "loss": 0.8426, + "step": 628000 + }, + { + "epoch": 0.67, + "learning_rate": 1.1157688844422213e-05, + "loss": 0.8407, + "step": 628500 + }, + { + "epoch": 0.67, + "learning_rate": 1.1142681340670335e-05, + "loss": 0.8292, + "step": 629000 + }, + { + "epoch": 0.68, + "learning_rate": 1.1127673836918459e-05, + "loss": 0.8412, + "step": 629500 + }, + { + "epoch": 0.68, + "learning_rate": 1.1112666333166584e-05, + "loss": 0.8378, + "step": 630000 + }, + { + "epoch": 0.68, + "eval_loss": 0.7810372114181519, + "eval_runtime": 608.8449, + "eval_samples_per_second": 164.245, + "eval_steps_per_second": 41.061, + "step": 630000 + }, + { + "epoch": 0.68, + "learning_rate": 1.1097658829414707e-05, + "loss": 0.832, + "step": 630500 + }, + { + "epoch": 0.68, + "learning_rate": 1.108265132566283e-05, + "loss": 0.8393, + "step": 631000 + }, + { + "epoch": 0.68, + "learning_rate": 1.106767383691846e-05, + "loss": 0.8387, + "step": 631500 + }, + { + "epoch": 0.68, + "learning_rate": 1.1052666333166583e-05, + "loss": 0.8366, + "step": 632000 + }, + { + "epoch": 0.68, + "learning_rate": 1.1037688844422212e-05, + "loss": 0.8145, + "step": 632500 + }, + { + "epoch": 0.68, + "learning_rate": 1.1022681340670336e-05, + "loss": 0.8275, + "step": 633000 + }, + { + "epoch": 0.68, + "learning_rate": 1.1007673836918458e-05, + "loss": 0.8372, + "step": 633500 + }, + { + "epoch": 0.68, + "learning_rate": 1.0992666333166584e-05, + "loss": 0.8224, + "step": 634000 + }, + { + "epoch": 0.68, + "learning_rate": 1.0977658829414708e-05, + "loss": 0.8247, + "step": 634500 + }, + { + "epoch": 0.68, + "learning_rate": 1.0962651325662832e-05, + "loss": 0.8397, + "step": 635000 + }, + { + "epoch": 0.68, + "learning_rate": 1.0947643821910956e-05, + "loss": 0.8302, + "step": 635500 + }, + { + "epoch": 0.68, + "learning_rate": 1.0932666333166584e-05, + "loss": 0.8334, + "step": 636000 + }, + { + "epoch": 0.68, + "learning_rate": 1.0917658829414707e-05, + "loss": 0.8377, + "step": 636500 + }, + { + "epoch": 0.68, + "learning_rate": 1.0902651325662832e-05, + "loss": 0.816, + "step": 637000 + }, + { + "epoch": 0.68, + "learning_rate": 1.0887643821910956e-05, + "loss": 0.8293, + "step": 637500 + }, + { + "epoch": 0.68, + "learning_rate": 1.087263631815908e-05, + "loss": 0.8414, + "step": 638000 + }, + { + "epoch": 0.68, + "learning_rate": 1.0857658829414707e-05, + "loss": 0.8348, + "step": 638500 + }, + { + "epoch": 0.69, + "learning_rate": 1.0842651325662833e-05, + "loss": 0.8368, + "step": 639000 + }, + { + "epoch": 0.69, + "learning_rate": 1.0827643821910955e-05, + "loss": 0.8547, + "step": 639500 + }, + { + "epoch": 0.69, + "learning_rate": 1.0812636318159079e-05, + "loss": 0.842, + "step": 640000 + }, + { + "epoch": 0.69, + "eval_loss": 0.7787752151489258, + "eval_runtime": 597.0129, + "eval_samples_per_second": 167.501, + "eval_steps_per_second": 41.875, + "step": 640000 + }, + { + "epoch": 0.69, + "learning_rate": 1.0797628814407205e-05, + "loss": 0.8327, + "step": 640500 + }, + { + "epoch": 0.69, + "learning_rate": 1.0782621310655329e-05, + "loss": 0.8286, + "step": 641000 + }, + { + "epoch": 0.69, + "learning_rate": 1.076761380690345e-05, + "loss": 0.8293, + "step": 641500 + }, + { + "epoch": 0.69, + "learning_rate": 1.0752606303151576e-05, + "loss": 0.8377, + "step": 642000 + }, + { + "epoch": 0.69, + "learning_rate": 1.07375987993997e-05, + "loss": 0.8163, + "step": 642500 + }, + { + "epoch": 0.69, + "learning_rate": 1.0722591295647823e-05, + "loss": 0.8428, + "step": 643000 + }, + { + "epoch": 0.69, + "learning_rate": 1.0707583791895948e-05, + "loss": 0.8245, + "step": 643500 + }, + { + "epoch": 0.69, + "learning_rate": 1.0692576288144072e-05, + "loss": 0.8317, + "step": 644000 + }, + { + "epoch": 0.69, + "learning_rate": 1.06775987993997e-05, + "loss": 0.8223, + "step": 644500 + }, + { + "epoch": 0.69, + "learning_rate": 1.0662591295647825e-05, + "loss": 0.8298, + "step": 645000 + }, + { + "epoch": 0.69, + "learning_rate": 1.0647583791895949e-05, + "loss": 0.8304, + "step": 645500 + }, + { + "epoch": 0.69, + "learning_rate": 1.0632576288144073e-05, + "loss": 0.8417, + "step": 646000 + }, + { + "epoch": 0.69, + "learning_rate": 1.06175987993997e-05, + "loss": 0.8283, + "step": 646500 + }, + { + "epoch": 0.69, + "learning_rate": 1.0602591295647825e-05, + "loss": 0.8296, + "step": 647000 + }, + { + "epoch": 0.69, + "learning_rate": 1.0587583791895948e-05, + "loss": 0.825, + "step": 647500 + }, + { + "epoch": 0.69, + "learning_rate": 1.0572576288144072e-05, + "loss": 0.8312, + "step": 648000 + }, + { + "epoch": 0.7, + "learning_rate": 1.0557568784392197e-05, + "loss": 0.8393, + "step": 648500 + }, + { + "epoch": 0.7, + "learning_rate": 1.0542561280640321e-05, + "loss": 0.838, + "step": 649000 + }, + { + "epoch": 0.7, + "learning_rate": 1.0527553776888443e-05, + "loss": 0.8271, + "step": 649500 + }, + { + "epoch": 0.7, + "learning_rate": 1.0512576288144072e-05, + "loss": 0.8348, + "step": 650000 + }, + { + "epoch": 0.7, + "eval_loss": 0.7790135145187378, + "eval_runtime": 606.4656, + "eval_samples_per_second": 164.89, + "eval_steps_per_second": 41.222, + "step": 650000 + }, + { + "epoch": 0.7, + "learning_rate": 1.0497568784392196e-05, + "loss": 0.8234, + "step": 650500 + }, + { + "epoch": 0.7, + "learning_rate": 1.048256128064032e-05, + "loss": 0.8434, + "step": 651000 + }, + { + "epoch": 0.7, + "learning_rate": 1.0467553776888444e-05, + "loss": 0.8387, + "step": 651500 + }, + { + "epoch": 0.7, + "learning_rate": 1.045254627313657e-05, + "loss": 0.8255, + "step": 652000 + }, + { + "epoch": 0.7, + "learning_rate": 1.0437538769384692e-05, + "loss": 0.8211, + "step": 652500 + }, + { + "epoch": 0.7, + "learning_rate": 1.0422531265632816e-05, + "loss": 0.8452, + "step": 653000 + }, + { + "epoch": 0.7, + "learning_rate": 1.0407553776888445e-05, + "loss": 0.8278, + "step": 653500 + }, + { + "epoch": 0.7, + "learning_rate": 1.0392546273136568e-05, + "loss": 0.8275, + "step": 654000 + }, + { + "epoch": 0.7, + "learning_rate": 1.0377538769384692e-05, + "loss": 0.8328, + "step": 654500 + }, + { + "epoch": 0.7, + "learning_rate": 1.0362531265632818e-05, + "loss": 0.8127, + "step": 655000 + }, + { + "epoch": 0.7, + "learning_rate": 1.034752376188094e-05, + "loss": 0.8171, + "step": 655500 + }, + { + "epoch": 0.7, + "learning_rate": 1.0332516258129064e-05, + "loss": 0.8297, + "step": 656000 + }, + { + "epoch": 0.7, + "learning_rate": 1.031750875437719e-05, + "loss": 0.8299, + "step": 656500 + }, + { + "epoch": 0.7, + "learning_rate": 1.0302501250625314e-05, + "loss": 0.8343, + "step": 657000 + }, + { + "epoch": 0.71, + "learning_rate": 1.028752376188094e-05, + "loss": 0.8344, + "step": 657500 + }, + { + "epoch": 0.71, + "learning_rate": 1.027254627313657e-05, + "loss": 0.8232, + "step": 658000 + }, + { + "epoch": 0.71, + "learning_rate": 1.0257538769384692e-05, + "loss": 0.83, + "step": 658500 + }, + { + "epoch": 0.71, + "learning_rate": 1.0242531265632817e-05, + "loss": 0.8321, + "step": 659000 + }, + { + "epoch": 0.71, + "learning_rate": 1.0227523761880941e-05, + "loss": 0.8327, + "step": 659500 + }, + { + "epoch": 0.71, + "learning_rate": 1.0212516258129064e-05, + "loss": 0.8217, + "step": 660000 + }, + { + "epoch": 0.71, + "eval_loss": 0.7772210836410522, + "eval_runtime": 623.462, + "eval_samples_per_second": 160.395, + "eval_steps_per_second": 40.099, + "step": 660000 + }, + { + "epoch": 0.71, + "learning_rate": 1.019750875437719e-05, + "loss": 0.8316, + "step": 660500 + }, + { + "epoch": 0.71, + "learning_rate": 1.0182501250625313e-05, + "loss": 0.8346, + "step": 661000 + }, + { + "epoch": 0.71, + "learning_rate": 1.0167493746873437e-05, + "loss": 0.8353, + "step": 661500 + }, + { + "epoch": 0.71, + "learning_rate": 1.0152516258129064e-05, + "loss": 0.8295, + "step": 662000 + }, + { + "epoch": 0.71, + "learning_rate": 1.013750875437719e-05, + "loss": 0.8361, + "step": 662500 + }, + { + "epoch": 0.71, + "learning_rate": 1.0122501250625312e-05, + "loss": 0.8363, + "step": 663000 + }, + { + "epoch": 0.71, + "learning_rate": 1.0107493746873436e-05, + "loss": 0.8393, + "step": 663500 + }, + { + "epoch": 0.71, + "learning_rate": 1.0092516258129065e-05, + "loss": 0.837, + "step": 664000 + }, + { + "epoch": 0.71, + "learning_rate": 1.0077538769384692e-05, + "loss": 0.8273, + "step": 664500 + }, + { + "epoch": 0.71, + "learning_rate": 1.0062531265632817e-05, + "loss": 0.8308, + "step": 665000 + }, + { + "epoch": 0.71, + "learning_rate": 1.0047523761880941e-05, + "loss": 0.8322, + "step": 665500 + }, + { + "epoch": 0.71, + "learning_rate": 1.0032516258129064e-05, + "loss": 0.8109, + "step": 666000 + }, + { + "epoch": 0.71, + "learning_rate": 1.0017538769384692e-05, + "loss": 0.8194, + "step": 666500 + }, + { + "epoch": 0.72, + "learning_rate": 1.0002531265632816e-05, + "loss": 0.8341, + "step": 667000 + }, + { + "epoch": 0.72, + "learning_rate": 9.98752376188094e-06, + "loss": 0.8371, + "step": 667500 + }, + { + "epoch": 0.72, + "learning_rate": 9.972516258129064e-06, + "loss": 0.8252, + "step": 668000 + }, + { + "epoch": 0.72, + "learning_rate": 9.95750875437719e-06, + "loss": 0.8276, + "step": 668500 + }, + { + "epoch": 0.72, + "learning_rate": 9.942501250625312e-06, + "loss": 0.8333, + "step": 669000 + }, + { + "epoch": 0.72, + "learning_rate": 9.927493746873436e-06, + "loss": 0.8287, + "step": 669500 + }, + { + "epoch": 0.72, + "learning_rate": 9.912516258129065e-06, + "loss": 0.8309, + "step": 670000 + }, + { + "epoch": 0.72, + "eval_loss": 0.7751027941703796, + "eval_runtime": 657.5511, + "eval_samples_per_second": 152.079, + "eval_steps_per_second": 38.02, + "step": 670000 + }, + { + "epoch": 0.72, + "learning_rate": 9.897508754377189e-06, + "loss": 0.8265, + "step": 670500 + }, + { + "epoch": 0.72, + "learning_rate": 9.882501250625313e-06, + "loss": 0.8296, + "step": 671000 + }, + { + "epoch": 0.72, + "learning_rate": 9.867493746873438e-06, + "loss": 0.8207, + "step": 671500 + }, + { + "epoch": 0.72, + "learning_rate": 9.85248624312156e-06, + "loss": 0.8193, + "step": 672000 + }, + { + "epoch": 0.72, + "learning_rate": 9.837478739369684e-06, + "loss": 0.8323, + "step": 672500 + }, + { + "epoch": 0.72, + "learning_rate": 9.822501250625313e-06, + "loss": 0.8389, + "step": 673000 + }, + { + "epoch": 0.72, + "learning_rate": 9.807493746873437e-06, + "loss": 0.8307, + "step": 673500 + }, + { + "epoch": 0.72, + "learning_rate": 9.792486243121561e-06, + "loss": 0.8229, + "step": 674000 + }, + { + "epoch": 0.72, + "learning_rate": 9.777478739369685e-06, + "loss": 0.8343, + "step": 674500 + }, + { + "epoch": 0.72, + "learning_rate": 9.762471235617809e-06, + "loss": 0.8317, + "step": 675000 + }, + { + "epoch": 0.72, + "learning_rate": 9.747493746873438e-06, + "loss": 0.8269, + "step": 675500 + }, + { + "epoch": 0.72, + "learning_rate": 9.732486243121562e-06, + "loss": 0.8238, + "step": 676000 + }, + { + "epoch": 0.73, + "learning_rate": 9.717478739369684e-06, + "loss": 0.8096, + "step": 676500 + }, + { + "epoch": 0.73, + "learning_rate": 9.70247123561781e-06, + "loss": 0.8251, + "step": 677000 + }, + { + "epoch": 0.73, + "learning_rate": 9.687463731865933e-06, + "loss": 0.8358, + "step": 677500 + }, + { + "epoch": 0.73, + "learning_rate": 9.672456228114057e-06, + "loss": 0.8155, + "step": 678000 + }, + { + "epoch": 0.73, + "learning_rate": 9.657448724362181e-06, + "loss": 0.8232, + "step": 678500 + }, + { + "epoch": 0.73, + "learning_rate": 9.642441220610305e-06, + "loss": 0.8185, + "step": 679000 + }, + { + "epoch": 0.73, + "learning_rate": 9.627463731865932e-06, + "loss": 0.8348, + "step": 679500 + }, + { + "epoch": 0.73, + "learning_rate": 9.612456228114056e-06, + "loss": 0.8094, + "step": 680000 + }, + { + "epoch": 0.73, + "eval_loss": 0.7731601595878601, + "eval_runtime": 620.6555, + "eval_samples_per_second": 161.12, + "eval_steps_per_second": 40.28, + "step": 680000 + }, + { + "epoch": 0.73, + "learning_rate": 9.597448724362182e-06, + "loss": 0.8311, + "step": 680500 + }, + { + "epoch": 0.73, + "learning_rate": 9.582441220610306e-06, + "loss": 0.818, + "step": 681000 + }, + { + "epoch": 0.73, + "learning_rate": 9.56743371685843e-06, + "loss": 0.8162, + "step": 681500 + }, + { + "epoch": 0.73, + "learning_rate": 9.552426213106554e-06, + "loss": 0.8283, + "step": 682000 + }, + { + "epoch": 0.73, + "learning_rate": 9.537418709354678e-06, + "loss": 0.8117, + "step": 682500 + }, + { + "epoch": 0.73, + "learning_rate": 9.522441220610305e-06, + "loss": 0.8371, + "step": 683000 + }, + { + "epoch": 0.73, + "learning_rate": 9.50743371685843e-06, + "loss": 0.8156, + "step": 683500 + }, + { + "epoch": 0.73, + "learning_rate": 9.492426213106554e-06, + "loss": 0.828, + "step": 684000 + }, + { + "epoch": 0.73, + "learning_rate": 9.477418709354677e-06, + "loss": 0.8315, + "step": 684500 + }, + { + "epoch": 0.73, + "learning_rate": 9.462411205602802e-06, + "loss": 0.8242, + "step": 685000 + }, + { + "epoch": 0.74, + "learning_rate": 9.447403701850926e-06, + "loss": 0.8334, + "step": 685500 + }, + { + "epoch": 0.74, + "learning_rate": 9.43239619809905e-06, + "loss": 0.8207, + "step": 686000 + }, + { + "epoch": 0.74, + "learning_rate": 9.417388694347174e-06, + "loss": 0.8318, + "step": 686500 + }, + { + "epoch": 0.74, + "learning_rate": 9.402411205602803e-06, + "loss": 0.8288, + "step": 687000 + }, + { + "epoch": 0.74, + "learning_rate": 9.38743371685843e-06, + "loss": 0.8319, + "step": 687500 + }, + { + "epoch": 0.74, + "learning_rate": 9.372426213106554e-06, + "loss": 0.8174, + "step": 688000 + }, + { + "epoch": 0.74, + "learning_rate": 9.357418709354678e-06, + "loss": 0.8295, + "step": 688500 + }, + { + "epoch": 0.74, + "learning_rate": 9.342411205602802e-06, + "loss": 0.8265, + "step": 689000 + }, + { + "epoch": 0.74, + "learning_rate": 9.327403701850926e-06, + "loss": 0.8263, + "step": 689500 + }, + { + "epoch": 0.74, + "learning_rate": 9.31239619809905e-06, + "loss": 0.826, + "step": 690000 + }, + { + "epoch": 0.74, + "eval_loss": 0.7714752554893494, + "eval_runtime": 616.7754, + "eval_samples_per_second": 162.134, + "eval_steps_per_second": 40.533, + "step": 690000 + }, + { + "epoch": 0.74, + "learning_rate": 9.297418709354677e-06, + "loss": 0.8202, + "step": 690500 + }, + { + "epoch": 0.74, + "learning_rate": 9.282411205602802e-06, + "loss": 0.8123, + "step": 691000 + }, + { + "epoch": 0.74, + "learning_rate": 9.267403701850926e-06, + "loss": 0.8179, + "step": 691500 + }, + { + "epoch": 0.74, + "learning_rate": 9.252396198099048e-06, + "loss": 0.8245, + "step": 692000 + }, + { + "epoch": 0.74, + "learning_rate": 9.237388694347174e-06, + "loss": 0.8257, + "step": 692500 + }, + { + "epoch": 0.74, + "learning_rate": 9.222381190595298e-06, + "loss": 0.8111, + "step": 693000 + }, + { + "epoch": 0.74, + "learning_rate": 9.207403701850925e-06, + "loss": 0.821, + "step": 693500 + }, + { + "epoch": 0.74, + "learning_rate": 9.192396198099049e-06, + "loss": 0.8179, + "step": 694000 + }, + { + "epoch": 0.74, + "learning_rate": 9.177388694347175e-06, + "loss": 0.8151, + "step": 694500 + }, + { + "epoch": 0.75, + "learning_rate": 9.162381190595297e-06, + "loss": 0.8411, + "step": 695000 + }, + { + "epoch": 0.75, + "learning_rate": 9.147403701850926e-06, + "loss": 0.8441, + "step": 695500 + }, + { + "epoch": 0.75, + "learning_rate": 9.132396198099051e-06, + "loss": 0.8214, + "step": 696000 + }, + { + "epoch": 0.75, + "learning_rate": 9.117388694347173e-06, + "loss": 0.8247, + "step": 696500 + }, + { + "epoch": 0.75, + "learning_rate": 9.102381190595297e-06, + "loss": 0.8196, + "step": 697000 + }, + { + "epoch": 0.75, + "learning_rate": 9.087373686843423e-06, + "loss": 0.8204, + "step": 697500 + }, + { + "epoch": 0.75, + "learning_rate": 9.072366183091547e-06, + "loss": 0.8126, + "step": 698000 + }, + { + "epoch": 0.75, + "learning_rate": 9.057388694347174e-06, + "loss": 0.8344, + "step": 698500 + }, + { + "epoch": 0.75, + "learning_rate": 9.042381190595298e-06, + "loss": 0.825, + "step": 699000 + }, + { + "epoch": 0.75, + "learning_rate": 9.027373686843422e-06, + "loss": 0.8234, + "step": 699500 + }, + { + "epoch": 0.75, + "learning_rate": 9.01239619809905e-06, + "loss": 0.8289, + "step": 700000 + }, + { + "epoch": 0.75, + "eval_loss": 0.7685481905937195, + "eval_runtime": 617.0203, + "eval_samples_per_second": 162.069, + "eval_steps_per_second": 40.517, + "step": 700000 + }, + { + "epoch": 0.75, + "learning_rate": 8.997388694347175e-06, + "loss": 0.8284, + "step": 700500 + }, + { + "epoch": 0.75, + "learning_rate": 8.982381190595297e-06, + "loss": 0.8129, + "step": 701000 + }, + { + "epoch": 0.75, + "learning_rate": 8.967373686843422e-06, + "loss": 0.8313, + "step": 701500 + }, + { + "epoch": 0.75, + "learning_rate": 8.952366183091546e-06, + "loss": 0.813, + "step": 702000 + }, + { + "epoch": 0.75, + "learning_rate": 8.93735867933967e-06, + "loss": 0.8221, + "step": 702500 + }, + { + "epoch": 0.75, + "learning_rate": 8.922351175587794e-06, + "loss": 0.8273, + "step": 703000 + }, + { + "epoch": 0.75, + "learning_rate": 8.907343671835918e-06, + "loss": 0.8329, + "step": 703500 + }, + { + "epoch": 0.75, + "learning_rate": 8.892366183091545e-06, + "loss": 0.8303, + "step": 704000 + }, + { + "epoch": 0.76, + "learning_rate": 8.87735867933967e-06, + "loss": 0.8078, + "step": 704500 + }, + { + "epoch": 0.76, + "learning_rate": 8.862381190595298e-06, + "loss": 0.8184, + "step": 705000 + }, + { + "epoch": 0.76, + "learning_rate": 8.847373686843422e-06, + "loss": 0.8168, + "step": 705500 + }, + { + "epoch": 0.76, + "learning_rate": 8.832366183091546e-06, + "loss": 0.8302, + "step": 706000 + }, + { + "epoch": 0.76, + "learning_rate": 8.817388694347175e-06, + "loss": 0.8239, + "step": 706500 + }, + { + "epoch": 0.76, + "learning_rate": 8.802381190595297e-06, + "loss": 0.8246, + "step": 707000 + }, + { + "epoch": 0.76, + "learning_rate": 8.787373686843422e-06, + "loss": 0.8196, + "step": 707500 + }, + { + "epoch": 0.76, + "learning_rate": 8.772366183091546e-06, + "loss": 0.8266, + "step": 708000 + }, + { + "epoch": 0.76, + "learning_rate": 8.757358679339669e-06, + "loss": 0.8321, + "step": 708500 + }, + { + "epoch": 0.76, + "learning_rate": 8.742351175587794e-06, + "loss": 0.8268, + "step": 709000 + }, + { + "epoch": 0.76, + "learning_rate": 8.727343671835918e-06, + "loss": 0.8253, + "step": 709500 + }, + { + "epoch": 0.76, + "learning_rate": 8.712366183091545e-06, + "loss": 0.8222, + "step": 710000 + }, + { + "epoch": 0.76, + "eval_loss": 0.7680653929710388, + "eval_runtime": 586.5219, + "eval_samples_per_second": 170.497, + "eval_steps_per_second": 42.624, + "step": 710000 + }, + { + "epoch": 0.76, + "learning_rate": 8.697358679339671e-06, + "loss": 0.8269, + "step": 710500 + }, + { + "epoch": 0.76, + "learning_rate": 8.682351175587795e-06, + "loss": 0.8123, + "step": 711000 + }, + { + "epoch": 0.76, + "learning_rate": 8.667343671835917e-06, + "loss": 0.8246, + "step": 711500 + }, + { + "epoch": 0.76, + "learning_rate": 8.652336168084043e-06, + "loss": 0.808, + "step": 712000 + }, + { + "epoch": 0.76, + "learning_rate": 8.637328664332167e-06, + "loss": 0.8297, + "step": 712500 + }, + { + "epoch": 0.76, + "learning_rate": 8.62232116058029e-06, + "loss": 0.8164, + "step": 713000 + }, + { + "epoch": 0.77, + "learning_rate": 8.607313656828414e-06, + "loss": 0.8028, + "step": 713500 + }, + { + "epoch": 0.77, + "learning_rate": 8.592306153076538e-06, + "loss": 0.8048, + "step": 714000 + }, + { + "epoch": 0.77, + "learning_rate": 8.577298649324662e-06, + "loss": 0.8233, + "step": 714500 + }, + { + "epoch": 0.77, + "learning_rate": 8.562291145572786e-06, + "loss": 0.8238, + "step": 715000 + }, + { + "epoch": 0.77, + "learning_rate": 8.54728364182091e-06, + "loss": 0.8322, + "step": 715500 + }, + { + "epoch": 0.77, + "learning_rate": 8.532276138069034e-06, + "loss": 0.8237, + "step": 716000 + }, + { + "epoch": 0.77, + "learning_rate": 8.51726863431716e-06, + "loss": 0.8338, + "step": 716500 + }, + { + "epoch": 0.77, + "learning_rate": 8.502261130565282e-06, + "loss": 0.8221, + "step": 717000 + }, + { + "epoch": 0.77, + "learning_rate": 8.487253626813408e-06, + "loss": 0.809, + "step": 717500 + }, + { + "epoch": 0.77, + "learning_rate": 8.472276138069036e-06, + "loss": 0.8317, + "step": 718000 + }, + { + "epoch": 0.77, + "learning_rate": 8.457268634317159e-06, + "loss": 0.8149, + "step": 718500 + }, + { + "epoch": 0.77, + "learning_rate": 8.442261130565283e-06, + "loss": 0.8168, + "step": 719000 + }, + { + "epoch": 0.77, + "learning_rate": 8.427253626813408e-06, + "loss": 0.8143, + "step": 719500 + }, + { + "epoch": 0.77, + "learning_rate": 8.41224612306153e-06, + "loss": 0.8208, + "step": 720000 + }, + { + "epoch": 0.77, + "eval_loss": 0.7673591375350952, + "eval_runtime": 614.272, + "eval_samples_per_second": 162.794, + "eval_steps_per_second": 40.699, + "step": 720000 + }, + { + "epoch": 0.77, + "learning_rate": 8.397238619309654e-06, + "loss": 0.825, + "step": 720500 + }, + { + "epoch": 0.77, + "learning_rate": 8.38223111555778e-06, + "loss": 0.8217, + "step": 721000 + }, + { + "epoch": 0.77, + "learning_rate": 8.367223611805904e-06, + "loss": 0.8191, + "step": 721500 + }, + { + "epoch": 0.77, + "learning_rate": 8.352246123061531e-06, + "loss": 0.8185, + "step": 722000 + }, + { + "epoch": 0.77, + "learning_rate": 8.337238619309655e-06, + "loss": 0.8198, + "step": 722500 + }, + { + "epoch": 0.78, + "learning_rate": 8.322231115557779e-06, + "loss": 0.8174, + "step": 723000 + }, + { + "epoch": 0.78, + "learning_rate": 8.307223611805903e-06, + "loss": 0.8028, + "step": 723500 + }, + { + "epoch": 0.78, + "learning_rate": 8.292246123061532e-06, + "loss": 0.8118, + "step": 724000 + }, + { + "epoch": 0.78, + "learning_rate": 8.277238619309654e-06, + "loss": 0.8277, + "step": 724500 + }, + { + "epoch": 0.78, + "learning_rate": 8.262261130565283e-06, + "loss": 0.8261, + "step": 725000 + }, + { + "epoch": 0.78, + "learning_rate": 8.247253626813408e-06, + "loss": 0.8166, + "step": 725500 + }, + { + "epoch": 0.78, + "learning_rate": 8.23224612306153e-06, + "loss": 0.8226, + "step": 726000 + }, + { + "epoch": 0.78, + "learning_rate": 8.217238619309654e-06, + "loss": 0.8205, + "step": 726500 + }, + { + "epoch": 0.78, + "learning_rate": 8.20223111555778e-06, + "loss": 0.8266, + "step": 727000 + }, + { + "epoch": 0.78, + "learning_rate": 8.187223611805902e-06, + "loss": 0.8173, + "step": 727500 + }, + { + "epoch": 0.78, + "learning_rate": 8.172246123061531e-06, + "loss": 0.8102, + "step": 728000 + }, + { + "epoch": 0.78, + "learning_rate": 8.157238619309655e-06, + "loss": 0.8065, + "step": 728500 + }, + { + "epoch": 0.78, + "learning_rate": 8.142231115557779e-06, + "loss": 0.8224, + "step": 729000 + }, + { + "epoch": 0.78, + "learning_rate": 8.127223611805903e-06, + "loss": 0.8121, + "step": 729500 + }, + { + "epoch": 0.78, + "learning_rate": 8.112216108054028e-06, + "loss": 0.8068, + "step": 730000 + }, + { + "epoch": 0.78, + "eval_loss": 0.7651957869529724, + "eval_runtime": 605.1423, + "eval_samples_per_second": 165.25, + "eval_steps_per_second": 41.313, + "step": 730000 + }, + { + "epoch": 0.78, + "learning_rate": 8.09720860430215e-06, + "loss": 0.8107, + "step": 730500 + }, + { + "epoch": 0.78, + "learning_rate": 8.082201100550275e-06, + "loss": 0.82, + "step": 731000 + }, + { + "epoch": 0.78, + "learning_rate": 8.0671935967984e-06, + "loss": 0.8167, + "step": 731500 + }, + { + "epoch": 0.78, + "learning_rate": 8.052216108054027e-06, + "loss": 0.8155, + "step": 732000 + }, + { + "epoch": 0.79, + "learning_rate": 8.037208604302151e-06, + "loss": 0.8169, + "step": 732500 + }, + { + "epoch": 0.79, + "learning_rate": 8.022201100550275e-06, + "loss": 0.8167, + "step": 733000 + }, + { + "epoch": 0.79, + "learning_rate": 8.007223611805902e-06, + "loss": 0.8225, + "step": 733500 + }, + { + "epoch": 0.79, + "learning_rate": 7.992216108054028e-06, + "loss": 0.8243, + "step": 734000 + }, + { + "epoch": 0.79, + "learning_rate": 7.977208604302152e-06, + "loss": 0.8146, + "step": 734500 + }, + { + "epoch": 0.79, + "learning_rate": 7.962201100550274e-06, + "loss": 0.8102, + "step": 735000 + }, + { + "epoch": 0.79, + "learning_rate": 7.9471935967984e-06, + "loss": 0.814, + "step": 735500 + }, + { + "epoch": 0.79, + "learning_rate": 7.932186093046524e-06, + "loss": 0.8221, + "step": 736000 + }, + { + "epoch": 0.79, + "learning_rate": 7.917178589294648e-06, + "loss": 0.8207, + "step": 736500 + }, + { + "epoch": 0.79, + "learning_rate": 7.902171085542771e-06, + "loss": 0.817, + "step": 737000 + }, + { + "epoch": 0.79, + "learning_rate": 7.8871935967984e-06, + "loss": 0.8043, + "step": 737500 + }, + { + "epoch": 0.79, + "learning_rate": 7.872186093046524e-06, + "loss": 0.8082, + "step": 738000 + }, + { + "epoch": 0.79, + "learning_rate": 7.857178589294646e-06, + "loss": 0.8119, + "step": 738500 + }, + { + "epoch": 0.79, + "learning_rate": 7.842171085542772e-06, + "loss": 0.8165, + "step": 739000 + }, + { + "epoch": 0.79, + "learning_rate": 7.827193596798399e-06, + "loss": 0.8235, + "step": 739500 + }, + { + "epoch": 0.79, + "learning_rate": 7.812186093046523e-06, + "loss": 0.8118, + "step": 740000 + }, + { + "epoch": 0.79, + "eval_loss": 0.7625706195831299, + "eval_runtime": 571.8371, + "eval_samples_per_second": 174.875, + "eval_steps_per_second": 43.719, + "step": 740000 + }, + { + "epoch": 0.79, + "learning_rate": 7.797178589294647e-06, + "loss": 0.8077, + "step": 740500 + }, + { + "epoch": 0.79, + "learning_rate": 7.782171085542773e-06, + "loss": 0.8171, + "step": 741000 + }, + { + "epoch": 0.8, + "learning_rate": 7.7671935967984e-06, + "loss": 0.807, + "step": 741500 + }, + { + "epoch": 0.8, + "learning_rate": 7.752186093046524e-06, + "loss": 0.8073, + "step": 742000 + }, + { + "epoch": 0.8, + "learning_rate": 7.737178589294648e-06, + "loss": 0.824, + "step": 742500 + }, + { + "epoch": 0.8, + "learning_rate": 7.722171085542771e-06, + "loss": 0.8159, + "step": 743000 + }, + { + "epoch": 0.8, + "learning_rate": 7.7071935967984e-06, + "loss": 0.8057, + "step": 743500 + }, + { + "epoch": 0.8, + "learning_rate": 7.692216108054027e-06, + "loss": 0.8145, + "step": 744000 + }, + { + "epoch": 0.8, + "learning_rate": 7.677208604302151e-06, + "loss": 0.8241, + "step": 744500 + }, + { + "epoch": 0.8, + "learning_rate": 7.662201100550275e-06, + "loss": 0.8223, + "step": 745000 + }, + { + "epoch": 0.8, + "learning_rate": 7.6471935967984e-06, + "loss": 0.8127, + "step": 745500 + }, + { + "epoch": 0.8, + "learning_rate": 7.632186093046523e-06, + "loss": 0.8108, + "step": 746000 + }, + { + "epoch": 0.8, + "learning_rate": 7.617178589294648e-06, + "loss": 0.8077, + "step": 746500 + }, + { + "epoch": 0.8, + "learning_rate": 7.602201100550274e-06, + "loss": 0.8255, + "step": 747000 + }, + { + "epoch": 0.8, + "learning_rate": 7.5871935967984e-06, + "loss": 0.8239, + "step": 747500 + }, + { + "epoch": 0.8, + "learning_rate": 7.572216108054028e-06, + "loss": 0.815, + "step": 748000 + }, + { + "epoch": 0.8, + "learning_rate": 7.557208604302151e-06, + "loss": 0.8117, + "step": 748500 + }, + { + "epoch": 0.8, + "learning_rate": 7.542201100550276e-06, + "loss": 0.8061, + "step": 749000 + }, + { + "epoch": 0.8, + "learning_rate": 7.5271935967983995e-06, + "loss": 0.8132, + "step": 749500 + }, + { + "epoch": 0.8, + "learning_rate": 7.512186093046523e-06, + "loss": 0.8134, + "step": 750000 + }, + { + "epoch": 0.8, + "eval_loss": 0.761015772819519, + "eval_runtime": 586.4258, + "eval_samples_per_second": 170.525, + "eval_steps_per_second": 42.631, + "step": 750000 + }, + { + "epoch": 0.8, + "learning_rate": 7.497178589294647e-06, + "loss": 0.8265, + "step": 750500 + }, + { + "epoch": 0.81, + "learning_rate": 7.482171085542771e-06, + "loss": 0.8161, + "step": 751000 + }, + { + "epoch": 0.81, + "learning_rate": 7.467163581790896e-06, + "loss": 0.8179, + "step": 751500 + }, + { + "epoch": 0.81, + "learning_rate": 7.452156078039019e-06, + "loss": 0.8219, + "step": 752000 + }, + { + "epoch": 0.81, + "learning_rate": 7.437178589294647e-06, + "loss": 0.8043, + "step": 752500 + }, + { + "epoch": 0.81, + "learning_rate": 7.422171085542772e-06, + "loss": 0.8199, + "step": 753000 + }, + { + "epoch": 0.81, + "learning_rate": 7.407163581790896e-06, + "loss": 0.809, + "step": 753500 + }, + { + "epoch": 0.81, + "learning_rate": 7.39215607803902e-06, + "loss": 0.8071, + "step": 754000 + }, + { + "epoch": 0.81, + "learning_rate": 7.377148574287144e-06, + "loss": 0.8114, + "step": 754500 + }, + { + "epoch": 0.81, + "learning_rate": 7.3621710855427716e-06, + "loss": 0.8021, + "step": 755000 + }, + { + "epoch": 0.81, + "learning_rate": 7.3471635817908955e-06, + "loss": 0.8117, + "step": 755500 + }, + { + "epoch": 0.81, + "learning_rate": 7.33215607803902e-06, + "loss": 0.8195, + "step": 756000 + }, + { + "epoch": 0.81, + "learning_rate": 7.317148574287143e-06, + "loss": 0.807, + "step": 756500 + }, + { + "epoch": 0.81, + "learning_rate": 7.302141070535268e-06, + "loss": 0.8047, + "step": 757000 + }, + { + "epoch": 0.81, + "learning_rate": 7.287163581790896e-06, + "loss": 0.8079, + "step": 757500 + }, + { + "epoch": 0.81, + "learning_rate": 7.27215607803902e-06, + "loss": 0.821, + "step": 758000 + }, + { + "epoch": 0.81, + "learning_rate": 7.257148574287144e-06, + "loss": 0.7959, + "step": 758500 + }, + { + "epoch": 0.81, + "learning_rate": 7.242141070535268e-06, + "loss": 0.8177, + "step": 759000 + }, + { + "epoch": 0.81, + "learning_rate": 7.227163581790896e-06, + "loss": 0.8155, + "step": 759500 + }, + { + "epoch": 0.81, + "learning_rate": 7.21215607803902e-06, + "loss": 0.8152, + "step": 760000 + }, + { + "epoch": 0.81, + "eval_loss": 0.7589353919029236, + "eval_runtime": 594.6054, + "eval_samples_per_second": 168.179, + "eval_steps_per_second": 42.045, + "step": 760000 + }, + { + "epoch": 0.82, + "learning_rate": 7.197148574287144e-06, + "loss": 0.8072, + "step": 760500 + }, + { + "epoch": 0.82, + "learning_rate": 7.1821410705352676e-06, + "loss": 0.8052, + "step": 761000 + }, + { + "epoch": 0.82, + "learning_rate": 7.167133566783392e-06, + "loss": 0.8271, + "step": 761500 + }, + { + "epoch": 0.82, + "learning_rate": 7.152126063031516e-06, + "loss": 0.8098, + "step": 762000 + }, + { + "epoch": 0.82, + "learning_rate": 7.13711855927964e-06, + "loss": 0.8081, + "step": 762500 + }, + { + "epoch": 0.82, + "learning_rate": 7.122111055527764e-06, + "loss": 0.8172, + "step": 763000 + }, + { + "epoch": 0.82, + "learning_rate": 7.107103551775888e-06, + "loss": 0.8152, + "step": 763500 + }, + { + "epoch": 0.82, + "learning_rate": 7.092096048024012e-06, + "loss": 0.8143, + "step": 764000 + }, + { + "epoch": 0.82, + "learning_rate": 7.077088544272136e-06, + "loss": 0.8038, + "step": 764500 + }, + { + "epoch": 0.82, + "learning_rate": 7.062081040520261e-06, + "loss": 0.8107, + "step": 765000 + }, + { + "epoch": 0.82, + "learning_rate": 7.047103551775889e-06, + "loss": 0.7959, + "step": 765500 + }, + { + "epoch": 0.82, + "learning_rate": 7.032096048024012e-06, + "loss": 0.8229, + "step": 766000 + }, + { + "epoch": 0.82, + "learning_rate": 7.0170885442721365e-06, + "loss": 0.8147, + "step": 766500 + }, + { + "epoch": 0.82, + "learning_rate": 7.0020810405202604e-06, + "loss": 0.8026, + "step": 767000 + }, + { + "epoch": 0.82, + "learning_rate": 6.9871335667833915e-06, + "loss": 0.8015, + "step": 767500 + }, + { + "epoch": 0.82, + "learning_rate": 6.972126063031516e-06, + "loss": 0.8041, + "step": 768000 + }, + { + "epoch": 0.82, + "learning_rate": 6.957118559279639e-06, + "loss": 0.8258, + "step": 768500 + }, + { + "epoch": 0.82, + "learning_rate": 6.942111055527764e-06, + "loss": 0.8184, + "step": 769000 + }, + { + "epoch": 0.83, + "learning_rate": 6.927103551775888e-06, + "loss": 0.8076, + "step": 769500 + }, + { + "epoch": 0.83, + "learning_rate": 6.912096048024013e-06, + "loss": 0.8106, + "step": 770000 + }, + { + "epoch": 0.83, + "eval_loss": 0.7582979202270508, + "eval_runtime": 581.7955, + "eval_samples_per_second": 171.882, + "eval_steps_per_second": 42.97, + "step": 770000 + }, + { + "epoch": 0.83, + "learning_rate": 6.897088544272136e-06, + "loss": 0.8126, + "step": 770500 + }, + { + "epoch": 0.83, + "learning_rate": 6.882111055527764e-06, + "loss": 0.8049, + "step": 771000 + }, + { + "epoch": 0.83, + "learning_rate": 6.867103551775888e-06, + "loss": 0.8076, + "step": 771500 + }, + { + "epoch": 0.83, + "learning_rate": 6.852096048024013e-06, + "loss": 0.8063, + "step": 772000 + }, + { + "epoch": 0.83, + "learning_rate": 6.837088544272136e-06, + "loss": 0.8151, + "step": 772500 + }, + { + "epoch": 0.83, + "learning_rate": 6.8220810405202605e-06, + "loss": 0.8293, + "step": 773000 + }, + { + "epoch": 0.83, + "learning_rate": 6.807073536768384e-06, + "loss": 0.8075, + "step": 773500 + }, + { + "epoch": 0.83, + "learning_rate": 6.792096048024012e-06, + "loss": 0.8077, + "step": 774000 + }, + { + "epoch": 0.83, + "learning_rate": 6.777088544272136e-06, + "loss": 0.8028, + "step": 774500 + }, + { + "epoch": 0.83, + "learning_rate": 6.76208104052026e-06, + "loss": 0.8217, + "step": 775000 + }, + { + "epoch": 0.83, + "learning_rate": 6.747073536768384e-06, + "loss": 0.8213, + "step": 775500 + }, + { + "epoch": 0.83, + "learning_rate": 6.732066033016509e-06, + "loss": 0.8068, + "step": 776000 + }, + { + "epoch": 0.83, + "learning_rate": 6.717058529264632e-06, + "loss": 0.802, + "step": 776500 + }, + { + "epoch": 0.83, + "learning_rate": 6.702051025512757e-06, + "loss": 0.8199, + "step": 777000 + }, + { + "epoch": 0.83, + "learning_rate": 6.687043521760881e-06, + "loss": 0.8121, + "step": 777500 + }, + { + "epoch": 0.83, + "learning_rate": 6.672036018009005e-06, + "loss": 0.8154, + "step": 778000 + }, + { + "epoch": 0.83, + "learning_rate": 6.6570585292646326e-06, + "loss": 0.804, + "step": 778500 + }, + { + "epoch": 0.84, + "learning_rate": 6.64208104052026e-06, + "loss": 0.8103, + "step": 779000 + }, + { + "epoch": 0.84, + "learning_rate": 6.6270735367683844e-06, + "loss": 0.7978, + "step": 779500 + }, + { + "epoch": 0.84, + "learning_rate": 6.612096048024012e-06, + "loss": 0.812, + "step": 780000 + }, + { + "epoch": 0.84, + "eval_loss": 0.7558347582817078, + "eval_runtime": 600.211, + "eval_samples_per_second": 166.608, + "eval_steps_per_second": 41.652, + "step": 780000 + }, + { + "epoch": 0.84, + "learning_rate": 6.597088544272136e-06, + "loss": 0.7998, + "step": 780500 + }, + { + "epoch": 0.84, + "learning_rate": 6.58208104052026e-06, + "loss": 0.8003, + "step": 781000 + }, + { + "epoch": 0.84, + "learning_rate": 6.567073536768384e-06, + "loss": 0.8138, + "step": 781500 + }, + { + "epoch": 0.84, + "learning_rate": 6.552096048024012e-06, + "loss": 0.7981, + "step": 782000 + }, + { + "epoch": 0.84, + "learning_rate": 6.537088544272137e-06, + "loss": 0.8065, + "step": 782500 + }, + { + "epoch": 0.84, + "learning_rate": 6.52208104052026e-06, + "loss": 0.7939, + "step": 783000 + }, + { + "epoch": 0.84, + "learning_rate": 6.507073536768384e-06, + "loss": 0.7971, + "step": 783500 + }, + { + "epoch": 0.84, + "learning_rate": 6.492066033016509e-06, + "loss": 0.8023, + "step": 784000 + }, + { + "epoch": 0.84, + "learning_rate": 6.477058529264633e-06, + "loss": 0.8124, + "step": 784500 + }, + { + "epoch": 0.84, + "learning_rate": 6.4620510255127565e-06, + "loss": 0.8006, + "step": 785000 + }, + { + "epoch": 0.84, + "learning_rate": 6.4470435217608805e-06, + "loss": 0.8008, + "step": 785500 + }, + { + "epoch": 0.84, + "learning_rate": 6.432036018009005e-06, + "loss": 0.8218, + "step": 786000 + }, + { + "epoch": 0.84, + "learning_rate": 6.417028514257128e-06, + "loss": 0.8166, + "step": 786500 + }, + { + "epoch": 0.84, + "learning_rate": 6.402021010505252e-06, + "loss": 0.7998, + "step": 787000 + }, + { + "epoch": 0.84, + "learning_rate": 6.387013506753377e-06, + "loss": 0.814, + "step": 787500 + }, + { + "epoch": 0.85, + "learning_rate": 6.372006003001501e-06, + "loss": 0.8006, + "step": 788000 + }, + { + "epoch": 0.85, + "learning_rate": 6.356998499249625e-06, + "loss": 0.8054, + "step": 788500 + }, + { + "epoch": 0.85, + "learning_rate": 6.341990995497749e-06, + "loss": 0.8008, + "step": 789000 + }, + { + "epoch": 0.85, + "learning_rate": 6.326983491745874e-06, + "loss": 0.8155, + "step": 789500 + }, + { + "epoch": 0.85, + "learning_rate": 6.3120060030015015e-06, + "loss": 0.8077, + "step": 790000 + }, + { + "epoch": 0.85, + "eval_loss": 0.7543774843215942, + "eval_runtime": 577.6914, + "eval_samples_per_second": 173.103, + "eval_steps_per_second": 43.276, + "step": 790000 + }, + { + "epoch": 0.85, + "learning_rate": 6.296998499249625e-06, + "loss": 0.7926, + "step": 790500 + }, + { + "epoch": 0.85, + "learning_rate": 6.2819909954977485e-06, + "loss": 0.8047, + "step": 791000 + }, + { + "epoch": 0.85, + "learning_rate": 6.2670135067533765e-06, + "loss": 0.808, + "step": 791500 + }, + { + "epoch": 0.85, + "learning_rate": 6.252006003001501e-06, + "loss": 0.7919, + "step": 792000 + }, + { + "epoch": 0.85, + "learning_rate": 6.237028514257129e-06, + "loss": 0.7999, + "step": 792500 + }, + { + "epoch": 0.85, + "learning_rate": 6.222021010505252e-06, + "loss": 0.806, + "step": 793000 + }, + { + "epoch": 0.85, + "learning_rate": 6.207013506753377e-06, + "loss": 0.7944, + "step": 793500 + }, + { + "epoch": 0.85, + "learning_rate": 6.192006003001501e-06, + "loss": 0.8031, + "step": 794000 + }, + { + "epoch": 0.85, + "learning_rate": 6.177028514257129e-06, + "loss": 0.8094, + "step": 794500 + }, + { + "epoch": 0.85, + "learning_rate": 6.162021010505252e-06, + "loss": 0.8119, + "step": 795000 + }, + { + "epoch": 0.85, + "learning_rate": 6.147013506753377e-06, + "loss": 0.8092, + "step": 795500 + }, + { + "epoch": 0.85, + "learning_rate": 6.132006003001501e-06, + "loss": 0.7965, + "step": 796000 + }, + { + "epoch": 0.85, + "learning_rate": 6.1169984992496255e-06, + "loss": 0.8127, + "step": 796500 + }, + { + "epoch": 0.85, + "learning_rate": 6.1019909954977486e-06, + "loss": 0.8076, + "step": 797000 + }, + { + "epoch": 0.86, + "learning_rate": 6.086983491745873e-06, + "loss": 0.8006, + "step": 797500 + }, + { + "epoch": 0.86, + "learning_rate": 6.071975987993997e-06, + "loss": 0.8039, + "step": 798000 + }, + { + "epoch": 0.86, + "learning_rate": 6.056968484242122e-06, + "loss": 0.8076, + "step": 798500 + }, + { + "epoch": 0.86, + "learning_rate": 6.041960980490245e-06, + "loss": 0.7912, + "step": 799000 + }, + { + "epoch": 0.86, + "learning_rate": 6.026953476738369e-06, + "loss": 0.8113, + "step": 799500 + }, + { + "epoch": 0.86, + "learning_rate": 6.011945972986494e-06, + "loss": 0.8112, + "step": 800000 + }, + { + "epoch": 0.86, + "eval_loss": 0.7531115412712097, + "eval_runtime": 659.7719, + "eval_samples_per_second": 151.568, + "eval_steps_per_second": 37.892, + "step": 800000 + }, + { + "epoch": 0.86, + "learning_rate": 5.996968484242122e-06, + "loss": 0.7955, + "step": 800500 + }, + { + "epoch": 0.86, + "learning_rate": 5.981960980490245e-06, + "loss": 0.8159, + "step": 801000 + }, + { + "epoch": 0.86, + "learning_rate": 5.96695347673837e-06, + "loss": 0.8022, + "step": 801500 + }, + { + "epoch": 0.86, + "learning_rate": 5.9519459729864936e-06, + "loss": 0.806, + "step": 802000 + }, + { + "epoch": 0.86, + "learning_rate": 5.9369684842421215e-06, + "loss": 0.806, + "step": 802500 + }, + { + "epoch": 0.86, + "learning_rate": 5.921960980490245e-06, + "loss": 0.8175, + "step": 803000 + }, + { + "epoch": 0.86, + "learning_rate": 5.906953476738369e-06, + "loss": 0.8015, + "step": 803500 + }, + { + "epoch": 0.86, + "learning_rate": 5.891975987993997e-06, + "loss": 0.8171, + "step": 804000 + }, + { + "epoch": 0.86, + "learning_rate": 5.876968484242121e-06, + "loss": 0.8022, + "step": 804500 + }, + { + "epoch": 0.86, + "learning_rate": 5.861960980490245e-06, + "loss": 0.81, + "step": 805000 + }, + { + "epoch": 0.86, + "learning_rate": 5.846953476738369e-06, + "loss": 0.7946, + "step": 805500 + }, + { + "epoch": 0.86, + "learning_rate": 5.831945972986494e-06, + "loss": 0.8044, + "step": 806000 + }, + { + "epoch": 0.86, + "learning_rate": 5.816968484242122e-06, + "loss": 0.7969, + "step": 806500 + }, + { + "epoch": 0.87, + "learning_rate": 5.801960980490245e-06, + "loss": 0.8136, + "step": 807000 + }, + { + "epoch": 0.87, + "learning_rate": 5.786953476738369e-06, + "loss": 0.814, + "step": 807500 + }, + { + "epoch": 0.87, + "learning_rate": 5.771945972986494e-06, + "loss": 0.7976, + "step": 808000 + }, + { + "epoch": 0.87, + "learning_rate": 5.7569384692346175e-06, + "loss": 0.793, + "step": 808500 + }, + { + "epoch": 0.87, + "learning_rate": 5.7419309654827415e-06, + "loss": 0.8028, + "step": 809000 + }, + { + "epoch": 0.87, + "learning_rate": 5.726923461730865e-06, + "loss": 0.8023, + "step": 809500 + }, + { + "epoch": 0.87, + "learning_rate": 5.71191595797899e-06, + "loss": 0.8162, + "step": 810000 + }, + { + "epoch": 0.87, + "eval_loss": 0.7523924708366394, + "eval_runtime": 648.5551, + "eval_samples_per_second": 154.189, + "eval_steps_per_second": 38.547, + "step": 810000 + }, + { + "epoch": 0.87, + "learning_rate": 5.696938469234618e-06, + "loss": 0.809, + "step": 810500 + }, + { + "epoch": 0.87, + "learning_rate": 5.681930965482741e-06, + "loss": 0.8036, + "step": 811000 + }, + { + "epoch": 0.87, + "learning_rate": 5.666923461730865e-06, + "loss": 0.8188, + "step": 811500 + }, + { + "epoch": 0.87, + "learning_rate": 5.65191595797899e-06, + "loss": 0.8053, + "step": 812000 + }, + { + "epoch": 0.87, + "learning_rate": 5.636938469234618e-06, + "loss": 0.802, + "step": 812500 + }, + { + "epoch": 0.87, + "learning_rate": 5.621960980490246e-06, + "loss": 0.8064, + "step": 813000 + }, + { + "epoch": 0.87, + "learning_rate": 5.606953476738369e-06, + "loss": 0.8014, + "step": 813500 + }, + { + "epoch": 0.87, + "learning_rate": 5.591945972986494e-06, + "loss": 0.8038, + "step": 814000 + }, + { + "epoch": 0.87, + "learning_rate": 5.5769384692346176e-06, + "loss": 0.8161, + "step": 814500 + }, + { + "epoch": 0.87, + "learning_rate": 5.5619309654827415e-06, + "loss": 0.7968, + "step": 815000 + }, + { + "epoch": 0.87, + "learning_rate": 5.546953476738369e-06, + "loss": 0.7984, + "step": 815500 + }, + { + "epoch": 0.88, + "learning_rate": 5.531945972986493e-06, + "loss": 0.8034, + "step": 816000 + }, + { + "epoch": 0.88, + "learning_rate": 5.516938469234617e-06, + "loss": 0.8058, + "step": 816500 + }, + { + "epoch": 0.88, + "learning_rate": 5.501930965482742e-06, + "loss": 0.8205, + "step": 817000 + }, + { + "epoch": 0.88, + "learning_rate": 5.486923461730865e-06, + "loss": 0.794, + "step": 817500 + }, + { + "epoch": 0.88, + "learning_rate": 5.47191595797899e-06, + "loss": 0.8056, + "step": 818000 + }, + { + "epoch": 0.88, + "learning_rate": 5.456938469234618e-06, + "loss": 0.7963, + "step": 818500 + }, + { + "epoch": 0.88, + "learning_rate": 5.441930965482742e-06, + "loss": 0.8053, + "step": 819000 + }, + { + "epoch": 0.88, + "learning_rate": 5.426923461730865e-06, + "loss": 0.8077, + "step": 819500 + }, + { + "epoch": 0.88, + "learning_rate": 5.41191595797899e-06, + "loss": 0.7988, + "step": 820000 + }, + { + "epoch": 0.88, + "eval_loss": 0.7511362433433533, + "eval_runtime": 622.036, + "eval_samples_per_second": 160.762, + "eval_steps_per_second": 40.191, + "step": 820000 + }, + { + "epoch": 0.88, + "learning_rate": 5.396908454227114e-06, + "loss": 0.8059, + "step": 820500 + }, + { + "epoch": 0.88, + "learning_rate": 5.3819309654827415e-06, + "loss": 0.8038, + "step": 821000 + }, + { + "epoch": 0.88, + "learning_rate": 5.3669234617308654e-06, + "loss": 0.7953, + "step": 821500 + }, + { + "epoch": 0.88, + "learning_rate": 5.351915957978989e-06, + "loss": 0.7935, + "step": 822000 + }, + { + "epoch": 0.88, + "learning_rate": 5.336908454227114e-06, + "loss": 0.8052, + "step": 822500 + }, + { + "epoch": 0.88, + "learning_rate": 5.321900950475238e-06, + "loss": 0.8097, + "step": 823000 + }, + { + "epoch": 0.88, + "learning_rate": 5.306893446723361e-06, + "loss": 0.808, + "step": 823500 + }, + { + "epoch": 0.88, + "learning_rate": 5.291915957978989e-06, + "loss": 0.7991, + "step": 824000 + }, + { + "epoch": 0.88, + "learning_rate": 5.276908454227114e-06, + "loss": 0.8031, + "step": 824500 + }, + { + "epoch": 0.88, + "learning_rate": 5.261900950475238e-06, + "loss": 0.8172, + "step": 825000 + }, + { + "epoch": 0.89, + "learning_rate": 5.246893446723362e-06, + "loss": 0.7948, + "step": 825500 + }, + { + "epoch": 0.89, + "learning_rate": 5.231885942971486e-06, + "loss": 0.8089, + "step": 826000 + }, + { + "epoch": 0.89, + "learning_rate": 5.2168784392196105e-06, + "loss": 0.7969, + "step": 826500 + }, + { + "epoch": 0.89, + "learning_rate": 5.201870935467734e-06, + "loss": 0.8, + "step": 827000 + }, + { + "epoch": 0.89, + "learning_rate": 5.1868634317158575e-06, + "loss": 0.7848, + "step": 827500 + }, + { + "epoch": 0.89, + "learning_rate": 5.171885942971485e-06, + "loss": 0.8025, + "step": 828000 + }, + { + "epoch": 0.89, + "learning_rate": 5.15687843921961e-06, + "loss": 0.7814, + "step": 828500 + }, + { + "epoch": 0.89, + "learning_rate": 5.141900950475238e-06, + "loss": 0.7938, + "step": 829000 + }, + { + "epoch": 0.89, + "learning_rate": 5.126893446723362e-06, + "loss": 0.8011, + "step": 829500 + }, + { + "epoch": 0.89, + "learning_rate": 5.111885942971486e-06, + "loss": 0.8038, + "step": 830000 + }, + { + "epoch": 0.89, + "eval_loss": 0.7489431500434875, + "eval_runtime": 627.3417, + "eval_samples_per_second": 159.403, + "eval_steps_per_second": 39.851, + "step": 830000 + }, + { + "epoch": 0.89, + "learning_rate": 5.09687843921961e-06, + "loss": 0.8, + "step": 830500 + }, + { + "epoch": 0.89, + "learning_rate": 5.081870935467734e-06, + "loss": 0.7896, + "step": 831000 + }, + { + "epoch": 0.89, + "learning_rate": 5.066863431715858e-06, + "loss": 0.81, + "step": 831500 + }, + { + "epoch": 0.89, + "learning_rate": 5.051855927963982e-06, + "loss": 0.8004, + "step": 832000 + }, + { + "epoch": 0.89, + "learning_rate": 5.0368484242121065e-06, + "loss": 0.8019, + "step": 832500 + }, + { + "epoch": 0.89, + "learning_rate": 5.021870935467734e-06, + "loss": 0.794, + "step": 833000 + }, + { + "epoch": 0.89, + "learning_rate": 5.0068634317158575e-06, + "loss": 0.8003, + "step": 833500 + }, + { + "epoch": 0.89, + "learning_rate": 4.9918859429714854e-06, + "loss": 0.808, + "step": 834000 + }, + { + "epoch": 0.89, + "learning_rate": 4.97687843921961e-06, + "loss": 0.7961, + "step": 834500 + }, + { + "epoch": 0.9, + "learning_rate": 4.961870935467734e-06, + "loss": 0.7936, + "step": 835000 + }, + { + "epoch": 0.9, + "learning_rate": 4.946863431715858e-06, + "loss": 0.8034, + "step": 835500 + }, + { + "epoch": 0.9, + "learning_rate": 4.931855927963982e-06, + "loss": 0.7837, + "step": 836000 + }, + { + "epoch": 0.9, + "learning_rate": 4.916848424212106e-06, + "loss": 0.791, + "step": 836500 + }, + { + "epoch": 0.9, + "learning_rate": 4.901840920460231e-06, + "loss": 0.8138, + "step": 837000 + }, + { + "epoch": 0.9, + "learning_rate": 4.886833416708354e-06, + "loss": 0.8031, + "step": 837500 + }, + { + "epoch": 0.9, + "learning_rate": 4.871855927963982e-06, + "loss": 0.7994, + "step": 838000 + }, + { + "epoch": 0.9, + "learning_rate": 4.8568484242121065e-06, + "loss": 0.7981, + "step": 838500 + }, + { + "epoch": 0.9, + "learning_rate": 4.8418409204602304e-06, + "loss": 0.7969, + "step": 839000 + }, + { + "epoch": 0.9, + "learning_rate": 4.826833416708354e-06, + "loss": 0.7991, + "step": 839500 + }, + { + "epoch": 0.9, + "learning_rate": 4.8118559279639815e-06, + "loss": 0.7978, + "step": 840000 + }, + { + "epoch": 0.9, + "eval_loss": 0.7486736178398132, + "eval_runtime": 614.7602, + "eval_samples_per_second": 162.665, + "eval_steps_per_second": 40.666, + "step": 840000 + }, + { + "epoch": 0.9, + "learning_rate": 4.796848424212106e-06, + "loss": 0.7944, + "step": 840500 + }, + { + "epoch": 0.9, + "learning_rate": 4.78184092046023e-06, + "loss": 0.7894, + "step": 841000 + }, + { + "epoch": 0.9, + "learning_rate": 4.766833416708355e-06, + "loss": 0.8049, + "step": 841500 + }, + { + "epoch": 0.9, + "learning_rate": 4.751855927963982e-06, + "loss": 0.7895, + "step": 842000 + }, + { + "epoch": 0.9, + "learning_rate": 4.73687843921961e-06, + "loss": 0.7994, + "step": 842500 + }, + { + "epoch": 0.9, + "learning_rate": 4.721870935467734e-06, + "loss": 0.7957, + "step": 843000 + }, + { + "epoch": 0.9, + "learning_rate": 4.706863431715858e-06, + "loss": 0.8035, + "step": 843500 + }, + { + "epoch": 0.91, + "learning_rate": 4.691855927963983e-06, + "loss": 0.8004, + "step": 844000 + }, + { + "epoch": 0.91, + "learning_rate": 4.676848424212106e-06, + "loss": 0.8055, + "step": 844500 + }, + { + "epoch": 0.91, + "learning_rate": 4.661870935467734e-06, + "loss": 0.7879, + "step": 845000 + }, + { + "epoch": 0.91, + "learning_rate": 4.646863431715858e-06, + "loss": 0.7818, + "step": 845500 + }, + { + "epoch": 0.91, + "learning_rate": 4.631855927963982e-06, + "loss": 0.7879, + "step": 846000 + }, + { + "epoch": 0.91, + "learning_rate": 4.616848424212106e-06, + "loss": 0.7961, + "step": 846500 + }, + { + "epoch": 0.91, + "learning_rate": 4.60184092046023e-06, + "loss": 0.7985, + "step": 847000 + }, + { + "epoch": 0.91, + "learning_rate": 4.586833416708354e-06, + "loss": 0.7991, + "step": 847500 + }, + { + "epoch": 0.91, + "learning_rate": 4.571825912956478e-06, + "loss": 0.7947, + "step": 848000 + }, + { + "epoch": 0.91, + "learning_rate": 4.556818409204602e-06, + "loss": 0.7986, + "step": 848500 + }, + { + "epoch": 0.91, + "learning_rate": 4.541810905452727e-06, + "loss": 0.7947, + "step": 849000 + }, + { + "epoch": 0.91, + "learning_rate": 4.526863431715858e-06, + "loss": 0.7932, + "step": 849500 + }, + { + "epoch": 0.91, + "learning_rate": 4.511855927963983e-06, + "loss": 0.7925, + "step": 850000 + }, + { + "epoch": 0.91, + "eval_loss": 0.74729984998703, + "eval_runtime": 633.8245, + "eval_samples_per_second": 157.772, + "eval_steps_per_second": 39.443, + "step": 850000 + }, + { + "epoch": 0.91, + "learning_rate": 4.496848424212106e-06, + "loss": 0.7857, + "step": 850500 + }, + { + "epoch": 0.91, + "learning_rate": 4.481870935467734e-06, + "loss": 0.7798, + "step": 851000 + }, + { + "epoch": 0.91, + "learning_rate": 4.466863431715858e-06, + "loss": 0.8151, + "step": 851500 + }, + { + "epoch": 0.91, + "learning_rate": 4.451855927963982e-06, + "loss": 0.805, + "step": 852000 + }, + { + "epoch": 0.91, + "learning_rate": 4.436848424212106e-06, + "loss": 0.8009, + "step": 852500 + }, + { + "epoch": 0.91, + "learning_rate": 4.42184092046023e-06, + "loss": 0.7944, + "step": 853000 + }, + { + "epoch": 0.92, + "learning_rate": 4.406833416708354e-06, + "loss": 0.8048, + "step": 853500 + }, + { + "epoch": 0.92, + "learning_rate": 4.391825912956479e-06, + "loss": 0.7969, + "step": 854000 + }, + { + "epoch": 0.92, + "learning_rate": 4.376818409204602e-06, + "loss": 0.8027, + "step": 854500 + }, + { + "epoch": 0.92, + "learning_rate": 4.361810905452727e-06, + "loss": 0.7892, + "step": 855000 + }, + { + "epoch": 0.92, + "learning_rate": 4.346803401700851e-06, + "loss": 0.7969, + "step": 855500 + }, + { + "epoch": 0.92, + "learning_rate": 4.331795897948975e-06, + "loss": 0.8099, + "step": 856000 + }, + { + "epoch": 0.92, + "learning_rate": 4.316788394197099e-06, + "loss": 0.8151, + "step": 856500 + }, + { + "epoch": 0.92, + "learning_rate": 4.3018109054527265e-06, + "loss": 0.7951, + "step": 857000 + }, + { + "epoch": 0.92, + "learning_rate": 4.2868334167083545e-06, + "loss": 0.7952, + "step": 857500 + }, + { + "epoch": 0.92, + "learning_rate": 4.271825912956478e-06, + "loss": 0.7942, + "step": 858000 + }, + { + "epoch": 0.92, + "learning_rate": 4.256818409204603e-06, + "loss": 0.7968, + "step": 858500 + }, + { + "epoch": 0.92, + "learning_rate": 4.241810905452726e-06, + "loss": 0.7929, + "step": 859000 + }, + { + "epoch": 0.92, + "learning_rate": 4.22680340170085e-06, + "loss": 0.8059, + "step": 859500 + }, + { + "epoch": 0.92, + "learning_rate": 4.211825912956478e-06, + "loss": 0.7919, + "step": 860000 + }, + { + "epoch": 0.92, + "eval_loss": 0.7454802393913269, + "eval_runtime": 606.3303, + "eval_samples_per_second": 164.927, + "eval_steps_per_second": 41.232, + "step": 860000 + }, + { + "epoch": 0.92, + "learning_rate": 4.196818409204603e-06, + "loss": 0.7904, + "step": 860500 + }, + { + "epoch": 0.92, + "learning_rate": 4.181810905452726e-06, + "loss": 0.8019, + "step": 861000 + }, + { + "epoch": 0.92, + "learning_rate": 4.166803401700851e-06, + "loss": 0.7888, + "step": 861500 + }, + { + "epoch": 0.92, + "learning_rate": 4.151795897948975e-06, + "loss": 0.7882, + "step": 862000 + }, + { + "epoch": 0.92, + "learning_rate": 4.136788394197099e-06, + "loss": 0.802, + "step": 862500 + }, + { + "epoch": 0.93, + "learning_rate": 4.1217808904452225e-06, + "loss": 0.798, + "step": 863000 + }, + { + "epoch": 0.93, + "learning_rate": 4.1068034017008505e-06, + "loss": 0.7931, + "step": 863500 + }, + { + "epoch": 0.93, + "learning_rate": 4.091825912956478e-06, + "loss": 0.8102, + "step": 864000 + }, + { + "epoch": 0.93, + "learning_rate": 4.076818409204602e-06, + "loss": 0.7989, + "step": 864500 + }, + { + "epoch": 0.93, + "learning_rate": 4.061810905452726e-06, + "loss": 0.7932, + "step": 865000 + }, + { + "epoch": 0.93, + "learning_rate": 4.04680340170085e-06, + "loss": 0.7873, + "step": 865500 + }, + { + "epoch": 0.93, + "learning_rate": 4.031795897948975e-06, + "loss": 0.7947, + "step": 866000 + }, + { + "epoch": 0.93, + "learning_rate": 4.016788394197099e-06, + "loss": 0.7919, + "step": 866500 + }, + { + "epoch": 0.93, + "learning_rate": 4.001780890445223e-06, + "loss": 0.7963, + "step": 867000 + }, + { + "epoch": 0.93, + "learning_rate": 3.986773386693347e-06, + "loss": 0.7956, + "step": 867500 + }, + { + "epoch": 0.93, + "learning_rate": 3.971765882941471e-06, + "loss": 0.7991, + "step": 868000 + }, + { + "epoch": 0.93, + "learning_rate": 3.956758379189595e-06, + "loss": 0.8067, + "step": 868500 + }, + { + "epoch": 0.93, + "learning_rate": 3.9417508754377186e-06, + "loss": 0.7864, + "step": 869000 + }, + { + "epoch": 0.93, + "learning_rate": 3.926743371685843e-06, + "loss": 0.7947, + "step": 869500 + }, + { + "epoch": 0.93, + "learning_rate": 3.911765882941471e-06, + "loss": 0.7883, + "step": 870000 + }, + { + "epoch": 0.93, + "eval_loss": 0.743596076965332, + "eval_runtime": 589.6574, + "eval_samples_per_second": 169.59, + "eval_steps_per_second": 42.398, + "step": 870000 + }, + { + "epoch": 0.93, + "learning_rate": 3.896758379189595e-06, + "loss": 0.7853, + "step": 870500 + }, + { + "epoch": 0.93, + "learning_rate": 3.881780890445222e-06, + "loss": 0.7879, + "step": 871000 + }, + { + "epoch": 0.93, + "learning_rate": 3.866773386693346e-06, + "loss": 0.797, + "step": 871500 + }, + { + "epoch": 0.94, + "learning_rate": 3.851765882941471e-06, + "loss": 0.8057, + "step": 872000 + }, + { + "epoch": 0.94, + "learning_rate": 3.836758379189595e-06, + "loss": 0.7998, + "step": 872500 + }, + { + "epoch": 0.94, + "learning_rate": 3.821750875437719e-06, + "loss": 0.8032, + "step": 873000 + }, + { + "epoch": 0.94, + "learning_rate": 3.806743371685843e-06, + "loss": 0.7969, + "step": 873500 + }, + { + "epoch": 0.94, + "learning_rate": 3.791735867933967e-06, + "loss": 0.8039, + "step": 874000 + }, + { + "epoch": 0.94, + "learning_rate": 3.7767283641820915e-06, + "loss": 0.7789, + "step": 874500 + }, + { + "epoch": 0.94, + "learning_rate": 3.7617508754377186e-06, + "loss": 0.7955, + "step": 875000 + }, + { + "epoch": 0.94, + "learning_rate": 3.746743371685843e-06, + "loss": 0.7855, + "step": 875500 + }, + { + "epoch": 0.94, + "learning_rate": 3.7317358679339673e-06, + "loss": 0.7946, + "step": 876000 + }, + { + "epoch": 0.94, + "learning_rate": 3.7167283641820912e-06, + "loss": 0.797, + "step": 876500 + }, + { + "epoch": 0.94, + "learning_rate": 3.7017208604302156e-06, + "loss": 0.7964, + "step": 877000 + }, + { + "epoch": 0.94, + "learning_rate": 3.6867433716858427e-06, + "loss": 0.7946, + "step": 877500 + }, + { + "epoch": 0.94, + "learning_rate": 3.671735867933967e-06, + "loss": 0.7962, + "step": 878000 + }, + { + "epoch": 0.94, + "learning_rate": 3.656728364182091e-06, + "loss": 0.7968, + "step": 878500 + }, + { + "epoch": 0.94, + "learning_rate": 3.6417208604302153e-06, + "loss": 0.7812, + "step": 879000 + }, + { + "epoch": 0.94, + "learning_rate": 3.6267133566783392e-06, + "loss": 0.797, + "step": 879500 + }, + { + "epoch": 0.94, + "learning_rate": 3.6117058529264636e-06, + "loss": 0.7819, + "step": 880000 + }, + { + "epoch": 0.94, + "eval_loss": 0.7429205179214478, + "eval_runtime": 604.5124, + "eval_samples_per_second": 165.423, + "eval_steps_per_second": 41.356, + "step": 880000 + }, + { + "epoch": 0.94, + "learning_rate": 3.5966983491745875e-06, + "loss": 0.7879, + "step": 880500 + }, + { + "epoch": 0.94, + "learning_rate": 3.5816908454227115e-06, + "loss": 0.7936, + "step": 881000 + }, + { + "epoch": 0.95, + "learning_rate": 3.566713356678339e-06, + "loss": 0.8068, + "step": 881500 + }, + { + "epoch": 0.95, + "learning_rate": 3.5517058529264633e-06, + "loss": 0.7924, + "step": 882000 + }, + { + "epoch": 0.95, + "learning_rate": 3.5366983491745873e-06, + "loss": 0.7907, + "step": 882500 + }, + { + "epoch": 0.95, + "learning_rate": 3.5216908454227116e-06, + "loss": 0.7876, + "step": 883000 + }, + { + "epoch": 0.95, + "learning_rate": 3.506713356678339e-06, + "loss": 0.7992, + "step": 883500 + }, + { + "epoch": 0.95, + "learning_rate": 3.4917058529264635e-06, + "loss": 0.7884, + "step": 884000 + }, + { + "epoch": 0.95, + "learning_rate": 3.4766983491745874e-06, + "loss": 0.7833, + "step": 884500 + }, + { + "epoch": 0.95, + "learning_rate": 3.4616908454227118e-06, + "loss": 0.7935, + "step": 885000 + }, + { + "epoch": 0.95, + "learning_rate": 3.4467133566783393e-06, + "loss": 0.7985, + "step": 885500 + }, + { + "epoch": 0.95, + "learning_rate": 3.431705852926463e-06, + "loss": 0.7935, + "step": 886000 + }, + { + "epoch": 0.95, + "learning_rate": 3.416698349174587e-06, + "loss": 0.7787, + "step": 886500 + }, + { + "epoch": 0.95, + "learning_rate": 3.4016908454227115e-06, + "loss": 0.7971, + "step": 887000 + }, + { + "epoch": 0.95, + "learning_rate": 3.3867133566783394e-06, + "loss": 0.7929, + "step": 887500 + }, + { + "epoch": 0.95, + "learning_rate": 3.3717058529264634e-06, + "loss": 0.7943, + "step": 888000 + }, + { + "epoch": 0.95, + "learning_rate": 3.3566983491745877e-06, + "loss": 0.7842, + "step": 888500 + }, + { + "epoch": 0.95, + "learning_rate": 3.3416908454227116e-06, + "loss": 0.7941, + "step": 889000 + }, + { + "epoch": 0.95, + "learning_rate": 3.326713356678339e-06, + "loss": 0.7939, + "step": 889500 + }, + { + "epoch": 0.95, + "learning_rate": 3.311735867933967e-06, + "loss": 0.7854, + "step": 890000 + }, + { + "epoch": 0.95, + "eval_loss": 0.742018461227417, + "eval_runtime": 599.0341, + "eval_samples_per_second": 166.935, + "eval_steps_per_second": 41.734, + "step": 890000 + }, + { + "epoch": 0.95, + "learning_rate": 3.296728364182091e-06, + "loss": 0.7893, + "step": 890500 + }, + { + "epoch": 0.96, + "learning_rate": 3.281720860430215e-06, + "loss": 0.803, + "step": 891000 + }, + { + "epoch": 0.96, + "learning_rate": 3.2667133566783393e-06, + "loss": 0.802, + "step": 891500 + }, + { + "epoch": 0.96, + "learning_rate": 3.2517058529264632e-06, + "loss": 0.7993, + "step": 892000 + }, + { + "epoch": 0.96, + "learning_rate": 3.2366983491745876e-06, + "loss": 0.8021, + "step": 892500 + }, + { + "epoch": 0.96, + "learning_rate": 3.2216908454227115e-06, + "loss": 0.7878, + "step": 893000 + }, + { + "epoch": 0.96, + "learning_rate": 3.2066833416708354e-06, + "loss": 0.781, + "step": 893500 + }, + { + "epoch": 0.96, + "learning_rate": 3.1916758379189594e-06, + "loss": 0.8049, + "step": 894000 + }, + { + "epoch": 0.96, + "learning_rate": 3.1766683341670837e-06, + "loss": 0.8089, + "step": 894500 + }, + { + "epoch": 0.96, + "learning_rate": 3.1616608304152077e-06, + "loss": 0.7842, + "step": 895000 + }, + { + "epoch": 0.96, + "learning_rate": 3.1466533266633316e-06, + "loss": 0.7958, + "step": 895500 + }, + { + "epoch": 0.96, + "learning_rate": 3.131645822911456e-06, + "loss": 0.7933, + "step": 896000 + }, + { + "epoch": 0.96, + "learning_rate": 3.116668334167084e-06, + "loss": 0.7816, + "step": 896500 + }, + { + "epoch": 0.96, + "learning_rate": 3.101660830415208e-06, + "loss": 0.798, + "step": 897000 + }, + { + "epoch": 0.96, + "learning_rate": 3.0866833416708353e-06, + "loss": 0.7837, + "step": 897500 + }, + { + "epoch": 0.96, + "learning_rate": 3.0716758379189593e-06, + "loss": 0.7781, + "step": 898000 + }, + { + "epoch": 0.96, + "learning_rate": 3.0566683341670836e-06, + "loss": 0.7966, + "step": 898500 + }, + { + "epoch": 0.96, + "learning_rate": 3.0416608304152075e-06, + "loss": 0.7896, + "step": 899000 + }, + { + "epoch": 0.96, + "learning_rate": 3.026653326663332e-06, + "loss": 0.776, + "step": 899500 + }, + { + "epoch": 0.97, + "learning_rate": 3.011645822911456e-06, + "loss": 0.7784, + "step": 900000 + }, + { + "epoch": 0.97, + "eval_loss": 0.7412576675415039, + "eval_runtime": 624.3312, + "eval_samples_per_second": 160.171, + "eval_steps_per_second": 40.043, + "step": 900000 + }, + { + "epoch": 0.97, + "learning_rate": 2.99663831915958e-06, + "loss": 0.7892, + "step": 900500 + }, + { + "epoch": 0.97, + "learning_rate": 2.9816608304152077e-06, + "loss": 0.7944, + "step": 901000 + }, + { + "epoch": 0.97, + "learning_rate": 2.9666533266633316e-06, + "loss": 0.7818, + "step": 901500 + }, + { + "epoch": 0.97, + "learning_rate": 2.9516458229114555e-06, + "loss": 0.7921, + "step": 902000 + }, + { + "epoch": 0.97, + "learning_rate": 2.93663831915958e-06, + "loss": 0.7927, + "step": 902500 + }, + { + "epoch": 0.97, + "learning_rate": 2.921630815407704e-06, + "loss": 0.7877, + "step": 903000 + }, + { + "epoch": 0.97, + "learning_rate": 2.906623311655828e-06, + "loss": 0.7997, + "step": 903500 + }, + { + "epoch": 0.97, + "learning_rate": 2.8916458229114557e-06, + "loss": 0.7847, + "step": 904000 + }, + { + "epoch": 0.97, + "learning_rate": 2.87663831915958e-06, + "loss": 0.7988, + "step": 904500 + }, + { + "epoch": 0.97, + "learning_rate": 2.861630815407704e-06, + "loss": 0.7929, + "step": 905000 + }, + { + "epoch": 0.97, + "learning_rate": 2.846623311655828e-06, + "loss": 0.7907, + "step": 905500 + }, + { + "epoch": 0.97, + "learning_rate": 2.831615807903952e-06, + "loss": 0.7932, + "step": 906000 + }, + { + "epoch": 0.97, + "learning_rate": 2.816608304152076e-06, + "loss": 0.7834, + "step": 906500 + }, + { + "epoch": 0.97, + "learning_rate": 2.8016008004002e-06, + "loss": 0.7859, + "step": 907000 + }, + { + "epoch": 0.97, + "learning_rate": 2.7865932966483245e-06, + "loss": 0.7873, + "step": 907500 + }, + { + "epoch": 0.97, + "learning_rate": 2.771615807903952e-06, + "loss": 0.786, + "step": 908000 + }, + { + "epoch": 0.97, + "learning_rate": 2.7566083041520763e-06, + "loss": 0.7941, + "step": 908500 + }, + { + "epoch": 0.97, + "learning_rate": 2.7416308154077043e-06, + "loss": 0.7962, + "step": 909000 + }, + { + "epoch": 0.98, + "learning_rate": 2.726623311655828e-06, + "loss": 0.7934, + "step": 909500 + }, + { + "epoch": 0.98, + "learning_rate": 2.7116158079039517e-06, + "loss": 0.7874, + "step": 910000 + }, + { + "epoch": 0.98, + "eval_loss": 0.7403942942619324, + "eval_runtime": 608.9146, + "eval_samples_per_second": 164.227, + "eval_steps_per_second": 41.057, + "step": 910000 + }, + { + "epoch": 0.98, + "learning_rate": 2.6966383191595797e-06, + "loss": 0.7744, + "step": 910500 + }, + { + "epoch": 0.98, + "learning_rate": 2.681630815407704e-06, + "loss": 0.8014, + "step": 911000 + }, + { + "epoch": 0.98, + "learning_rate": 2.666623311655828e-06, + "loss": 0.786, + "step": 911500 + }, + { + "epoch": 0.98, + "learning_rate": 2.6516158079039523e-06, + "loss": 0.7952, + "step": 912000 + }, + { + "epoch": 0.98, + "learning_rate": 2.6366083041520762e-06, + "loss": 0.7962, + "step": 912500 + }, + { + "epoch": 0.98, + "learning_rate": 2.621630815407704e-06, + "loss": 0.793, + "step": 913000 + }, + { + "epoch": 0.98, + "learning_rate": 2.6066233116558277e-06, + "loss": 0.7885, + "step": 913500 + }, + { + "epoch": 0.98, + "learning_rate": 2.591615807903952e-06, + "loss": 0.7883, + "step": 914000 + }, + { + "epoch": 0.98, + "learning_rate": 2.576608304152076e-06, + "loss": 0.7811, + "step": 914500 + }, + { + "epoch": 0.98, + "learning_rate": 2.5616008004002003e-06, + "loss": 0.7862, + "step": 915000 + }, + { + "epoch": 0.98, + "learning_rate": 2.5465932966483242e-06, + "loss": 0.7899, + "step": 915500 + }, + { + "epoch": 0.98, + "learning_rate": 2.531585792896448e-06, + "loss": 0.771, + "step": 916000 + }, + { + "epoch": 0.98, + "learning_rate": 2.5165782891445725e-06, + "loss": 0.7835, + "step": 916500 + }, + { + "epoch": 0.98, + "learning_rate": 2.5015707853926965e-06, + "loss": 0.779, + "step": 917000 + }, + { + "epoch": 0.98, + "learning_rate": 2.486593296648324e-06, + "loss": 0.7986, + "step": 917500 + }, + { + "epoch": 0.98, + "learning_rate": 2.4715857928964483e-06, + "loss": 0.7895, + "step": 918000 + }, + { + "epoch": 0.98, + "learning_rate": 2.4565782891445723e-06, + "loss": 0.7882, + "step": 918500 + }, + { + "epoch": 0.99, + "learning_rate": 2.441570785392696e-06, + "loss": 0.786, + "step": 919000 + }, + { + "epoch": 0.99, + "learning_rate": 2.4265632816408205e-06, + "loss": 0.7866, + "step": 919500 + }, + { + "epoch": 0.99, + "learning_rate": 2.4115557778889445e-06, + "loss": 0.7727, + "step": 920000 + }, + { + "epoch": 0.99, + "eval_loss": 0.7387272715568542, + "eval_runtime": 611.0966, + "eval_samples_per_second": 163.64, + "eval_steps_per_second": 40.91, + "step": 920000 + }, + { + "epoch": 0.99, + "learning_rate": 2.3965782891445724e-06, + "loss": 0.79, + "step": 920500 + }, + { + "epoch": 0.99, + "learning_rate": 2.3815707853926968e-06, + "loss": 0.7841, + "step": 921000 + }, + { + "epoch": 0.99, + "learning_rate": 2.3665632816408203e-06, + "loss": 0.797, + "step": 921500 + }, + { + "epoch": 0.99, + "learning_rate": 2.351555777888944e-06, + "loss": 0.7758, + "step": 922000 + }, + { + "epoch": 0.99, + "learning_rate": 2.3365482741370685e-06, + "loss": 0.7854, + "step": 922500 + }, + { + "epoch": 0.99, + "learning_rate": 2.3215707853926965e-06, + "loss": 0.7877, + "step": 923000 + }, + { + "epoch": 0.99, + "learning_rate": 2.3065632816408204e-06, + "loss": 0.787, + "step": 923500 + }, + { + "epoch": 0.99, + "learning_rate": 2.2915557778889448e-06, + "loss": 0.7823, + "step": 924000 + }, + { + "epoch": 0.99, + "learning_rate": 2.2765482741370687e-06, + "loss": 0.7831, + "step": 924500 + }, + { + "epoch": 0.99, + "learning_rate": 2.261540770385193e-06, + "loss": 0.7846, + "step": 925000 + }, + { + "epoch": 0.99, + "learning_rate": 2.2465332666333166e-06, + "loss": 0.7889, + "step": 925500 + }, + { + "epoch": 0.99, + "learning_rate": 2.2315257628814405e-06, + "loss": 0.7796, + "step": 926000 + }, + { + "epoch": 0.99, + "learning_rate": 2.216518259129565e-06, + "loss": 0.785, + "step": 926500 + }, + { + "epoch": 0.99, + "learning_rate": 2.2015407703851928e-06, + "loss": 0.8018, + "step": 927000 + }, + { + "epoch": 0.99, + "learning_rate": 2.1865332666333167e-06, + "loss": 0.787, + "step": 927500 + }, + { + "epoch": 1.0, + "learning_rate": 2.171525762881441e-06, + "loss": 0.7962, + "step": 928000 + }, + { + "epoch": 1.0, + "learning_rate": 2.156518259129565e-06, + "loss": 0.783, + "step": 928500 + }, + { + "epoch": 1.0, + "learning_rate": 2.141540770385193e-06, + "loss": 0.8057, + "step": 929000 + }, + { + "epoch": 1.0, + "learning_rate": 2.1265332666333164e-06, + "loss": 0.7904, + "step": 929500 + }, + { + "epoch": 1.0, + "learning_rate": 2.111525762881441e-06, + "loss": 0.7756, + "step": 930000 + }, + { + "epoch": 1.0, + "eval_loss": 0.7374897003173828, + "eval_runtime": 578.7073, + "eval_samples_per_second": 172.799, + "eval_steps_per_second": 43.2, + "step": 930000 + }, + { + "epoch": 1.0, + "learning_rate": 2.0965182591295647e-06, + "loss": 0.7893, + "step": 930500 + }, + { + "epoch": 1.0, + "learning_rate": 2.081510755377689e-06, + "loss": 0.7763, + "step": 931000 + }, + { + "epoch": 1.0, + "learning_rate": 2.066503251625813e-06, + "loss": 0.7817, + "step": 931500 + }, + { + "epoch": 1.0, + "learning_rate": 2.051525762881441e-06, + "loss": 0.7807, + "step": 932000 + }, + { + "epoch": 1.0, + "learning_rate": 2.036518259129565e-06, + "loss": 0.7849, + "step": 932500 + }, + { + "epoch": 1.0, + "learning_rate": 2.0215107553776892e-06, + "loss": 0.7735, + "step": 933000 + }, + { + "epoch": 1.0, + "learning_rate": 2.0065032516258127e-06, + "loss": 0.7658, + "step": 933500 + }, + { + "epoch": 1.0, + "learning_rate": 1.991495747873937e-06, + "loss": 0.7667, + "step": 934000 + }, + { + "epoch": 1.0, + "learning_rate": 1.976488244122061e-06, + "loss": 0.7655, + "step": 934500 + }, + { + "epoch": 1.0, + "learning_rate": 1.961480740370185e-06, + "loss": 0.7673, + "step": 935000 + }, + { + "epoch": 1.0, + "learning_rate": 1.9464732366183093e-06, + "loss": 0.7603, + "step": 935500 + }, + { + "epoch": 1.0, + "learning_rate": 1.9314957478739372e-06, + "loss": 0.7651, + "step": 936000 + }, + { + "epoch": 1.0, + "learning_rate": 1.916488244122061e-06, + "loss": 0.76, + "step": 936500 + }, + { + "epoch": 1.0, + "learning_rate": 1.9014807403701853e-06, + "loss": 0.762, + "step": 937000 + }, + { + "epoch": 1.01, + "learning_rate": 1.8864732366183092e-06, + "loss": 0.7732, + "step": 937500 + }, + { + "epoch": 1.01, + "learning_rate": 1.8714657328664334e-06, + "loss": 0.764, + "step": 938000 + }, + { + "epoch": 1.01, + "learning_rate": 1.8564582291145573e-06, + "loss": 0.7633, + "step": 938500 + }, + { + "epoch": 1.01, + "learning_rate": 1.841480740370185e-06, + "loss": 0.7701, + "step": 939000 + }, + { + "epoch": 1.01, + "learning_rate": 1.8264732366183092e-06, + "loss": 0.7654, + "step": 939500 + }, + { + "epoch": 1.01, + "learning_rate": 1.8114657328664333e-06, + "loss": 0.7588, + "step": 940000 + }, + { + "epoch": 1.01, + "eval_loss": 0.737705409526825, + "eval_runtime": 579.0479, + "eval_samples_per_second": 172.697, + "eval_steps_per_second": 43.174, + "step": 940000 + }, + { + "epoch": 1.01, + "learning_rate": 1.7964582291145573e-06, + "loss": 0.7784, + "step": 940500 + }, + { + "epoch": 1.01, + "learning_rate": 1.7814507253626814e-06, + "loss": 0.7521, + "step": 941000 + }, + { + "epoch": 1.01, + "learning_rate": 1.7664432216108055e-06, + "loss": 0.7575, + "step": 941500 + }, + { + "epoch": 1.01, + "learning_rate": 1.7514357178589297e-06, + "loss": 0.7786, + "step": 942000 + }, + { + "epoch": 1.01, + "learning_rate": 1.7364582291145572e-06, + "loss": 0.7631, + "step": 942500 + }, + { + "epoch": 1.01, + "learning_rate": 1.7214507253626813e-06, + "loss": 0.756, + "step": 943000 + }, + { + "epoch": 1.01, + "learning_rate": 1.7064432216108055e-06, + "loss": 0.7576, + "step": 943500 + }, + { + "epoch": 1.01, + "learning_rate": 1.6914357178589296e-06, + "loss": 0.7712, + "step": 944000 + }, + { + "epoch": 1.01, + "learning_rate": 1.6764282141070535e-06, + "loss": 0.7553, + "step": 944500 + }, + { + "epoch": 1.01, + "learning_rate": 1.6614507253626813e-06, + "loss": 0.7592, + "step": 945000 + }, + { + "epoch": 1.01, + "learning_rate": 1.6464432216108054e-06, + "loss": 0.7677, + "step": 945500 + }, + { + "epoch": 1.01, + "learning_rate": 1.6314357178589296e-06, + "loss": 0.7628, + "step": 946000 + }, + { + "epoch": 1.01, + "learning_rate": 1.6164282141070535e-06, + "loss": 0.7724, + "step": 946500 + }, + { + "epoch": 1.02, + "learning_rate": 1.6014207103551776e-06, + "loss": 0.7541, + "step": 947000 + }, + { + "epoch": 1.02, + "learning_rate": 1.5864432216108056e-06, + "loss": 0.7656, + "step": 947500 + }, + { + "epoch": 1.02, + "learning_rate": 1.5714357178589295e-06, + "loss": 0.7726, + "step": 948000 + }, + { + "epoch": 1.02, + "learning_rate": 1.5564282141070534e-06, + "loss": 0.755, + "step": 948500 + }, + { + "epoch": 1.02, + "learning_rate": 1.5414207103551776e-06, + "loss": 0.7555, + "step": 949000 + }, + { + "epoch": 1.02, + "learning_rate": 1.5264132066033017e-06, + "loss": 0.765, + "step": 949500 + }, + { + "epoch": 1.02, + "learning_rate": 1.5114057028514259e-06, + "loss": 0.7734, + "step": 950000 + }, + { + "epoch": 1.02, + "eval_loss": 0.736987292766571, + "eval_runtime": 575.6732, + "eval_samples_per_second": 173.71, + "eval_steps_per_second": 43.427, + "step": 950000 + }, + { + "epoch": 1.02, + "learning_rate": 1.4963981990995498e-06, + "loss": 0.7605, + "step": 950500 + }, + { + "epoch": 1.02, + "learning_rate": 1.4814207103551775e-06, + "loss": 0.7677, + "step": 951000 + }, + { + "epoch": 1.02, + "learning_rate": 1.4664132066033016e-06, + "loss": 0.764, + "step": 951500 + }, + { + "epoch": 1.02, + "learning_rate": 1.4514057028514258e-06, + "loss": 0.7535, + "step": 952000 + }, + { + "epoch": 1.02, + "learning_rate": 1.4363981990995497e-06, + "loss": 0.7746, + "step": 952500 + }, + { + "epoch": 1.02, + "learning_rate": 1.4213906953476739e-06, + "loss": 0.7642, + "step": 953000 + }, + { + "epoch": 1.02, + "learning_rate": 1.406383191595798e-06, + "loss": 0.7574, + "step": 953500 + }, + { + "epoch": 1.02, + "learning_rate": 1.391375687843922e-06, + "loss": 0.7556, + "step": 954000 + }, + { + "epoch": 1.02, + "learning_rate": 1.3763981990995497e-06, + "loss": 0.7684, + "step": 954500 + }, + { + "epoch": 1.02, + "learning_rate": 1.3613906953476738e-06, + "loss": 0.7678, + "step": 955000 + }, + { + "epoch": 1.02, + "learning_rate": 1.346383191595798e-06, + "loss": 0.7582, + "step": 955500 + }, + { + "epoch": 1.03, + "learning_rate": 1.331375687843922e-06, + "loss": 0.7699, + "step": 956000 + }, + { + "epoch": 1.03, + "learning_rate": 1.316368184092046e-06, + "loss": 0.7704, + "step": 956500 + }, + { + "epoch": 1.03, + "learning_rate": 1.3013606803401702e-06, + "loss": 0.7631, + "step": 957000 + }, + { + "epoch": 1.03, + "learning_rate": 1.2863531765882943e-06, + "loss": 0.7781, + "step": 957500 + }, + { + "epoch": 1.03, + "learning_rate": 1.2713456728364182e-06, + "loss": 0.7707, + "step": 958000 + }, + { + "epoch": 1.03, + "learning_rate": 1.256368184092046e-06, + "loss": 0.7599, + "step": 958500 + }, + { + "epoch": 1.03, + "learning_rate": 1.24136068034017e-06, + "loss": 0.7639, + "step": 959000 + }, + { + "epoch": 1.03, + "learning_rate": 1.2263531765882942e-06, + "loss": 0.7784, + "step": 959500 + }, + { + "epoch": 1.03, + "learning_rate": 1.2113456728364182e-06, + "loss": 0.7563, + "step": 960000 + }, + { + "epoch": 1.03, + "eval_loss": 0.7364427447319031, + "eval_runtime": 576.5734, + "eval_samples_per_second": 173.438, + "eval_steps_per_second": 43.36, + "step": 960000 + }, + { + "epoch": 1.03, + "learning_rate": 1.196368184092046e-06, + "loss": 0.7694, + "step": 960500 + }, + { + "epoch": 1.03, + "learning_rate": 1.18136068034017e-06, + "loss": 0.7588, + "step": 961000 + }, + { + "epoch": 1.03, + "learning_rate": 1.1663531765882942e-06, + "loss": 0.7441, + "step": 961500 + }, + { + "epoch": 1.03, + "learning_rate": 1.1513756878439221e-06, + "loss": 0.7665, + "step": 962000 + }, + { + "epoch": 1.03, + "learning_rate": 1.136368184092046e-06, + "loss": 0.7515, + "step": 962500 + }, + { + "epoch": 1.03, + "learning_rate": 1.1213606803401702e-06, + "loss": 0.7683, + "step": 963000 + }, + { + "epoch": 1.03, + "learning_rate": 1.1063531765882941e-06, + "loss": 0.7637, + "step": 963500 + }, + { + "epoch": 1.03, + "learning_rate": 1.0913456728364183e-06, + "loss": 0.7537, + "step": 964000 + }, + { + "epoch": 1.03, + "learning_rate": 1.076368184092046e-06, + "loss": 0.7667, + "step": 964500 + }, + { + "epoch": 1.03, + "learning_rate": 1.0613606803401701e-06, + "loss": 0.7663, + "step": 965000 + }, + { + "epoch": 1.04, + "learning_rate": 1.0463531765882943e-06, + "loss": 0.7604, + "step": 965500 + }, + { + "epoch": 1.04, + "learning_rate": 1.0313456728364184e-06, + "loss": 0.7534, + "step": 966000 + }, + { + "epoch": 1.04, + "learning_rate": 1.0163381690845423e-06, + "loss": 0.771, + "step": 966500 + }, + { + "epoch": 1.04, + "learning_rate": 1.0013306653326663e-06, + "loss": 0.7567, + "step": 967000 + }, + { + "epoch": 1.04, + "learning_rate": 9.863531765882942e-07, + "loss": 0.7632, + "step": 967500 + }, + { + "epoch": 1.04, + "learning_rate": 9.713456728364183e-07, + "loss": 0.7728, + "step": 968000 + }, + { + "epoch": 1.04, + "learning_rate": 9.563381690845423e-07, + "loss": 0.754, + "step": 968500 + }, + { + "epoch": 1.04, + "learning_rate": 9.413306653326664e-07, + "loss": 0.7629, + "step": 969000 + }, + { + "epoch": 1.04, + "learning_rate": 9.263231615807905e-07, + "loss": 0.7696, + "step": 969500 + }, + { + "epoch": 1.04, + "learning_rate": 9.113156578289145e-07, + "loss": 0.7753, + "step": 970000 + }, + { + "epoch": 1.04, + "eval_loss": 0.7353302240371704, + "eval_runtime": 645.1885, + "eval_samples_per_second": 154.993, + "eval_steps_per_second": 38.748, + "step": 970000 + }, + { + "epoch": 1.04, + "learning_rate": 8.963081540770385e-07, + "loss": 0.7666, + "step": 970500 + }, + { + "epoch": 1.04, + "learning_rate": 8.813006503251626e-07, + "loss": 0.7675, + "step": 971000 + }, + { + "epoch": 1.04, + "learning_rate": 8.663231615807905e-07, + "loss": 0.777, + "step": 971500 + }, + { + "epoch": 1.04, + "learning_rate": 8.513156578289144e-07, + "loss": 0.7657, + "step": 972000 + }, + { + "epoch": 1.04, + "learning_rate": 8.363081540770386e-07, + "loss": 0.7698, + "step": 972500 + }, + { + "epoch": 1.04, + "learning_rate": 8.213006503251626e-07, + "loss": 0.7571, + "step": 973000 + }, + { + "epoch": 1.04, + "learning_rate": 8.063231615807904e-07, + "loss": 0.7628, + "step": 973500 + }, + { + "epoch": 1.04, + "learning_rate": 7.913156578289145e-07, + "loss": 0.7672, + "step": 974000 + }, + { + "epoch": 1.05, + "learning_rate": 7.763081540770386e-07, + "loss": 0.7677, + "step": 974500 + }, + { + "epoch": 1.05, + "learning_rate": 7.613306653326663e-07, + "loss": 0.7624, + "step": 975000 + }, + { + "epoch": 1.05, + "learning_rate": 7.463231615807905e-07, + "loss": 0.7617, + "step": 975500 + }, + { + "epoch": 1.05, + "learning_rate": 7.313156578289144e-07, + "loss": 0.7627, + "step": 976000 + }, + { + "epoch": 1.05, + "learning_rate": 7.163081540770386e-07, + "loss": 0.7499, + "step": 976500 + }, + { + "epoch": 1.05, + "learning_rate": 7.013006503251626e-07, + "loss": 0.7753, + "step": 977000 + }, + { + "epoch": 1.05, + "learning_rate": 6.862931465732867e-07, + "loss": 0.7663, + "step": 977500 + }, + { + "epoch": 1.05, + "learning_rate": 6.713156578289145e-07, + "loss": 0.7773, + "step": 978000 + }, + { + "epoch": 1.05, + "learning_rate": 6.563381690845423e-07, + "loss": 0.7573, + "step": 978500 + }, + { + "epoch": 1.05, + "learning_rate": 6.413306653326663e-07, + "loss": 0.7679, + "step": 979000 + }, + { + "epoch": 1.05, + "learning_rate": 6.263231615807905e-07, + "loss": 0.7665, + "step": 979500 + }, + { + "epoch": 1.05, + "learning_rate": 6.113156578289144e-07, + "loss": 0.7651, + "step": 980000 + }, + { + "epoch": 1.05, + "eval_loss": 0.7347660064697266, + "eval_runtime": 596.5908, + "eval_samples_per_second": 167.619, + "eval_steps_per_second": 41.905, + "step": 980000 + }, + { + "epoch": 1.05, + "learning_rate": 5.963081540770385e-07, + "loss": 0.7661, + "step": 980500 + }, + { + "epoch": 1.05, + "learning_rate": 5.813006503251626e-07, + "loss": 0.7551, + "step": 981000 + }, + { + "epoch": 1.05, + "learning_rate": 5.662931465732867e-07, + "loss": 0.7647, + "step": 981500 + }, + { + "epoch": 1.05, + "learning_rate": 5.512856428214107e-07, + "loss": 0.7543, + "step": 982000 + }, + { + "epoch": 1.05, + "learning_rate": 5.362781390695348e-07, + "loss": 0.7666, + "step": 982500 + }, + { + "epoch": 1.05, + "learning_rate": 5.212706353176588e-07, + "loss": 0.7721, + "step": 983000 + }, + { + "epoch": 1.05, + "learning_rate": 5.06263131565783e-07, + "loss": 0.7605, + "step": 983500 + }, + { + "epoch": 1.06, + "learning_rate": 4.912856428214107e-07, + "loss": 0.7602, + "step": 984000 + }, + { + "epoch": 1.06, + "learning_rate": 4.762781390695348e-07, + "loss": 0.7553, + "step": 984500 + }, + { + "epoch": 1.06, + "learning_rate": 4.612706353176588e-07, + "loss": 0.7499, + "step": 985000 + }, + { + "epoch": 1.06, + "learning_rate": 4.462631315657829e-07, + "loss": 0.7592, + "step": 985500 + }, + { + "epoch": 1.06, + "learning_rate": 4.3125562781390695e-07, + "loss": 0.7653, + "step": 986000 + }, + { + "epoch": 1.06, + "learning_rate": 4.1624812406203104e-07, + "loss": 0.7705, + "step": 986500 + }, + { + "epoch": 1.06, + "learning_rate": 4.012406203101551e-07, + "loss": 0.7631, + "step": 987000 + }, + { + "epoch": 1.06, + "learning_rate": 3.862631315657829e-07, + "loss": 0.7501, + "step": 987500 + }, + { + "epoch": 1.06, + "learning_rate": 3.7125562781390694e-07, + "loss": 0.7564, + "step": 988000 + }, + { + "epoch": 1.06, + "learning_rate": 3.5624812406203103e-07, + "loss": 0.7495, + "step": 988500 + }, + { + "epoch": 1.06, + "learning_rate": 3.4124062031015507e-07, + "loss": 0.7605, + "step": 989000 + }, + { + "epoch": 1.06, + "learning_rate": 3.2623311655827916e-07, + "loss": 0.7698, + "step": 989500 + }, + { + "epoch": 1.06, + "learning_rate": 3.1125562781390693e-07, + "loss": 0.7526, + "step": 990000 + }, + { + "epoch": 1.06, + "eval_loss": 0.7347068190574646, + "eval_runtime": 582.4895, + "eval_samples_per_second": 171.677, + "eval_steps_per_second": 42.919, + "step": 990000 + }, + { + "epoch": 1.06, + "learning_rate": 2.96248124062031e-07, + "loss": 0.7615, + "step": 990500 + }, + { + "epoch": 1.06, + "learning_rate": 2.8124062031015506e-07, + "loss": 0.7677, + "step": 991000 + }, + { + "epoch": 1.06, + "learning_rate": 2.6623311655827915e-07, + "loss": 0.754, + "step": 991500 + }, + { + "epoch": 1.06, + "learning_rate": 2.512256128064032e-07, + "loss": 0.7632, + "step": 992000 + }, + { + "epoch": 1.06, + "learning_rate": 2.3621810905452727e-07, + "loss": 0.7677, + "step": 992500 + }, + { + "epoch": 1.06, + "learning_rate": 2.2121060530265134e-07, + "loss": 0.7546, + "step": 993000 + }, + { + "epoch": 1.07, + "learning_rate": 2.0623311655827914e-07, + "loss": 0.7602, + "step": 993500 + }, + { + "epoch": 1.07, + "learning_rate": 1.9125562781390697e-07, + "loss": 0.7498, + "step": 994000 + }, + { + "epoch": 1.07, + "learning_rate": 1.7624812406203103e-07, + "loss": 0.7587, + "step": 994500 + }, + { + "epoch": 1.07, + "learning_rate": 1.612406203101551e-07, + "loss": 0.7581, + "step": 995000 + }, + { + "epoch": 1.07, + "learning_rate": 1.4623311655827916e-07, + "loss": 0.7569, + "step": 995500 + }, + { + "epoch": 1.07, + "learning_rate": 1.3122561280640322e-07, + "loss": 0.7696, + "step": 996000 + }, + { + "epoch": 1.07, + "learning_rate": 1.1621810905452727e-07, + "loss": 0.7511, + "step": 996500 + }, + { + "epoch": 1.07, + "learning_rate": 1.0121060530265133e-07, + "loss": 0.7672, + "step": 997000 + }, + { + "epoch": 1.07, + "learning_rate": 8.620310155077538e-08, + "loss": 0.7461, + "step": 997500 + }, + { + "epoch": 1.07, + "learning_rate": 7.119559779889944e-08, + "loss": 0.7562, + "step": 998000 + }, + { + "epoch": 1.07, + "learning_rate": 5.6188094047023514e-08, + "loss": 0.7609, + "step": 998500 + }, + { + "epoch": 1.07, + "learning_rate": 4.1210605302651325e-08, + "loss": 0.7588, + "step": 999000 + }, + { + "epoch": 1.07, + "learning_rate": 2.6203101550775387e-08, + "loss": 0.7513, + "step": 999500 + }, + { + "epoch": 1.07, + "learning_rate": 1.119559779889945e-08, + "loss": 0.755, + "step": 1000000 + }, + { + "epoch": 1.07, + "eval_loss": 0.7345585823059082, + "eval_runtime": 579.2567, + "eval_samples_per_second": 172.635, + "eval_steps_per_second": 43.159, + "step": 1000000 + } + ], + "max_steps": 1000000, + "num_train_epochs": 2, + "total_flos": 2.242924955382055e+18, + "trial_name": null, + "trial_params": null +}