{ "best_metric": 2.785961627960205, "best_model_checkpoint": "experiments/qg/google/mt5-large_all/checkpoint-126000", "epoch": 1.9977723341340248, "global_step": 126000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 3e-05, "loss": 8.8865, "step": 500 }, { "epoch": 0.02, "learning_rate": 2.9976277372262774e-05, "loss": 4.0325, "step": 1000 }, { "epoch": 0.02, "eval_loss": 3.3621604442596436, "eval_runtime": 444.6073, "eval_samples_per_second": 89.967, "eval_steps_per_second": 11.246, "step": 1000 }, { "epoch": 0.02, "learning_rate": 2.9952554744525548e-05, "loss": 3.6162, "step": 1500 }, { "epoch": 0.03, "learning_rate": 2.992883211678832e-05, "loss": 3.4961, "step": 2000 }, { "epoch": 0.03, "eval_loss": 3.1849684715270996, "eval_runtime": 443.8466, "eval_samples_per_second": 90.121, "eval_steps_per_second": 11.265, "step": 2000 }, { "epoch": 0.04, "learning_rate": 2.9905109489051095e-05, "loss": 3.4531, "step": 2500 }, { "epoch": 0.05, "learning_rate": 2.988138686131387e-05, "loss": 3.3883, "step": 3000 }, { "epoch": 0.05, "eval_loss": 3.0837316513061523, "eval_runtime": 451.6674, "eval_samples_per_second": 88.561, "eval_steps_per_second": 11.07, "step": 3000 }, { "epoch": 0.06, "learning_rate": 2.9857664233576645e-05, "loss": 3.3483, "step": 3500 }, { "epoch": 0.06, "learning_rate": 2.9833941605839416e-05, "loss": 3.281, "step": 4000 }, { "epoch": 0.06, "eval_loss": 3.053595781326294, "eval_runtime": 451.6413, "eval_samples_per_second": 88.566, "eval_steps_per_second": 11.071, "step": 4000 }, { "epoch": 0.07, "learning_rate": 2.9810218978102192e-05, "loss": 3.2901, "step": 4500 }, { "epoch": 0.08, "learning_rate": 2.9786496350364963e-05, "loss": 3.2764, "step": 5000 }, { "epoch": 0.08, "eval_loss": 3.045032262802124, "eval_runtime": 450.7805, "eval_samples_per_second": 88.735, "eval_steps_per_second": 11.092, "step": 5000 }, { "epoch": 0.09, "learning_rate": 2.9762773722627736e-05, "loss": 3.255, "step": 5500 }, { "epoch": 0.1, "learning_rate": 2.9739051094890513e-05, "loss": 3.2352, "step": 6000 }, { "epoch": 0.1, "eval_loss": 3.018794059753418, "eval_runtime": 451.4803, "eval_samples_per_second": 88.597, "eval_steps_per_second": 11.075, "step": 6000 }, { "epoch": 0.1, "learning_rate": 2.9715328467153283e-05, "loss": 3.1954, "step": 6500 }, { "epoch": 0.11, "learning_rate": 2.969160583941606e-05, "loss": 3.1939, "step": 7000 }, { "epoch": 0.11, "eval_loss": 3.000720500946045, "eval_runtime": 450.9264, "eval_samples_per_second": 88.706, "eval_steps_per_second": 11.088, "step": 7000 }, { "epoch": 0.12, "learning_rate": 2.966788321167883e-05, "loss": 3.1893, "step": 7500 }, { "epoch": 0.13, "learning_rate": 2.9644160583941607e-05, "loss": 3.1936, "step": 8000 }, { "epoch": 0.13, "eval_loss": 2.988145589828491, "eval_runtime": 451.5214, "eval_samples_per_second": 88.589, "eval_steps_per_second": 11.074, "step": 8000 }, { "epoch": 0.13, "learning_rate": 2.962043795620438e-05, "loss": 3.1604, "step": 8500 }, { "epoch": 0.14, "learning_rate": 2.9596715328467154e-05, "loss": 3.1765, "step": 9000 }, { "epoch": 0.14, "eval_loss": 2.9744112491607666, "eval_runtime": 449.7283, "eval_samples_per_second": 88.943, "eval_steps_per_second": 11.118, "step": 9000 }, { "epoch": 0.15, "learning_rate": 2.9572992700729928e-05, "loss": 3.1478, "step": 9500 }, { "epoch": 0.16, "learning_rate": 2.95492700729927e-05, "loss": 3.1532, "step": 10000 }, { "epoch": 0.16, "eval_loss": 2.960232734680176, "eval_runtime": 450.2869, "eval_samples_per_second": 88.832, "eval_steps_per_second": 11.104, "step": 10000 }, { "epoch": 0.17, "learning_rate": 2.9525547445255475e-05, "loss": 3.1187, "step": 10500 }, { "epoch": 0.17, "learning_rate": 2.950182481751825e-05, "loss": 3.1318, "step": 11000 }, { "epoch": 0.17, "eval_loss": 2.956113338470459, "eval_runtime": 450.2104, "eval_samples_per_second": 88.847, "eval_steps_per_second": 11.106, "step": 11000 }, { "epoch": 0.18, "learning_rate": 2.9478102189781022e-05, "loss": 3.1086, "step": 11500 }, { "epoch": 0.19, "learning_rate": 2.9454379562043796e-05, "loss": 3.0946, "step": 12000 }, { "epoch": 0.19, "eval_loss": 2.946828842163086, "eval_runtime": 450.0294, "eval_samples_per_second": 88.883, "eval_steps_per_second": 11.11, "step": 12000 }, { "epoch": 0.2, "learning_rate": 2.9430656934306573e-05, "loss": 3.114, "step": 12500 }, { "epoch": 0.21, "learning_rate": 2.9406934306569343e-05, "loss": 3.0903, "step": 13000 }, { "epoch": 0.21, "eval_loss": 2.936713457107544, "eval_runtime": 449.1132, "eval_samples_per_second": 89.064, "eval_steps_per_second": 11.133, "step": 13000 }, { "epoch": 0.21, "learning_rate": 2.938321167883212e-05, "loss": 3.102, "step": 13500 }, { "epoch": 0.22, "learning_rate": 2.935948905109489e-05, "loss": 3.0916, "step": 14000 }, { "epoch": 0.22, "eval_loss": 2.9305248260498047, "eval_runtime": 449.5477, "eval_samples_per_second": 88.978, "eval_steps_per_second": 11.122, "step": 14000 }, { "epoch": 0.23, "learning_rate": 2.9335766423357667e-05, "loss": 3.0578, "step": 14500 }, { "epoch": 0.24, "learning_rate": 2.931204379562044e-05, "loss": 3.0754, "step": 15000 }, { "epoch": 0.24, "eval_loss": 2.92580246925354, "eval_runtime": 449.9439, "eval_samples_per_second": 88.9, "eval_steps_per_second": 11.112, "step": 15000 }, { "epoch": 0.25, "learning_rate": 2.928832116788321e-05, "loss": 3.0673, "step": 15500 }, { "epoch": 0.25, "learning_rate": 2.9264598540145987e-05, "loss": 3.0606, "step": 16000 }, { "epoch": 0.25, "eval_loss": 2.9198672771453857, "eval_runtime": 450.1912, "eval_samples_per_second": 88.851, "eval_steps_per_second": 11.106, "step": 16000 }, { "epoch": 0.26, "learning_rate": 2.9240875912408757e-05, "loss": 3.0472, "step": 16500 }, { "epoch": 0.27, "learning_rate": 2.9217153284671534e-05, "loss": 3.0598, "step": 17000 }, { "epoch": 0.27, "eval_loss": 2.91711163520813, "eval_runtime": 449.4944, "eval_samples_per_second": 88.989, "eval_steps_per_second": 11.124, "step": 17000 }, { "epoch": 0.28, "learning_rate": 2.9193430656934308e-05, "loss": 3.0376, "step": 17500 }, { "epoch": 0.29, "learning_rate": 2.916970802919708e-05, "loss": 3.0369, "step": 18000 }, { "epoch": 0.29, "eval_loss": 2.9090449810028076, "eval_runtime": 450.1639, "eval_samples_per_second": 88.857, "eval_steps_per_second": 11.107, "step": 18000 }, { "epoch": 0.29, "learning_rate": 2.9145985401459855e-05, "loss": 3.0517, "step": 18500 }, { "epoch": 0.3, "learning_rate": 2.912226277372263e-05, "loss": 3.0312, "step": 19000 }, { "epoch": 0.3, "eval_loss": 2.903899669647217, "eval_runtime": 450.5358, "eval_samples_per_second": 88.783, "eval_steps_per_second": 11.098, "step": 19000 }, { "epoch": 0.31, "learning_rate": 2.9098540145985402e-05, "loss": 3.0135, "step": 19500 }, { "epoch": 0.32, "learning_rate": 2.9074817518248176e-05, "loss": 3.0304, "step": 20000 }, { "epoch": 0.32, "eval_loss": 2.8915276527404785, "eval_runtime": 449.8834, "eval_samples_per_second": 88.912, "eval_steps_per_second": 11.114, "step": 20000 }, { "epoch": 0.33, "learning_rate": 2.905109489051095e-05, "loss": 3.0157, "step": 20500 }, { "epoch": 0.33, "learning_rate": 2.9027372262773723e-05, "loss": 3.008, "step": 21000 }, { "epoch": 0.33, "eval_loss": 2.89363431930542, "eval_runtime": 450.0699, "eval_samples_per_second": 88.875, "eval_steps_per_second": 11.109, "step": 21000 }, { "epoch": 0.34, "learning_rate": 2.9003649635036496e-05, "loss": 3.0125, "step": 21500 }, { "epoch": 0.35, "learning_rate": 2.897992700729927e-05, "loss": 3.0373, "step": 22000 }, { "epoch": 0.35, "eval_loss": 2.8869340419769287, "eval_runtime": 450.8527, "eval_samples_per_second": 88.721, "eval_steps_per_second": 11.09, "step": 22000 }, { "epoch": 0.36, "learning_rate": 2.8956204379562047e-05, "loss": 3.0246, "step": 22500 }, { "epoch": 0.36, "learning_rate": 2.8932481751824817e-05, "loss": 3.0107, "step": 23000 }, { "epoch": 0.36, "eval_loss": 2.882735252380371, "eval_runtime": 451.0618, "eval_samples_per_second": 88.68, "eval_steps_per_second": 11.085, "step": 23000 }, { "epoch": 0.37, "learning_rate": 2.8908759124087594e-05, "loss": 3.0006, "step": 23500 }, { "epoch": 0.38, "learning_rate": 2.8885036496350364e-05, "loss": 2.9781, "step": 24000 }, { "epoch": 0.38, "eval_loss": 2.8770592212677, "eval_runtime": 450.672, "eval_samples_per_second": 88.756, "eval_steps_per_second": 11.095, "step": 24000 }, { "epoch": 0.39, "learning_rate": 2.886131386861314e-05, "loss": 2.9744, "step": 24500 }, { "epoch": 0.4, "learning_rate": 2.8837591240875914e-05, "loss": 3.0146, "step": 25000 }, { "epoch": 0.4, "eval_loss": 2.8825135231018066, "eval_runtime": 450.4517, "eval_samples_per_second": 88.8, "eval_steps_per_second": 11.1, "step": 25000 }, { "epoch": 0.4, "learning_rate": 2.8813868613138685e-05, "loss": 2.997, "step": 25500 }, { "epoch": 0.41, "learning_rate": 2.879014598540146e-05, "loss": 2.9856, "step": 26000 }, { "epoch": 0.41, "eval_loss": 2.8855531215667725, "eval_runtime": 450.7359, "eval_samples_per_second": 88.744, "eval_steps_per_second": 11.093, "step": 26000 }, { "epoch": 0.42, "learning_rate": 2.8766423357664232e-05, "loss": 2.986, "step": 26500 }, { "epoch": 0.43, "learning_rate": 2.874270072992701e-05, "loss": 3.0001, "step": 27000 }, { "epoch": 0.43, "eval_loss": 2.886821985244751, "eval_runtime": 450.0432, "eval_samples_per_second": 88.88, "eval_steps_per_second": 11.11, "step": 27000 }, { "epoch": 0.44, "learning_rate": 2.8718978102189782e-05, "loss": 2.9924, "step": 27500 }, { "epoch": 0.44, "learning_rate": 2.8695255474452556e-05, "loss": 2.9633, "step": 28000 }, { "epoch": 0.44, "eval_loss": 2.8713860511779785, "eval_runtime": 450.4908, "eval_samples_per_second": 88.792, "eval_steps_per_second": 11.099, "step": 28000 }, { "epoch": 0.45, "learning_rate": 2.867153284671533e-05, "loss": 2.9684, "step": 28500 }, { "epoch": 0.46, "learning_rate": 2.8647810218978103e-05, "loss": 2.9563, "step": 29000 }, { "epoch": 0.46, "eval_loss": 2.8724424839019775, "eval_runtime": 450.9726, "eval_samples_per_second": 88.697, "eval_steps_per_second": 11.087, "step": 29000 }, { "epoch": 0.47, "learning_rate": 2.8624087591240876e-05, "loss": 2.9573, "step": 29500 }, { "epoch": 0.48, "learning_rate": 2.860036496350365e-05, "loss": 2.9812, "step": 30000 }, { "epoch": 0.48, "eval_loss": 2.86970591545105, "eval_runtime": 450.3055, "eval_samples_per_second": 88.829, "eval_steps_per_second": 11.104, "step": 30000 }, { "epoch": 0.48, "learning_rate": 2.8576642335766423e-05, "loss": 2.9677, "step": 30500 }, { "epoch": 0.49, "learning_rate": 2.8552919708029197e-05, "loss": 2.9773, "step": 31000 }, { "epoch": 0.49, "eval_loss": 2.8727774620056152, "eval_runtime": 449.455, "eval_samples_per_second": 88.997, "eval_steps_per_second": 11.125, "step": 31000 }, { "epoch": 0.5, "learning_rate": 2.8529197080291974e-05, "loss": 2.971, "step": 31500 }, { "epoch": 0.51, "learning_rate": 2.8505474452554744e-05, "loss": 2.9743, "step": 32000 }, { "epoch": 0.51, "eval_loss": 2.8665173053741455, "eval_runtime": 449.7846, "eval_samples_per_second": 88.931, "eval_steps_per_second": 11.116, "step": 32000 }, { "epoch": 0.52, "learning_rate": 2.848175182481752e-05, "loss": 2.9585, "step": 32500 }, { "epoch": 0.52, "learning_rate": 2.845802919708029e-05, "loss": 2.9428, "step": 33000 }, { "epoch": 0.52, "eval_loss": 2.8613767623901367, "eval_runtime": 450.8331, "eval_samples_per_second": 88.725, "eval_steps_per_second": 11.091, "step": 33000 }, { "epoch": 0.53, "learning_rate": 2.8434306569343068e-05, "loss": 2.9509, "step": 33500 }, { "epoch": 0.54, "learning_rate": 2.841058394160584e-05, "loss": 2.9683, "step": 34000 }, { "epoch": 0.54, "eval_loss": 2.86824369430542, "eval_runtime": 450.4958, "eval_samples_per_second": 88.791, "eval_steps_per_second": 11.099, "step": 34000 }, { "epoch": 0.55, "learning_rate": 2.8386861313868612e-05, "loss": 2.9332, "step": 34500 }, { "epoch": 0.55, "learning_rate": 2.836313868613139e-05, "loss": 2.9567, "step": 35000 }, { "epoch": 0.55, "eval_loss": 2.854598045349121, "eval_runtime": 450.2268, "eval_samples_per_second": 88.844, "eval_steps_per_second": 11.106, "step": 35000 }, { "epoch": 0.56, "learning_rate": 2.833941605839416e-05, "loss": 2.9529, "step": 35500 }, { "epoch": 0.57, "learning_rate": 2.8315693430656936e-05, "loss": 2.9515, "step": 36000 }, { "epoch": 0.57, "eval_loss": 2.8551025390625, "eval_runtime": 450.4445, "eval_samples_per_second": 88.801, "eval_steps_per_second": 11.1, "step": 36000 }, { "epoch": 0.58, "learning_rate": 2.829197080291971e-05, "loss": 2.9273, "step": 36500 }, { "epoch": 0.59, "learning_rate": 2.8268248175182483e-05, "loss": 2.965, "step": 37000 }, { "epoch": 0.59, "eval_loss": 2.8546838760375977, "eval_runtime": 450.6163, "eval_samples_per_second": 88.767, "eval_steps_per_second": 11.096, "step": 37000 }, { "epoch": 0.59, "learning_rate": 2.8244525547445256e-05, "loss": 2.9441, "step": 37500 }, { "epoch": 0.6, "learning_rate": 2.822080291970803e-05, "loss": 2.9326, "step": 38000 }, { "epoch": 0.6, "eval_loss": 2.848287582397461, "eval_runtime": 450.6373, "eval_samples_per_second": 88.763, "eval_steps_per_second": 11.095, "step": 38000 }, { "epoch": 0.61, "learning_rate": 2.8197080291970803e-05, "loss": 2.9547, "step": 38500 }, { "epoch": 0.62, "learning_rate": 2.817335766423358e-05, "loss": 2.9096, "step": 39000 }, { "epoch": 0.62, "eval_loss": 2.8502256870269775, "eval_runtime": 450.3731, "eval_samples_per_second": 88.815, "eval_steps_per_second": 11.102, "step": 39000 }, { "epoch": 0.63, "learning_rate": 2.814963503649635e-05, "loss": 2.9222, "step": 39500 }, { "epoch": 0.63, "learning_rate": 2.8125912408759124e-05, "loss": 2.9132, "step": 40000 }, { "epoch": 0.63, "eval_loss": 2.8453314304351807, "eval_runtime": 450.4787, "eval_samples_per_second": 88.794, "eval_steps_per_second": 11.099, "step": 40000 }, { "epoch": 0.64, "learning_rate": 2.8102189781021898e-05, "loss": 2.9172, "step": 40500 }, { "epoch": 0.65, "learning_rate": 2.807846715328467e-05, "loss": 2.9373, "step": 41000 }, { "epoch": 0.65, "eval_loss": 2.8487000465393066, "eval_runtime": 450.8104, "eval_samples_per_second": 88.729, "eval_steps_per_second": 11.091, "step": 41000 }, { "epoch": 0.66, "learning_rate": 2.8054744525547448e-05, "loss": 2.9451, "step": 41500 }, { "epoch": 0.67, "learning_rate": 2.8031021897810218e-05, "loss": 2.9333, "step": 42000 }, { "epoch": 0.67, "eval_loss": 2.8426833152770996, "eval_runtime": 449.7638, "eval_samples_per_second": 88.936, "eval_steps_per_second": 11.117, "step": 42000 }, { "epoch": 0.67, "learning_rate": 2.8007299270072995e-05, "loss": 2.9263, "step": 42500 }, { "epoch": 0.68, "learning_rate": 2.7983576642335765e-05, "loss": 2.9242, "step": 43000 }, { "epoch": 0.68, "eval_loss": 2.8440945148468018, "eval_runtime": 450.8592, "eval_samples_per_second": 88.719, "eval_steps_per_second": 11.09, "step": 43000 }, { "epoch": 0.69, "learning_rate": 2.795985401459854e-05, "loss": 2.9226, "step": 43500 }, { "epoch": 0.7, "learning_rate": 2.7936131386861316e-05, "loss": 2.9058, "step": 44000 }, { "epoch": 0.7, "eval_loss": 2.8383543491363525, "eval_runtime": 450.5891, "eval_samples_per_second": 88.773, "eval_steps_per_second": 11.097, "step": 44000 }, { "epoch": 0.71, "learning_rate": 2.7912408759124086e-05, "loss": 2.9113, "step": 44500 }, { "epoch": 0.71, "learning_rate": 2.7888686131386863e-05, "loss": 2.9237, "step": 45000 }, { "epoch": 0.71, "eval_loss": 2.8444628715515137, "eval_runtime": 450.7048, "eval_samples_per_second": 88.75, "eval_steps_per_second": 11.094, "step": 45000 }, { "epoch": 0.72, "learning_rate": 2.7864963503649633e-05, "loss": 2.904, "step": 45500 }, { "epoch": 0.73, "learning_rate": 2.784124087591241e-05, "loss": 2.9101, "step": 46000 }, { "epoch": 0.73, "eval_loss": 2.8378028869628906, "eval_runtime": 450.6185, "eval_samples_per_second": 88.767, "eval_steps_per_second": 11.096, "step": 46000 }, { "epoch": 0.74, "learning_rate": 2.7817518248175184e-05, "loss": 2.913, "step": 46500 }, { "epoch": 0.75, "learning_rate": 2.7793795620437957e-05, "loss": 2.9053, "step": 47000 }, { "epoch": 0.75, "eval_loss": 2.840611219406128, "eval_runtime": 448.4181, "eval_samples_per_second": 89.202, "eval_steps_per_second": 11.15, "step": 47000 }, { "epoch": 0.75, "learning_rate": 2.777007299270073e-05, "loss": 2.8649, "step": 47500 }, { "epoch": 0.76, "learning_rate": 2.7746350364963504e-05, "loss": 2.9016, "step": 48000 }, { "epoch": 0.76, "eval_loss": 2.832401752471924, "eval_runtime": 448.5723, "eval_samples_per_second": 89.172, "eval_steps_per_second": 11.146, "step": 48000 }, { "epoch": 0.77, "learning_rate": 2.7722627737226278e-05, "loss": 2.9178, "step": 48500 }, { "epoch": 0.78, "learning_rate": 2.769890510948905e-05, "loss": 2.8928, "step": 49000 }, { "epoch": 0.78, "eval_loss": 2.8320999145507812, "eval_runtime": 449.0119, "eval_samples_per_second": 89.084, "eval_steps_per_second": 11.136, "step": 49000 }, { "epoch": 0.78, "learning_rate": 2.7675182481751825e-05, "loss": 2.903, "step": 49500 }, { "epoch": 0.79, "learning_rate": 2.76514598540146e-05, "loss": 2.8792, "step": 50000 }, { "epoch": 0.79, "eval_loss": 2.836050033569336, "eval_runtime": 447.2366, "eval_samples_per_second": 89.438, "eval_steps_per_second": 11.18, "step": 50000 }, { "epoch": 0.8, "learning_rate": 2.7627737226277375e-05, "loss": 2.8807, "step": 50500 }, { "epoch": 0.81, "learning_rate": 2.7604014598540145e-05, "loss": 2.896, "step": 51000 }, { "epoch": 0.81, "eval_loss": 2.8326449394226074, "eval_runtime": 447.8025, "eval_samples_per_second": 89.325, "eval_steps_per_second": 11.166, "step": 51000 }, { "epoch": 0.82, "learning_rate": 2.7580291970802922e-05, "loss": 2.8805, "step": 51500 }, { "epoch": 0.82, "learning_rate": 2.7556569343065692e-05, "loss": 2.8708, "step": 52000 }, { "epoch": 0.82, "eval_loss": 2.830998420715332, "eval_runtime": 441.655, "eval_samples_per_second": 90.568, "eval_steps_per_second": 11.321, "step": 52000 }, { "epoch": 0.83, "learning_rate": 2.753284671532847e-05, "loss": 2.8656, "step": 52500 }, { "epoch": 0.84, "learning_rate": 2.7509124087591243e-05, "loss": 2.8882, "step": 53000 }, { "epoch": 0.84, "eval_loss": 2.8259706497192383, "eval_runtime": 442.0896, "eval_samples_per_second": 90.479, "eval_steps_per_second": 11.31, "step": 53000 }, { "epoch": 0.85, "learning_rate": 2.7485401459854017e-05, "loss": 2.8818, "step": 53500 }, { "epoch": 0.86, "learning_rate": 2.746167883211679e-05, "loss": 2.8682, "step": 54000 }, { "epoch": 0.86, "eval_loss": 2.8257410526275635, "eval_runtime": 441.6514, "eval_samples_per_second": 90.569, "eval_steps_per_second": 11.321, "step": 54000 }, { "epoch": 0.86, "learning_rate": 2.743795620437956e-05, "loss": 2.8765, "step": 54500 }, { "epoch": 0.87, "learning_rate": 2.7414233576642337e-05, "loss": 2.8843, "step": 55000 }, { "epoch": 0.87, "eval_loss": 2.8210554122924805, "eval_runtime": 441.9171, "eval_samples_per_second": 90.515, "eval_steps_per_second": 11.314, "step": 55000 }, { "epoch": 0.88, "learning_rate": 2.739051094890511e-05, "loss": 2.8785, "step": 55500 }, { "epoch": 0.89, "learning_rate": 2.7366788321167884e-05, "loss": 2.8631, "step": 56000 }, { "epoch": 0.89, "eval_loss": 2.825152635574341, "eval_runtime": 442.1075, "eval_samples_per_second": 90.476, "eval_steps_per_second": 11.309, "step": 56000 }, { "epoch": 0.9, "learning_rate": 2.7343065693430658e-05, "loss": 2.8557, "step": 56500 }, { "epoch": 0.9, "learning_rate": 2.731934306569343e-05, "loss": 2.8643, "step": 57000 }, { "epoch": 0.9, "eval_loss": 2.8247392177581787, "eval_runtime": 441.9526, "eval_samples_per_second": 90.507, "eval_steps_per_second": 11.313, "step": 57000 }, { "epoch": 0.91, "learning_rate": 2.7295620437956205e-05, "loss": 2.8976, "step": 57500 }, { "epoch": 0.92, "learning_rate": 2.727189781021898e-05, "loss": 2.8658, "step": 58000 }, { "epoch": 0.92, "eval_loss": 2.8236825466156006, "eval_runtime": 441.9598, "eval_samples_per_second": 90.506, "eval_steps_per_second": 11.313, "step": 58000 }, { "epoch": 0.93, "learning_rate": 2.7248175182481752e-05, "loss": 2.8608, "step": 58500 }, { "epoch": 0.94, "learning_rate": 2.7224452554744525e-05, "loss": 2.875, "step": 59000 }, { "epoch": 0.94, "eval_loss": 2.820819139480591, "eval_runtime": 442.0566, "eval_samples_per_second": 90.486, "eval_steps_per_second": 11.311, "step": 59000 }, { "epoch": 0.94, "learning_rate": 2.72007299270073e-05, "loss": 2.8704, "step": 59500 }, { "epoch": 0.95, "learning_rate": 2.7177007299270073e-05, "loss": 2.8531, "step": 60000 }, { "epoch": 0.95, "eval_loss": 2.821174144744873, "eval_runtime": 442.2056, "eval_samples_per_second": 90.456, "eval_steps_per_second": 11.307, "step": 60000 }, { "epoch": 0.96, "learning_rate": 2.715328467153285e-05, "loss": 2.8748, "step": 60500 }, { "epoch": 0.97, "learning_rate": 2.712956204379562e-05, "loss": 2.8654, "step": 61000 }, { "epoch": 0.97, "eval_loss": 2.825101613998413, "eval_runtime": 442.0952, "eval_samples_per_second": 90.478, "eval_steps_per_second": 11.31, "step": 61000 }, { "epoch": 0.98, "learning_rate": 2.7105839416058397e-05, "loss": 2.8515, "step": 61500 }, { "epoch": 0.98, "learning_rate": 2.7082116788321167e-05, "loss": 2.8432, "step": 62000 }, { "epoch": 0.98, "eval_loss": 2.819878101348877, "eval_runtime": 441.9364, "eval_samples_per_second": 90.511, "eval_steps_per_second": 11.314, "step": 62000 }, { "epoch": 0.99, "learning_rate": 2.7058394160583944e-05, "loss": 2.8591, "step": 62500 }, { "epoch": 1.0, "learning_rate": 2.7034671532846717e-05, "loss": 2.8871, "step": 63000 }, { "epoch": 1.0, "eval_loss": 2.8195338249206543, "eval_runtime": 442.0595, "eval_samples_per_second": 90.486, "eval_steps_per_second": 11.311, "step": 63000 }, { "epoch": 1.01, "learning_rate": 2.7010948905109487e-05, "loss": 2.7957, "step": 63500 }, { "epoch": 1.01, "learning_rate": 2.6987226277372264e-05, "loss": 2.7854, "step": 64000 }, { "epoch": 1.01, "eval_loss": 2.817440986633301, "eval_runtime": 442.0252, "eval_samples_per_second": 90.493, "eval_steps_per_second": 11.312, "step": 64000 }, { "epoch": 1.02, "learning_rate": 2.6963503649635034e-05, "loss": 2.7874, "step": 64500 }, { "epoch": 1.03, "learning_rate": 2.693978102189781e-05, "loss": 2.7703, "step": 65000 }, { "epoch": 1.03, "eval_loss": 2.8208272457122803, "eval_runtime": 442.1489, "eval_samples_per_second": 90.467, "eval_steps_per_second": 11.308, "step": 65000 }, { "epoch": 1.04, "learning_rate": 2.6916058394160585e-05, "loss": 2.7773, "step": 65500 }, { "epoch": 1.05, "learning_rate": 2.689233576642336e-05, "loss": 2.7764, "step": 66000 }, { "epoch": 1.05, "eval_loss": 2.8173325061798096, "eval_runtime": 443.1184, "eval_samples_per_second": 90.269, "eval_steps_per_second": 11.284, "step": 66000 }, { "epoch": 1.05, "learning_rate": 2.6868613138686132e-05, "loss": 2.7838, "step": 66500 }, { "epoch": 1.06, "learning_rate": 2.6844890510948906e-05, "loss": 2.7734, "step": 67000 }, { "epoch": 1.06, "eval_loss": 2.812627077102661, "eval_runtime": 443.0624, "eval_samples_per_second": 90.281, "eval_steps_per_second": 11.285, "step": 67000 }, { "epoch": 1.07, "learning_rate": 2.682116788321168e-05, "loss": 2.7541, "step": 67500 }, { "epoch": 1.08, "learning_rate": 2.6797445255474456e-05, "loss": 2.7637, "step": 68000 }, { "epoch": 1.08, "eval_loss": 2.817030668258667, "eval_runtime": 443.2279, "eval_samples_per_second": 90.247, "eval_steps_per_second": 11.281, "step": 68000 }, { "epoch": 1.09, "learning_rate": 2.6773722627737226e-05, "loss": 2.7809, "step": 68500 }, { "epoch": 1.09, "learning_rate": 2.675e-05, "loss": 2.775, "step": 69000 }, { "epoch": 1.09, "eval_loss": 2.819810628890991, "eval_runtime": 442.8797, "eval_samples_per_second": 90.318, "eval_steps_per_second": 11.29, "step": 69000 }, { "epoch": 1.1, "learning_rate": 2.6726277372262777e-05, "loss": 2.786, "step": 69500 }, { "epoch": 1.11, "learning_rate": 2.6702554744525547e-05, "loss": 2.747, "step": 70000 }, { "epoch": 1.11, "eval_loss": 2.820467233657837, "eval_runtime": 443.0381, "eval_samples_per_second": 90.286, "eval_steps_per_second": 11.286, "step": 70000 }, { "epoch": 1.12, "learning_rate": 2.6678832116788324e-05, "loss": 2.801, "step": 70500 }, { "epoch": 1.13, "learning_rate": 2.6655109489051094e-05, "loss": 2.7589, "step": 71000 }, { "epoch": 1.13, "eval_loss": 2.8193023204803467, "eval_runtime": 443.2053, "eval_samples_per_second": 90.252, "eval_steps_per_second": 11.281, "step": 71000 }, { "epoch": 1.13, "learning_rate": 2.663138686131387e-05, "loss": 2.7801, "step": 71500 }, { "epoch": 1.14, "learning_rate": 2.6607664233576644e-05, "loss": 2.7521, "step": 72000 }, { "epoch": 1.14, "eval_loss": 2.813857078552246, "eval_runtime": 443.2947, "eval_samples_per_second": 90.233, "eval_steps_per_second": 11.279, "step": 72000 }, { "epoch": 1.15, "learning_rate": 2.6583941605839414e-05, "loss": 2.7876, "step": 72500 }, { "epoch": 1.16, "learning_rate": 2.656021897810219e-05, "loss": 2.772, "step": 73000 }, { "epoch": 1.16, "eval_loss": 2.814068555831909, "eval_runtime": 443.3415, "eval_samples_per_second": 90.224, "eval_steps_per_second": 11.278, "step": 73000 }, { "epoch": 1.17, "learning_rate": 2.653649635036496e-05, "loss": 2.7618, "step": 73500 }, { "epoch": 1.17, "learning_rate": 2.651277372262774e-05, "loss": 2.785, "step": 74000 }, { "epoch": 1.17, "eval_loss": 2.8143815994262695, "eval_runtime": 443.2942, "eval_samples_per_second": 90.234, "eval_steps_per_second": 11.279, "step": 74000 }, { "epoch": 1.18, "learning_rate": 2.6489051094890512e-05, "loss": 2.7878, "step": 74500 }, { "epoch": 1.19, "learning_rate": 2.6465328467153286e-05, "loss": 2.743, "step": 75000 }, { "epoch": 1.19, "eval_loss": 2.8167881965637207, "eval_runtime": 443.0056, "eval_samples_per_second": 90.292, "eval_steps_per_second": 11.287, "step": 75000 }, { "epoch": 1.2, "learning_rate": 2.644160583941606e-05, "loss": 2.7564, "step": 75500 }, { "epoch": 1.21, "learning_rate": 2.6417883211678833e-05, "loss": 2.7694, "step": 76000 }, { "epoch": 1.21, "eval_loss": 2.8128414154052734, "eval_runtime": 443.1543, "eval_samples_per_second": 90.262, "eval_steps_per_second": 11.283, "step": 76000 }, { "epoch": 1.21, "learning_rate": 2.6394160583941606e-05, "loss": 2.7583, "step": 76500 }, { "epoch": 1.22, "learning_rate": 2.6370437956204383e-05, "loss": 2.7769, "step": 77000 }, { "epoch": 1.22, "eval_loss": 2.8131983280181885, "eval_runtime": 443.0764, "eval_samples_per_second": 90.278, "eval_steps_per_second": 11.285, "step": 77000 }, { "epoch": 1.23, "learning_rate": 2.6346715328467153e-05, "loss": 2.7623, "step": 77500 }, { "epoch": 1.24, "learning_rate": 2.6322992700729927e-05, "loss": 2.7523, "step": 78000 }, { "epoch": 1.24, "eval_loss": 2.813045024871826, "eval_runtime": 443.0803, "eval_samples_per_second": 90.277, "eval_steps_per_second": 11.285, "step": 78000 }, { "epoch": 1.24, "learning_rate": 2.62992700729927e-05, "loss": 2.7569, "step": 78500 }, { "epoch": 1.25, "learning_rate": 2.6275547445255474e-05, "loss": 2.7572, "step": 79000 }, { "epoch": 1.25, "eval_loss": 2.8214099407196045, "eval_runtime": 442.9545, "eval_samples_per_second": 90.303, "eval_steps_per_second": 11.288, "step": 79000 }, { "epoch": 1.26, "learning_rate": 2.625182481751825e-05, "loss": 2.7564, "step": 79500 }, { "epoch": 1.27, "learning_rate": 2.622810218978102e-05, "loss": 2.7573, "step": 80000 }, { "epoch": 1.27, "eval_loss": 2.811310052871704, "eval_runtime": 442.9337, "eval_samples_per_second": 90.307, "eval_steps_per_second": 11.288, "step": 80000 }, { "epoch": 1.28, "learning_rate": 2.6204379562043798e-05, "loss": 2.7635, "step": 80500 }, { "epoch": 1.28, "learning_rate": 2.6180656934306568e-05, "loss": 2.7514, "step": 81000 }, { "epoch": 1.28, "eval_loss": 2.8111774921417236, "eval_runtime": 442.9468, "eval_samples_per_second": 90.304, "eval_steps_per_second": 11.288, "step": 81000 }, { "epoch": 1.29, "learning_rate": 2.615693430656934e-05, "loss": 2.7534, "step": 81500 }, { "epoch": 1.3, "learning_rate": 2.613321167883212e-05, "loss": 2.7676, "step": 82000 }, { "epoch": 1.3, "eval_loss": 2.8070504665374756, "eval_runtime": 442.7918, "eval_samples_per_second": 90.336, "eval_steps_per_second": 11.292, "step": 82000 }, { "epoch": 1.31, "learning_rate": 2.6109489051094892e-05, "loss": 2.7212, "step": 82500 }, { "epoch": 1.32, "learning_rate": 2.6085766423357666e-05, "loss": 2.7044, "step": 83000 }, { "epoch": 1.32, "eval_loss": 2.8113818168640137, "eval_runtime": 440.1567, "eval_samples_per_second": 90.877, "eval_steps_per_second": 11.36, "step": 83000 }, { "epoch": 1.32, "learning_rate": 2.6062043795620436e-05, "loss": 2.742, "step": 83500 }, { "epoch": 1.33, "learning_rate": 2.6038321167883213e-05, "loss": 2.7205, "step": 84000 }, { "epoch": 1.33, "eval_loss": 2.810499906539917, "eval_runtime": 440.19, "eval_samples_per_second": 90.87, "eval_steps_per_second": 11.359, "step": 84000 }, { "epoch": 1.34, "learning_rate": 2.6014598540145986e-05, "loss": 2.7065, "step": 84500 }, { "epoch": 1.35, "learning_rate": 2.599087591240876e-05, "loss": 2.7168, "step": 85000 }, { "epoch": 1.35, "eval_loss": 2.813863754272461, "eval_runtime": 440.1828, "eval_samples_per_second": 90.871, "eval_steps_per_second": 11.359, "step": 85000 }, { "epoch": 1.36, "learning_rate": 2.5967153284671533e-05, "loss": 2.7367, "step": 85500 }, { "epoch": 1.36, "learning_rate": 2.5943430656934307e-05, "loss": 2.7337, "step": 86000 }, { "epoch": 1.36, "eval_loss": 2.80753231048584, "eval_runtime": 439.9369, "eval_samples_per_second": 90.922, "eval_steps_per_second": 11.365, "step": 86000 }, { "epoch": 1.37, "learning_rate": 2.591970802919708e-05, "loss": 2.718, "step": 86500 }, { "epoch": 1.38, "learning_rate": 2.5895985401459854e-05, "loss": 2.709, "step": 87000 }, { "epoch": 1.38, "eval_loss": 2.8110671043395996, "eval_runtime": 439.984, "eval_samples_per_second": 90.912, "eval_steps_per_second": 11.364, "step": 87000 }, { "epoch": 1.39, "learning_rate": 2.5872262773722628e-05, "loss": 2.7045, "step": 87500 }, { "epoch": 1.4, "learning_rate": 2.58485401459854e-05, "loss": 2.7449, "step": 88000 }, { "epoch": 1.4, "eval_loss": 2.806777238845825, "eval_runtime": 439.8672, "eval_samples_per_second": 90.937, "eval_steps_per_second": 11.367, "step": 88000 }, { "epoch": 1.4, "learning_rate": 2.5824817518248178e-05, "loss": 2.704, "step": 88500 }, { "epoch": 1.41, "learning_rate": 2.5801094890510948e-05, "loss": 2.7175, "step": 89000 }, { "epoch": 1.41, "eval_loss": 2.8058323860168457, "eval_runtime": 439.8993, "eval_samples_per_second": 90.93, "eval_steps_per_second": 11.366, "step": 89000 }, { "epoch": 1.42, "learning_rate": 2.5777372262773725e-05, "loss": 2.7292, "step": 89500 }, { "epoch": 1.43, "learning_rate": 2.5753649635036495e-05, "loss": 2.7096, "step": 90000 }, { "epoch": 1.43, "eval_loss": 2.808027744293213, "eval_runtime": 439.7855, "eval_samples_per_second": 90.953, "eval_steps_per_second": 11.369, "step": 90000 }, { "epoch": 1.43, "learning_rate": 2.572992700729927e-05, "loss": 2.7125, "step": 90500 }, { "epoch": 1.44, "learning_rate": 2.5706204379562046e-05, "loss": 2.7272, "step": 91000 }, { "epoch": 1.44, "eval_loss": 2.8089358806610107, "eval_runtime": 439.8804, "eval_samples_per_second": 90.934, "eval_steps_per_second": 11.367, "step": 91000 }, { "epoch": 1.45, "learning_rate": 2.568248175182482e-05, "loss": 2.719, "step": 91500 }, { "epoch": 1.46, "learning_rate": 2.5658759124087593e-05, "loss": 2.7092, "step": 92000 }, { "epoch": 1.46, "eval_loss": 2.806529998779297, "eval_runtime": 439.7422, "eval_samples_per_second": 90.962, "eval_steps_per_second": 11.37, "step": 92000 }, { "epoch": 1.47, "learning_rate": 2.5635036496350363e-05, "loss": 2.7121, "step": 92500 }, { "epoch": 1.47, "learning_rate": 2.561131386861314e-05, "loss": 2.6846, "step": 93000 }, { "epoch": 1.47, "eval_loss": 2.804901123046875, "eval_runtime": 440.0389, "eval_samples_per_second": 90.901, "eval_steps_per_second": 11.363, "step": 93000 }, { "epoch": 1.48, "learning_rate": 2.5587591240875913e-05, "loss": 2.7301, "step": 93500 }, { "epoch": 1.49, "learning_rate": 2.5563868613138687e-05, "loss": 2.6976, "step": 94000 }, { "epoch": 1.49, "eval_loss": 2.8061363697052, "eval_runtime": 439.8304, "eval_samples_per_second": 90.944, "eval_steps_per_second": 11.368, "step": 94000 }, { "epoch": 1.5, "learning_rate": 2.554014598540146e-05, "loss": 2.7008, "step": 94500 }, { "epoch": 1.51, "learning_rate": 2.5516423357664234e-05, "loss": 2.7057, "step": 95000 }, { "epoch": 1.51, "eval_loss": 2.815894603729248, "eval_runtime": 439.7552, "eval_samples_per_second": 90.96, "eval_steps_per_second": 11.37, "step": 95000 }, { "epoch": 1.51, "learning_rate": 2.5492700729927008e-05, "loss": 2.724, "step": 95500 }, { "epoch": 1.52, "learning_rate": 2.546897810218978e-05, "loss": 2.6922, "step": 96000 }, { "epoch": 1.52, "eval_loss": 2.8022561073303223, "eval_runtime": 439.8096, "eval_samples_per_second": 90.948, "eval_steps_per_second": 11.369, "step": 96000 }, { "epoch": 1.53, "learning_rate": 2.5445255474452555e-05, "loss": 2.7095, "step": 96500 }, { "epoch": 1.54, "learning_rate": 2.542153284671533e-05, "loss": 2.7011, "step": 97000 }, { "epoch": 1.54, "eval_loss": 2.801751136779785, "eval_runtime": 439.7019, "eval_samples_per_second": 90.971, "eval_steps_per_second": 11.371, "step": 97000 }, { "epoch": 1.55, "learning_rate": 2.5397810218978102e-05, "loss": 2.7212, "step": 97500 }, { "epoch": 1.55, "learning_rate": 2.5374087591240875e-05, "loss": 2.7029, "step": 98000 }, { "epoch": 1.55, "eval_loss": 2.8029119968414307, "eval_runtime": 439.744, "eval_samples_per_second": 90.962, "eval_steps_per_second": 11.37, "step": 98000 }, { "epoch": 1.56, "learning_rate": 2.5350364963503652e-05, "loss": 2.7055, "step": 98500 }, { "epoch": 1.57, "learning_rate": 2.5326642335766422e-05, "loss": 2.701, "step": 99000 }, { "epoch": 1.57, "eval_loss": 2.800555944442749, "eval_runtime": 439.7759, "eval_samples_per_second": 90.955, "eval_steps_per_second": 11.369, "step": 99000 }, { "epoch": 1.58, "learning_rate": 2.53029197080292e-05, "loss": 2.7136, "step": 99500 }, { "epoch": 1.59, "learning_rate": 2.527919708029197e-05, "loss": 2.7044, "step": 100000 }, { "epoch": 1.59, "eval_loss": 2.8019235134124756, "eval_runtime": 439.7518, "eval_samples_per_second": 90.96, "eval_steps_per_second": 11.37, "step": 100000 }, { "epoch": 1.59, "learning_rate": 2.5255474452554746e-05, "loss": 2.7141, "step": 100500 }, { "epoch": 1.6, "learning_rate": 2.523175182481752e-05, "loss": 2.7311, "step": 101000 }, { "epoch": 1.6, "eval_loss": 2.8053483963012695, "eval_runtime": 439.6812, "eval_samples_per_second": 90.975, "eval_steps_per_second": 11.372, "step": 101000 }, { "epoch": 1.61, "learning_rate": 2.520802919708029e-05, "loss": 2.7107, "step": 101500 }, { "epoch": 1.62, "learning_rate": 2.5184306569343067e-05, "loss": 2.711, "step": 102000 }, { "epoch": 1.62, "eval_loss": 2.805739641189575, "eval_runtime": 439.6645, "eval_samples_per_second": 90.978, "eval_steps_per_second": 11.372, "step": 102000 }, { "epoch": 1.63, "learning_rate": 2.5160583941605837e-05, "loss": 2.7212, "step": 102500 }, { "epoch": 1.63, "learning_rate": 2.5136861313868614e-05, "loss": 2.7344, "step": 103000 }, { "epoch": 1.63, "eval_loss": 2.799654722213745, "eval_runtime": 439.7601, "eval_samples_per_second": 90.959, "eval_steps_per_second": 11.37, "step": 103000 }, { "epoch": 1.64, "learning_rate": 2.5113138686131388e-05, "loss": 2.7122, "step": 103500 }, { "epoch": 1.65, "learning_rate": 2.508941605839416e-05, "loss": 2.7236, "step": 104000 }, { "epoch": 1.65, "eval_loss": 2.803441047668457, "eval_runtime": 439.7253, "eval_samples_per_second": 90.966, "eval_steps_per_second": 11.371, "step": 104000 }, { "epoch": 1.66, "learning_rate": 2.5065693430656935e-05, "loss": 2.7226, "step": 104500 }, { "epoch": 1.66, "learning_rate": 2.5041970802919705e-05, "loss": 2.6879, "step": 105000 }, { "epoch": 1.66, "eval_loss": 2.801588773727417, "eval_runtime": 439.6315, "eval_samples_per_second": 90.985, "eval_steps_per_second": 11.373, "step": 105000 }, { "epoch": 1.67, "learning_rate": 2.5018248175182482e-05, "loss": 2.732, "step": 105500 }, { "epoch": 1.68, "learning_rate": 2.499452554744526e-05, "loss": 2.7188, "step": 106000 }, { "epoch": 1.68, "eval_loss": 2.7991223335266113, "eval_runtime": 439.7752, "eval_samples_per_second": 90.956, "eval_steps_per_second": 11.369, "step": 106000 }, { "epoch": 1.69, "learning_rate": 2.497080291970803e-05, "loss": 2.7164, "step": 106500 }, { "epoch": 1.7, "learning_rate": 2.4947080291970802e-05, "loss": 2.6994, "step": 107000 }, { "epoch": 1.7, "eval_loss": 2.8013699054718018, "eval_runtime": 439.7722, "eval_samples_per_second": 90.956, "eval_steps_per_second": 11.37, "step": 107000 }, { "epoch": 1.7, "learning_rate": 2.492335766423358e-05, "loss": 2.6989, "step": 107500 }, { "epoch": 1.71, "learning_rate": 2.489963503649635e-05, "loss": 2.7129, "step": 108000 }, { "epoch": 1.71, "eval_loss": 2.7973504066467285, "eval_runtime": 439.8695, "eval_samples_per_second": 90.936, "eval_steps_per_second": 11.367, "step": 108000 }, { "epoch": 1.72, "learning_rate": 2.4875912408759126e-05, "loss": 2.7056, "step": 108500 }, { "epoch": 1.73, "learning_rate": 2.4852189781021897e-05, "loss": 2.7095, "step": 109000 }, { "epoch": 1.73, "eval_loss": 2.8017683029174805, "eval_runtime": 439.7668, "eval_samples_per_second": 90.957, "eval_steps_per_second": 11.37, "step": 109000 }, { "epoch": 1.74, "learning_rate": 2.4828467153284674e-05, "loss": 2.7021, "step": 109500 }, { "epoch": 1.74, "learning_rate": 2.4804744525547447e-05, "loss": 2.7029, "step": 110000 }, { "epoch": 1.74, "eval_loss": 2.7975335121154785, "eval_runtime": 439.9718, "eval_samples_per_second": 90.915, "eval_steps_per_second": 11.364, "step": 110000 }, { "epoch": 1.75, "learning_rate": 2.4781021897810217e-05, "loss": 2.7046, "step": 110500 }, { "epoch": 1.76, "learning_rate": 2.4757299270072994e-05, "loss": 2.7042, "step": 111000 }, { "epoch": 1.76, "eval_loss": 2.7961161136627197, "eval_runtime": 439.788, "eval_samples_per_second": 90.953, "eval_steps_per_second": 11.369, "step": 111000 }, { "epoch": 1.77, "learning_rate": 2.4733576642335768e-05, "loss": 2.7234, "step": 111500 }, { "epoch": 1.78, "learning_rate": 2.470985401459854e-05, "loss": 2.7134, "step": 112000 }, { "epoch": 1.78, "eval_loss": 2.795431613922119, "eval_runtime": 439.911, "eval_samples_per_second": 90.927, "eval_steps_per_second": 11.366, "step": 112000 }, { "epoch": 1.78, "learning_rate": 2.4686131386861315e-05, "loss": 2.6937, "step": 112500 }, { "epoch": 1.79, "learning_rate": 2.4662408759124088e-05, "loss": 2.6905, "step": 113000 }, { "epoch": 1.79, "eval_loss": 2.7966976165771484, "eval_runtime": 439.7831, "eval_samples_per_second": 90.954, "eval_steps_per_second": 11.369, "step": 113000 }, { "epoch": 1.8, "learning_rate": 2.4638686131386862e-05, "loss": 2.7027, "step": 113500 }, { "epoch": 1.81, "learning_rate": 2.4614963503649635e-05, "loss": 2.6871, "step": 114000 }, { "epoch": 1.81, "eval_loss": 2.7926506996154785, "eval_runtime": 440.0624, "eval_samples_per_second": 90.896, "eval_steps_per_second": 11.362, "step": 114000 }, { "epoch": 1.82, "learning_rate": 2.459124087591241e-05, "loss": 2.697, "step": 114500 }, { "epoch": 1.82, "learning_rate": 2.4567518248175186e-05, "loss": 2.6931, "step": 115000 }, { "epoch": 1.82, "eval_loss": 2.8011257648468018, "eval_runtime": 439.8116, "eval_samples_per_second": 90.948, "eval_steps_per_second": 11.369, "step": 115000 }, { "epoch": 1.83, "learning_rate": 2.4543795620437956e-05, "loss": 2.7136, "step": 115500 }, { "epoch": 1.84, "learning_rate": 2.452007299270073e-05, "loss": 2.702, "step": 116000 }, { "epoch": 1.84, "eval_loss": 2.7955451011657715, "eval_runtime": 439.914, "eval_samples_per_second": 90.927, "eval_steps_per_second": 11.366, "step": 116000 }, { "epoch": 1.85, "learning_rate": 2.4496350364963503e-05, "loss": 2.6859, "step": 116500 }, { "epoch": 1.86, "learning_rate": 2.4472627737226277e-05, "loss": 2.712, "step": 117000 }, { "epoch": 1.86, "eval_loss": 2.7944655418395996, "eval_runtime": 439.809, "eval_samples_per_second": 90.949, "eval_steps_per_second": 11.369, "step": 117000 }, { "epoch": 1.86, "learning_rate": 2.4448905109489054e-05, "loss": 2.6984, "step": 117500 }, { "epoch": 1.87, "learning_rate": 2.4425182481751824e-05, "loss": 2.7091, "step": 118000 }, { "epoch": 1.87, "eval_loss": 2.7959797382354736, "eval_runtime": 439.9158, "eval_samples_per_second": 90.926, "eval_steps_per_second": 11.366, "step": 118000 }, { "epoch": 1.88, "learning_rate": 2.44014598540146e-05, "loss": 2.6978, "step": 118500 }, { "epoch": 1.89, "learning_rate": 2.437773722627737e-05, "loss": 2.6913, "step": 119000 }, { "epoch": 1.89, "eval_loss": 2.79091739654541, "eval_runtime": 439.7923, "eval_samples_per_second": 90.952, "eval_steps_per_second": 11.369, "step": 119000 }, { "epoch": 1.89, "learning_rate": 2.4354014598540144e-05, "loss": 2.6964, "step": 119500 }, { "epoch": 1.9, "learning_rate": 2.433029197080292e-05, "loss": 2.7008, "step": 120000 }, { "epoch": 1.9, "eval_loss": 2.7938427925109863, "eval_runtime": 439.9595, "eval_samples_per_second": 90.917, "eval_steps_per_second": 11.365, "step": 120000 }, { "epoch": 1.91, "learning_rate": 2.4306569343065695e-05, "loss": 2.7015, "step": 120500 }, { "epoch": 1.92, "learning_rate": 2.428284671532847e-05, "loss": 2.6968, "step": 121000 }, { "epoch": 1.92, "eval_loss": 2.796417236328125, "eval_runtime": 439.8462, "eval_samples_per_second": 90.941, "eval_steps_per_second": 11.368, "step": 121000 }, { "epoch": 1.93, "learning_rate": 2.425912408759124e-05, "loss": 2.6936, "step": 121500 }, { "epoch": 1.93, "learning_rate": 2.4235401459854015e-05, "loss": 2.688, "step": 122000 }, { "epoch": 1.93, "eval_loss": 2.790297269821167, "eval_runtime": 439.89, "eval_samples_per_second": 90.932, "eval_steps_per_second": 11.366, "step": 122000 }, { "epoch": 1.94, "learning_rate": 2.421167883211679e-05, "loss": 2.6885, "step": 122500 }, { "epoch": 1.95, "learning_rate": 2.4187956204379563e-05, "loss": 2.7015, "step": 123000 }, { "epoch": 1.95, "eval_loss": 2.7968406677246094, "eval_runtime": 439.8161, "eval_samples_per_second": 90.947, "eval_steps_per_second": 11.368, "step": 123000 }, { "epoch": 1.96, "learning_rate": 2.4164233576642336e-05, "loss": 2.7095, "step": 123500 }, { "epoch": 1.97, "learning_rate": 2.414051094890511e-05, "loss": 2.7038, "step": 124000 }, { "epoch": 1.97, "eval_loss": 2.79244327545166, "eval_runtime": 439.8929, "eval_samples_per_second": 90.931, "eval_steps_per_second": 11.366, "step": 124000 }, { "epoch": 1.97, "learning_rate": 2.4116788321167883e-05, "loss": 2.6901, "step": 124500 }, { "epoch": 1.98, "learning_rate": 2.4093065693430657e-05, "loss": 2.6991, "step": 125000 }, { "epoch": 1.98, "eval_loss": 2.7943027019500732, "eval_runtime": 439.7901, "eval_samples_per_second": 90.952, "eval_steps_per_second": 11.369, "step": 125000 }, { "epoch": 1.99, "learning_rate": 2.406934306569343e-05, "loss": 2.6954, "step": 125500 }, { "epoch": 2.0, "learning_rate": 2.4045620437956207e-05, "loss": 2.6804, "step": 126000 }, { "epoch": 2.0, "eval_loss": 2.785961627960205, "eval_runtime": 440.0425, "eval_samples_per_second": 90.9, "eval_steps_per_second": 11.363, "step": 126000 } ], "max_steps": 630700, "num_train_epochs": 10, "total_flos": 8.47096165805568e+17, "trial_name": null, "trial_params": null }