diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,46345 @@ +{ + "best_metric": 0.3712206780910492, + "best_model_checkpoint": "./outputs/checkpoint-59925", + "epoch": 1200.0, + "global_step": 61200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.2, + "learning_rate": 4.901960784313725e-07, + "loss": 1.1885, + "step": 10 + }, + { + "epoch": 0.39, + "learning_rate": 9.80392156862745e-07, + "loss": 1.182, + "step": 20 + }, + { + "epoch": 0.59, + "learning_rate": 1.4705882352941175e-06, + "loss": 1.1666, + "step": 30 + }, + { + "epoch": 0.78, + "learning_rate": 1.96078431372549e-06, + "loss": 1.1481, + "step": 40 + }, + { + "epoch": 0.98, + "learning_rate": 2.450980392156863e-06, + "loss": 1.1257, + "step": 50 + }, + { + "epoch": 1.0, + "eval_loss": 1.1119023561477661, + "eval_runtime": 2.2391, + "eval_samples_per_second": 1017.827, + "eval_steps_per_second": 4.02, + "step": 51 + }, + { + "epoch": 1.18, + "learning_rate": 2.941176470588235e-06, + "loss": 1.1055, + "step": 60 + }, + { + "epoch": 1.37, + "learning_rate": 3.4313725490196073e-06, + "loss": 1.0888, + "step": 70 + }, + { + "epoch": 1.57, + "learning_rate": 3.92156862745098e-06, + "loss": 1.0737, + "step": 80 + }, + { + "epoch": 1.76, + "learning_rate": 4.4117647058823526e-06, + "loss": 1.0625, + "step": 90 + }, + { + "epoch": 1.96, + "learning_rate": 4.901960784313726e-06, + "loss": 1.0507, + "step": 100 + }, + { + "epoch": 2.0, + "eval_loss": 1.0434480905532837, + "eval_runtime": 2.0694, + "eval_samples_per_second": 1101.261, + "eval_steps_per_second": 4.349, + "step": 102 + }, + { + "epoch": 2.16, + "learning_rate": 5.392156862745097e-06, + "loss": 1.0403, + "step": 110 + }, + { + "epoch": 2.35, + "learning_rate": 5.88235294117647e-06, + "loss": 1.0303, + "step": 120 + }, + { + "epoch": 2.55, + "learning_rate": 6.372549019607843e-06, + "loss": 1.0213, + "step": 130 + }, + { + "epoch": 2.75, + "learning_rate": 6.8627450980392145e-06, + "loss": 1.013, + "step": 140 + }, + { + "epoch": 2.94, + "learning_rate": 7.352941176470588e-06, + "loss": 1.0046, + "step": 150 + }, + { + "epoch": 3.0, + "eval_loss": 0.9987890720367432, + "eval_runtime": 2.0662, + "eval_samples_per_second": 1103.006, + "eval_steps_per_second": 4.356, + "step": 153 + }, + { + "epoch": 3.14, + "learning_rate": 7.84313725490196e-06, + "loss": 0.9976, + "step": 160 + }, + { + "epoch": 3.33, + "learning_rate": 8.333333333333332e-06, + "loss": 0.9917, + "step": 170 + }, + { + "epoch": 3.53, + "learning_rate": 8.823529411764705e-06, + "loss": 0.9867, + "step": 180 + }, + { + "epoch": 3.73, + "learning_rate": 9.313725490196078e-06, + "loss": 0.9803, + "step": 190 + }, + { + "epoch": 3.92, + "learning_rate": 9.803921568627451e-06, + "loss": 0.9761, + "step": 200 + }, + { + "epoch": 4.0, + "eval_loss": 0.9724870920181274, + "eval_runtime": 2.0693, + "eval_samples_per_second": 1101.346, + "eval_steps_per_second": 4.349, + "step": 204 + }, + { + "epoch": 4.12, + "learning_rate": 1.0294117647058824e-05, + "loss": 0.9732, + "step": 210 + }, + { + "epoch": 4.31, + "learning_rate": 1.0784313725490194e-05, + "loss": 0.969, + "step": 220 + }, + { + "epoch": 4.51, + "learning_rate": 1.1274509803921567e-05, + "loss": 0.9652, + "step": 230 + }, + { + "epoch": 4.71, + "learning_rate": 1.176470588235294e-05, + "loss": 0.9613, + "step": 240 + }, + { + "epoch": 4.9, + "learning_rate": 1.2254901960784313e-05, + "loss": 0.9572, + "step": 250 + }, + { + "epoch": 5.0, + "eval_loss": 0.952890157699585, + "eval_runtime": 2.0561, + "eval_samples_per_second": 1108.418, + "eval_steps_per_second": 4.377, + "step": 255 + }, + { + "epoch": 5.1, + "learning_rate": 1.2745098039215686e-05, + "loss": 0.9535, + "step": 260 + }, + { + "epoch": 5.29, + "learning_rate": 1.323529411764706e-05, + "loss": 0.9486, + "step": 270 + }, + { + "epoch": 5.49, + "learning_rate": 1.3725490196078429e-05, + "loss": 0.9442, + "step": 280 + }, + { + "epoch": 5.69, + "learning_rate": 1.4215686274509802e-05, + "loss": 0.9397, + "step": 290 + }, + { + "epoch": 5.88, + "learning_rate": 1.4705882352941175e-05, + "loss": 0.9357, + "step": 300 + }, + { + "epoch": 6.0, + "eval_loss": 0.9303520917892456, + "eval_runtime": 2.0452, + "eval_samples_per_second": 1114.339, + "eval_steps_per_second": 4.401, + "step": 306 + }, + { + "epoch": 6.08, + "learning_rate": 1.5196078431372548e-05, + "loss": 0.933, + "step": 310 + }, + { + "epoch": 6.27, + "learning_rate": 1.568627450980392e-05, + "loss": 0.9253, + "step": 320 + }, + { + "epoch": 6.47, + "learning_rate": 1.6176470588235293e-05, + "loss": 0.9209, + "step": 330 + }, + { + "epoch": 6.67, + "learning_rate": 1.6666666666666664e-05, + "loss": 0.9187, + "step": 340 + }, + { + "epoch": 6.86, + "learning_rate": 1.7156862745098035e-05, + "loss": 0.9128, + "step": 350 + }, + { + "epoch": 7.0, + "eval_loss": 0.9099854826927185, + "eval_runtime": 2.1224, + "eval_samples_per_second": 1073.784, + "eval_steps_per_second": 4.24, + "step": 357 + }, + { + "epoch": 7.06, + "learning_rate": 1.764705882352941e-05, + "loss": 0.9093, + "step": 360 + }, + { + "epoch": 7.25, + "learning_rate": 1.813725490196078e-05, + "loss": 0.9083, + "step": 370 + }, + { + "epoch": 7.45, + "learning_rate": 1.8627450980392156e-05, + "loss": 0.9076, + "step": 380 + }, + { + "epoch": 7.65, + "learning_rate": 1.9117647058823528e-05, + "loss": 0.9058, + "step": 390 + }, + { + "epoch": 7.84, + "learning_rate": 1.9607843137254903e-05, + "loss": 0.9037, + "step": 400 + }, + { + "epoch": 8.0, + "eval_loss": 0.9003704786300659, + "eval_runtime": 2.1145, + "eval_samples_per_second": 1077.8, + "eval_steps_per_second": 4.256, + "step": 408 + }, + { + "epoch": 8.04, + "learning_rate": 2.009803921568627e-05, + "loss": 0.9048, + "step": 410 + }, + { + "epoch": 8.24, + "learning_rate": 2.058823529411765e-05, + "loss": 0.8982, + "step": 420 + }, + { + "epoch": 8.43, + "learning_rate": 2.1078431372549017e-05, + "loss": 0.8997, + "step": 430 + }, + { + "epoch": 8.63, + "learning_rate": 2.1568627450980388e-05, + "loss": 0.8968, + "step": 440 + }, + { + "epoch": 8.82, + "learning_rate": 2.2058823529411763e-05, + "loss": 0.8984, + "step": 450 + }, + { + "epoch": 9.0, + "eval_loss": 0.8941003084182739, + "eval_runtime": 2.0854, + "eval_samples_per_second": 1092.816, + "eval_steps_per_second": 4.316, + "step": 459 + }, + { + "epoch": 9.02, + "learning_rate": 2.2549019607843134e-05, + "loss": 0.8978, + "step": 460 + }, + { + "epoch": 9.22, + "learning_rate": 2.303921568627451e-05, + "loss": 0.8946, + "step": 470 + }, + { + "epoch": 9.41, + "learning_rate": 2.352941176470588e-05, + "loss": 0.8958, + "step": 480 + }, + { + "epoch": 9.61, + "learning_rate": 2.401960784313725e-05, + "loss": 0.8906, + "step": 490 + }, + { + "epoch": 9.8, + "learning_rate": 2.4509803921568626e-05, + "loss": 0.8909, + "step": 500 + }, + { + "epoch": 10.0, + "learning_rate": 2.4999999999999998e-05, + "loss": 0.8904, + "step": 510 + }, + { + "epoch": 10.0, + "eval_loss": 0.8895702958106995, + "eval_runtime": 2.056, + "eval_samples_per_second": 1108.47, + "eval_steps_per_second": 4.377, + "step": 510 + }, + { + "epoch": 10.2, + "learning_rate": 2.5490196078431373e-05, + "loss": 0.8899, + "step": 520 + }, + { + "epoch": 10.39, + "learning_rate": 2.598039215686274e-05, + "loss": 0.8879, + "step": 530 + }, + { + "epoch": 10.59, + "learning_rate": 2.647058823529412e-05, + "loss": 0.8879, + "step": 540 + }, + { + "epoch": 10.78, + "learning_rate": 2.6960784313725487e-05, + "loss": 0.885, + "step": 550 + }, + { + "epoch": 10.98, + "learning_rate": 2.7450980392156858e-05, + "loss": 0.8846, + "step": 560 + }, + { + "epoch": 11.0, + "eval_loss": 0.8801982998847961, + "eval_runtime": 2.1061, + "eval_samples_per_second": 1082.111, + "eval_steps_per_second": 4.273, + "step": 561 + }, + { + "epoch": 11.18, + "learning_rate": 2.7941176470588233e-05, + "loss": 0.8806, + "step": 570 + }, + { + "epoch": 11.37, + "learning_rate": 2.8431372549019604e-05, + "loss": 0.8803, + "step": 580 + }, + { + "epoch": 11.57, + "learning_rate": 2.892156862745098e-05, + "loss": 0.8801, + "step": 590 + }, + { + "epoch": 11.76, + "learning_rate": 2.941176470588235e-05, + "loss": 0.8795, + "step": 600 + }, + { + "epoch": 11.96, + "learning_rate": 2.9901960784313725e-05, + "loss": 0.8748, + "step": 610 + }, + { + "epoch": 12.0, + "eval_loss": 0.8775041699409485, + "eval_runtime": 2.1281, + "eval_samples_per_second": 1070.893, + "eval_steps_per_second": 4.229, + "step": 612 + }, + { + "epoch": 12.16, + "learning_rate": 3.0392156862745097e-05, + "loss": 0.8723, + "step": 620 + }, + { + "epoch": 12.35, + "learning_rate": 3.088235294117647e-05, + "loss": 0.8733, + "step": 630 + }, + { + "epoch": 12.55, + "learning_rate": 3.137254901960784e-05, + "loss": 0.8739, + "step": 640 + }, + { + "epoch": 12.75, + "learning_rate": 3.186274509803921e-05, + "loss": 0.8722, + "step": 650 + }, + { + "epoch": 12.94, + "learning_rate": 3.2352941176470585e-05, + "loss": 0.8692, + "step": 660 + }, + { + "epoch": 13.0, + "eval_loss": 0.8685004115104675, + "eval_runtime": 2.1995, + "eval_samples_per_second": 1036.155, + "eval_steps_per_second": 4.092, + "step": 663 + }, + { + "epoch": 13.14, + "learning_rate": 3.284313725490196e-05, + "loss": 0.8681, + "step": 670 + }, + { + "epoch": 13.33, + "learning_rate": 3.333333333333333e-05, + "loss": 0.865, + "step": 680 + }, + { + "epoch": 13.53, + "learning_rate": 3.38235294117647e-05, + "loss": 0.8655, + "step": 690 + }, + { + "epoch": 13.73, + "learning_rate": 3.431372549019607e-05, + "loss": 0.8673, + "step": 700 + }, + { + "epoch": 13.92, + "learning_rate": 3.480392156862745e-05, + "loss": 0.8656, + "step": 710 + }, + { + "epoch": 14.0, + "eval_loss": 0.866548478603363, + "eval_runtime": 2.0997, + "eval_samples_per_second": 1085.376, + "eval_steps_per_second": 4.286, + "step": 714 + }, + { + "epoch": 14.12, + "learning_rate": 3.529411764705882e-05, + "loss": 0.8644, + "step": 720 + }, + { + "epoch": 14.31, + "learning_rate": 3.5784313725490195e-05, + "loss": 0.8649, + "step": 730 + }, + { + "epoch": 14.51, + "learning_rate": 3.627450980392156e-05, + "loss": 0.8648, + "step": 740 + }, + { + "epoch": 14.71, + "learning_rate": 3.676470588235294e-05, + "loss": 0.8614, + "step": 750 + }, + { + "epoch": 14.9, + "learning_rate": 3.725490196078431e-05, + "loss": 0.8634, + "step": 760 + }, + { + "epoch": 15.0, + "eval_loss": 0.8607373833656311, + "eval_runtime": 2.2131, + "eval_samples_per_second": 1029.772, + "eval_steps_per_second": 4.067, + "step": 765 + }, + { + "epoch": 15.1, + "learning_rate": 3.774509803921568e-05, + "loss": 0.8596, + "step": 770 + }, + { + "epoch": 15.29, + "learning_rate": 3.8235294117647055e-05, + "loss": 0.861, + "step": 780 + }, + { + "epoch": 15.49, + "learning_rate": 3.872549019607843e-05, + "loss": 0.8584, + "step": 790 + }, + { + "epoch": 15.69, + "learning_rate": 3.9215686274509805e-05, + "loss": 0.8613, + "step": 800 + }, + { + "epoch": 15.88, + "learning_rate": 3.970588235294117e-05, + "loss": 0.8565, + "step": 810 + }, + { + "epoch": 16.0, + "eval_loss": 0.8560643196105957, + "eval_runtime": 2.0515, + "eval_samples_per_second": 1110.887, + "eval_steps_per_second": 4.387, + "step": 816 + }, + { + "epoch": 16.08, + "learning_rate": 4.019607843137254e-05, + "loss": 0.8546, + "step": 820 + }, + { + "epoch": 16.27, + "learning_rate": 4.0686274509803916e-05, + "loss": 0.8574, + "step": 830 + }, + { + "epoch": 16.47, + "learning_rate": 4.11764705882353e-05, + "loss": 0.8572, + "step": 840 + }, + { + "epoch": 16.67, + "learning_rate": 4.1666666666666665e-05, + "loss": 0.8582, + "step": 850 + }, + { + "epoch": 16.86, + "learning_rate": 4.215686274509803e-05, + "loss": 0.8555, + "step": 860 + }, + { + "epoch": 17.0, + "eval_loss": 0.8547664880752563, + "eval_runtime": 2.1626, + "eval_samples_per_second": 1053.802, + "eval_steps_per_second": 4.162, + "step": 867 + }, + { + "epoch": 17.06, + "learning_rate": 4.264705882352941e-05, + "loss": 0.8562, + "step": 870 + }, + { + "epoch": 17.25, + "learning_rate": 4.3137254901960776e-05, + "loss": 0.8546, + "step": 880 + }, + { + "epoch": 17.45, + "learning_rate": 4.362745098039216e-05, + "loss": 0.8538, + "step": 890 + }, + { + "epoch": 17.65, + "learning_rate": 4.4117647058823526e-05, + "loss": 0.8529, + "step": 900 + }, + { + "epoch": 17.84, + "learning_rate": 4.46078431372549e-05, + "loss": 0.8521, + "step": 910 + }, + { + "epoch": 18.0, + "eval_loss": 0.8463531732559204, + "eval_runtime": 2.1408, + "eval_samples_per_second": 1064.533, + "eval_steps_per_second": 4.204, + "step": 918 + }, + { + "epoch": 18.04, + "learning_rate": 4.509803921568627e-05, + "loss": 0.8503, + "step": 920 + }, + { + "epoch": 18.24, + "learning_rate": 4.5588235294117636e-05, + "loss": 0.849, + "step": 930 + }, + { + "epoch": 18.43, + "learning_rate": 4.607843137254902e-05, + "loss": 0.8514, + "step": 940 + }, + { + "epoch": 18.63, + "learning_rate": 4.6568627450980386e-05, + "loss": 0.8518, + "step": 950 + }, + { + "epoch": 18.82, + "learning_rate": 4.705882352941176e-05, + "loss": 0.8478, + "step": 960 + }, + { + "epoch": 19.0, + "eval_loss": 0.8448628783226013, + "eval_runtime": 2.1443, + "eval_samples_per_second": 1062.799, + "eval_steps_per_second": 4.197, + "step": 969 + }, + { + "epoch": 19.02, + "learning_rate": 4.754901960784313e-05, + "loss": 0.8473, + "step": 970 + }, + { + "epoch": 19.22, + "learning_rate": 4.80392156862745e-05, + "loss": 0.8483, + "step": 980 + }, + { + "epoch": 19.41, + "learning_rate": 4.852941176470588e-05, + "loss": 0.8437, + "step": 990 + }, + { + "epoch": 19.61, + "learning_rate": 4.901960784313725e-05, + "loss": 0.8462, + "step": 1000 + }, + { + "epoch": 19.8, + "learning_rate": 4.950980392156862e-05, + "loss": 0.8491, + "step": 1010 + }, + { + "epoch": 20.0, + "learning_rate": 4.9999999999999996e-05, + "loss": 0.847, + "step": 1020 + }, + { + "epoch": 20.0, + "eval_loss": 0.8455402255058289, + "eval_runtime": 2.0671, + "eval_samples_per_second": 1102.533, + "eval_steps_per_second": 4.354, + "step": 1020 + }, + { + "epoch": 20.2, + "learning_rate": 5.049019607843137e-05, + "loss": 0.8424, + "step": 1030 + }, + { + "epoch": 20.39, + "learning_rate": 5.0980392156862745e-05, + "loss": 0.8427, + "step": 1040 + }, + { + "epoch": 20.59, + "learning_rate": 5.147058823529411e-05, + "loss": 0.8415, + "step": 1050 + }, + { + "epoch": 20.78, + "learning_rate": 5.196078431372548e-05, + "loss": 0.8443, + "step": 1060 + }, + { + "epoch": 20.98, + "learning_rate": 5.2450980392156856e-05, + "loss": 0.842, + "step": 1070 + }, + { + "epoch": 21.0, + "eval_loss": 0.8377746939659119, + "eval_runtime": 2.1007, + "eval_samples_per_second": 1084.901, + "eval_steps_per_second": 4.284, + "step": 1071 + }, + { + "epoch": 21.18, + "learning_rate": 5.294117647058824e-05, + "loss": 0.836, + "step": 1080 + }, + { + "epoch": 21.37, + "learning_rate": 5.3431372549019605e-05, + "loss": 0.8397, + "step": 1090 + }, + { + "epoch": 21.57, + "learning_rate": 5.3921568627450973e-05, + "loss": 0.8357, + "step": 1100 + }, + { + "epoch": 21.76, + "learning_rate": 5.441176470588235e-05, + "loss": 0.8377, + "step": 1110 + }, + { + "epoch": 21.96, + "learning_rate": 5.4901960784313716e-05, + "loss": 0.8385, + "step": 1120 + }, + { + "epoch": 22.0, + "eval_loss": 0.8358024954795837, + "eval_runtime": 2.1023, + "eval_samples_per_second": 1084.065, + "eval_steps_per_second": 4.281, + "step": 1122 + }, + { + "epoch": 22.16, + "learning_rate": 5.53921568627451e-05, + "loss": 0.836, + "step": 1130 + }, + { + "epoch": 22.35, + "learning_rate": 5.5882352941176466e-05, + "loss": 0.8319, + "step": 1140 + }, + { + "epoch": 22.55, + "learning_rate": 5.637254901960784e-05, + "loss": 0.8307, + "step": 1150 + }, + { + "epoch": 22.75, + "learning_rate": 5.686274509803921e-05, + "loss": 0.8343, + "step": 1160 + }, + { + "epoch": 22.94, + "learning_rate": 5.7352941176470576e-05, + "loss": 0.8319, + "step": 1170 + }, + { + "epoch": 23.0, + "eval_loss": 0.8331688046455383, + "eval_runtime": 2.1797, + "eval_samples_per_second": 1045.554, + "eval_steps_per_second": 4.129, + "step": 1173 + }, + { + "epoch": 23.14, + "learning_rate": 5.784313725490196e-05, + "loss": 0.8339, + "step": 1180 + }, + { + "epoch": 23.33, + "learning_rate": 5.8333333333333326e-05, + "loss": 0.8361, + "step": 1190 + }, + { + "epoch": 23.53, + "learning_rate": 5.88235294117647e-05, + "loss": 0.8297, + "step": 1200 + }, + { + "epoch": 23.73, + "learning_rate": 5.931372549019607e-05, + "loss": 0.83, + "step": 1210 + }, + { + "epoch": 23.92, + "learning_rate": 5.980392156862745e-05, + "loss": 0.8267, + "step": 1220 + }, + { + "epoch": 24.0, + "eval_loss": 0.8347041606903076, + "eval_runtime": 2.0517, + "eval_samples_per_second": 1110.806, + "eval_steps_per_second": 4.387, + "step": 1224 + }, + { + "epoch": 24.12, + "learning_rate": 6.029411764705882e-05, + "loss": 0.8316, + "step": 1230 + }, + { + "epoch": 24.31, + "learning_rate": 6.078431372549019e-05, + "loss": 0.825, + "step": 1240 + }, + { + "epoch": 24.51, + "learning_rate": 6.127450980392157e-05, + "loss": 0.8269, + "step": 1250 + }, + { + "epoch": 24.71, + "learning_rate": 6.176470588235294e-05, + "loss": 0.8267, + "step": 1260 + }, + { + "epoch": 24.9, + "learning_rate": 6.225490196078432e-05, + "loss": 0.8266, + "step": 1270 + }, + { + "epoch": 25.0, + "eval_loss": 0.8246671557426453, + "eval_runtime": 2.0855, + "eval_samples_per_second": 1092.759, + "eval_steps_per_second": 4.315, + "step": 1275 + }, + { + "epoch": 25.1, + "learning_rate": 6.274509803921569e-05, + "loss": 0.8247, + "step": 1280 + }, + { + "epoch": 25.29, + "learning_rate": 6.323529411764705e-05, + "loss": 0.8223, + "step": 1290 + }, + { + "epoch": 25.49, + "learning_rate": 6.372549019607842e-05, + "loss": 0.822, + "step": 1300 + }, + { + "epoch": 25.69, + "learning_rate": 6.421568627450979e-05, + "loss": 0.8234, + "step": 1310 + }, + { + "epoch": 25.88, + "learning_rate": 6.470588235294117e-05, + "loss": 0.8242, + "step": 1320 + }, + { + "epoch": 26.0, + "eval_loss": 0.8241580128669739, + "eval_runtime": 2.044, + "eval_samples_per_second": 1114.944, + "eval_steps_per_second": 4.403, + "step": 1326 + }, + { + "epoch": 26.08, + "learning_rate": 6.519607843137254e-05, + "loss": 0.8246, + "step": 1330 + }, + { + "epoch": 26.27, + "learning_rate": 6.568627450980392e-05, + "loss": 0.8212, + "step": 1340 + }, + { + "epoch": 26.47, + "learning_rate": 6.617647058823529e-05, + "loss": 0.8264, + "step": 1350 + }, + { + "epoch": 26.67, + "learning_rate": 6.666666666666666e-05, + "loss": 0.8215, + "step": 1360 + }, + { + "epoch": 26.86, + "learning_rate": 6.715686274509804e-05, + "loss": 0.8215, + "step": 1370 + }, + { + "epoch": 27.0, + "eval_loss": 0.8191553354263306, + "eval_runtime": 2.1872, + "eval_samples_per_second": 1041.969, + "eval_steps_per_second": 4.115, + "step": 1377 + }, + { + "epoch": 27.06, + "learning_rate": 6.76470588235294e-05, + "loss": 0.8234, + "step": 1380 + }, + { + "epoch": 27.25, + "learning_rate": 6.813725490196077e-05, + "loss": 0.8182, + "step": 1390 + }, + { + "epoch": 27.45, + "learning_rate": 6.862745098039214e-05, + "loss": 0.8173, + "step": 1400 + }, + { + "epoch": 27.65, + "learning_rate": 6.911764705882352e-05, + "loss": 0.816, + "step": 1410 + }, + { + "epoch": 27.84, + "learning_rate": 6.96078431372549e-05, + "loss": 0.8171, + "step": 1420 + }, + { + "epoch": 28.0, + "eval_loss": 0.8213248252868652, + "eval_runtime": 2.2192, + "eval_samples_per_second": 1026.926, + "eval_steps_per_second": 4.055, + "step": 1428 + }, + { + "epoch": 28.04, + "learning_rate": 7.009803921568627e-05, + "loss": 0.8199, + "step": 1430 + }, + { + "epoch": 28.24, + "learning_rate": 7.058823529411764e-05, + "loss": 0.8202, + "step": 1440 + }, + { + "epoch": 28.43, + "learning_rate": 7.107843137254901e-05, + "loss": 0.8149, + "step": 1450 + }, + { + "epoch": 28.63, + "learning_rate": 7.156862745098039e-05, + "loss": 0.8163, + "step": 1460 + }, + { + "epoch": 28.82, + "learning_rate": 7.205882352941176e-05, + "loss": 0.8176, + "step": 1470 + }, + { + "epoch": 29.0, + "eval_loss": 0.8160317540168762, + "eval_runtime": 2.1369, + "eval_samples_per_second": 1066.475, + "eval_steps_per_second": 4.212, + "step": 1479 + }, + { + "epoch": 29.02, + "learning_rate": 7.254901960784313e-05, + "loss": 0.8171, + "step": 1480 + }, + { + "epoch": 29.22, + "learning_rate": 7.303921568627451e-05, + "loss": 0.8171, + "step": 1490 + }, + { + "epoch": 29.41, + "learning_rate": 7.352941176470588e-05, + "loss": 0.8127, + "step": 1500 + }, + { + "epoch": 29.61, + "learning_rate": 7.401960784313726e-05, + "loss": 0.8192, + "step": 1510 + }, + { + "epoch": 29.8, + "learning_rate": 7.450980392156863e-05, + "loss": 0.8142, + "step": 1520 + }, + { + "epoch": 30.0, + "learning_rate": 7.5e-05, + "loss": 0.8122, + "step": 1530 + }, + { + "epoch": 30.0, + "eval_loss": 0.8127588629722595, + "eval_runtime": 2.1677, + "eval_samples_per_second": 1051.341, + "eval_steps_per_second": 4.152, + "step": 1530 + }, + { + "epoch": 30.2, + "learning_rate": 7.549019607843136e-05, + "loss": 0.8164, + "step": 1540 + }, + { + "epoch": 30.39, + "learning_rate": 7.598039215686273e-05, + "loss": 0.8143, + "step": 1550 + }, + { + "epoch": 30.59, + "learning_rate": 7.647058823529411e-05, + "loss": 0.8113, + "step": 1560 + }, + { + "epoch": 30.78, + "learning_rate": 7.696078431372548e-05, + "loss": 0.8075, + "step": 1570 + }, + { + "epoch": 30.98, + "learning_rate": 7.745098039215686e-05, + "loss": 0.8107, + "step": 1580 + }, + { + "epoch": 31.0, + "eval_loss": 0.803588330745697, + "eval_runtime": 2.1888, + "eval_samples_per_second": 1041.189, + "eval_steps_per_second": 4.112, + "step": 1581 + }, + { + "epoch": 31.18, + "learning_rate": 7.794117647058823e-05, + "loss": 0.8038, + "step": 1590 + }, + { + "epoch": 31.37, + "learning_rate": 7.843137254901961e-05, + "loss": 0.8105, + "step": 1600 + }, + { + "epoch": 31.57, + "learning_rate": 7.892156862745098e-05, + "loss": 0.8057, + "step": 1610 + }, + { + "epoch": 31.76, + "learning_rate": 7.941176470588235e-05, + "loss": 0.8085, + "step": 1620 + }, + { + "epoch": 31.96, + "learning_rate": 7.990196078431371e-05, + "loss": 0.8069, + "step": 1630 + }, + { + "epoch": 32.0, + "eval_loss": 0.8068580031394958, + "eval_runtime": 2.182, + "eval_samples_per_second": 1044.445, + "eval_steps_per_second": 4.125, + "step": 1632 + }, + { + "epoch": 32.16, + "learning_rate": 8.039215686274508e-05, + "loss": 0.8078, + "step": 1640 + }, + { + "epoch": 32.35, + "learning_rate": 8.088235294117646e-05, + "loss": 0.8101, + "step": 1650 + }, + { + "epoch": 32.55, + "learning_rate": 8.137254901960783e-05, + "loss": 0.8126, + "step": 1660 + }, + { + "epoch": 32.75, + "learning_rate": 8.18627450980392e-05, + "loss": 0.8108, + "step": 1670 + }, + { + "epoch": 32.94, + "learning_rate": 8.23529411764706e-05, + "loss": 0.8081, + "step": 1680 + }, + { + "epoch": 33.0, + "eval_loss": 0.8022666573524475, + "eval_runtime": 2.2411, + "eval_samples_per_second": 1016.898, + "eval_steps_per_second": 4.016, + "step": 1683 + }, + { + "epoch": 33.14, + "learning_rate": 8.284313725490196e-05, + "loss": 0.8091, + "step": 1690 + }, + { + "epoch": 33.33, + "learning_rate": 8.333333333333333e-05, + "loss": 0.8027, + "step": 1700 + }, + { + "epoch": 33.53, + "learning_rate": 8.38235294117647e-05, + "loss": 0.8029, + "step": 1710 + }, + { + "epoch": 33.73, + "learning_rate": 8.431372549019607e-05, + "loss": 0.8044, + "step": 1720 + }, + { + "epoch": 33.92, + "learning_rate": 8.480392156862745e-05, + "loss": 0.8043, + "step": 1730 + }, + { + "epoch": 34.0, + "eval_loss": 0.8047605752944946, + "eval_runtime": 2.0682, + "eval_samples_per_second": 1101.943, + "eval_steps_per_second": 4.352, + "step": 1734 + }, + { + "epoch": 34.12, + "learning_rate": 8.529411764705882e-05, + "loss": 0.8112, + "step": 1740 + }, + { + "epoch": 34.31, + "learning_rate": 8.578431372549018e-05, + "loss": 0.8091, + "step": 1750 + }, + { + "epoch": 34.51, + "learning_rate": 8.627450980392155e-05, + "loss": 0.8104, + "step": 1760 + }, + { + "epoch": 34.71, + "learning_rate": 8.676470588235295e-05, + "loss": 0.8062, + "step": 1770 + }, + { + "epoch": 34.9, + "learning_rate": 8.725490196078432e-05, + "loss": 0.8071, + "step": 1780 + }, + { + "epoch": 35.0, + "eval_loss": 0.8082063794136047, + "eval_runtime": 2.0463, + "eval_samples_per_second": 1113.701, + "eval_steps_per_second": 4.398, + "step": 1785 + }, + { + "epoch": 35.1, + "learning_rate": 8.774509803921568e-05, + "loss": 0.8066, + "step": 1790 + }, + { + "epoch": 35.29, + "learning_rate": 8.823529411764705e-05, + "loss": 0.8041, + "step": 1800 + }, + { + "epoch": 35.49, + "learning_rate": 8.872549019607842e-05, + "loss": 0.8009, + "step": 1810 + }, + { + "epoch": 35.69, + "learning_rate": 8.92156862745098e-05, + "loss": 0.7993, + "step": 1820 + }, + { + "epoch": 35.88, + "learning_rate": 8.970588235294117e-05, + "loss": 0.8017, + "step": 1830 + }, + { + "epoch": 36.0, + "eval_loss": 0.7971303462982178, + "eval_runtime": 2.1621, + "eval_samples_per_second": 1054.087, + "eval_steps_per_second": 4.163, + "step": 1836 + }, + { + "epoch": 36.08, + "learning_rate": 9.019607843137254e-05, + "loss": 0.7994, + "step": 1840 + }, + { + "epoch": 36.27, + "learning_rate": 9.06862745098039e-05, + "loss": 0.7953, + "step": 1850 + }, + { + "epoch": 36.47, + "learning_rate": 9.117647058823527e-05, + "loss": 0.8009, + "step": 1860 + }, + { + "epoch": 36.67, + "learning_rate": 9.166666666666667e-05, + "loss": 0.7986, + "step": 1870 + }, + { + "epoch": 36.86, + "learning_rate": 9.215686274509804e-05, + "loss": 0.7965, + "step": 1880 + }, + { + "epoch": 37.0, + "eval_loss": 0.7953115105628967, + "eval_runtime": 2.096, + "eval_samples_per_second": 1087.315, + "eval_steps_per_second": 4.294, + "step": 1887 + }, + { + "epoch": 37.06, + "learning_rate": 9.26470588235294e-05, + "loss": 0.7974, + "step": 1890 + }, + { + "epoch": 37.25, + "learning_rate": 9.313725490196077e-05, + "loss": 0.7956, + "step": 1900 + }, + { + "epoch": 37.45, + "learning_rate": 9.362745098039215e-05, + "loss": 0.7954, + "step": 1910 + }, + { + "epoch": 37.65, + "learning_rate": 9.411764705882352e-05, + "loss": 0.7934, + "step": 1920 + }, + { + "epoch": 37.84, + "learning_rate": 9.460784313725489e-05, + "loss": 0.7953, + "step": 1930 + }, + { + "epoch": 38.0, + "eval_loss": 0.8111655712127686, + "eval_runtime": 2.1785, + "eval_samples_per_second": 1046.146, + "eval_steps_per_second": 4.131, + "step": 1938 + }, + { + "epoch": 38.04, + "learning_rate": 9.509803921568626e-05, + "loss": 0.7976, + "step": 1940 + }, + { + "epoch": 38.24, + "learning_rate": 9.558823529411764e-05, + "loss": 0.8007, + "step": 1950 + }, + { + "epoch": 38.43, + "learning_rate": 9.6078431372549e-05, + "loss": 0.8014, + "step": 1960 + }, + { + "epoch": 38.63, + "learning_rate": 9.656862745098039e-05, + "loss": 0.7939, + "step": 1970 + }, + { + "epoch": 38.82, + "learning_rate": 9.705882352941176e-05, + "loss": 0.7979, + "step": 1980 + }, + { + "epoch": 39.0, + "eval_loss": 0.7954539656639099, + "eval_runtime": 2.1921, + "eval_samples_per_second": 1039.635, + "eval_steps_per_second": 4.106, + "step": 1989 + }, + { + "epoch": 39.02, + "learning_rate": 9.754901960784314e-05, + "loss": 0.7966, + "step": 1990 + }, + { + "epoch": 39.22, + "learning_rate": 9.80392156862745e-05, + "loss": 0.792, + "step": 2000 + }, + { + "epoch": 39.41, + "learning_rate": 9.852941176470587e-05, + "loss": 0.7913, + "step": 2010 + }, + { + "epoch": 39.61, + "learning_rate": 9.901960784313724e-05, + "loss": 0.7892, + "step": 2020 + }, + { + "epoch": 39.8, + "learning_rate": 9.950980392156861e-05, + "loss": 0.7892, + "step": 2030 + }, + { + "epoch": 40.0, + "learning_rate": 9.999999999999999e-05, + "loss": 0.7887, + "step": 2040 + }, + { + "epoch": 40.0, + "eval_loss": 0.7966196537017822, + "eval_runtime": 2.1656, + "eval_samples_per_second": 1052.34, + "eval_steps_per_second": 4.156, + "step": 2040 + }, + { + "epoch": 40.2, + "learning_rate": 0.00010049019607843136, + "loss": 0.7982, + "step": 2050 + }, + { + "epoch": 40.39, + "learning_rate": 0.00010098039215686274, + "loss": 0.7965, + "step": 2060 + }, + { + "epoch": 40.59, + "learning_rate": 0.00010147058823529411, + "loss": 0.7897, + "step": 2070 + }, + { + "epoch": 40.78, + "learning_rate": 0.00010196078431372549, + "loss": 0.7892, + "step": 2080 + }, + { + "epoch": 40.98, + "learning_rate": 0.00010245098039215686, + "loss": 0.7866, + "step": 2090 + }, + { + "epoch": 41.0, + "eval_loss": 0.7878917455673218, + "eval_runtime": 2.1957, + "eval_samples_per_second": 1037.957, + "eval_steps_per_second": 4.099, + "step": 2091 + }, + { + "epoch": 41.18, + "learning_rate": 0.00010294117647058823, + "loss": 0.7895, + "step": 2100 + }, + { + "epoch": 41.37, + "learning_rate": 0.0001034313725490196, + "loss": 0.7871, + "step": 2110 + }, + { + "epoch": 41.57, + "learning_rate": 0.00010392156862745096, + "loss": 0.785, + "step": 2120 + }, + { + "epoch": 41.76, + "learning_rate": 0.00010441176470588234, + "loss": 0.7827, + "step": 2130 + }, + { + "epoch": 41.96, + "learning_rate": 0.00010490196078431371, + "loss": 0.7862, + "step": 2140 + }, + { + "epoch": 42.0, + "eval_loss": 0.7828369736671448, + "eval_runtime": 2.2171, + "eval_samples_per_second": 1027.911, + "eval_steps_per_second": 4.059, + "step": 2142 + }, + { + "epoch": 42.16, + "learning_rate": 0.00010539215686274508, + "loss": 0.7855, + "step": 2150 + }, + { + "epoch": 42.35, + "learning_rate": 0.00010588235294117647, + "loss": 0.785, + "step": 2160 + }, + { + "epoch": 42.55, + "learning_rate": 0.00010637254901960784, + "loss": 0.7865, + "step": 2170 + }, + { + "epoch": 42.75, + "learning_rate": 0.00010686274509803921, + "loss": 0.7823, + "step": 2180 + }, + { + "epoch": 42.94, + "learning_rate": 0.00010735294117647058, + "loss": 0.7836, + "step": 2190 + }, + { + "epoch": 43.0, + "eval_loss": 0.7864591479301453, + "eval_runtime": 2.1233, + "eval_samples_per_second": 1073.307, + "eval_steps_per_second": 4.239, + "step": 2193 + }, + { + "epoch": 43.14, + "learning_rate": 0.00010784313725490195, + "loss": 0.7888, + "step": 2200 + }, + { + "epoch": 43.33, + "learning_rate": 0.00010833333333333333, + "loss": 0.7871, + "step": 2210 + }, + { + "epoch": 43.53, + "learning_rate": 0.0001088235294117647, + "loss": 0.7861, + "step": 2220 + }, + { + "epoch": 43.73, + "learning_rate": 0.00010931372549019606, + "loss": 0.7843, + "step": 2230 + }, + { + "epoch": 43.92, + "learning_rate": 0.00010980392156862743, + "loss": 0.7851, + "step": 2240 + }, + { + "epoch": 44.0, + "eval_loss": 0.7829829454421997, + "eval_runtime": 2.1348, + "eval_samples_per_second": 1067.557, + "eval_steps_per_second": 4.216, + "step": 2244 + }, + { + "epoch": 44.12, + "learning_rate": 0.00011029411764705883, + "loss": 0.784, + "step": 2250 + }, + { + "epoch": 44.31, + "learning_rate": 0.0001107843137254902, + "loss": 0.784, + "step": 2260 + }, + { + "epoch": 44.51, + "learning_rate": 0.00011127450980392156, + "loss": 0.785, + "step": 2270 + }, + { + "epoch": 44.71, + "learning_rate": 0.00011176470588235293, + "loss": 0.7807, + "step": 2280 + }, + { + "epoch": 44.9, + "learning_rate": 0.0001122549019607843, + "loss": 0.7813, + "step": 2290 + }, + { + "epoch": 45.0, + "eval_loss": 0.7840399146080017, + "eval_runtime": 2.1519, + "eval_samples_per_second": 1059.088, + "eval_steps_per_second": 4.182, + "step": 2295 + }, + { + "epoch": 45.1, + "learning_rate": 0.00011274509803921568, + "loss": 0.7796, + "step": 2300 + }, + { + "epoch": 45.29, + "learning_rate": 0.00011323529411764705, + "loss": 0.7796, + "step": 2310 + }, + { + "epoch": 45.49, + "learning_rate": 0.00011372549019607842, + "loss": 0.7776, + "step": 2320 + }, + { + "epoch": 45.69, + "learning_rate": 0.00011421568627450978, + "loss": 0.7792, + "step": 2330 + }, + { + "epoch": 45.88, + "learning_rate": 0.00011470588235294115, + "loss": 0.78, + "step": 2340 + }, + { + "epoch": 46.0, + "eval_loss": 0.7749137878417969, + "eval_runtime": 2.1918, + "eval_samples_per_second": 1039.761, + "eval_steps_per_second": 4.106, + "step": 2346 + }, + { + "epoch": 46.08, + "learning_rate": 0.00011519607843137255, + "loss": 0.7761, + "step": 2350 + }, + { + "epoch": 46.27, + "learning_rate": 0.00011568627450980392, + "loss": 0.7747, + "step": 2360 + }, + { + "epoch": 46.47, + "learning_rate": 0.00011617647058823528, + "loss": 0.772, + "step": 2370 + }, + { + "epoch": 46.67, + "learning_rate": 0.00011666666666666665, + "loss": 0.7748, + "step": 2380 + }, + { + "epoch": 46.86, + "learning_rate": 0.00011715686274509803, + "loss": 0.779, + "step": 2390 + }, + { + "epoch": 47.0, + "eval_loss": 0.7824994325637817, + "eval_runtime": 2.077, + "eval_samples_per_second": 1097.254, + "eval_steps_per_second": 4.333, + "step": 2397 + }, + { + "epoch": 47.06, + "learning_rate": 0.0001176470588235294, + "loss": 0.7811, + "step": 2400 + }, + { + "epoch": 47.25, + "learning_rate": 0.00011813725490196077, + "loss": 0.7817, + "step": 2410 + }, + { + "epoch": 47.45, + "learning_rate": 0.00011862745098039214, + "loss": 0.7805, + "step": 2420 + }, + { + "epoch": 47.65, + "learning_rate": 0.0001191176470588235, + "loss": 0.7783, + "step": 2430 + }, + { + "epoch": 47.84, + "learning_rate": 0.0001196078431372549, + "loss": 0.7762, + "step": 2440 + }, + { + "epoch": 48.0, + "eval_loss": 0.7712346911430359, + "eval_runtime": 2.1789, + "eval_samples_per_second": 1045.947, + "eval_steps_per_second": 4.131, + "step": 2448 + }, + { + "epoch": 48.04, + "learning_rate": 0.00012009803921568627, + "loss": 0.7758, + "step": 2450 + }, + { + "epoch": 48.24, + "learning_rate": 0.00012058823529411764, + "loss": 0.7734, + "step": 2460 + }, + { + "epoch": 48.43, + "learning_rate": 0.00012107843137254902, + "loss": 0.7697, + "step": 2470 + }, + { + "epoch": 48.63, + "learning_rate": 0.00012156862745098039, + "loss": 0.7663, + "step": 2480 + }, + { + "epoch": 48.82, + "learning_rate": 0.00012205882352941175, + "loss": 0.7676, + "step": 2490 + }, + { + "epoch": 49.0, + "eval_loss": 0.7674837112426758, + "eval_runtime": 2.1122, + "eval_samples_per_second": 1078.947, + "eval_steps_per_second": 4.261, + "step": 2499 + }, + { + "epoch": 49.02, + "learning_rate": 0.00012254901960784314, + "loss": 0.7657, + "step": 2500 + }, + { + "epoch": 49.22, + "learning_rate": 0.0001230392156862745, + "loss": 0.7671, + "step": 2510 + }, + { + "epoch": 49.41, + "learning_rate": 0.00012352941176470587, + "loss": 0.7691, + "step": 2520 + }, + { + "epoch": 49.61, + "learning_rate": 0.00012401960784313724, + "loss": 0.7623, + "step": 2530 + }, + { + "epoch": 49.8, + "learning_rate": 0.00012450980392156863, + "loss": 0.7683, + "step": 2540 + }, + { + "epoch": 50.0, + "learning_rate": 0.000125, + "loss": 0.7638, + "step": 2550 + }, + { + "epoch": 50.0, + "eval_loss": 0.7645083069801331, + "eval_runtime": 2.1712, + "eval_samples_per_second": 1049.658, + "eval_steps_per_second": 4.145, + "step": 2550 + }, + { + "epoch": 50.2, + "learning_rate": 0.00012549019607843137, + "loss": 0.7679, + "step": 2560 + }, + { + "epoch": 50.39, + "learning_rate": 0.00012598039215686274, + "loss": 0.7716, + "step": 2570 + }, + { + "epoch": 50.59, + "learning_rate": 0.0001264705882352941, + "loss": 0.772, + "step": 2580 + }, + { + "epoch": 50.78, + "learning_rate": 0.00012696078431372547, + "loss": 0.7757, + "step": 2590 + }, + { + "epoch": 50.98, + "learning_rate": 0.00012745098039215684, + "loss": 0.7826, + "step": 2600 + }, + { + "epoch": 51.0, + "eval_loss": 0.7879320979118347, + "eval_runtime": 2.139, + "eval_samples_per_second": 1065.46, + "eval_steps_per_second": 4.208, + "step": 2601 + }, + { + "epoch": 51.18, + "learning_rate": 0.0001279411764705882, + "loss": 0.7886, + "step": 2610 + }, + { + "epoch": 51.37, + "learning_rate": 0.00012843137254901958, + "loss": 0.7883, + "step": 2620 + }, + { + "epoch": 51.57, + "learning_rate": 0.00012892156862745097, + "loss": 0.7851, + "step": 2630 + }, + { + "epoch": 51.76, + "learning_rate": 0.00012941176470588234, + "loss": 0.7797, + "step": 2640 + }, + { + "epoch": 51.96, + "learning_rate": 0.0001299019607843137, + "loss": 0.7728, + "step": 2650 + }, + { + "epoch": 52.0, + "eval_loss": 0.7729543447494507, + "eval_runtime": 2.2366, + "eval_samples_per_second": 1018.972, + "eval_steps_per_second": 4.024, + "step": 2652 + }, + { + "epoch": 52.16, + "learning_rate": 0.00013039215686274508, + "loss": 0.7681, + "step": 2660 + }, + { + "epoch": 52.35, + "learning_rate": 0.00013088235294117647, + "loss": 0.7614, + "step": 2670 + }, + { + "epoch": 52.55, + "learning_rate": 0.00013137254901960784, + "loss": 0.7626, + "step": 2680 + }, + { + "epoch": 52.75, + "learning_rate": 0.0001318627450980392, + "loss": 0.7621, + "step": 2690 + }, + { + "epoch": 52.94, + "learning_rate": 0.00013235294117647058, + "loss": 0.7629, + "step": 2700 + }, + { + "epoch": 53.0, + "eval_loss": 0.7606103420257568, + "eval_runtime": 2.2103, + "eval_samples_per_second": 1031.088, + "eval_steps_per_second": 4.072, + "step": 2703 + }, + { + "epoch": 53.14, + "learning_rate": 0.00013284313725490194, + "loss": 0.7619, + "step": 2710 + }, + { + "epoch": 53.33, + "learning_rate": 0.0001333333333333333, + "loss": 0.7705, + "step": 2720 + }, + { + "epoch": 53.53, + "learning_rate": 0.0001338235294117647, + "loss": 0.7831, + "step": 2730 + }, + { + "epoch": 53.73, + "learning_rate": 0.00013431372549019608, + "loss": 0.7841, + "step": 2740 + }, + { + "epoch": 53.92, + "learning_rate": 0.00013480392156862744, + "loss": 0.7819, + "step": 2750 + }, + { + "epoch": 54.0, + "eval_loss": 0.7718145847320557, + "eval_runtime": 2.0561, + "eval_samples_per_second": 1108.406, + "eval_steps_per_second": 4.377, + "step": 2754 + }, + { + "epoch": 54.12, + "learning_rate": 0.0001352941176470588, + "loss": 0.7737, + "step": 2760 + }, + { + "epoch": 54.31, + "learning_rate": 0.00013578431372549018, + "loss": 0.7763, + "step": 2770 + }, + { + "epoch": 54.51, + "learning_rate": 0.00013627450980392155, + "loss": 0.7791, + "step": 2780 + }, + { + "epoch": 54.71, + "learning_rate": 0.00013676470588235292, + "loss": 0.7741, + "step": 2790 + }, + { + "epoch": 54.9, + "learning_rate": 0.00013725490196078428, + "loss": 0.7802, + "step": 2800 + }, + { + "epoch": 55.0, + "eval_loss": 0.7808622121810913, + "eval_runtime": 2.1507, + "eval_samples_per_second": 1059.636, + "eval_steps_per_second": 4.185, + "step": 2805 + }, + { + "epoch": 55.1, + "learning_rate": 0.00013774509803921568, + "loss": 0.7777, + "step": 2810 + }, + { + "epoch": 55.29, + "learning_rate": 0.00013823529411764705, + "loss": 0.781, + "step": 2820 + }, + { + "epoch": 55.49, + "learning_rate": 0.00013872549019607841, + "loss": 0.7691, + "step": 2830 + }, + { + "epoch": 55.69, + "learning_rate": 0.0001392156862745098, + "loss": 0.7677, + "step": 2840 + }, + { + "epoch": 55.88, + "learning_rate": 0.00013970588235294118, + "loss": 0.7632, + "step": 2850 + }, + { + "epoch": 56.0, + "eval_loss": 0.7576876878738403, + "eval_runtime": 2.1386, + "eval_samples_per_second": 1065.672, + "eval_steps_per_second": 4.208, + "step": 2856 + }, + { + "epoch": 56.08, + "learning_rate": 0.00014019607843137255, + "loss": 0.7594, + "step": 2860 + }, + { + "epoch": 56.27, + "learning_rate": 0.00014068627450980391, + "loss": 0.7557, + "step": 2870 + }, + { + "epoch": 56.47, + "learning_rate": 0.00014117647058823528, + "loss": 0.7603, + "step": 2880 + }, + { + "epoch": 56.67, + "learning_rate": 0.00014166666666666665, + "loss": 0.7581, + "step": 2890 + }, + { + "epoch": 56.86, + "learning_rate": 0.00014215686274509802, + "loss": 0.7567, + "step": 2900 + }, + { + "epoch": 57.0, + "eval_loss": 0.7653807401657104, + "eval_runtime": 2.1852, + "eval_samples_per_second": 1042.923, + "eval_steps_per_second": 4.119, + "step": 2907 + }, + { + "epoch": 57.06, + "learning_rate": 0.00014264705882352939, + "loss": 0.7603, + "step": 2910 + }, + { + "epoch": 57.25, + "learning_rate": 0.00014313725490196078, + "loss": 0.7607, + "step": 2920 + }, + { + "epoch": 57.45, + "learning_rate": 0.00014362745098039215, + "loss": 0.759, + "step": 2930 + }, + { + "epoch": 57.65, + "learning_rate": 0.00014411764705882352, + "loss": 0.7567, + "step": 2940 + }, + { + "epoch": 57.84, + "learning_rate": 0.00014460784313725488, + "loss": 0.7564, + "step": 2950 + }, + { + "epoch": 58.0, + "eval_loss": 0.7573947906494141, + "eval_runtime": 2.2323, + "eval_samples_per_second": 1020.928, + "eval_steps_per_second": 4.032, + "step": 2958 + }, + { + "epoch": 58.04, + "learning_rate": 0.00014509803921568625, + "loss": 0.7568, + "step": 2960 + }, + { + "epoch": 58.24, + "learning_rate": 0.00014558823529411762, + "loss": 0.7602, + "step": 2970 + }, + { + "epoch": 58.43, + "learning_rate": 0.00014607843137254902, + "loss": 0.7537, + "step": 2980 + }, + { + "epoch": 58.63, + "learning_rate": 0.00014656862745098038, + "loss": 0.752, + "step": 2990 + }, + { + "epoch": 58.82, + "learning_rate": 0.00014705882352941175, + "loss": 0.7535, + "step": 3000 + }, + { + "epoch": 59.0, + "eval_loss": 0.755523145198822, + "eval_runtime": 2.1203, + "eval_samples_per_second": 1074.859, + "eval_steps_per_second": 4.245, + "step": 3009 + }, + { + "epoch": 59.02, + "learning_rate": 0.00014754901960784312, + "loss": 0.7549, + "step": 3010 + }, + { + "epoch": 59.22, + "learning_rate": 0.00014803921568627451, + "loss": 0.7576, + "step": 3020 + }, + { + "epoch": 59.41, + "learning_rate": 0.00014852941176470588, + "loss": 0.7449, + "step": 3030 + }, + { + "epoch": 59.61, + "learning_rate": 0.00014901960784313725, + "loss": 0.75, + "step": 3040 + }, + { + "epoch": 59.8, + "learning_rate": 0.00014950980392156862, + "loss": 0.7489, + "step": 3050 + }, + { + "epoch": 60.0, + "learning_rate": 0.00015, + "loss": 0.75, + "step": 3060 + }, + { + "epoch": 60.0, + "eval_loss": 0.7484251856803894, + "eval_runtime": 2.2273, + "eval_samples_per_second": 1023.23, + "eval_steps_per_second": 4.041, + "step": 3060 + }, + { + "epoch": 60.2, + "learning_rate": 0.00014999998905083632, + "loss": 0.7529, + "step": 3070 + }, + { + "epoch": 60.39, + "learning_rate": 0.00014999995620334851, + "loss": 0.7475, + "step": 3080 + }, + { + "epoch": 60.59, + "learning_rate": 0.00014999990145754617, + "loss": 0.7475, + "step": 3090 + }, + { + "epoch": 60.78, + "learning_rate": 0.00014999982481344522, + "loss": 0.7475, + "step": 3100 + }, + { + "epoch": 60.98, + "learning_rate": 0.0001499997262710681, + "loss": 0.7512, + "step": 3110 + }, + { + "epoch": 61.0, + "eval_loss": 0.7487233877182007, + "eval_runtime": 2.2297, + "eval_samples_per_second": 1022.1, + "eval_steps_per_second": 4.036, + "step": 3111 + }, + { + "epoch": 61.18, + "learning_rate": 0.0001499996058304436, + "loss": 0.7474, + "step": 3120 + }, + { + "epoch": 61.37, + "learning_rate": 0.0001499994634916068, + "loss": 0.7481, + "step": 3130 + }, + { + "epoch": 61.57, + "learning_rate": 0.00014999929925459934, + "loss": 0.7491, + "step": 3140 + }, + { + "epoch": 61.76, + "learning_rate": 0.00014999911311946914, + "loss": 0.757, + "step": 3150 + }, + { + "epoch": 61.96, + "learning_rate": 0.00014999890508627056, + "loss": 0.7493, + "step": 3160 + }, + { + "epoch": 62.0, + "eval_loss": 0.7462049722671509, + "eval_runtime": 2.0885, + "eval_samples_per_second": 1091.196, + "eval_steps_per_second": 4.309, + "step": 3162 + }, + { + "epoch": 62.16, + "learning_rate": 0.00014999867515506433, + "loss": 0.7456, + "step": 3170 + }, + { + "epoch": 62.35, + "learning_rate": 0.00014999842332591757, + "loss": 0.747, + "step": 3180 + }, + { + "epoch": 62.55, + "learning_rate": 0.00014999814959890383, + "loss": 0.741, + "step": 3190 + }, + { + "epoch": 62.75, + "learning_rate": 0.00014999785397410304, + "loss": 0.7395, + "step": 3200 + }, + { + "epoch": 62.94, + "learning_rate": 0.0001499975364516015, + "loss": 0.742, + "step": 3210 + }, + { + "epoch": 63.0, + "eval_loss": 0.7449880838394165, + "eval_runtime": 2.2315, + "eval_samples_per_second": 1021.27, + "eval_steps_per_second": 4.033, + "step": 3213 + }, + { + "epoch": 63.14, + "learning_rate": 0.00014999719703149192, + "loss": 0.7464, + "step": 3220 + }, + { + "epoch": 63.33, + "learning_rate": 0.0001499968357138734, + "loss": 0.7437, + "step": 3230 + }, + { + "epoch": 63.53, + "learning_rate": 0.00014999645249885146, + "loss": 0.7401, + "step": 3240 + }, + { + "epoch": 63.73, + "learning_rate": 0.00014999604738653798, + "loss": 0.7441, + "step": 3250 + }, + { + "epoch": 63.92, + "learning_rate": 0.0001499956203770512, + "loss": 0.7469, + "step": 3260 + }, + { + "epoch": 64.0, + "eval_loss": 0.7464487552642822, + "eval_runtime": 2.1124, + "eval_samples_per_second": 1078.89, + "eval_steps_per_second": 4.261, + "step": 3264 + }, + { + "epoch": 64.12, + "learning_rate": 0.00014999517147051586, + "loss": 0.744, + "step": 3270 + }, + { + "epoch": 64.31, + "learning_rate": 0.00014999470066706303, + "loss": 0.7427, + "step": 3280 + }, + { + "epoch": 64.51, + "learning_rate": 0.0001499942079668301, + "loss": 0.7379, + "step": 3290 + }, + { + "epoch": 64.71, + "learning_rate": 0.00014999369336996101, + "loss": 0.7426, + "step": 3300 + }, + { + "epoch": 64.9, + "learning_rate": 0.00014999315687660596, + "loss": 0.7449, + "step": 3310 + }, + { + "epoch": 65.0, + "eval_loss": 0.7393355369567871, + "eval_runtime": 2.0866, + "eval_samples_per_second": 1092.226, + "eval_steps_per_second": 4.313, + "step": 3315 + }, + { + "epoch": 65.1, + "learning_rate": 0.0001499925984869216, + "loss": 0.7412, + "step": 3320 + }, + { + "epoch": 65.29, + "learning_rate": 0.00014999201820107102, + "loss": 0.7426, + "step": 3330 + }, + { + "epoch": 65.49, + "learning_rate": 0.0001499914160192236, + "loss": 0.7328, + "step": 3340 + }, + { + "epoch": 65.69, + "learning_rate": 0.00014999079194155516, + "loss": 0.7346, + "step": 3350 + }, + { + "epoch": 65.88, + "learning_rate": 0.0001499901459682479, + "loss": 0.7321, + "step": 3360 + }, + { + "epoch": 66.0, + "eval_loss": 0.7425104975700378, + "eval_runtime": 2.1171, + "eval_samples_per_second": 1076.448, + "eval_steps_per_second": 4.251, + "step": 3366 + }, + { + "epoch": 66.08, + "learning_rate": 0.0001499894780994905, + "loss": 0.7411, + "step": 3370 + }, + { + "epoch": 66.27, + "learning_rate": 0.0001499887883354779, + "loss": 0.7411, + "step": 3380 + }, + { + "epoch": 66.47, + "learning_rate": 0.00014998807667641157, + "loss": 0.7395, + "step": 3390 + }, + { + "epoch": 66.67, + "learning_rate": 0.0001499873431224992, + "loss": 0.7384, + "step": 3400 + }, + { + "epoch": 66.86, + "learning_rate": 0.00014998658767395503, + "loss": 0.7411, + "step": 3410 + }, + { + "epoch": 67.0, + "eval_loss": 0.7390904426574707, + "eval_runtime": 2.1435, + "eval_samples_per_second": 1063.218, + "eval_steps_per_second": 4.199, + "step": 3417 + }, + { + "epoch": 67.06, + "learning_rate": 0.00014998581033099962, + "loss": 0.7368, + "step": 3420 + }, + { + "epoch": 67.25, + "learning_rate": 0.00014998501109385993, + "loss": 0.7379, + "step": 3430 + }, + { + "epoch": 67.45, + "learning_rate": 0.00014998418996276934, + "loss": 0.739, + "step": 3440 + }, + { + "epoch": 67.65, + "learning_rate": 0.00014998334693796758, + "loss": 0.7388, + "step": 3450 + }, + { + "epoch": 67.84, + "learning_rate": 0.00014998248201970082, + "loss": 0.7394, + "step": 3460 + }, + { + "epoch": 68.0, + "eval_loss": 0.7413014769554138, + "eval_runtime": 2.1617, + "eval_samples_per_second": 1054.246, + "eval_steps_per_second": 4.163, + "step": 3468 + }, + { + "epoch": 68.04, + "learning_rate": 0.00014998159520822156, + "loss": 0.7424, + "step": 3470 + }, + { + "epoch": 68.24, + "learning_rate": 0.00014998068650378876, + "loss": 0.7376, + "step": 3480 + }, + { + "epoch": 68.43, + "learning_rate": 0.00014997975590666775, + "loss": 0.734, + "step": 3490 + }, + { + "epoch": 68.63, + "learning_rate": 0.0001499788034171302, + "loss": 0.7281, + "step": 3500 + }, + { + "epoch": 68.82, + "learning_rate": 0.00014997782903545428, + "loss": 0.7301, + "step": 3510 + }, + { + "epoch": 69.0, + "eval_loss": 0.7344282865524292, + "eval_runtime": 2.1969, + "eval_samples_per_second": 1037.378, + "eval_steps_per_second": 4.097, + "step": 3519 + }, + { + "epoch": 69.02, + "learning_rate": 0.0001499768327619244, + "loss": 0.7302, + "step": 3520 + }, + { + "epoch": 69.22, + "learning_rate": 0.00014997581459683156, + "loss": 0.7292, + "step": 3530 + }, + { + "epoch": 69.41, + "learning_rate": 0.00014997477454047295, + "loss": 0.7285, + "step": 3540 + }, + { + "epoch": 69.61, + "learning_rate": 0.00014997371259315226, + "loss": 0.7297, + "step": 3550 + }, + { + "epoch": 69.8, + "learning_rate": 0.00014997262875517955, + "loss": 0.7245, + "step": 3560 + }, + { + "epoch": 70.0, + "learning_rate": 0.00014997152302687134, + "loss": 0.7208, + "step": 3570 + }, + { + "epoch": 70.0, + "eval_loss": 0.7255963683128357, + "eval_runtime": 2.129, + "eval_samples_per_second": 1070.465, + "eval_steps_per_second": 4.227, + "step": 3570 + }, + { + "epoch": 70.2, + "learning_rate": 0.00014997039540855041, + "loss": 0.7311, + "step": 3580 + }, + { + "epoch": 70.39, + "learning_rate": 0.00014996924590054603, + "loss": 0.7263, + "step": 3590 + }, + { + "epoch": 70.59, + "learning_rate": 0.00014996807450319381, + "loss": 0.7231, + "step": 3600 + }, + { + "epoch": 70.78, + "learning_rate": 0.00014996688121683582, + "loss": 0.7232, + "step": 3610 + }, + { + "epoch": 70.98, + "learning_rate": 0.00014996566604182042, + "loss": 0.7211, + "step": 3620 + }, + { + "epoch": 71.0, + "eval_loss": 0.7225197553634644, + "eval_runtime": 2.2345, + "eval_samples_per_second": 1019.927, + "eval_steps_per_second": 4.028, + "step": 3621 + }, + { + "epoch": 71.18, + "learning_rate": 0.00014996442897850245, + "loss": 0.7261, + "step": 3630 + }, + { + "epoch": 71.37, + "learning_rate": 0.00014996317002724305, + "loss": 0.7247, + "step": 3640 + }, + { + "epoch": 71.57, + "learning_rate": 0.00014996188918840986, + "loss": 0.7322, + "step": 3650 + }, + { + "epoch": 71.76, + "learning_rate": 0.00014996058646237682, + "loss": 0.7333, + "step": 3660 + }, + { + "epoch": 71.96, + "learning_rate": 0.00014995926184952434, + "loss": 0.7273, + "step": 3670 + }, + { + "epoch": 72.0, + "eval_loss": 0.726405918598175, + "eval_runtime": 2.1479, + "eval_samples_per_second": 1061.037, + "eval_steps_per_second": 4.19, + "step": 3672 + }, + { + "epoch": 72.16, + "learning_rate": 0.00014995791535023917, + "loss": 0.7277, + "step": 3680 + }, + { + "epoch": 72.35, + "learning_rate": 0.0001499565469649144, + "loss": 0.7324, + "step": 3690 + }, + { + "epoch": 72.55, + "learning_rate": 0.00014995515669394963, + "loss": 0.7308, + "step": 3700 + }, + { + "epoch": 72.75, + "learning_rate": 0.00014995374453775077, + "loss": 0.7262, + "step": 3710 + }, + { + "epoch": 72.94, + "learning_rate": 0.00014995231049673012, + "loss": 0.7267, + "step": 3720 + }, + { + "epoch": 73.0, + "eval_loss": 0.7220990061759949, + "eval_runtime": 2.1293, + "eval_samples_per_second": 1070.303, + "eval_steps_per_second": 4.227, + "step": 3723 + }, + { + "epoch": 73.14, + "learning_rate": 0.00014995085457130645, + "loss": 0.7224, + "step": 3730 + }, + { + "epoch": 73.33, + "learning_rate": 0.00014994937676190477, + "loss": 0.7255, + "step": 3740 + }, + { + "epoch": 73.53, + "learning_rate": 0.0001499478770689566, + "loss": 0.7224, + "step": 3750 + }, + { + "epoch": 73.73, + "learning_rate": 0.00014994635549289985, + "loss": 0.7241, + "step": 3760 + }, + { + "epoch": 73.92, + "learning_rate": 0.00014994481203417875, + "loss": 0.7222, + "step": 3770 + }, + { + "epoch": 74.0, + "eval_loss": 0.7255922555923462, + "eval_runtime": 2.0896, + "eval_samples_per_second": 1090.633, + "eval_steps_per_second": 4.307, + "step": 3774 + }, + { + "epoch": 74.12, + "learning_rate": 0.00014994324669324397, + "loss": 0.7213, + "step": 3780 + }, + { + "epoch": 74.31, + "learning_rate": 0.00014994165947055255, + "loss": 0.7197, + "step": 3790 + }, + { + "epoch": 74.51, + "learning_rate": 0.00014994005036656795, + "loss": 0.7233, + "step": 3800 + }, + { + "epoch": 74.71, + "learning_rate": 0.00014993841938175994, + "loss": 0.7224, + "step": 3810 + }, + { + "epoch": 74.9, + "learning_rate": 0.00014993676651660478, + "loss": 0.7175, + "step": 3820 + }, + { + "epoch": 75.0, + "eval_loss": 0.7201787829399109, + "eval_runtime": 2.0903, + "eval_samples_per_second": 1090.287, + "eval_steps_per_second": 4.306, + "step": 3825 + }, + { + "epoch": 75.1, + "learning_rate": 0.00014993509177158503, + "loss": 0.7198, + "step": 3830 + }, + { + "epoch": 75.29, + "learning_rate": 0.00014993339514718972, + "loss": 0.7277, + "step": 3840 + }, + { + "epoch": 75.49, + "learning_rate": 0.00014993167664391417, + "loss": 0.7224, + "step": 3850 + }, + { + "epoch": 75.69, + "learning_rate": 0.00014992993626226022, + "loss": 0.721, + "step": 3860 + }, + { + "epoch": 75.88, + "learning_rate": 0.00014992817400273593, + "loss": 0.7174, + "step": 3870 + }, + { + "epoch": 76.0, + "eval_loss": 0.714850127696991, + "eval_runtime": 2.0699, + "eval_samples_per_second": 1101.043, + "eval_steps_per_second": 4.348, + "step": 3876 + }, + { + "epoch": 76.08, + "learning_rate": 0.00014992638986585592, + "loss": 0.717, + "step": 3880 + }, + { + "epoch": 76.27, + "learning_rate": 0.0001499245838521411, + "loss": 0.7163, + "step": 3890 + }, + { + "epoch": 76.47, + "learning_rate": 0.00014992275596211878, + "loss": 0.7175, + "step": 3900 + }, + { + "epoch": 76.67, + "learning_rate": 0.00014992090619632265, + "loss": 0.7098, + "step": 3910 + }, + { + "epoch": 76.86, + "learning_rate": 0.00014991903455529278, + "loss": 0.7143, + "step": 3920 + }, + { + "epoch": 77.0, + "eval_loss": 0.7127418518066406, + "eval_runtime": 2.101, + "eval_samples_per_second": 1084.704, + "eval_steps_per_second": 4.284, + "step": 3927 + }, + { + "epoch": 77.06, + "learning_rate": 0.00014991714103957572, + "loss": 0.7136, + "step": 3930 + }, + { + "epoch": 77.25, + "learning_rate": 0.00014991522564972428, + "loss": 0.7134, + "step": 3940 + }, + { + "epoch": 77.45, + "learning_rate": 0.0001499132883862977, + "loss": 0.713, + "step": 3950 + }, + { + "epoch": 77.65, + "learning_rate": 0.00014991132924986164, + "loss": 0.7119, + "step": 3960 + }, + { + "epoch": 77.84, + "learning_rate": 0.00014990934824098815, + "loss": 0.7106, + "step": 3970 + }, + { + "epoch": 78.0, + "eval_loss": 0.7061274647712708, + "eval_runtime": 2.0952, + "eval_samples_per_second": 1087.724, + "eval_steps_per_second": 4.296, + "step": 3978 + }, + { + "epoch": 78.04, + "learning_rate": 0.0001499073453602556, + "loss": 0.7079, + "step": 3980 + }, + { + "epoch": 78.24, + "learning_rate": 0.00014990532060824878, + "loss": 0.7104, + "step": 3990 + }, + { + "epoch": 78.43, + "learning_rate": 0.00014990327398555894, + "loss": 0.7088, + "step": 4000 + }, + { + "epoch": 78.63, + "learning_rate": 0.00014990120549278357, + "loss": 0.7098, + "step": 4010 + }, + { + "epoch": 78.82, + "learning_rate": 0.00014989911513052666, + "loss": 0.7188, + "step": 4020 + }, + { + "epoch": 79.0, + "eval_loss": 0.7153338193893433, + "eval_runtime": 2.0939, + "eval_samples_per_second": 1088.393, + "eval_steps_per_second": 4.298, + "step": 4029 + }, + { + "epoch": 79.02, + "learning_rate": 0.00014989700289939854, + "loss": 0.7183, + "step": 4030 + }, + { + "epoch": 79.22, + "learning_rate": 0.00014989486880001595, + "loss": 0.7099, + "step": 4040 + }, + { + "epoch": 79.41, + "learning_rate": 0.00014989271283300198, + "loss": 0.7068, + "step": 4050 + }, + { + "epoch": 79.61, + "learning_rate": 0.00014989053499898613, + "loss": 0.707, + "step": 4060 + }, + { + "epoch": 79.8, + "learning_rate": 0.0001498883352986043, + "loss": 0.7039, + "step": 4070 + }, + { + "epoch": 80.0, + "learning_rate": 0.0001498861137324987, + "loss": 0.7103, + "step": 4080 + }, + { + "epoch": 80.0, + "eval_loss": 0.7086203694343567, + "eval_runtime": 2.1491, + "eval_samples_per_second": 1060.423, + "eval_steps_per_second": 4.188, + "step": 4080 + }, + { + "epoch": 80.2, + "learning_rate": 0.00014988387030131803, + "loss": 0.7066, + "step": 4090 + }, + { + "epoch": 80.39, + "learning_rate": 0.0001498816050057173, + "loss": 0.705, + "step": 4100 + }, + { + "epoch": 80.59, + "learning_rate": 0.00014987931784635796, + "loss": 0.7077, + "step": 4110 + }, + { + "epoch": 80.78, + "learning_rate": 0.00014987700882390775, + "loss": 0.7073, + "step": 4120 + }, + { + "epoch": 80.98, + "learning_rate": 0.0001498746779390409, + "loss": 0.7055, + "step": 4130 + }, + { + "epoch": 81.0, + "eval_loss": 0.709750235080719, + "eval_runtime": 2.1143, + "eval_samples_per_second": 1077.874, + "eval_steps_per_second": 4.257, + "step": 4131 + }, + { + "epoch": 81.18, + "learning_rate": 0.00014987232519243792, + "loss": 0.7106, + "step": 4140 + }, + { + "epoch": 81.37, + "learning_rate": 0.00014986995058478584, + "loss": 0.7064, + "step": 4150 + }, + { + "epoch": 81.57, + "learning_rate": 0.00014986755411677794, + "loss": 0.7061, + "step": 4160 + }, + { + "epoch": 81.76, + "learning_rate": 0.00014986513578911395, + "loss": 0.7081, + "step": 4170 + }, + { + "epoch": 81.96, + "learning_rate": 0.00014986269560249995, + "loss": 0.7026, + "step": 4180 + }, + { + "epoch": 82.0, + "eval_loss": 0.7075186967849731, + "eval_runtime": 2.1061, + "eval_samples_per_second": 1082.105, + "eval_steps_per_second": 4.273, + "step": 4182 + }, + { + "epoch": 82.16, + "learning_rate": 0.00014986023355764846, + "loss": 0.7069, + "step": 4190 + }, + { + "epoch": 82.35, + "learning_rate": 0.0001498577496552783, + "loss": 0.7038, + "step": 4200 + }, + { + "epoch": 82.55, + "learning_rate": 0.0001498552438961147, + "loss": 0.7068, + "step": 4210 + }, + { + "epoch": 82.75, + "learning_rate": 0.00014985271628088938, + "loss": 0.7073, + "step": 4220 + }, + { + "epoch": 82.94, + "learning_rate": 0.00014985016681034024, + "loss": 0.7191, + "step": 4230 + }, + { + "epoch": 83.0, + "eval_loss": 0.7127190828323364, + "eval_runtime": 2.1082, + "eval_samples_per_second": 1081.004, + "eval_steps_per_second": 4.269, + "step": 4233 + }, + { + "epoch": 83.14, + "learning_rate": 0.00014984759548521172, + "loss": 0.7117, + "step": 4240 + }, + { + "epoch": 83.33, + "learning_rate": 0.0001498450023062546, + "loss": 0.7083, + "step": 4250 + }, + { + "epoch": 83.53, + "learning_rate": 0.00014984238727422597, + "loss": 0.7064, + "step": 4260 + }, + { + "epoch": 83.73, + "learning_rate": 0.00014983975038988944, + "loss": 0.7066, + "step": 4270 + }, + { + "epoch": 83.92, + "learning_rate": 0.00014983709165401489, + "loss": 0.7027, + "step": 4280 + }, + { + "epoch": 84.0, + "eval_loss": 0.7171905040740967, + "eval_runtime": 2.2202, + "eval_samples_per_second": 1026.505, + "eval_steps_per_second": 4.054, + "step": 4284 + }, + { + "epoch": 84.12, + "learning_rate": 0.00014983441106737857, + "loss": 0.7064, + "step": 4290 + }, + { + "epoch": 84.31, + "learning_rate": 0.0001498317086307632, + "loss": 0.7106, + "step": 4300 + }, + { + "epoch": 84.51, + "learning_rate": 0.00014982898434495783, + "loss": 0.7057, + "step": 4310 + }, + { + "epoch": 84.71, + "learning_rate": 0.0001498262382107579, + "loss": 0.7022, + "step": 4320 + }, + { + "epoch": 84.9, + "learning_rate": 0.00014982347022896516, + "loss": 0.6981, + "step": 4330 + }, + { + "epoch": 85.0, + "eval_loss": 0.7069874405860901, + "eval_runtime": 2.0965, + "eval_samples_per_second": 1087.073, + "eval_steps_per_second": 4.293, + "step": 4335 + }, + { + "epoch": 85.1, + "learning_rate": 0.00014982068040038786, + "loss": 0.7042, + "step": 4340 + }, + { + "epoch": 85.29, + "learning_rate": 0.00014981786872584056, + "loss": 0.7022, + "step": 4350 + }, + { + "epoch": 85.49, + "learning_rate": 0.00014981503520614415, + "loss": 0.6987, + "step": 4360 + }, + { + "epoch": 85.69, + "learning_rate": 0.000149812179842126, + "loss": 0.699, + "step": 4370 + }, + { + "epoch": 85.88, + "learning_rate": 0.00014980930263461985, + "loss": 0.7064, + "step": 4380 + }, + { + "epoch": 86.0, + "eval_loss": 0.7029294371604919, + "eval_runtime": 2.1186, + "eval_samples_per_second": 1075.695, + "eval_steps_per_second": 4.248, + "step": 4386 + }, + { + "epoch": 86.08, + "learning_rate": 0.00014980640358446573, + "loss": 0.6995, + "step": 4390 + }, + { + "epoch": 86.27, + "learning_rate": 0.0001498034826925101, + "loss": 0.7056, + "step": 4400 + }, + { + "epoch": 86.47, + "learning_rate": 0.0001498005399596058, + "loss": 0.7014, + "step": 4410 + }, + { + "epoch": 86.67, + "learning_rate": 0.00014979757538661204, + "loss": 0.6983, + "step": 4420 + }, + { + "epoch": 86.86, + "learning_rate": 0.00014979458897439442, + "loss": 0.6943, + "step": 4430 + }, + { + "epoch": 87.0, + "eval_loss": 0.7045770883560181, + "eval_runtime": 2.1001, + "eval_samples_per_second": 1085.211, + "eval_steps_per_second": 4.286, + "step": 4437 + }, + { + "epoch": 87.06, + "learning_rate": 0.0001497915807238249, + "loss": 0.6975, + "step": 4440 + }, + { + "epoch": 87.25, + "learning_rate": 0.00014978855063578183, + "loss": 0.7033, + "step": 4450 + }, + { + "epoch": 87.45, + "learning_rate": 0.00014978549871114992, + "loss": 0.6967, + "step": 4460 + }, + { + "epoch": 87.65, + "learning_rate": 0.00014978242495082024, + "loss": 0.7053, + "step": 4470 + }, + { + "epoch": 87.84, + "learning_rate": 0.00014977932935569032, + "loss": 0.7025, + "step": 4480 + }, + { + "epoch": 88.0, + "eval_loss": 0.7035739421844482, + "eval_runtime": 2.0997, + "eval_samples_per_second": 1085.417, + "eval_steps_per_second": 4.286, + "step": 4488 + }, + { + "epoch": 88.04, + "learning_rate": 0.00014977621192666395, + "loss": 0.7019, + "step": 4490 + }, + { + "epoch": 88.24, + "learning_rate": 0.00014977307266465139, + "loss": 0.7036, + "step": 4500 + }, + { + "epoch": 88.43, + "learning_rate": 0.00014976991157056917, + "loss": 0.7015, + "step": 4510 + }, + { + "epoch": 88.63, + "learning_rate": 0.00014976672864534034, + "loss": 0.6985, + "step": 4520 + }, + { + "epoch": 88.82, + "learning_rate": 0.0001497635238898942, + "loss": 0.6959, + "step": 4530 + }, + { + "epoch": 89.0, + "eval_loss": 0.7093929648399353, + "eval_runtime": 2.2076, + "eval_samples_per_second": 1032.357, + "eval_steps_per_second": 4.077, + "step": 4539 + }, + { + "epoch": 89.02, + "learning_rate": 0.00014976029730516646, + "loss": 0.7044, + "step": 4540 + }, + { + "epoch": 89.22, + "learning_rate": 0.00014975704889209923, + "loss": 0.7057, + "step": 4550 + }, + { + "epoch": 89.41, + "learning_rate": 0.00014975377865164097, + "loss": 0.7022, + "step": 4560 + }, + { + "epoch": 89.61, + "learning_rate": 0.00014975048658474652, + "loss": 0.695, + "step": 4570 + }, + { + "epoch": 89.8, + "learning_rate": 0.00014974717269237707, + "loss": 0.6957, + "step": 4580 + }, + { + "epoch": 90.0, + "learning_rate": 0.00014974383697550022, + "loss": 0.6988, + "step": 4590 + }, + { + "epoch": 90.0, + "eval_loss": 0.691733717918396, + "eval_runtime": 2.1923, + "eval_samples_per_second": 1039.543, + "eval_steps_per_second": 4.105, + "step": 4590 + }, + { + "epoch": 90.2, + "learning_rate": 0.00014974047943508995, + "loss": 0.6922, + "step": 4600 + }, + { + "epoch": 90.39, + "learning_rate": 0.00014973710007212652, + "loss": 0.6949, + "step": 4610 + }, + { + "epoch": 90.59, + "learning_rate": 0.00014973369888759668, + "loss": 0.693, + "step": 4620 + }, + { + "epoch": 90.78, + "learning_rate": 0.0001497302758824935, + "loss": 0.6976, + "step": 4630 + }, + { + "epoch": 90.98, + "learning_rate": 0.00014972683105781638, + "loss": 0.6912, + "step": 4640 + }, + { + "epoch": 91.0, + "eval_loss": 0.6925643682479858, + "eval_runtime": 2.18, + "eval_samples_per_second": 1045.407, + "eval_steps_per_second": 4.128, + "step": 4641 + }, + { + "epoch": 91.18, + "learning_rate": 0.00014972336441457118, + "loss": 0.6917, + "step": 4650 + }, + { + "epoch": 91.37, + "learning_rate": 0.00014971987595377008, + "loss": 0.6871, + "step": 4660 + }, + { + "epoch": 91.57, + "learning_rate": 0.00014971636567643161, + "loss": 0.6927, + "step": 4670 + }, + { + "epoch": 91.76, + "learning_rate": 0.0001497128335835807, + "loss": 0.6945, + "step": 4680 + }, + { + "epoch": 91.96, + "learning_rate": 0.00014970927967624864, + "loss": 0.689, + "step": 4690 + }, + { + "epoch": 92.0, + "eval_loss": 0.6880647540092468, + "eval_runtime": 2.0762, + "eval_samples_per_second": 1097.663, + "eval_steps_per_second": 4.335, + "step": 4692 + }, + { + "epoch": 92.16, + "learning_rate": 0.00014970570395547307, + "loss": 0.6964, + "step": 4700 + }, + { + "epoch": 92.35, + "learning_rate": 0.00014970210642229807, + "loss": 0.6949, + "step": 4710 + }, + { + "epoch": 92.55, + "learning_rate": 0.000149698487077774, + "loss": 0.69, + "step": 4720 + }, + { + "epoch": 92.75, + "learning_rate": 0.00014969484592295765, + "loss": 0.6878, + "step": 4730 + }, + { + "epoch": 92.94, + "learning_rate": 0.00014969118295891215, + "loss": 0.687, + "step": 4740 + }, + { + "epoch": 93.0, + "eval_loss": 0.6865754127502441, + "eval_runtime": 2.2287, + "eval_samples_per_second": 1022.574, + "eval_steps_per_second": 4.038, + "step": 4743 + }, + { + "epoch": 93.14, + "learning_rate": 0.000149687498186707, + "loss": 0.689, + "step": 4750 + }, + { + "epoch": 93.33, + "learning_rate": 0.00014968379160741805, + "loss": 0.6867, + "step": 4760 + }, + { + "epoch": 93.53, + "learning_rate": 0.00014968006322212758, + "loss": 0.6896, + "step": 4770 + }, + { + "epoch": 93.73, + "learning_rate": 0.00014967631303192417, + "loss": 0.6887, + "step": 4780 + }, + { + "epoch": 93.92, + "learning_rate": 0.0001496725410379028, + "loss": 0.6867, + "step": 4790 + }, + { + "epoch": 94.0, + "eval_loss": 0.6873466372489929, + "eval_runtime": 2.114, + "eval_samples_per_second": 1078.052, + "eval_steps_per_second": 4.257, + "step": 4794 + }, + { + "epoch": 94.12, + "learning_rate": 0.0001496687472411648, + "loss": 0.6861, + "step": 4800 + }, + { + "epoch": 94.31, + "learning_rate": 0.0001496649316428179, + "loss": 0.6911, + "step": 4810 + }, + { + "epoch": 94.51, + "learning_rate": 0.00014966109424397614, + "loss": 0.6865, + "step": 4820 + }, + { + "epoch": 94.71, + "learning_rate": 0.00014965723504575996, + "loss": 0.6789, + "step": 4830 + }, + { + "epoch": 94.9, + "learning_rate": 0.00014965335404929617, + "loss": 0.6832, + "step": 4840 + }, + { + "epoch": 95.0, + "eval_loss": 0.6820151805877686, + "eval_runtime": 2.1799, + "eval_samples_per_second": 1045.479, + "eval_steps_per_second": 4.129, + "step": 4845 + }, + { + "epoch": 95.1, + "learning_rate": 0.00014964945125571792, + "loss": 0.6821, + "step": 4850 + }, + { + "epoch": 95.29, + "learning_rate": 0.00014964552666616476, + "loss": 0.6875, + "step": 4860 + }, + { + "epoch": 95.49, + "learning_rate": 0.00014964158028178256, + "loss": 0.7012, + "step": 4870 + }, + { + "epoch": 95.69, + "learning_rate": 0.00014963761210372357, + "loss": 0.6934, + "step": 4880 + }, + { + "epoch": 95.88, + "learning_rate": 0.00014963362213314647, + "loss": 0.6863, + "step": 4890 + }, + { + "epoch": 96.0, + "eval_loss": 0.6808879375457764, + "eval_runtime": 2.1682, + "eval_samples_per_second": 1051.107, + "eval_steps_per_second": 4.151, + "step": 4896 + }, + { + "epoch": 96.08, + "learning_rate": 0.00014962961037121616, + "loss": 0.6835, + "step": 4900 + }, + { + "epoch": 96.27, + "learning_rate": 0.00014962557681910406, + "loss": 0.6891, + "step": 4910 + }, + { + "epoch": 96.47, + "learning_rate": 0.00014962152147798778, + "loss": 0.6867, + "step": 4920 + }, + { + "epoch": 96.67, + "learning_rate": 0.0001496174443490515, + "loss": 0.678, + "step": 4930 + }, + { + "epoch": 96.86, + "learning_rate": 0.0001496133454334856, + "loss": 0.6908, + "step": 4940 + }, + { + "epoch": 97.0, + "eval_loss": 0.6791673898696899, + "eval_runtime": 2.094, + "eval_samples_per_second": 1088.337, + "eval_steps_per_second": 4.298, + "step": 4947 + }, + { + "epoch": 97.06, + "learning_rate": 0.0001496092247324869, + "loss": 0.6815, + "step": 4950 + }, + { + "epoch": 97.25, + "learning_rate": 0.00014960508224725846, + "loss": 0.6837, + "step": 4960 + }, + { + "epoch": 97.45, + "learning_rate": 0.00014960091797900987, + "loss": 0.6784, + "step": 4970 + }, + { + "epoch": 97.65, + "learning_rate": 0.00014959673192895703, + "loss": 0.6837, + "step": 4980 + }, + { + "epoch": 97.84, + "learning_rate": 0.0001495925240983221, + "loss": 0.6891, + "step": 4990 + }, + { + "epoch": 98.0, + "eval_loss": 0.67962646484375, + "eval_runtime": 2.1402, + "eval_samples_per_second": 1064.861, + "eval_steps_per_second": 4.205, + "step": 4998 + }, + { + "epoch": 98.04, + "learning_rate": 0.0001495882944883337, + "loss": 0.6784, + "step": 5000 + }, + { + "epoch": 98.24, + "learning_rate": 0.00014958404310022683, + "loss": 0.68, + "step": 5010 + }, + { + "epoch": 98.43, + "learning_rate": 0.00014957976993524276, + "loss": 0.6756, + "step": 5020 + }, + { + "epoch": 98.63, + "learning_rate": 0.00014957547499462912, + "loss": 0.6841, + "step": 5030 + }, + { + "epoch": 98.82, + "learning_rate": 0.00014957115827964, + "loss": 0.6803, + "step": 5040 + }, + { + "epoch": 99.0, + "eval_loss": 0.6792941689491272, + "eval_runtime": 2.1861, + "eval_samples_per_second": 1042.487, + "eval_steps_per_second": 4.117, + "step": 5049 + }, + { + "epoch": 99.02, + "learning_rate": 0.00014956681979153577, + "loss": 0.682, + "step": 5050 + }, + { + "epoch": 99.22, + "learning_rate": 0.00014956245953158314, + "loss": 0.6795, + "step": 5060 + }, + { + "epoch": 99.41, + "learning_rate": 0.00014955807750105524, + "loss": 0.6753, + "step": 5070 + }, + { + "epoch": 99.61, + "learning_rate": 0.00014955367370123152, + "loss": 0.6752, + "step": 5080 + }, + { + "epoch": 99.8, + "learning_rate": 0.00014954924813339777, + "loss": 0.676, + "step": 5090 + }, + { + "epoch": 100.0, + "learning_rate": 0.0001495448007988462, + "loss": 0.6755, + "step": 5100 + }, + { + "epoch": 100.0, + "eval_loss": 0.6737886071205139, + "eval_runtime": 2.1396, + "eval_samples_per_second": 1065.176, + "eval_steps_per_second": 4.206, + "step": 5100 + }, + { + "epoch": 100.2, + "learning_rate": 0.0001495403316988753, + "loss": 0.6741, + "step": 5110 + }, + { + "epoch": 100.39, + "learning_rate": 0.00014953584083478993, + "loss": 0.6756, + "step": 5120 + }, + { + "epoch": 100.59, + "learning_rate": 0.0001495313282079014, + "loss": 0.6759, + "step": 5130 + }, + { + "epoch": 100.78, + "learning_rate": 0.00014952679381952718, + "loss": 0.6725, + "step": 5140 + }, + { + "epoch": 100.98, + "learning_rate": 0.0001495222376709913, + "loss": 0.6735, + "step": 5150 + }, + { + "epoch": 101.0, + "eval_loss": 0.6750496029853821, + "eval_runtime": 2.2168, + "eval_samples_per_second": 1028.06, + "eval_steps_per_second": 4.06, + "step": 5151 + }, + { + "epoch": 101.18, + "learning_rate": 0.00014951765976362405, + "loss": 0.6753, + "step": 5160 + }, + { + "epoch": 101.37, + "learning_rate": 0.00014951306009876203, + "loss": 0.6732, + "step": 5170 + }, + { + "epoch": 101.57, + "learning_rate": 0.00014950843867774828, + "loss": 0.6717, + "step": 5180 + }, + { + "epoch": 101.76, + "learning_rate": 0.00014950379550193212, + "loss": 0.6704, + "step": 5190 + }, + { + "epoch": 101.96, + "learning_rate": 0.00014949913057266928, + "loss": 0.6727, + "step": 5200 + }, + { + "epoch": 102.0, + "eval_loss": 0.672935962677002, + "eval_runtime": 2.1734, + "eval_samples_per_second": 1048.602, + "eval_steps_per_second": 4.141, + "step": 5202 + }, + { + "epoch": 102.16, + "learning_rate": 0.0001494944438913218, + "loss": 0.6736, + "step": 5210 + }, + { + "epoch": 102.35, + "learning_rate": 0.00014948973545925807, + "loss": 0.6732, + "step": 5220 + }, + { + "epoch": 102.55, + "learning_rate": 0.0001494850052778529, + "loss": 0.6703, + "step": 5230 + }, + { + "epoch": 102.75, + "learning_rate": 0.00014948025334848736, + "loss": 0.6721, + "step": 5240 + }, + { + "epoch": 102.94, + "learning_rate": 0.0001494754796725489, + "loss": 0.6695, + "step": 5250 + }, + { + "epoch": 103.0, + "eval_loss": 0.6733797192573547, + "eval_runtime": 2.1812, + "eval_samples_per_second": 1044.839, + "eval_steps_per_second": 4.126, + "step": 5253 + }, + { + "epoch": 103.14, + "learning_rate": 0.00014947068425143136, + "loss": 0.6728, + "step": 5260 + }, + { + "epoch": 103.33, + "learning_rate": 0.00014946586708653486, + "loss": 0.6684, + "step": 5270 + }, + { + "epoch": 103.53, + "learning_rate": 0.0001494610281792659, + "loss": 0.6702, + "step": 5280 + }, + { + "epoch": 103.73, + "learning_rate": 0.0001494561675310374, + "loss": 0.6705, + "step": 5290 + }, + { + "epoch": 103.92, + "learning_rate": 0.0001494512851432685, + "loss": 0.6678, + "step": 5300 + }, + { + "epoch": 104.0, + "eval_loss": 0.6701691746711731, + "eval_runtime": 2.1386, + "eval_samples_per_second": 1065.675, + "eval_steps_per_second": 4.208, + "step": 5304 + }, + { + "epoch": 104.12, + "learning_rate": 0.00014944638101738474, + "loss": 0.6684, + "step": 5310 + }, + { + "epoch": 104.31, + "learning_rate": 0.00014944145515481805, + "loss": 0.6648, + "step": 5320 + }, + { + "epoch": 104.51, + "learning_rate": 0.00014943650755700667, + "loss": 0.6702, + "step": 5330 + }, + { + "epoch": 104.71, + "learning_rate": 0.0001494315382253952, + "loss": 0.671, + "step": 5340 + }, + { + "epoch": 104.9, + "learning_rate": 0.0001494265471614345, + "loss": 0.671, + "step": 5350 + }, + { + "epoch": 105.0, + "eval_loss": 0.6720392107963562, + "eval_runtime": 2.1432, + "eval_samples_per_second": 1063.36, + "eval_steps_per_second": 4.199, + "step": 5355 + }, + { + "epoch": 105.1, + "learning_rate": 0.0001494215343665819, + "loss": 0.6686, + "step": 5360 + }, + { + "epoch": 105.29, + "learning_rate": 0.00014941649984230107, + "loss": 0.6669, + "step": 5370 + }, + { + "epoch": 105.49, + "learning_rate": 0.00014941144359006194, + "loss": 0.6654, + "step": 5380 + }, + { + "epoch": 105.69, + "learning_rate": 0.00014940636561134078, + "loss": 0.6645, + "step": 5390 + }, + { + "epoch": 105.88, + "learning_rate": 0.0001494012659076203, + "loss": 0.6654, + "step": 5400 + }, + { + "epoch": 106.0, + "eval_loss": 0.6686482429504395, + "eval_runtime": 2.244, + "eval_samples_per_second": 1015.59, + "eval_steps_per_second": 4.011, + "step": 5406 + }, + { + "epoch": 106.08, + "learning_rate": 0.00014939614448038948, + "loss": 0.6674, + "step": 5410 + }, + { + "epoch": 106.27, + "learning_rate": 0.00014939100133114368, + "loss": 0.6675, + "step": 5420 + }, + { + "epoch": 106.47, + "learning_rate": 0.00014938583646138457, + "loss": 0.6667, + "step": 5430 + }, + { + "epoch": 106.67, + "learning_rate": 0.00014938064987262016, + "loss": 0.6673, + "step": 5440 + }, + { + "epoch": 106.86, + "learning_rate": 0.00014937544156636484, + "loss": 0.669, + "step": 5450 + }, + { + "epoch": 107.0, + "eval_loss": 0.6682608723640442, + "eval_runtime": 2.2531, + "eval_samples_per_second": 1011.487, + "eval_steps_per_second": 3.994, + "step": 5457 + }, + { + "epoch": 107.06, + "learning_rate": 0.00014937021154413932, + "loss": 0.6636, + "step": 5460 + }, + { + "epoch": 107.25, + "learning_rate": 0.00014936495980747065, + "loss": 0.6646, + "step": 5470 + }, + { + "epoch": 107.45, + "learning_rate": 0.00014935968635789224, + "loss": 0.6677, + "step": 5480 + }, + { + "epoch": 107.65, + "learning_rate": 0.00014935439119694377, + "loss": 0.6651, + "step": 5490 + }, + { + "epoch": 107.84, + "learning_rate": 0.00014934907432617134, + "loss": 0.6628, + "step": 5500 + }, + { + "epoch": 108.0, + "eval_loss": 0.6639227867126465, + "eval_runtime": 2.2629, + "eval_samples_per_second": 1007.134, + "eval_steps_per_second": 3.977, + "step": 5508 + }, + { + "epoch": 108.04, + "learning_rate": 0.00014934373574712734, + "loss": 0.6611, + "step": 5510 + }, + { + "epoch": 108.24, + "learning_rate": 0.00014933837546137054, + "loss": 0.6629, + "step": 5520 + }, + { + "epoch": 108.43, + "learning_rate": 0.000149332993470466, + "loss": 0.6639, + "step": 5530 + }, + { + "epoch": 108.63, + "learning_rate": 0.00014932758977598514, + "loss": 0.6703, + "step": 5540 + }, + { + "epoch": 108.82, + "learning_rate": 0.0001493221643795058, + "loss": 0.6655, + "step": 5550 + }, + { + "epoch": 109.0, + "eval_loss": 0.6662523150444031, + "eval_runtime": 2.2626, + "eval_samples_per_second": 1007.256, + "eval_steps_per_second": 3.978, + "step": 5559 + }, + { + "epoch": 109.02, + "learning_rate": 0.00014931671728261195, + "loss": 0.6633, + "step": 5560 + }, + { + "epoch": 109.22, + "learning_rate": 0.00014931124848689407, + "loss": 0.6672, + "step": 5570 + }, + { + "epoch": 109.41, + "learning_rate": 0.00014930575799394893, + "loss": 0.6585, + "step": 5580 + }, + { + "epoch": 109.61, + "learning_rate": 0.00014930024580537968, + "loss": 0.6637, + "step": 5590 + }, + { + "epoch": 109.8, + "learning_rate": 0.00014929471192279568, + "loss": 0.6653, + "step": 5600 + }, + { + "epoch": 110.0, + "learning_rate": 0.00014928915634781272, + "loss": 0.6637, + "step": 5610 + }, + { + "epoch": 110.0, + "eval_loss": 0.6650734543800354, + "eval_runtime": 2.1716, + "eval_samples_per_second": 1049.462, + "eval_steps_per_second": 4.144, + "step": 5610 + }, + { + "epoch": 110.2, + "learning_rate": 0.00014928357908205295, + "loss": 0.6677, + "step": 5620 + }, + { + "epoch": 110.39, + "learning_rate": 0.00014927798012714477, + "loss": 0.665, + "step": 5630 + }, + { + "epoch": 110.59, + "learning_rate": 0.00014927235948472293, + "loss": 0.6645, + "step": 5640 + }, + { + "epoch": 110.78, + "learning_rate": 0.00014926671715642854, + "loss": 0.6612, + "step": 5650 + }, + { + "epoch": 110.98, + "learning_rate": 0.0001492610531439091, + "loss": 0.6643, + "step": 5660 + }, + { + "epoch": 111.0, + "eval_loss": 0.6638761162757874, + "eval_runtime": 2.1014, + "eval_samples_per_second": 1084.505, + "eval_steps_per_second": 4.283, + "step": 5661 + }, + { + "epoch": 111.18, + "learning_rate": 0.00014925536744881827, + "loss": 0.6655, + "step": 5670 + }, + { + "epoch": 111.37, + "learning_rate": 0.00014924966007281624, + "loss": 0.6648, + "step": 5680 + }, + { + "epoch": 111.57, + "learning_rate": 0.00014924393101756938, + "loss": 0.662, + "step": 5690 + }, + { + "epoch": 111.76, + "learning_rate": 0.00014923818028475045, + "loss": 0.6596, + "step": 5700 + }, + { + "epoch": 111.96, + "learning_rate": 0.0001492324078760386, + "loss": 0.6607, + "step": 5710 + }, + { + "epoch": 112.0, + "eval_loss": 0.656067967414856, + "eval_runtime": 2.1149, + "eval_samples_per_second": 1077.6, + "eval_steps_per_second": 4.256, + "step": 5712 + }, + { + "epoch": 112.16, + "learning_rate": 0.00014922661379311914, + "loss": 0.6633, + "step": 5720 + }, + { + "epoch": 112.35, + "learning_rate": 0.00014922079803768388, + "loss": 0.6635, + "step": 5730 + }, + { + "epoch": 112.55, + "learning_rate": 0.0001492149606114309, + "loss": 0.6634, + "step": 5740 + }, + { + "epoch": 112.75, + "learning_rate": 0.00014920910151606454, + "loss": 0.6635, + "step": 5750 + }, + { + "epoch": 112.94, + "learning_rate": 0.00014920322075329557, + "loss": 0.6598, + "step": 5760 + }, + { + "epoch": 113.0, + "eval_loss": 0.6590579748153687, + "eval_runtime": 2.1383, + "eval_samples_per_second": 1065.808, + "eval_steps_per_second": 4.209, + "step": 5763 + }, + { + "epoch": 113.14, + "learning_rate": 0.00014919731832484104, + "loss": 0.6608, + "step": 5770 + }, + { + "epoch": 113.33, + "learning_rate": 0.0001491913942324243, + "loss": 0.6601, + "step": 5780 + }, + { + "epoch": 113.53, + "learning_rate": 0.00014918544847777513, + "loss": 0.6647, + "step": 5790 + }, + { + "epoch": 113.73, + "learning_rate": 0.00014917948106262947, + "loss": 0.6605, + "step": 5800 + }, + { + "epoch": 113.92, + "learning_rate": 0.00014917349198872968, + "loss": 0.6589, + "step": 5810 + }, + { + "epoch": 114.0, + "eval_loss": 0.6609504818916321, + "eval_runtime": 2.1322, + "eval_samples_per_second": 1068.831, + "eval_steps_per_second": 4.221, + "step": 5814 + }, + { + "epoch": 114.12, + "learning_rate": 0.0001491674812578245, + "loss": 0.6534, + "step": 5820 + }, + { + "epoch": 114.31, + "learning_rate": 0.00014916144887166884, + "loss": 0.6593, + "step": 5830 + }, + { + "epoch": 114.51, + "learning_rate": 0.0001491553948320241, + "loss": 0.6607, + "step": 5840 + }, + { + "epoch": 114.71, + "learning_rate": 0.0001491493191406579, + "loss": 0.6595, + "step": 5850 + }, + { + "epoch": 114.9, + "learning_rate": 0.00014914322179934418, + "loss": 0.6566, + "step": 5860 + }, + { + "epoch": 115.0, + "eval_loss": 0.6566076874732971, + "eval_runtime": 2.1529, + "eval_samples_per_second": 1058.561, + "eval_steps_per_second": 4.18, + "step": 5865 + }, + { + "epoch": 115.1, + "learning_rate": 0.00014913710280986328, + "loss": 0.6598, + "step": 5870 + }, + { + "epoch": 115.29, + "learning_rate": 0.00014913096217400175, + "loss": 0.6645, + "step": 5880 + }, + { + "epoch": 115.49, + "learning_rate": 0.00014912479989355254, + "loss": 0.6665, + "step": 5890 + }, + { + "epoch": 115.69, + "learning_rate": 0.00014911861597031493, + "loss": 0.6694, + "step": 5900 + }, + { + "epoch": 115.88, + "learning_rate": 0.00014911241040609444, + "loss": 0.6706, + "step": 5910 + }, + { + "epoch": 116.0, + "eval_loss": 0.6748928427696228, + "eval_runtime": 2.1111, + "eval_samples_per_second": 1079.51, + "eval_steps_per_second": 4.263, + "step": 5916 + }, + { + "epoch": 116.08, + "learning_rate": 0.000149106183202703, + "loss": 0.6744, + "step": 5920 + }, + { + "epoch": 116.27, + "learning_rate": 0.0001490999343619588, + "loss": 0.665, + "step": 5930 + }, + { + "epoch": 116.47, + "learning_rate": 0.0001490936638856863, + "loss": 0.6654, + "step": 5940 + }, + { + "epoch": 116.67, + "learning_rate": 0.00014908737177571644, + "loss": 0.6701, + "step": 5950 + }, + { + "epoch": 116.86, + "learning_rate": 0.00014908105803388634, + "loss": 0.6688, + "step": 5960 + }, + { + "epoch": 117.0, + "eval_loss": 0.667015016078949, + "eval_runtime": 2.2347, + "eval_samples_per_second": 1019.841, + "eval_steps_per_second": 4.027, + "step": 5967 + }, + { + "epoch": 117.06, + "learning_rate": 0.00014907472266203944, + "loss": 0.6653, + "step": 5970 + }, + { + "epoch": 117.25, + "learning_rate": 0.00014906836566202554, + "loss": 0.6631, + "step": 5980 + }, + { + "epoch": 117.45, + "learning_rate": 0.00014906198703570076, + "loss": 0.6656, + "step": 5990 + }, + { + "epoch": 117.65, + "learning_rate": 0.0001490555867849275, + "loss": 0.6625, + "step": 6000 + }, + { + "epoch": 117.84, + "learning_rate": 0.00014904916491157452, + "loss": 0.6657, + "step": 6010 + }, + { + "epoch": 118.0, + "eval_loss": 0.6599269509315491, + "eval_runtime": 2.1838, + "eval_samples_per_second": 1043.574, + "eval_steps_per_second": 4.121, + "step": 6018 + }, + { + "epoch": 118.04, + "learning_rate": 0.00014904272141751684, + "loss": 0.661, + "step": 6020 + }, + { + "epoch": 118.24, + "learning_rate": 0.00014903625630463581, + "loss": 0.658, + "step": 6030 + }, + { + "epoch": 118.43, + "learning_rate": 0.0001490297695748191, + "loss": 0.6598, + "step": 6040 + }, + { + "epoch": 118.63, + "learning_rate": 0.0001490232612299607, + "loss": 0.6664, + "step": 6050 + }, + { + "epoch": 118.82, + "learning_rate": 0.00014901673127196092, + "loss": 0.6611, + "step": 6060 + }, + { + "epoch": 119.0, + "eval_loss": 0.6566824913024902, + "eval_runtime": 2.1748, + "eval_samples_per_second": 1047.897, + "eval_steps_per_second": 4.138, + "step": 6069 + }, + { + "epoch": 119.02, + "learning_rate": 0.00014901017970272634, + "loss": 0.6568, + "step": 6070 + }, + { + "epoch": 119.22, + "learning_rate": 0.00014900360652416987, + "loss": 0.6577, + "step": 6080 + }, + { + "epoch": 119.41, + "learning_rate": 0.00014899701173821071, + "loss": 0.6559, + "step": 6090 + }, + { + "epoch": 119.61, + "learning_rate": 0.00014899039534677444, + "loss": 0.6552, + "step": 6100 + }, + { + "epoch": 119.8, + "learning_rate": 0.00014898375735179287, + "loss": 0.6548, + "step": 6110 + }, + { + "epoch": 120.0, + "learning_rate": 0.00014897709775520417, + "loss": 0.6528, + "step": 6120 + }, + { + "epoch": 120.0, + "eval_loss": 0.6591194868087769, + "eval_runtime": 2.2152, + "eval_samples_per_second": 1028.818, + "eval_steps_per_second": 4.063, + "step": 6120 + }, + { + "epoch": 120.2, + "learning_rate": 0.00014897041655895276, + "loss": 0.6594, + "step": 6130 + }, + { + "epoch": 120.39, + "learning_rate": 0.0001489637137649894, + "loss": 0.6593, + "step": 6140 + }, + { + "epoch": 120.59, + "learning_rate": 0.0001489569893752712, + "loss": 0.6523, + "step": 6150 + }, + { + "epoch": 120.78, + "learning_rate": 0.00014895024339176149, + "loss": 0.657, + "step": 6160 + }, + { + "epoch": 120.98, + "learning_rate": 0.00014894347581642994, + "loss": 0.652, + "step": 6170 + }, + { + "epoch": 121.0, + "eval_loss": 0.6566243171691895, + "eval_runtime": 2.1086, + "eval_samples_per_second": 1080.825, + "eval_steps_per_second": 4.268, + "step": 6171 + }, + { + "epoch": 121.18, + "learning_rate": 0.00014893668665125258, + "loss": 0.6498, + "step": 6180 + }, + { + "epoch": 121.37, + "learning_rate": 0.00014892987589821164, + "loss": 0.652, + "step": 6190 + }, + { + "epoch": 121.57, + "learning_rate": 0.00014892304355929576, + "loss": 0.6503, + "step": 6200 + }, + { + "epoch": 121.76, + "learning_rate": 0.00014891618963649978, + "loss": 0.6514, + "step": 6210 + }, + { + "epoch": 121.96, + "learning_rate": 0.00014890931413182493, + "loss": 0.6488, + "step": 6220 + }, + { + "epoch": 122.0, + "eval_loss": 0.6527683734893799, + "eval_runtime": 2.162, + "eval_samples_per_second": 1054.109, + "eval_steps_per_second": 4.163, + "step": 6222 + }, + { + "epoch": 122.16, + "learning_rate": 0.0001489024170472787, + "loss": 0.6491, + "step": 6230 + }, + { + "epoch": 122.35, + "learning_rate": 0.00014889549838487486, + "loss": 0.6536, + "step": 6240 + }, + { + "epoch": 122.55, + "learning_rate": 0.00014888855814663355, + "loss": 0.6554, + "step": 6250 + }, + { + "epoch": 122.75, + "learning_rate": 0.0001488815963345811, + "loss": 0.6561, + "step": 6260 + }, + { + "epoch": 122.94, + "learning_rate": 0.00014887461295075025, + "loss": 0.6538, + "step": 6270 + }, + { + "epoch": 123.0, + "eval_loss": 0.6558259129524231, + "eval_runtime": 2.1735, + "eval_samples_per_second": 1048.543, + "eval_steps_per_second": 4.141, + "step": 6273 + }, + { + "epoch": 123.14, + "learning_rate": 0.00014886760799717998, + "loss": 0.6462, + "step": 6280 + }, + { + "epoch": 123.33, + "learning_rate": 0.0001488605814759156, + "loss": 0.6557, + "step": 6290 + }, + { + "epoch": 123.53, + "learning_rate": 0.00014885353338900867, + "loss": 0.6523, + "step": 6300 + }, + { + "epoch": 123.73, + "learning_rate": 0.00014884646373851707, + "loss": 0.6496, + "step": 6310 + }, + { + "epoch": 123.92, + "learning_rate": 0.000148839372526505, + "loss": 0.6457, + "step": 6320 + }, + { + "epoch": 124.0, + "eval_loss": 0.6509066820144653, + "eval_runtime": 2.2241, + "eval_samples_per_second": 1024.675, + "eval_steps_per_second": 4.047, + "step": 6324 + }, + { + "epoch": 124.12, + "learning_rate": 0.00014883225975504294, + "loss": 0.6514, + "step": 6330 + }, + { + "epoch": 124.31, + "learning_rate": 0.00014882512542620764, + "loss": 0.648, + "step": 6340 + }, + { + "epoch": 124.51, + "learning_rate": 0.0001488179695420822, + "loss": 0.6467, + "step": 6350 + }, + { + "epoch": 124.71, + "learning_rate": 0.00014881079210475593, + "loss": 0.6431, + "step": 6360 + }, + { + "epoch": 124.9, + "learning_rate": 0.00014880359311632453, + "loss": 0.643, + "step": 6370 + }, + { + "epoch": 125.0, + "eval_loss": 0.6462063789367676, + "eval_runtime": 2.2405, + "eval_samples_per_second": 1017.192, + "eval_steps_per_second": 4.017, + "step": 6375 + }, + { + "epoch": 125.1, + "learning_rate": 0.0001487963725788899, + "loss": 0.6473, + "step": 6380 + }, + { + "epoch": 125.29, + "learning_rate": 0.00014878913049456028, + "loss": 0.649, + "step": 6390 + }, + { + "epoch": 125.49, + "learning_rate": 0.00014878186686545026, + "loss": 0.6446, + "step": 6400 + }, + { + "epoch": 125.69, + "learning_rate": 0.00014877458169368056, + "loss": 0.6455, + "step": 6410 + }, + { + "epoch": 125.88, + "learning_rate": 0.00014876727498137836, + "loss": 0.6433, + "step": 6420 + }, + { + "epoch": 126.0, + "eval_loss": 0.645854651927948, + "eval_runtime": 2.0792, + "eval_samples_per_second": 1096.098, + "eval_steps_per_second": 4.329, + "step": 6426 + }, + { + "epoch": 126.08, + "learning_rate": 0.00014875994673067703, + "loss": 0.6448, + "step": 6430 + }, + { + "epoch": 126.27, + "learning_rate": 0.0001487525969437163, + "loss": 0.6438, + "step": 6440 + }, + { + "epoch": 126.47, + "learning_rate": 0.00014874522562264206, + "loss": 0.6462, + "step": 6450 + }, + { + "epoch": 126.67, + "learning_rate": 0.0001487378327696066, + "loss": 0.6419, + "step": 6460 + }, + { + "epoch": 126.86, + "learning_rate": 0.00014873041838676853, + "loss": 0.6451, + "step": 6470 + }, + { + "epoch": 127.0, + "eval_loss": 0.6453904509544373, + "eval_runtime": 2.1225, + "eval_samples_per_second": 1073.748, + "eval_steps_per_second": 4.24, + "step": 6477 + }, + { + "epoch": 127.06, + "learning_rate": 0.00014872298247629263, + "loss": 0.6439, + "step": 6480 + }, + { + "epoch": 127.25, + "learning_rate": 0.00014871552504035, + "loss": 0.6429, + "step": 6490 + }, + { + "epoch": 127.45, + "learning_rate": 0.00014870804608111812, + "loss": 0.6456, + "step": 6500 + }, + { + "epoch": 127.65, + "learning_rate": 0.00014870054560078057, + "loss": 0.646, + "step": 6510 + }, + { + "epoch": 127.84, + "learning_rate": 0.00014869302360152745, + "loss": 0.6413, + "step": 6520 + }, + { + "epoch": 128.0, + "eval_loss": 0.6441250443458557, + "eval_runtime": 2.2111, + "eval_samples_per_second": 1030.7, + "eval_steps_per_second": 4.07, + "step": 6528 + }, + { + "epoch": 128.04, + "learning_rate": 0.00014868548008555493, + "loss": 0.6426, + "step": 6530 + }, + { + "epoch": 128.24, + "learning_rate": 0.00014867791505506557, + "loss": 0.6458, + "step": 6540 + }, + { + "epoch": 128.43, + "learning_rate": 0.0001486703285122682, + "loss": 0.6405, + "step": 6550 + }, + { + "epoch": 128.63, + "learning_rate": 0.00014866272045937787, + "loss": 0.6432, + "step": 6560 + }, + { + "epoch": 128.82, + "learning_rate": 0.00014865509089861603, + "loss": 0.6407, + "step": 6570 + }, + { + "epoch": 129.0, + "eval_loss": 0.6409056782722473, + "eval_runtime": 2.2046, + "eval_samples_per_second": 1033.764, + "eval_steps_per_second": 4.082, + "step": 6579 + }, + { + "epoch": 129.02, + "learning_rate": 0.00014864743983221033, + "loss": 0.6424, + "step": 6580 + }, + { + "epoch": 129.22, + "learning_rate": 0.00014863976726239468, + "loss": 0.6441, + "step": 6590 + }, + { + "epoch": 129.41, + "learning_rate": 0.00014863207319140934, + "loss": 0.6386, + "step": 6600 + }, + { + "epoch": 129.61, + "learning_rate": 0.00014862435762150075, + "loss": 0.6421, + "step": 6610 + }, + { + "epoch": 129.8, + "learning_rate": 0.00014861662055492173, + "loss": 0.6409, + "step": 6620 + }, + { + "epoch": 130.0, + "learning_rate": 0.00014860886199393134, + "loss": 0.6381, + "step": 6630 + }, + { + "epoch": 130.0, + "eval_loss": 0.6422334909439087, + "eval_runtime": 2.2457, + "eval_samples_per_second": 1014.81, + "eval_steps_per_second": 4.008, + "step": 6630 + }, + { + "epoch": 130.2, + "learning_rate": 0.00014860108194079486, + "loss": 0.6419, + "step": 6640 + }, + { + "epoch": 130.39, + "learning_rate": 0.0001485932803977839, + "loss": 0.6369, + "step": 6650 + }, + { + "epoch": 130.59, + "learning_rate": 0.0001485854573671764, + "loss": 0.6383, + "step": 6660 + }, + { + "epoch": 130.78, + "learning_rate": 0.00014857761285125642, + "loss": 0.6396, + "step": 6670 + }, + { + "epoch": 130.98, + "learning_rate": 0.00014856974685231446, + "loss": 0.6408, + "step": 6680 + }, + { + "epoch": 131.0, + "eval_loss": 0.6432426571846008, + "eval_runtime": 2.2309, + "eval_samples_per_second": 1021.546, + "eval_steps_per_second": 4.034, + "step": 6681 + }, + { + "epoch": 131.18, + "learning_rate": 0.00014856185937264717, + "loss": 0.6373, + "step": 6690 + }, + { + "epoch": 131.37, + "learning_rate": 0.00014855395041455752, + "loss": 0.6412, + "step": 6700 + }, + { + "epoch": 131.57, + "learning_rate": 0.00014854601998035477, + "loss": 0.6391, + "step": 6710 + }, + { + "epoch": 131.76, + "learning_rate": 0.00014853806807235443, + "loss": 0.6364, + "step": 6720 + }, + { + "epoch": 131.96, + "learning_rate": 0.00014853009469287826, + "loss": 0.6404, + "step": 6730 + }, + { + "epoch": 132.0, + "eval_loss": 0.640821099281311, + "eval_runtime": 2.0959, + "eval_samples_per_second": 1087.358, + "eval_steps_per_second": 4.294, + "step": 6732 + }, + { + "epoch": 132.16, + "learning_rate": 0.00014852209984425434, + "loss": 0.6456, + "step": 6740 + }, + { + "epoch": 132.35, + "learning_rate": 0.00014851408352881693, + "loss": 0.6469, + "step": 6750 + }, + { + "epoch": 132.55, + "learning_rate": 0.0001485060457489067, + "loss": 0.6374, + "step": 6760 + }, + { + "epoch": 132.75, + "learning_rate": 0.0001484979865068704, + "loss": 0.6421, + "step": 6770 + }, + { + "epoch": 132.94, + "learning_rate": 0.00014848990580506124, + "loss": 0.6412, + "step": 6780 + }, + { + "epoch": 133.0, + "eval_loss": 0.635369598865509, + "eval_runtime": 2.1408, + "eval_samples_per_second": 1064.58, + "eval_steps_per_second": 4.204, + "step": 6783 + }, + { + "epoch": 133.14, + "learning_rate": 0.00014848180364583857, + "loss": 0.6387, + "step": 6790 + }, + { + "epoch": 133.33, + "learning_rate": 0.00014847368003156804, + "loss": 0.6349, + "step": 6800 + }, + { + "epoch": 133.53, + "learning_rate": 0.00014846553496462153, + "loss": 0.638, + "step": 6810 + }, + { + "epoch": 133.73, + "learning_rate": 0.00014845736844737727, + "loss": 0.6346, + "step": 6820 + }, + { + "epoch": 133.92, + "learning_rate": 0.0001484491804822197, + "loss": 0.6348, + "step": 6830 + }, + { + "epoch": 134.0, + "eval_loss": 0.6349772810935974, + "eval_runtime": 2.174, + "eval_samples_per_second": 1048.301, + "eval_steps_per_second": 4.14, + "step": 6834 + }, + { + "epoch": 134.12, + "learning_rate": 0.0001484409710715395, + "loss": 0.6331, + "step": 6840 + }, + { + "epoch": 134.31, + "learning_rate": 0.00014843274021773364, + "loss": 0.638, + "step": 6850 + }, + { + "epoch": 134.51, + "learning_rate": 0.00014842448792320534, + "loss": 0.638, + "step": 6860 + }, + { + "epoch": 134.71, + "learning_rate": 0.00014841621419036408, + "loss": 0.6346, + "step": 6870 + }, + { + "epoch": 134.9, + "learning_rate": 0.00014840791902162562, + "loss": 0.6307, + "step": 6880 + }, + { + "epoch": 135.0, + "eval_loss": 0.6389310956001282, + "eval_runtime": 2.1053, + "eval_samples_per_second": 1082.49, + "eval_steps_per_second": 4.275, + "step": 6885 + }, + { + "epoch": 135.1, + "learning_rate": 0.00014839960241941198, + "loss": 0.6345, + "step": 6890 + }, + { + "epoch": 135.29, + "learning_rate": 0.00014839126438615137, + "loss": 0.6352, + "step": 6900 + }, + { + "epoch": 135.49, + "learning_rate": 0.00014838290492427838, + "loss": 0.6297, + "step": 6910 + }, + { + "epoch": 135.69, + "learning_rate": 0.00014837452403623373, + "loss": 0.6367, + "step": 6920 + }, + { + "epoch": 135.88, + "learning_rate": 0.00014836612172446447, + "loss": 0.639, + "step": 6930 + }, + { + "epoch": 136.0, + "eval_loss": 0.6417258381843567, + "eval_runtime": 2.2192, + "eval_samples_per_second": 1026.924, + "eval_steps_per_second": 4.055, + "step": 6936 + }, + { + "epoch": 136.08, + "learning_rate": 0.00014835769799142389, + "loss": 0.6377, + "step": 6940 + }, + { + "epoch": 136.27, + "learning_rate": 0.00014834925283957153, + "loss": 0.6389, + "step": 6950 + }, + { + "epoch": 136.47, + "learning_rate": 0.0001483407862713732, + "loss": 0.6385, + "step": 6960 + }, + { + "epoch": 136.67, + "learning_rate": 0.00014833229828930094, + "loss": 0.6331, + "step": 6970 + }, + { + "epoch": 136.86, + "learning_rate": 0.00014832378889583303, + "loss": 0.6319, + "step": 6980 + }, + { + "epoch": 137.0, + "eval_loss": 0.6353015899658203, + "eval_runtime": 2.2601, + "eval_samples_per_second": 1008.349, + "eval_steps_per_second": 3.982, + "step": 6987 + }, + { + "epoch": 137.06, + "learning_rate": 0.00014831525809345404, + "loss": 0.6331, + "step": 6990 + }, + { + "epoch": 137.25, + "learning_rate": 0.0001483067058846548, + "loss": 0.634, + "step": 7000 + }, + { + "epoch": 137.45, + "learning_rate": 0.00014829813227193233, + "loss": 0.6321, + "step": 7010 + }, + { + "epoch": 137.65, + "learning_rate": 0.00014828953725778995, + "loss": 0.6313, + "step": 7020 + }, + { + "epoch": 137.84, + "learning_rate": 0.0001482809208447372, + "loss": 0.6306, + "step": 7030 + }, + { + "epoch": 138.0, + "eval_loss": 0.6385012269020081, + "eval_runtime": 2.1538, + "eval_samples_per_second": 1058.12, + "eval_steps_per_second": 4.179, + "step": 7038 + }, + { + "epoch": 138.04, + "learning_rate": 0.00014827228303528986, + "loss": 0.6387, + "step": 7040 + }, + { + "epoch": 138.24, + "learning_rate": 0.00014826362383197004, + "loss": 0.6332, + "step": 7050 + }, + { + "epoch": 138.43, + "learning_rate": 0.00014825494323730598, + "loss": 0.6369, + "step": 7060 + }, + { + "epoch": 138.63, + "learning_rate": 0.00014824624125383225, + "loss": 0.6319, + "step": 7070 + }, + { + "epoch": 138.82, + "learning_rate": 0.0001482375178840896, + "loss": 0.6307, + "step": 7080 + }, + { + "epoch": 139.0, + "eval_loss": 0.6411579251289368, + "eval_runtime": 2.2349, + "eval_samples_per_second": 1019.731, + "eval_steps_per_second": 4.027, + "step": 7089 + }, + { + "epoch": 139.02, + "learning_rate": 0.00014822877313062512, + "loss": 0.6336, + "step": 7090 + }, + { + "epoch": 139.22, + "learning_rate": 0.00014822000699599204, + "loss": 0.6396, + "step": 7100 + }, + { + "epoch": 139.41, + "learning_rate": 0.00014821121948274985, + "loss": 0.6287, + "step": 7110 + }, + { + "epoch": 139.61, + "learning_rate": 0.00014820241059346437, + "loss": 0.6303, + "step": 7120 + }, + { + "epoch": 139.8, + "learning_rate": 0.00014819358033070756, + "loss": 0.6306, + "step": 7130 + }, + { + "epoch": 140.0, + "learning_rate": 0.00014818472869705765, + "loss": 0.6343, + "step": 7140 + }, + { + "epoch": 140.0, + "eval_loss": 0.6308197379112244, + "eval_runtime": 2.1192, + "eval_samples_per_second": 1075.425, + "eval_steps_per_second": 4.247, + "step": 7140 + }, + { + "epoch": 140.2, + "learning_rate": 0.00014817585569509918, + "loss": 0.6315, + "step": 7150 + }, + { + "epoch": 140.39, + "learning_rate": 0.0001481669613274228, + "loss": 0.6305, + "step": 7160 + }, + { + "epoch": 140.59, + "learning_rate": 0.00014815804559662547, + "loss": 0.6292, + "step": 7170 + }, + { + "epoch": 140.78, + "learning_rate": 0.00014814910850531046, + "loss": 0.6275, + "step": 7180 + }, + { + "epoch": 140.98, + "learning_rate": 0.0001481401500560871, + "loss": 0.6289, + "step": 7190 + }, + { + "epoch": 141.0, + "eval_loss": 0.6336754560470581, + "eval_runtime": 2.185, + "eval_samples_per_second": 1043.027, + "eval_steps_per_second": 4.119, + "step": 7191 + }, + { + "epoch": 141.18, + "learning_rate": 0.00014813117025157114, + "loss": 0.6284, + "step": 7200 + }, + { + "epoch": 141.37, + "learning_rate": 0.00014812216909438443, + "loss": 0.6286, + "step": 7210 + }, + { + "epoch": 141.57, + "learning_rate": 0.00014811314658715516, + "loss": 0.6321, + "step": 7220 + }, + { + "epoch": 141.76, + "learning_rate": 0.00014810410273251762, + "loss": 0.6281, + "step": 7230 + }, + { + "epoch": 141.96, + "learning_rate": 0.00014809503753311252, + "loss": 0.6298, + "step": 7240 + }, + { + "epoch": 142.0, + "eval_loss": 0.6342359781265259, + "eval_runtime": 2.2299, + "eval_samples_per_second": 1022.019, + "eval_steps_per_second": 4.036, + "step": 7242 + }, + { + "epoch": 142.16, + "learning_rate": 0.0001480859509915866, + "loss": 0.6277, + "step": 7250 + }, + { + "epoch": 142.35, + "learning_rate": 0.00014807684311059297, + "loss": 0.6304, + "step": 7260 + }, + { + "epoch": 142.55, + "learning_rate": 0.0001480677138927909, + "loss": 0.6329, + "step": 7270 + }, + { + "epoch": 142.75, + "learning_rate": 0.000148058563340846, + "loss": 0.6289, + "step": 7280 + }, + { + "epoch": 142.94, + "learning_rate": 0.00014804939145742993, + "loss": 0.6284, + "step": 7290 + }, + { + "epoch": 143.0, + "eval_loss": 0.6286506056785583, + "eval_runtime": 2.2504, + "eval_samples_per_second": 1012.705, + "eval_steps_per_second": 3.999, + "step": 7293 + }, + { + "epoch": 143.14, + "learning_rate": 0.00014804019824522072, + "loss": 0.6306, + "step": 7300 + }, + { + "epoch": 143.33, + "learning_rate": 0.00014803098370690256, + "loss": 0.6258, + "step": 7310 + }, + { + "epoch": 143.53, + "learning_rate": 0.00014802174784516592, + "loss": 0.6242, + "step": 7320 + }, + { + "epoch": 143.73, + "learning_rate": 0.00014801249066270745, + "loss": 0.6251, + "step": 7330 + }, + { + "epoch": 143.92, + "learning_rate": 0.00014800321216223004, + "loss": 0.624, + "step": 7340 + }, + { + "epoch": 144.0, + "eval_loss": 0.6305137872695923, + "eval_runtime": 2.1855, + "eval_samples_per_second": 1042.771, + "eval_steps_per_second": 4.118, + "step": 7344 + }, + { + "epoch": 144.12, + "learning_rate": 0.00014799391234644282, + "loss": 0.6305, + "step": 7350 + }, + { + "epoch": 144.31, + "learning_rate": 0.00014798459121806112, + "loss": 0.6235, + "step": 7360 + }, + { + "epoch": 144.51, + "learning_rate": 0.0001479752487798065, + "loss": 0.6233, + "step": 7370 + }, + { + "epoch": 144.71, + "learning_rate": 0.00014796588503440675, + "loss": 0.6252, + "step": 7380 + }, + { + "epoch": 144.9, + "learning_rate": 0.00014795649998459583, + "loss": 0.6266, + "step": 7390 + }, + { + "epoch": 145.0, + "eval_loss": 0.6337530612945557, + "eval_runtime": 2.1245, + "eval_samples_per_second": 1072.699, + "eval_steps_per_second": 4.236, + "step": 7395 + }, + { + "epoch": 145.1, + "learning_rate": 0.000147947093633114, + "loss": 0.6315, + "step": 7400 + }, + { + "epoch": 145.29, + "learning_rate": 0.00014793766598270773, + "loss": 0.6313, + "step": 7410 + }, + { + "epoch": 145.49, + "learning_rate": 0.00014792821703612968, + "loss": 0.6256, + "step": 7420 + }, + { + "epoch": 145.69, + "learning_rate": 0.0001479187467961387, + "loss": 0.6237, + "step": 7430 + }, + { + "epoch": 145.88, + "learning_rate": 0.00014790925526549987, + "loss": 0.6253, + "step": 7440 + }, + { + "epoch": 146.0, + "eval_loss": 0.6281046271324158, + "eval_runtime": 2.2095, + "eval_samples_per_second": 1031.434, + "eval_steps_per_second": 4.073, + "step": 7446 + }, + { + "epoch": 146.08, + "learning_rate": 0.0001478997424469846, + "loss": 0.6263, + "step": 7450 + }, + { + "epoch": 146.27, + "learning_rate": 0.0001478902083433703, + "loss": 0.625, + "step": 7460 + }, + { + "epoch": 146.47, + "learning_rate": 0.0001478806529574408, + "loss": 0.6256, + "step": 7470 + }, + { + "epoch": 146.67, + "learning_rate": 0.00014787107629198601, + "loss": 0.6251, + "step": 7480 + }, + { + "epoch": 146.86, + "learning_rate": 0.00014786147834980218, + "loss": 0.6204, + "step": 7490 + }, + { + "epoch": 147.0, + "eval_loss": 0.624106764793396, + "eval_runtime": 2.317, + "eval_samples_per_second": 983.579, + "eval_steps_per_second": 3.884, + "step": 7497 + }, + { + "epoch": 147.06, + "learning_rate": 0.00014785185913369157, + "loss": 0.621, + "step": 7500 + }, + { + "epoch": 147.25, + "learning_rate": 0.00014784221864646289, + "loss": 0.6257, + "step": 7510 + }, + { + "epoch": 147.45, + "learning_rate": 0.0001478325568909309, + "loss": 0.6257, + "step": 7520 + }, + { + "epoch": 147.65, + "learning_rate": 0.0001478228738699166, + "loss": 0.6245, + "step": 7530 + }, + { + "epoch": 147.84, + "learning_rate": 0.00014781316958624726, + "loss": 0.6232, + "step": 7540 + }, + { + "epoch": 148.0, + "eval_loss": 0.6222416758537292, + "eval_runtime": 2.1817, + "eval_samples_per_second": 1044.619, + "eval_steps_per_second": 4.125, + "step": 7548 + }, + { + "epoch": 148.04, + "learning_rate": 0.00014780344404275627, + "loss": 0.6219, + "step": 7550 + }, + { + "epoch": 148.24, + "learning_rate": 0.00014779369724228332, + "loss": 0.6228, + "step": 7560 + }, + { + "epoch": 148.43, + "learning_rate": 0.00014778392918767422, + "loss": 0.626, + "step": 7570 + }, + { + "epoch": 148.63, + "learning_rate": 0.00014777413988178103, + "loss": 0.6252, + "step": 7580 + }, + { + "epoch": 148.82, + "learning_rate": 0.00014776432932746202, + "loss": 0.6213, + "step": 7590 + }, + { + "epoch": 149.0, + "eval_loss": 0.620071530342102, + "eval_runtime": 2.1112, + "eval_samples_per_second": 1079.463, + "eval_steps_per_second": 4.263, + "step": 7599 + }, + { + "epoch": 149.02, + "learning_rate": 0.00014775449752758166, + "loss": 0.6174, + "step": 7600 + }, + { + "epoch": 149.22, + "learning_rate": 0.00014774464448501058, + "loss": 0.6262, + "step": 7610 + }, + { + "epoch": 149.41, + "learning_rate": 0.0001477347702026257, + "loss": 0.6249, + "step": 7620 + }, + { + "epoch": 149.61, + "learning_rate": 0.00014772487468331006, + "loss": 0.6219, + "step": 7630 + }, + { + "epoch": 149.8, + "learning_rate": 0.00014771495792995293, + "loss": 0.6219, + "step": 7640 + }, + { + "epoch": 150.0, + "learning_rate": 0.00014770501994544976, + "loss": 0.6225, + "step": 7650 + }, + { + "epoch": 150.0, + "eval_loss": 0.6237109303474426, + "eval_runtime": 2.0993, + "eval_samples_per_second": 1085.616, + "eval_steps_per_second": 4.287, + "step": 7650 + }, + { + "epoch": 150.2, + "learning_rate": 0.00014769506073270228, + "loss": 0.6222, + "step": 7660 + }, + { + "epoch": 150.39, + "learning_rate": 0.0001476850802946183, + "loss": 0.626, + "step": 7670 + }, + { + "epoch": 150.59, + "learning_rate": 0.00014767507863411194, + "loss": 0.622, + "step": 7680 + }, + { + "epoch": 150.78, + "learning_rate": 0.0001476650557541034, + "loss": 0.6205, + "step": 7690 + }, + { + "epoch": 150.98, + "learning_rate": 0.00014765501165751917, + "loss": 0.6228, + "step": 7700 + }, + { + "epoch": 151.0, + "eval_loss": 0.6192641258239746, + "eval_runtime": 2.0691, + "eval_samples_per_second": 1101.464, + "eval_steps_per_second": 4.35, + "step": 7701 + }, + { + "epoch": 151.18, + "learning_rate": 0.0001476449463472919, + "loss": 0.6194, + "step": 7710 + }, + { + "epoch": 151.37, + "learning_rate": 0.00014763485982636045, + "loss": 0.6193, + "step": 7720 + }, + { + "epoch": 151.57, + "learning_rate": 0.00014762475209766985, + "loss": 0.6174, + "step": 7730 + }, + { + "epoch": 151.76, + "learning_rate": 0.00014761462316417132, + "loss": 0.6191, + "step": 7740 + }, + { + "epoch": 151.96, + "learning_rate": 0.00014760447302882227, + "loss": 0.6191, + "step": 7750 + }, + { + "epoch": 152.0, + "eval_loss": 0.6199597120285034, + "eval_runtime": 2.2126, + "eval_samples_per_second": 1029.988, + "eval_steps_per_second": 4.068, + "step": 7752 + }, + { + "epoch": 152.16, + "learning_rate": 0.00014759430169458636, + "loss": 0.6251, + "step": 7760 + }, + { + "epoch": 152.35, + "learning_rate": 0.00014758410916443333, + "loss": 0.6229, + "step": 7770 + }, + { + "epoch": 152.55, + "learning_rate": 0.00014757389544133926, + "loss": 0.6175, + "step": 7780 + }, + { + "epoch": 152.75, + "learning_rate": 0.00014756366052828622, + "loss": 0.6195, + "step": 7790 + }, + { + "epoch": 152.94, + "learning_rate": 0.00014755340442826266, + "loss": 0.6198, + "step": 7800 + }, + { + "epoch": 153.0, + "eval_loss": 0.6229197978973389, + "eval_runtime": 2.1172, + "eval_samples_per_second": 1076.425, + "eval_steps_per_second": 4.251, + "step": 7803 + }, + { + "epoch": 153.14, + "learning_rate": 0.0001475431271442631, + "loss": 0.6181, + "step": 7810 + }, + { + "epoch": 153.33, + "learning_rate": 0.0001475328286792883, + "loss": 0.6236, + "step": 7820 + }, + { + "epoch": 153.53, + "learning_rate": 0.00014752250903634514, + "loss": 0.621, + "step": 7830 + }, + { + "epoch": 153.73, + "learning_rate": 0.00014751216821844677, + "loss": 0.6167, + "step": 7840 + }, + { + "epoch": 153.92, + "learning_rate": 0.00014750180622861243, + "loss": 0.6183, + "step": 7850 + }, + { + "epoch": 154.0, + "eval_loss": 0.6212825179100037, + "eval_runtime": 2.2545, + "eval_samples_per_second": 1010.847, + "eval_steps_per_second": 3.992, + "step": 7854 + }, + { + "epoch": 154.12, + "learning_rate": 0.00014749142306986763, + "loss": 0.6227, + "step": 7860 + }, + { + "epoch": 154.31, + "learning_rate": 0.00014748101874524402, + "loss": 0.62, + "step": 7870 + }, + { + "epoch": 154.51, + "learning_rate": 0.00014747059325777943, + "loss": 0.6211, + "step": 7880 + }, + { + "epoch": 154.71, + "learning_rate": 0.00014746014661051784, + "loss": 0.6164, + "step": 7890 + }, + { + "epoch": 154.9, + "learning_rate": 0.00014744967880650947, + "loss": 0.6181, + "step": 7900 + }, + { + "epoch": 155.0, + "eval_loss": 0.6212865114212036, + "eval_runtime": 2.1579, + "eval_samples_per_second": 1056.104, + "eval_steps_per_second": 4.171, + "step": 7905 + }, + { + "epoch": 155.1, + "learning_rate": 0.00014743918984881066, + "loss": 0.6198, + "step": 7910 + }, + { + "epoch": 155.29, + "learning_rate": 0.000147428679740484, + "loss": 0.6242, + "step": 7920 + }, + { + "epoch": 155.49, + "learning_rate": 0.00014741814848459812, + "loss": 0.6169, + "step": 7930 + }, + { + "epoch": 155.69, + "learning_rate": 0.000147407596084228, + "loss": 0.6203, + "step": 7940 + }, + { + "epoch": 155.88, + "learning_rate": 0.00014739702254245465, + "loss": 0.6168, + "step": 7950 + }, + { + "epoch": 156.0, + "eval_loss": 0.6163668036460876, + "eval_runtime": 2.143, + "eval_samples_per_second": 1063.445, + "eval_steps_per_second": 4.2, + "step": 7956 + }, + { + "epoch": 156.08, + "learning_rate": 0.00014738642786236533, + "loss": 0.6208, + "step": 7960 + }, + { + "epoch": 156.27, + "learning_rate": 0.00014737581204705345, + "loss": 0.62, + "step": 7970 + }, + { + "epoch": 156.47, + "learning_rate": 0.0001473651750996186, + "loss": 0.6165, + "step": 7980 + }, + { + "epoch": 156.67, + "learning_rate": 0.0001473545170231665, + "loss": 0.6187, + "step": 7990 + }, + { + "epoch": 156.86, + "learning_rate": 0.00014734383782080912, + "loss": 0.6156, + "step": 8000 + }, + { + "epoch": 157.0, + "eval_loss": 0.6160290241241455, + "eval_runtime": 2.1391, + "eval_samples_per_second": 1065.42, + "eval_steps_per_second": 4.207, + "step": 8007 + }, + { + "epoch": 157.06, + "learning_rate": 0.0001473331374956645, + "loss": 0.619, + "step": 8010 + }, + { + "epoch": 157.25, + "learning_rate": 0.00014732241605085693, + "loss": 0.6117, + "step": 8020 + }, + { + "epoch": 157.45, + "learning_rate": 0.0001473116734895168, + "loss": 0.615, + "step": 8030 + }, + { + "epoch": 157.65, + "learning_rate": 0.00014730090981478075, + "loss": 0.6147, + "step": 8040 + }, + { + "epoch": 157.84, + "learning_rate": 0.00014729012502979146, + "loss": 0.6125, + "step": 8050 + }, + { + "epoch": 158.0, + "eval_loss": 0.6153013110160828, + "eval_runtime": 2.256, + "eval_samples_per_second": 1010.206, + "eval_steps_per_second": 3.989, + "step": 8058 + }, + { + "epoch": 158.04, + "learning_rate": 0.00014727931913769793, + "loss": 0.6186, + "step": 8060 + }, + { + "epoch": 158.24, + "learning_rate": 0.00014726849214165516, + "loss": 0.6138, + "step": 8070 + }, + { + "epoch": 158.43, + "learning_rate": 0.00014725764404482445, + "loss": 0.6207, + "step": 8080 + }, + { + "epoch": 158.63, + "learning_rate": 0.00014724677485037317, + "loss": 0.6155, + "step": 8090 + }, + { + "epoch": 158.82, + "learning_rate": 0.0001472358845614749, + "loss": 0.6126, + "step": 8100 + }, + { + "epoch": 159.0, + "eval_loss": 0.6150580048561096, + "eval_runtime": 2.1863, + "eval_samples_per_second": 1042.382, + "eval_steps_per_second": 4.116, + "step": 8109 + }, + { + "epoch": 159.02, + "learning_rate": 0.00014722497318130935, + "loss": 0.6151, + "step": 8110 + }, + { + "epoch": 159.22, + "learning_rate": 0.0001472140407130624, + "loss": 0.61, + "step": 8120 + }, + { + "epoch": 159.41, + "learning_rate": 0.00014720308715992613, + "loss": 0.6128, + "step": 8130 + }, + { + "epoch": 159.61, + "learning_rate": 0.00014719211252509865, + "loss": 0.6123, + "step": 8140 + }, + { + "epoch": 159.8, + "learning_rate": 0.00014718111681178437, + "loss": 0.6129, + "step": 8150 + }, + { + "epoch": 160.0, + "learning_rate": 0.00014717010002319376, + "loss": 0.6115, + "step": 8160 + }, + { + "epoch": 160.0, + "eval_loss": 0.6163375377655029, + "eval_runtime": 2.1353, + "eval_samples_per_second": 1067.303, + "eval_steps_per_second": 4.215, + "step": 8160 + }, + { + "epoch": 160.2, + "learning_rate": 0.0001471590621625435, + "loss": 0.6117, + "step": 8170 + }, + { + "epoch": 160.39, + "learning_rate": 0.00014714800323305642, + "loss": 0.6116, + "step": 8180 + }, + { + "epoch": 160.59, + "learning_rate": 0.00014713692323796142, + "loss": 0.6091, + "step": 8190 + }, + { + "epoch": 160.78, + "learning_rate": 0.00014712582218049365, + "loss": 0.6147, + "step": 8200 + }, + { + "epoch": 160.98, + "learning_rate": 0.00014711470006389436, + "loss": 0.611, + "step": 8210 + }, + { + "epoch": 161.0, + "eval_loss": 0.6166603565216064, + "eval_runtime": 2.1988, + "eval_samples_per_second": 1036.453, + "eval_steps_per_second": 4.093, + "step": 8211 + }, + { + "epoch": 161.18, + "learning_rate": 0.000147103556891411, + "loss": 0.6208, + "step": 8220 + }, + { + "epoch": 161.37, + "learning_rate": 0.00014709239266629705, + "loss": 0.6114, + "step": 8230 + }, + { + "epoch": 161.57, + "learning_rate": 0.00014708120739181227, + "loss": 0.612, + "step": 8240 + }, + { + "epoch": 161.76, + "learning_rate": 0.0001470700010712225, + "loss": 0.6107, + "step": 8250 + }, + { + "epoch": 161.96, + "learning_rate": 0.00014705877370779975, + "loss": 0.6099, + "step": 8260 + }, + { + "epoch": 162.0, + "eval_loss": 0.608273446559906, + "eval_runtime": 2.1536, + "eval_samples_per_second": 1058.242, + "eval_steps_per_second": 4.179, + "step": 8262 + }, + { + "epoch": 162.16, + "learning_rate": 0.00014704752530482213, + "loss": 0.6117, + "step": 8270 + }, + { + "epoch": 162.35, + "learning_rate": 0.00014703625586557392, + "loss": 0.6114, + "step": 8280 + }, + { + "epoch": 162.55, + "learning_rate": 0.00014702496539334558, + "loss": 0.6113, + "step": 8290 + }, + { + "epoch": 162.75, + "learning_rate": 0.00014701365389143366, + "loss": 0.6121, + "step": 8300 + }, + { + "epoch": 162.94, + "learning_rate": 0.00014700232136314085, + "loss": 0.6089, + "step": 8310 + }, + { + "epoch": 163.0, + "eval_loss": 0.6103874444961548, + "eval_runtime": 2.2403, + "eval_samples_per_second": 1017.264, + "eval_steps_per_second": 4.017, + "step": 8313 + }, + { + "epoch": 163.14, + "learning_rate": 0.00014699096781177603, + "loss": 0.6081, + "step": 8320 + }, + { + "epoch": 163.33, + "learning_rate": 0.00014697959324065415, + "loss": 0.6117, + "step": 8330 + }, + { + "epoch": 163.53, + "learning_rate": 0.00014696819765309637, + "loss": 0.6091, + "step": 8340 + }, + { + "epoch": 163.73, + "learning_rate": 0.0001469567810524299, + "loss": 0.6109, + "step": 8350 + }, + { + "epoch": 163.92, + "learning_rate": 0.00014694534344198814, + "loss": 0.6091, + "step": 8360 + }, + { + "epoch": 164.0, + "eval_loss": 0.6139717698097229, + "eval_runtime": 2.2739, + "eval_samples_per_second": 1002.243, + "eval_steps_per_second": 3.958, + "step": 8364 + }, + { + "epoch": 164.12, + "learning_rate": 0.00014693388482511067, + "loss": 0.6104, + "step": 8370 + }, + { + "epoch": 164.31, + "learning_rate": 0.00014692240520514308, + "loss": 0.6059, + "step": 8380 + }, + { + "epoch": 164.51, + "learning_rate": 0.0001469109045854372, + "loss": 0.6079, + "step": 8390 + }, + { + "epoch": 164.71, + "learning_rate": 0.00014689938296935095, + "loss": 0.6097, + "step": 8400 + }, + { + "epoch": 164.9, + "learning_rate": 0.0001468878403602484, + "loss": 0.6105, + "step": 8410 + }, + { + "epoch": 165.0, + "eval_loss": 0.6122100949287415, + "eval_runtime": 2.0978, + "eval_samples_per_second": 1086.385, + "eval_steps_per_second": 4.29, + "step": 8415 + }, + { + "epoch": 165.1, + "learning_rate": 0.0001468762767614997, + "loss": 0.6108, + "step": 8420 + }, + { + "epoch": 165.29, + "learning_rate": 0.0001468646921764812, + "loss": 0.6138, + "step": 8430 + }, + { + "epoch": 165.49, + "learning_rate": 0.0001468530866085753, + "loss": 0.6102, + "step": 8440 + }, + { + "epoch": 165.69, + "learning_rate": 0.0001468414600611706, + "loss": 0.6078, + "step": 8450 + }, + { + "epoch": 165.88, + "learning_rate": 0.0001468298125376618, + "loss": 0.61, + "step": 8460 + }, + { + "epoch": 166.0, + "eval_loss": 0.6105751395225525, + "eval_runtime": 2.2305, + "eval_samples_per_second": 1021.739, + "eval_steps_per_second": 4.035, + "step": 8466 + }, + { + "epoch": 166.08, + "learning_rate": 0.0001468181440414497, + "loss": 0.608, + "step": 8470 + }, + { + "epoch": 166.27, + "learning_rate": 0.0001468064545759412, + "loss": 0.6077, + "step": 8480 + }, + { + "epoch": 166.47, + "learning_rate": 0.00014679474414454942, + "loss": 0.6073, + "step": 8490 + }, + { + "epoch": 166.67, + "learning_rate": 0.00014678301275069353, + "loss": 0.6081, + "step": 8500 + }, + { + "epoch": 166.86, + "learning_rate": 0.00014677126039779886, + "loss": 0.6104, + "step": 8510 + }, + { + "epoch": 167.0, + "eval_loss": 0.6062180399894714, + "eval_runtime": 2.2377, + "eval_samples_per_second": 1018.474, + "eval_steps_per_second": 4.022, + "step": 8517 + }, + { + "epoch": 167.06, + "learning_rate": 0.0001467594870892968, + "loss": 0.61, + "step": 8520 + }, + { + "epoch": 167.25, + "learning_rate": 0.00014674769282862487, + "loss": 0.6098, + "step": 8530 + }, + { + "epoch": 167.45, + "learning_rate": 0.0001467358776192268, + "loss": 0.6099, + "step": 8540 + }, + { + "epoch": 167.65, + "learning_rate": 0.0001467240414645523, + "loss": 0.6036, + "step": 8550 + }, + { + "epoch": 167.84, + "learning_rate": 0.00014671218436805732, + "loss": 0.6067, + "step": 8560 + }, + { + "epoch": 168.0, + "eval_loss": 0.6094751358032227, + "eval_runtime": 2.1511, + "eval_samples_per_second": 1059.437, + "eval_steps_per_second": 4.184, + "step": 8568 + }, + { + "epoch": 168.04, + "learning_rate": 0.00014670030633320383, + "loss": 0.6032, + "step": 8570 + }, + { + "epoch": 168.24, + "learning_rate": 0.00014668840736345993, + "loss": 0.607, + "step": 8580 + }, + { + "epoch": 168.43, + "learning_rate": 0.00014667648746229993, + "loss": 0.6061, + "step": 8590 + }, + { + "epoch": 168.63, + "learning_rate": 0.0001466645466332041, + "loss": 0.608, + "step": 8600 + }, + { + "epoch": 168.82, + "learning_rate": 0.00014665258487965896, + "loss": 0.6056, + "step": 8610 + }, + { + "epoch": 169.0, + "eval_loss": 0.606715202331543, + "eval_runtime": 2.1396, + "eval_samples_per_second": 1065.129, + "eval_steps_per_second": 4.206, + "step": 8619 + }, + { + "epoch": 169.02, + "learning_rate": 0.00014664060220515704, + "loss": 0.6008, + "step": 8620 + }, + { + "epoch": 169.22, + "learning_rate": 0.00014662859861319698, + "loss": 0.6066, + "step": 8630 + }, + { + "epoch": 169.41, + "learning_rate": 0.00014661657410728365, + "loss": 0.604, + "step": 8640 + }, + { + "epoch": 169.61, + "learning_rate": 0.00014660452869092786, + "loss": 0.6058, + "step": 8650 + }, + { + "epoch": 169.8, + "learning_rate": 0.0001465924623676466, + "loss": 0.6037, + "step": 8660 + }, + { + "epoch": 170.0, + "learning_rate": 0.00014658037514096305, + "loss": 0.607, + "step": 8670 + }, + { + "epoch": 170.0, + "eval_loss": 0.6091219782829285, + "eval_runtime": 2.2153, + "eval_samples_per_second": 1028.74, + "eval_steps_per_second": 4.063, + "step": 8670 + }, + { + "epoch": 170.2, + "learning_rate": 0.00014656826701440632, + "loss": 0.6046, + "step": 8680 + }, + { + "epoch": 170.39, + "learning_rate": 0.00014655613799151177, + "loss": 0.6135, + "step": 8690 + }, + { + "epoch": 170.59, + "learning_rate": 0.00014654398807582074, + "loss": 0.6052, + "step": 8700 + }, + { + "epoch": 170.78, + "learning_rate": 0.00014653181727088084, + "loss": 0.6088, + "step": 8710 + }, + { + "epoch": 170.98, + "learning_rate": 0.00014651962558024558, + "loss": 0.6032, + "step": 8720 + }, + { + "epoch": 171.0, + "eval_loss": 0.6041246652603149, + "eval_runtime": 2.1575, + "eval_samples_per_second": 1056.316, + "eval_steps_per_second": 4.171, + "step": 8721 + }, + { + "epoch": 171.18, + "learning_rate": 0.00014650741300747467, + "loss": 0.6048, + "step": 8730 + }, + { + "epoch": 171.37, + "learning_rate": 0.00014649517955613397, + "loss": 0.6034, + "step": 8740 + }, + { + "epoch": 171.57, + "learning_rate": 0.0001464829252297953, + "loss": 0.6044, + "step": 8750 + }, + { + "epoch": 171.76, + "learning_rate": 0.00014647065003203674, + "loss": 0.6014, + "step": 8760 + }, + { + "epoch": 171.96, + "learning_rate": 0.00014645835396644228, + "loss": 0.6038, + "step": 8770 + }, + { + "epoch": 172.0, + "eval_loss": 0.610372006893158, + "eval_runtime": 2.2748, + "eval_samples_per_second": 1001.839, + "eval_steps_per_second": 3.956, + "step": 8772 + }, + { + "epoch": 172.16, + "learning_rate": 0.00014644603703660214, + "loss": 0.6019, + "step": 8780 + }, + { + "epoch": 172.35, + "learning_rate": 0.0001464336992461126, + "loss": 0.6056, + "step": 8790 + }, + { + "epoch": 172.55, + "learning_rate": 0.000146421340598576, + "loss": 0.6053, + "step": 8800 + }, + { + "epoch": 172.75, + "learning_rate": 0.0001464089610976008, + "loss": 0.6066, + "step": 8810 + }, + { + "epoch": 172.94, + "learning_rate": 0.0001463965607468015, + "loss": 0.605, + "step": 8820 + }, + { + "epoch": 173.0, + "eval_loss": 0.606823742389679, + "eval_runtime": 2.2188, + "eval_samples_per_second": 1027.155, + "eval_steps_per_second": 4.056, + "step": 8823 + }, + { + "epoch": 173.14, + "learning_rate": 0.0001463841395497988, + "loss": 0.6055, + "step": 8830 + }, + { + "epoch": 173.33, + "learning_rate": 0.00014637169751021938, + "loss": 0.602, + "step": 8840 + }, + { + "epoch": 173.53, + "learning_rate": 0.000146359234631696, + "loss": 0.6031, + "step": 8850 + }, + { + "epoch": 173.73, + "learning_rate": 0.00014634675091786757, + "loss": 0.6029, + "step": 8860 + }, + { + "epoch": 173.92, + "learning_rate": 0.0001463342463723791, + "loss": 0.6036, + "step": 8870 + }, + { + "epoch": 174.0, + "eval_loss": 0.6004641056060791, + "eval_runtime": 2.1823, + "eval_samples_per_second": 1044.294, + "eval_steps_per_second": 4.124, + "step": 8874 + }, + { + "epoch": 174.12, + "learning_rate": 0.00014632172099888157, + "loss": 0.6014, + "step": 8880 + }, + { + "epoch": 174.31, + "learning_rate": 0.00014630917480103214, + "loss": 0.5998, + "step": 8890 + }, + { + "epoch": 174.51, + "learning_rate": 0.000146296607782494, + "loss": 0.6006, + "step": 8900 + }, + { + "epoch": 174.71, + "learning_rate": 0.0001462840199469365, + "loss": 0.5982, + "step": 8910 + }, + { + "epoch": 174.9, + "learning_rate": 0.00014627141129803492, + "loss": 0.6035, + "step": 8920 + }, + { + "epoch": 175.0, + "eval_loss": 0.605520486831665, + "eval_runtime": 2.0997, + "eval_samples_per_second": 1085.379, + "eval_steps_per_second": 4.286, + "step": 8925 + }, + { + "epoch": 175.1, + "learning_rate": 0.00014625878183947076, + "loss": 0.6024, + "step": 8930 + }, + { + "epoch": 175.29, + "learning_rate": 0.00014624613157493153, + "loss": 0.6016, + "step": 8940 + }, + { + "epoch": 175.49, + "learning_rate": 0.00014623346050811085, + "loss": 0.6036, + "step": 8950 + }, + { + "epoch": 175.69, + "learning_rate": 0.0001462207686427083, + "loss": 0.6034, + "step": 8960 + }, + { + "epoch": 175.88, + "learning_rate": 0.0001462080559824297, + "loss": 0.6026, + "step": 8970 + }, + { + "epoch": 176.0, + "eval_loss": 0.6013907194137573, + "eval_runtime": 2.2617, + "eval_samples_per_second": 1007.639, + "eval_steps_per_second": 3.979, + "step": 8976 + }, + { + "epoch": 176.08, + "learning_rate": 0.0001461953225309869, + "loss": 0.5985, + "step": 8980 + }, + { + "epoch": 176.27, + "learning_rate": 0.0001461825682920977, + "loss": 0.6, + "step": 8990 + }, + { + "epoch": 176.47, + "learning_rate": 0.00014616979326948607, + "loss": 0.6012, + "step": 9000 + }, + { + "epoch": 176.67, + "learning_rate": 0.00014615699746688206, + "loss": 0.6005, + "step": 9010 + }, + { + "epoch": 176.86, + "learning_rate": 0.00014614418088802173, + "loss": 0.6012, + "step": 9020 + }, + { + "epoch": 177.0, + "eval_loss": 0.6028780341148376, + "eval_runtime": 2.2208, + "eval_samples_per_second": 1026.22, + "eval_steps_per_second": 4.053, + "step": 9027 + }, + { + "epoch": 177.06, + "learning_rate": 0.0001461313435366473, + "loss": 0.6033, + "step": 9030 + }, + { + "epoch": 177.25, + "learning_rate": 0.00014611848541650686, + "loss": 0.6005, + "step": 9040 + }, + { + "epoch": 177.45, + "learning_rate": 0.00014610560653135482, + "loss": 0.5972, + "step": 9050 + }, + { + "epoch": 177.65, + "learning_rate": 0.0001460927068849515, + "loss": 0.5979, + "step": 9060 + }, + { + "epoch": 177.84, + "learning_rate": 0.00014607978648106327, + "loss": 0.5945, + "step": 9070 + }, + { + "epoch": 178.0, + "eval_loss": 0.5966967344284058, + "eval_runtime": 2.2013, + "eval_samples_per_second": 1035.296, + "eval_steps_per_second": 4.088, + "step": 9078 + }, + { + "epoch": 178.04, + "learning_rate": 0.0001460668453234626, + "loss": 0.5976, + "step": 9080 + }, + { + "epoch": 178.24, + "learning_rate": 0.00014605388341592805, + "loss": 0.597, + "step": 9090 + }, + { + "epoch": 178.43, + "learning_rate": 0.00014604090076224423, + "loss": 0.5998, + "step": 9100 + }, + { + "epoch": 178.63, + "learning_rate": 0.0001460278973662017, + "loss": 0.5972, + "step": 9110 + }, + { + "epoch": 178.82, + "learning_rate": 0.00014601487323159728, + "loss": 0.6011, + "step": 9120 + }, + { + "epoch": 179.0, + "eval_loss": 0.5920745730400085, + "eval_runtime": 2.1356, + "eval_samples_per_second": 1067.143, + "eval_steps_per_second": 4.214, + "step": 9129 + }, + { + "epoch": 179.02, + "learning_rate": 0.0001460018283622336, + "loss": 0.6004, + "step": 9130 + }, + { + "epoch": 179.22, + "learning_rate": 0.00014598876276191957, + "loss": 0.597, + "step": 9140 + }, + { + "epoch": 179.41, + "learning_rate": 0.00014597567643447, + "loss": 0.5967, + "step": 9150 + }, + { + "epoch": 179.61, + "learning_rate": 0.00014596256938370584, + "loss": 0.5986, + "step": 9160 + }, + { + "epoch": 179.8, + "learning_rate": 0.00014594944161345404, + "loss": 0.5954, + "step": 9170 + }, + { + "epoch": 180.0, + "learning_rate": 0.00014593629312754756, + "loss": 0.5929, + "step": 9180 + }, + { + "epoch": 180.0, + "eval_loss": 0.5991100668907166, + "eval_runtime": 2.2402, + "eval_samples_per_second": 1017.3, + "eval_steps_per_second": 4.017, + "step": 9180 + }, + { + "epoch": 180.2, + "learning_rate": 0.00014592312392982558, + "loss": 0.5966, + "step": 9190 + }, + { + "epoch": 180.39, + "learning_rate": 0.00014590993402413313, + "loss": 0.5993, + "step": 9200 + }, + { + "epoch": 180.59, + "learning_rate": 0.0001458967234143214, + "loss": 0.6039, + "step": 9210 + }, + { + "epoch": 180.78, + "learning_rate": 0.00014588349210424757, + "loss": 0.5962, + "step": 9220 + }, + { + "epoch": 180.98, + "learning_rate": 0.00014587024009777492, + "loss": 0.5981, + "step": 9230 + }, + { + "epoch": 181.0, + "eval_loss": 0.5953816175460815, + "eval_runtime": 2.1845, + "eval_samples_per_second": 1043.242, + "eval_steps_per_second": 4.12, + "step": 9231 + }, + { + "epoch": 181.18, + "learning_rate": 0.0001458569673987727, + "loss": 0.5913, + "step": 9240 + }, + { + "epoch": 181.37, + "learning_rate": 0.00014584367401111628, + "loss": 0.5993, + "step": 9250 + }, + { + "epoch": 181.57, + "learning_rate": 0.00014583035993868701, + "loss": 0.5983, + "step": 9260 + }, + { + "epoch": 181.76, + "learning_rate": 0.0001458170251853723, + "loss": 0.5953, + "step": 9270 + }, + { + "epoch": 181.96, + "learning_rate": 0.00014580366975506563, + "loss": 0.6011, + "step": 9280 + }, + { + "epoch": 182.0, + "eval_loss": 0.6006762385368347, + "eval_runtime": 2.2479, + "eval_samples_per_second": 1013.832, + "eval_steps_per_second": 4.004, + "step": 9282 + }, + { + "epoch": 182.16, + "learning_rate": 0.00014579029365166646, + "loss": 0.6002, + "step": 9290 + }, + { + "epoch": 182.35, + "learning_rate": 0.0001457768968790803, + "loss": 0.596, + "step": 9300 + }, + { + "epoch": 182.55, + "learning_rate": 0.00014576347944121872, + "loss": 0.5964, + "step": 9310 + }, + { + "epoch": 182.75, + "learning_rate": 0.00014575004134199937, + "loss": 0.5929, + "step": 9320 + }, + { + "epoch": 182.94, + "learning_rate": 0.00014573658258534578, + "loss": 0.5977, + "step": 9330 + }, + { + "epoch": 183.0, + "eval_loss": 0.6013053059577942, + "eval_runtime": 2.1277, + "eval_samples_per_second": 1071.122, + "eval_steps_per_second": 4.23, + "step": 9333 + }, + { + "epoch": 183.14, + "learning_rate": 0.0001457231031751877, + "loss": 0.5967, + "step": 9340 + }, + { + "epoch": 183.33, + "learning_rate": 0.00014570960311546073, + "loss": 0.5993, + "step": 9350 + }, + { + "epoch": 183.53, + "learning_rate": 0.00014569608241010663, + "loss": 0.5945, + "step": 9360 + }, + { + "epoch": 183.73, + "learning_rate": 0.00014568254106307318, + "loss": 0.5928, + "step": 9370 + }, + { + "epoch": 183.92, + "learning_rate": 0.00014566897907831408, + "loss": 0.5947, + "step": 9380 + }, + { + "epoch": 184.0, + "eval_loss": 0.602336049079895, + "eval_runtime": 2.2459, + "eval_samples_per_second": 1014.742, + "eval_steps_per_second": 4.007, + "step": 9384 + }, + { + "epoch": 184.12, + "learning_rate": 0.00014565539645978917, + "loss": 0.5934, + "step": 9390 + }, + { + "epoch": 184.31, + "learning_rate": 0.00014564179321146427, + "loss": 0.5975, + "step": 9400 + }, + { + "epoch": 184.51, + "learning_rate": 0.0001456281693373112, + "loss": 0.5974, + "step": 9410 + }, + { + "epoch": 184.71, + "learning_rate": 0.00014561452484130786, + "loss": 0.5978, + "step": 9420 + }, + { + "epoch": 184.9, + "learning_rate": 0.0001456008597274381, + "loss": 0.59, + "step": 9430 + }, + { + "epoch": 185.0, + "eval_loss": 0.5967941284179688, + "eval_runtime": 2.1353, + "eval_samples_per_second": 1067.295, + "eval_steps_per_second": 4.215, + "step": 9435 + }, + { + "epoch": 185.1, + "learning_rate": 0.00014558717399969188, + "loss": 0.5959, + "step": 9440 + }, + { + "epoch": 185.29, + "learning_rate": 0.00014557346766206508, + "loss": 0.5968, + "step": 9450 + }, + { + "epoch": 185.49, + "learning_rate": 0.00014555974071855967, + "loss": 0.5948, + "step": 9460 + }, + { + "epoch": 185.69, + "learning_rate": 0.0001455459931731836, + "loss": 0.5959, + "step": 9470 + }, + { + "epoch": 185.88, + "learning_rate": 0.00014553222502995087, + "loss": 0.5924, + "step": 9480 + }, + { + "epoch": 186.0, + "eval_loss": 0.5987167954444885, + "eval_runtime": 2.2188, + "eval_samples_per_second": 1027.115, + "eval_steps_per_second": 4.056, + "step": 9486 + }, + { + "epoch": 186.08, + "learning_rate": 0.00014551843629288143, + "loss": 0.5948, + "step": 9490 + }, + { + "epoch": 186.27, + "learning_rate": 0.00014550462696600133, + "loss": 0.5946, + "step": 9500 + }, + { + "epoch": 186.47, + "learning_rate": 0.00014549079705334253, + "loss": 0.5933, + "step": 9510 + }, + { + "epoch": 186.67, + "learning_rate": 0.0001454769465589431, + "loss": 0.5963, + "step": 9520 + }, + { + "epoch": 186.86, + "learning_rate": 0.00014546307548684708, + "loss": 0.5906, + "step": 9530 + }, + { + "epoch": 187.0, + "eval_loss": 0.5914626121520996, + "eval_runtime": 2.1056, + "eval_samples_per_second": 1082.37, + "eval_steps_per_second": 4.274, + "step": 9537 + }, + { + "epoch": 187.06, + "learning_rate": 0.0001454491838411045, + "loss": 0.5908, + "step": 9540 + }, + { + "epoch": 187.25, + "learning_rate": 0.00014543527162577137, + "loss": 0.5945, + "step": 9550 + }, + { + "epoch": 187.45, + "learning_rate": 0.00014542133884490983, + "loss": 0.5961, + "step": 9560 + }, + { + "epoch": 187.65, + "learning_rate": 0.00014540738550258787, + "loss": 0.5961, + "step": 9570 + }, + { + "epoch": 187.84, + "learning_rate": 0.00014539341160287958, + "loss": 0.5928, + "step": 9580 + }, + { + "epoch": 188.0, + "eval_loss": 0.5877456665039062, + "eval_runtime": 2.2344, + "eval_samples_per_second": 1019.947, + "eval_steps_per_second": 4.028, + "step": 9588 + }, + { + "epoch": 188.04, + "learning_rate": 0.00014537941714986503, + "loss": 0.5913, + "step": 9590 + }, + { + "epoch": 188.24, + "learning_rate": 0.0001453654021476303, + "loss": 0.5938, + "step": 9600 + }, + { + "epoch": 188.43, + "learning_rate": 0.00014535136660026742, + "loss": 0.5895, + "step": 9610 + }, + { + "epoch": 188.63, + "learning_rate": 0.00014533731051187448, + "loss": 0.5888, + "step": 9620 + }, + { + "epoch": 188.82, + "learning_rate": 0.00014532323388655557, + "loss": 0.5849, + "step": 9630 + }, + { + "epoch": 189.0, + "eval_loss": 0.5910800695419312, + "eval_runtime": 2.1237, + "eval_samples_per_second": 1073.117, + "eval_steps_per_second": 4.238, + "step": 9639 + }, + { + "epoch": 189.02, + "learning_rate": 0.0001453091367284207, + "loss": 0.5896, + "step": 9640 + }, + { + "epoch": 189.22, + "learning_rate": 0.00014529501904158597, + "loss": 0.5961, + "step": 9650 + }, + { + "epoch": 189.41, + "learning_rate": 0.00014528088083017339, + "loss": 0.5835, + "step": 9660 + }, + { + "epoch": 189.61, + "learning_rate": 0.00014526672209831104, + "loss": 0.5849, + "step": 9670 + }, + { + "epoch": 189.8, + "learning_rate": 0.00014525254285013294, + "loss": 0.5893, + "step": 9680 + }, + { + "epoch": 190.0, + "learning_rate": 0.0001452383430897791, + "loss": 0.5913, + "step": 9690 + }, + { + "epoch": 190.0, + "eval_loss": 0.5954359769821167, + "eval_runtime": 2.2008, + "eval_samples_per_second": 1035.542, + "eval_steps_per_second": 4.089, + "step": 9690 + }, + { + "epoch": 190.2, + "learning_rate": 0.00014522412282139555, + "loss": 0.5899, + "step": 9700 + }, + { + "epoch": 190.39, + "learning_rate": 0.00014520988204913426, + "loss": 0.5912, + "step": 9710 + }, + { + "epoch": 190.59, + "learning_rate": 0.0001451956207771533, + "loss": 0.5932, + "step": 9720 + }, + { + "epoch": 190.78, + "learning_rate": 0.00014518133900961653, + "loss": 0.5892, + "step": 9730 + }, + { + "epoch": 190.98, + "learning_rate": 0.00014516703675069401, + "loss": 0.5863, + "step": 9740 + }, + { + "epoch": 191.0, + "eval_loss": 0.5905748009681702, + "eval_runtime": 2.1806, + "eval_samples_per_second": 1045.123, + "eval_steps_per_second": 4.127, + "step": 9741 + }, + { + "epoch": 191.18, + "learning_rate": 0.00014515271400456162, + "loss": 0.5901, + "step": 9750 + }, + { + "epoch": 191.37, + "learning_rate": 0.0001451383707754013, + "loss": 0.5891, + "step": 9760 + }, + { + "epoch": 191.57, + "learning_rate": 0.00014512400706740095, + "loss": 0.5917, + "step": 9770 + }, + { + "epoch": 191.76, + "learning_rate": 0.00014510962288475445, + "loss": 0.5881, + "step": 9780 + }, + { + "epoch": 191.96, + "learning_rate": 0.00014509521823166164, + "loss": 0.588, + "step": 9790 + }, + { + "epoch": 192.0, + "eval_loss": 0.5942443013191223, + "eval_runtime": 2.2206, + "eval_samples_per_second": 1026.291, + "eval_steps_per_second": 4.053, + "step": 9792 + }, + { + "epoch": 192.16, + "learning_rate": 0.00014508079311232843, + "loss": 0.5859, + "step": 9800 + }, + { + "epoch": 192.35, + "learning_rate": 0.00014506634753096655, + "loss": 0.5939, + "step": 9810 + }, + { + "epoch": 192.55, + "learning_rate": 0.00014505188149179383, + "loss": 0.5893, + "step": 9820 + }, + { + "epoch": 192.75, + "learning_rate": 0.00014503739499903404, + "loss": 0.5914, + "step": 9830 + }, + { + "epoch": 192.94, + "learning_rate": 0.00014502288805691687, + "loss": 0.5906, + "step": 9840 + }, + { + "epoch": 193.0, + "eval_loss": 0.5924288034439087, + "eval_runtime": 2.1443, + "eval_samples_per_second": 1062.813, + "eval_steps_per_second": 4.197, + "step": 9843 + }, + { + "epoch": 193.14, + "learning_rate": 0.00014500836066967806, + "loss": 0.5851, + "step": 9850 + }, + { + "epoch": 193.33, + "learning_rate": 0.00014499381284155928, + "loss": 0.5909, + "step": 9860 + }, + { + "epoch": 193.53, + "learning_rate": 0.00014497924457680813, + "loss": 0.5898, + "step": 9870 + }, + { + "epoch": 193.73, + "learning_rate": 0.00014496465587967832, + "loss": 0.591, + "step": 9880 + }, + { + "epoch": 193.92, + "learning_rate": 0.0001449500467544293, + "loss": 0.5927, + "step": 9890 + }, + { + "epoch": 194.0, + "eval_loss": 0.5911122560501099, + "eval_runtime": 2.1852, + "eval_samples_per_second": 1042.912, + "eval_steps_per_second": 4.119, + "step": 9894 + }, + { + "epoch": 194.12, + "learning_rate": 0.00014493541720532666, + "loss": 0.588, + "step": 9900 + }, + { + "epoch": 194.31, + "learning_rate": 0.00014492076723664194, + "loss": 0.5881, + "step": 9910 + }, + { + "epoch": 194.51, + "learning_rate": 0.00014490609685265256, + "loss": 0.5875, + "step": 9920 + }, + { + "epoch": 194.71, + "learning_rate": 0.00014489140605764196, + "loss": 0.5895, + "step": 9930 + }, + { + "epoch": 194.9, + "learning_rate": 0.00014487669485589952, + "loss": 0.5857, + "step": 9940 + }, + { + "epoch": 195.0, + "eval_loss": 0.5852168202400208, + "eval_runtime": 2.3015, + "eval_samples_per_second": 990.24, + "eval_steps_per_second": 3.911, + "step": 9945 + }, + { + "epoch": 195.1, + "learning_rate": 0.00014486196325172058, + "loss": 0.582, + "step": 9950 + }, + { + "epoch": 195.29, + "learning_rate": 0.00014484721124940643, + "loss": 0.5834, + "step": 9960 + }, + { + "epoch": 195.49, + "learning_rate": 0.00014483243885326437, + "loss": 0.5891, + "step": 9970 + }, + { + "epoch": 195.69, + "learning_rate": 0.00014481764606760755, + "loss": 0.589, + "step": 9980 + }, + { + "epoch": 195.88, + "learning_rate": 0.00014480283289675517, + "loss": 0.5859, + "step": 9990 + }, + { + "epoch": 196.0, + "eval_loss": 0.5909630060195923, + "eval_runtime": 2.2494, + "eval_samples_per_second": 1013.138, + "eval_steps_per_second": 4.001, + "step": 9996 + }, + { + "epoch": 196.08, + "learning_rate": 0.00014478799934503233, + "loss": 0.5859, + "step": 10000 + }, + { + "epoch": 196.27, + "learning_rate": 0.00014477314541677016, + "loss": 0.5885, + "step": 10010 + }, + { + "epoch": 196.47, + "learning_rate": 0.00014475827111630555, + "loss": 0.5799, + "step": 10020 + }, + { + "epoch": 196.67, + "learning_rate": 0.00014474337644798159, + "loss": 0.583, + "step": 10030 + }, + { + "epoch": 196.86, + "learning_rate": 0.00014472846141614707, + "loss": 0.5775, + "step": 10040 + }, + { + "epoch": 197.0, + "eval_loss": 0.5853325128555298, + "eval_runtime": 2.1195, + "eval_samples_per_second": 1075.278, + "eval_steps_per_second": 4.246, + "step": 10047 + }, + { + "epoch": 197.06, + "learning_rate": 0.00014471352602515695, + "loss": 0.5791, + "step": 10050 + }, + { + "epoch": 197.25, + "learning_rate": 0.000144698570279372, + "loss": 0.582, + "step": 10060 + }, + { + "epoch": 197.45, + "learning_rate": 0.00014468359418315894, + "loss": 0.5823, + "step": 10070 + }, + { + "epoch": 197.65, + "learning_rate": 0.0001446685977408905, + "loss": 0.5861, + "step": 10080 + }, + { + "epoch": 197.84, + "learning_rate": 0.00014465358095694527, + "loss": 0.586, + "step": 10090 + }, + { + "epoch": 198.0, + "eval_loss": 0.5876543521881104, + "eval_runtime": 2.0896, + "eval_samples_per_second": 1090.616, + "eval_steps_per_second": 4.307, + "step": 10098 + }, + { + "epoch": 198.04, + "learning_rate": 0.00014463854383570782, + "loss": 0.5862, + "step": 10100 + }, + { + "epoch": 198.24, + "learning_rate": 0.00014462348638156865, + "loss": 0.5902, + "step": 10110 + }, + { + "epoch": 198.43, + "learning_rate": 0.00014460840859892424, + "loss": 0.5845, + "step": 10120 + }, + { + "epoch": 198.63, + "learning_rate": 0.0001445933104921769, + "loss": 0.5818, + "step": 10130 + }, + { + "epoch": 198.82, + "learning_rate": 0.000144578192065735, + "loss": 0.5853, + "step": 10140 + }, + { + "epoch": 199.0, + "eval_loss": 0.5847514867782593, + "eval_runtime": 2.1429, + "eval_samples_per_second": 1063.513, + "eval_steps_per_second": 4.2, + "step": 10149 + }, + { + "epoch": 199.02, + "learning_rate": 0.00014456305332401277, + "loss": 0.5841, + "step": 10150 + }, + { + "epoch": 199.22, + "learning_rate": 0.00014454789427143034, + "loss": 0.5864, + "step": 10160 + }, + { + "epoch": 199.41, + "learning_rate": 0.0001445327149124139, + "loss": 0.582, + "step": 10170 + }, + { + "epoch": 199.61, + "learning_rate": 0.00014451751525139538, + "loss": 0.5867, + "step": 10180 + }, + { + "epoch": 199.8, + "learning_rate": 0.00014450229529281285, + "loss": 0.5813, + "step": 10190 + }, + { + "epoch": 200.0, + "learning_rate": 0.0001444870550411101, + "loss": 0.5824, + "step": 10200 + }, + { + "epoch": 200.0, + "eval_loss": 0.5854251384735107, + "eval_runtime": 2.1252, + "eval_samples_per_second": 1072.384, + "eval_steps_per_second": 4.235, + "step": 10200 + }, + { + "epoch": 200.2, + "learning_rate": 0.00014447179450073703, + "loss": 0.5841, + "step": 10210 + }, + { + "epoch": 200.39, + "learning_rate": 0.00014445651367614933, + "loss": 0.5785, + "step": 10220 + }, + { + "epoch": 200.59, + "learning_rate": 0.00014444121257180866, + "loss": 0.5868, + "step": 10230 + }, + { + "epoch": 200.78, + "learning_rate": 0.0001444258911921826, + "loss": 0.5842, + "step": 10240 + }, + { + "epoch": 200.98, + "learning_rate": 0.0001444105495417447, + "loss": 0.5797, + "step": 10250 + }, + { + "epoch": 201.0, + "eval_loss": 0.5834174752235413, + "eval_runtime": 2.1546, + "eval_samples_per_second": 1057.713, + "eval_steps_per_second": 4.177, + "step": 10251 + }, + { + "epoch": 201.18, + "learning_rate": 0.00014439518762497428, + "loss": 0.5814, + "step": 10260 + }, + { + "epoch": 201.37, + "learning_rate": 0.00014437980544635675, + "loss": 0.576, + "step": 10270 + }, + { + "epoch": 201.57, + "learning_rate": 0.00014436440301038335, + "loss": 0.5806, + "step": 10280 + }, + { + "epoch": 201.76, + "learning_rate": 0.00014434898032155127, + "loss": 0.5845, + "step": 10290 + }, + { + "epoch": 201.96, + "learning_rate": 0.00014433353738436359, + "loss": 0.5857, + "step": 10300 + }, + { + "epoch": 202.0, + "eval_loss": 0.5791680216789246, + "eval_runtime": 2.098, + "eval_samples_per_second": 1086.25, + "eval_steps_per_second": 4.29, + "step": 10302 + }, + { + "epoch": 202.16, + "learning_rate": 0.00014431807420332921, + "loss": 0.5806, + "step": 10310 + }, + { + "epoch": 202.35, + "learning_rate": 0.00014430259078296317, + "loss": 0.581, + "step": 10320 + }, + { + "epoch": 202.55, + "learning_rate": 0.00014428708712778618, + "loss": 0.579, + "step": 10330 + }, + { + "epoch": 202.75, + "learning_rate": 0.000144271563242325, + "loss": 0.5746, + "step": 10340 + }, + { + "epoch": 202.94, + "learning_rate": 0.0001442560191311123, + "loss": 0.5863, + "step": 10350 + }, + { + "epoch": 203.0, + "eval_loss": 0.5824128985404968, + "eval_runtime": 2.1448, + "eval_samples_per_second": 1062.581, + "eval_steps_per_second": 4.196, + "step": 10353 + }, + { + "epoch": 203.14, + "learning_rate": 0.00014424045479868655, + "loss": 0.5802, + "step": 10360 + }, + { + "epoch": 203.33, + "learning_rate": 0.0001442248702495922, + "loss": 0.5826, + "step": 10370 + }, + { + "epoch": 203.53, + "learning_rate": 0.00014420926548837959, + "loss": 0.5813, + "step": 10380 + }, + { + "epoch": 203.73, + "learning_rate": 0.000144193640519605, + "loss": 0.5836, + "step": 10390 + }, + { + "epoch": 203.92, + "learning_rate": 0.00014417799534783055, + "loss": 0.5826, + "step": 10400 + }, + { + "epoch": 204.0, + "eval_loss": 0.5838117003440857, + "eval_runtime": 2.1558, + "eval_samples_per_second": 1057.159, + "eval_steps_per_second": 4.175, + "step": 10404 + }, + { + "epoch": 204.12, + "learning_rate": 0.00014416232997762428, + "loss": 0.5853, + "step": 10410 + }, + { + "epoch": 204.31, + "learning_rate": 0.00014414664441356008, + "loss": 0.5776, + "step": 10420 + }, + { + "epoch": 204.51, + "learning_rate": 0.00014413093866021788, + "loss": 0.581, + "step": 10430 + }, + { + "epoch": 204.71, + "learning_rate": 0.00014411521272218335, + "loss": 0.5787, + "step": 10440 + }, + { + "epoch": 204.9, + "learning_rate": 0.0001440994666040481, + "loss": 0.579, + "step": 10450 + }, + { + "epoch": 205.0, + "eval_loss": 0.5807640552520752, + "eval_runtime": 2.1947, + "eval_samples_per_second": 1038.42, + "eval_steps_per_second": 4.101, + "step": 10455 + }, + { + "epoch": 205.1, + "learning_rate": 0.0001440837003104097, + "loss": 0.5808, + "step": 10460 + }, + { + "epoch": 205.29, + "learning_rate": 0.0001440679138458715, + "loss": 0.5786, + "step": 10470 + }, + { + "epoch": 205.49, + "learning_rate": 0.00014405210721504284, + "loss": 0.5807, + "step": 10480 + }, + { + "epoch": 205.69, + "learning_rate": 0.00014403628042253887, + "loss": 0.5794, + "step": 10490 + }, + { + "epoch": 205.88, + "learning_rate": 0.0001440204334729807, + "loss": 0.5758, + "step": 10500 + }, + { + "epoch": 206.0, + "eval_loss": 0.58095782995224, + "eval_runtime": 2.09, + "eval_samples_per_second": 1090.42, + "eval_steps_per_second": 4.306, + "step": 10506 + }, + { + "epoch": 206.08, + "learning_rate": 0.00014400456637099525, + "loss": 0.5798, + "step": 10510 + }, + { + "epoch": 206.27, + "learning_rate": 0.00014398867912121538, + "loss": 0.5726, + "step": 10520 + }, + { + "epoch": 206.47, + "learning_rate": 0.00014397277172827977, + "loss": 0.5783, + "step": 10530 + }, + { + "epoch": 206.67, + "learning_rate": 0.00014395684419683306, + "loss": 0.5765, + "step": 10540 + }, + { + "epoch": 206.86, + "learning_rate": 0.00014394089653152576, + "loss": 0.5798, + "step": 10550 + }, + { + "epoch": 207.0, + "eval_loss": 0.5782448649406433, + "eval_runtime": 2.1817, + "eval_samples_per_second": 1044.609, + "eval_steps_per_second": 4.125, + "step": 10557 + }, + { + "epoch": 207.06, + "learning_rate": 0.0001439249287370142, + "loss": 0.5779, + "step": 10560 + }, + { + "epoch": 207.25, + "learning_rate": 0.0001439089408179606, + "loss": 0.5839, + "step": 10570 + }, + { + "epoch": 207.45, + "learning_rate": 0.00014389293277903312, + "loss": 0.5778, + "step": 10580 + }, + { + "epoch": 207.65, + "learning_rate": 0.00014387690462490572, + "loss": 0.574, + "step": 10590 + }, + { + "epoch": 207.84, + "learning_rate": 0.00014386085636025828, + "loss": 0.576, + "step": 10600 + }, + { + "epoch": 208.0, + "eval_loss": 0.5818247199058533, + "eval_runtime": 2.2327, + "eval_samples_per_second": 1020.751, + "eval_steps_per_second": 4.031, + "step": 10608 + }, + { + "epoch": 208.04, + "learning_rate": 0.00014384478798977655, + "loss": 0.5776, + "step": 10610 + }, + { + "epoch": 208.24, + "learning_rate": 0.00014382869951815207, + "loss": 0.5749, + "step": 10620 + }, + { + "epoch": 208.43, + "learning_rate": 0.00014381259095008238, + "loss": 0.581, + "step": 10630 + }, + { + "epoch": 208.63, + "learning_rate": 0.0001437964622902708, + "loss": 0.5839, + "step": 10640 + }, + { + "epoch": 208.82, + "learning_rate": 0.00014378031354342656, + "loss": 0.5717, + "step": 10650 + }, + { + "epoch": 209.0, + "eval_loss": 0.5826478600502014, + "eval_runtime": 2.1857, + "eval_samples_per_second": 1042.691, + "eval_steps_per_second": 4.118, + "step": 10659 + }, + { + "epoch": 209.02, + "learning_rate": 0.00014376414471426472, + "loss": 0.5762, + "step": 10660 + }, + { + "epoch": 209.22, + "learning_rate": 0.00014374795580750617, + "loss": 0.5745, + "step": 10670 + }, + { + "epoch": 209.41, + "learning_rate": 0.0001437317468278778, + "loss": 0.5766, + "step": 10680 + }, + { + "epoch": 209.61, + "learning_rate": 0.00014371551778011218, + "loss": 0.5757, + "step": 10690 + }, + { + "epoch": 209.8, + "learning_rate": 0.0001436992686689479, + "loss": 0.5771, + "step": 10700 + }, + { + "epoch": 210.0, + "learning_rate": 0.00014368299949912928, + "loss": 0.5774, + "step": 10710 + }, + { + "epoch": 210.0, + "eval_loss": 0.5800108313560486, + "eval_runtime": 2.1291, + "eval_samples_per_second": 1070.401, + "eval_steps_per_second": 4.227, + "step": 10710 + }, + { + "epoch": 210.2, + "learning_rate": 0.0001436667102754066, + "loss": 0.5743, + "step": 10720 + }, + { + "epoch": 210.39, + "learning_rate": 0.00014365040100253597, + "loss": 0.5747, + "step": 10730 + }, + { + "epoch": 210.59, + "learning_rate": 0.00014363407168527928, + "loss": 0.5781, + "step": 10740 + }, + { + "epoch": 210.78, + "learning_rate": 0.00014361772232840433, + "loss": 0.5735, + "step": 10750 + }, + { + "epoch": 210.98, + "learning_rate": 0.0001436013529366848, + "loss": 0.5724, + "step": 10760 + }, + { + "epoch": 211.0, + "eval_loss": 0.5812757611274719, + "eval_runtime": 2.1877, + "eval_samples_per_second": 1041.728, + "eval_steps_per_second": 4.114, + "step": 10761 + }, + { + "epoch": 211.18, + "learning_rate": 0.00014358496351490015, + "loss": 0.5782, + "step": 10770 + }, + { + "epoch": 211.37, + "learning_rate": 0.00014356855406783578, + "loss": 0.5756, + "step": 10780 + }, + { + "epoch": 211.57, + "learning_rate": 0.00014355212460028283, + "loss": 0.5755, + "step": 10790 + }, + { + "epoch": 211.76, + "learning_rate": 0.00014353567511703836, + "loss": 0.5736, + "step": 10800 + }, + { + "epoch": 211.96, + "learning_rate": 0.00014351920562290525, + "loss": 0.5706, + "step": 10810 + }, + { + "epoch": 212.0, + "eval_loss": 0.575522243976593, + "eval_runtime": 2.1831, + "eval_samples_per_second": 1043.938, + "eval_steps_per_second": 4.123, + "step": 10812 + }, + { + "epoch": 212.16, + "learning_rate": 0.00014350271612269223, + "loss": 0.5736, + "step": 10820 + }, + { + "epoch": 212.35, + "learning_rate": 0.00014348620662121386, + "loss": 0.5775, + "step": 10830 + }, + { + "epoch": 212.55, + "learning_rate": 0.00014346967712329053, + "loss": 0.573, + "step": 10840 + }, + { + "epoch": 212.75, + "learning_rate": 0.00014345312763374852, + "loss": 0.5774, + "step": 10850 + }, + { + "epoch": 212.94, + "learning_rate": 0.00014343655815741987, + "loss": 0.5737, + "step": 10860 + }, + { + "epoch": 213.0, + "eval_loss": 0.5787567496299744, + "eval_runtime": 2.1398, + "eval_samples_per_second": 1065.048, + "eval_steps_per_second": 4.206, + "step": 10863 + }, + { + "epoch": 213.14, + "learning_rate": 0.00014341996869914254, + "loss": 0.5767, + "step": 10870 + }, + { + "epoch": 213.33, + "learning_rate": 0.00014340335926376027, + "loss": 0.5779, + "step": 10880 + }, + { + "epoch": 213.53, + "learning_rate": 0.00014338672985612263, + "loss": 0.5761, + "step": 10890 + }, + { + "epoch": 213.73, + "learning_rate": 0.00014337008048108504, + "loss": 0.5742, + "step": 10900 + }, + { + "epoch": 213.92, + "learning_rate": 0.00014335341114350876, + "loss": 0.5791, + "step": 10910 + }, + { + "epoch": 214.0, + "eval_loss": 0.576858639717102, + "eval_runtime": 2.1295, + "eval_samples_per_second": 1070.195, + "eval_steps_per_second": 4.226, + "step": 10914 + }, + { + "epoch": 214.12, + "learning_rate": 0.00014333672184826086, + "loss": 0.5749, + "step": 10920 + }, + { + "epoch": 214.31, + "learning_rate": 0.00014332001260021422, + "loss": 0.574, + "step": 10930 + }, + { + "epoch": 214.51, + "learning_rate": 0.00014330328340424759, + "loss": 0.5756, + "step": 10940 + }, + { + "epoch": 214.71, + "learning_rate": 0.0001432865342652455, + "loss": 0.5759, + "step": 10950 + }, + { + "epoch": 214.9, + "learning_rate": 0.00014326976518809836, + "loss": 0.5712, + "step": 10960 + }, + { + "epoch": 215.0, + "eval_loss": 0.576650083065033, + "eval_runtime": 2.2593, + "eval_samples_per_second": 1008.739, + "eval_steps_per_second": 3.984, + "step": 10965 + }, + { + "epoch": 215.1, + "learning_rate": 0.00014325297617770238, + "loss": 0.5725, + "step": 10970 + }, + { + "epoch": 215.29, + "learning_rate": 0.00014323616723895953, + "loss": 0.5747, + "step": 10980 + }, + { + "epoch": 215.49, + "learning_rate": 0.00014321933837677762, + "loss": 0.5712, + "step": 10990 + }, + { + "epoch": 215.69, + "learning_rate": 0.00014320248959607038, + "loss": 0.5758, + "step": 11000 + }, + { + "epoch": 215.88, + "learning_rate": 0.00014318562090175722, + "loss": 0.567, + "step": 11010 + }, + { + "epoch": 216.0, + "eval_loss": 0.5790178179740906, + "eval_runtime": 2.1116, + "eval_samples_per_second": 1079.269, + "eval_steps_per_second": 4.262, + "step": 11016 + }, + { + "epoch": 216.08, + "learning_rate": 0.00014316873229876345, + "loss": 0.5795, + "step": 11020 + }, + { + "epoch": 216.27, + "learning_rate": 0.00014315182379202017, + "loss": 0.5833, + "step": 11030 + }, + { + "epoch": 216.47, + "learning_rate": 0.00014313489538646425, + "loss": 0.579, + "step": 11040 + }, + { + "epoch": 216.67, + "learning_rate": 0.00014311794708703847, + "loss": 0.5771, + "step": 11050 + }, + { + "epoch": 216.86, + "learning_rate": 0.00014310097889869128, + "loss": 0.5671, + "step": 11060 + }, + { + "epoch": 217.0, + "eval_loss": 0.5734152793884277, + "eval_runtime": 2.2121, + "eval_samples_per_second": 1030.244, + "eval_steps_per_second": 4.069, + "step": 11067 + }, + { + "epoch": 217.06, + "learning_rate": 0.0001430839908263771, + "loss": 0.5713, + "step": 11070 + }, + { + "epoch": 217.25, + "learning_rate": 0.00014306698287505596, + "loss": 0.576, + "step": 11080 + }, + { + "epoch": 217.45, + "learning_rate": 0.00014304995504969392, + "loss": 0.5701, + "step": 11090 + }, + { + "epoch": 217.65, + "learning_rate": 0.00014303290735526262, + "loss": 0.5727, + "step": 11100 + }, + { + "epoch": 217.84, + "learning_rate": 0.00014301583979673966, + "loss": 0.5733, + "step": 11110 + }, + { + "epoch": 218.0, + "eval_loss": 0.5721628665924072, + "eval_runtime": 2.1562, + "eval_samples_per_second": 1056.94, + "eval_steps_per_second": 4.174, + "step": 11118 + }, + { + "epoch": 218.04, + "learning_rate": 0.0001429987523791084, + "loss": 0.5717, + "step": 11120 + }, + { + "epoch": 218.24, + "learning_rate": 0.00014298164510735795, + "loss": 0.5692, + "step": 11130 + }, + { + "epoch": 218.43, + "learning_rate": 0.00014296451798648328, + "loss": 0.5677, + "step": 11140 + }, + { + "epoch": 218.63, + "learning_rate": 0.0001429473710214851, + "loss": 0.5708, + "step": 11150 + }, + { + "epoch": 218.82, + "learning_rate": 0.00014293020421736997, + "loss": 0.5673, + "step": 11160 + }, + { + "epoch": 219.0, + "eval_loss": 0.5806319117546082, + "eval_runtime": 2.2602, + "eval_samples_per_second": 1008.303, + "eval_steps_per_second": 3.982, + "step": 11169 + }, + { + "epoch": 219.02, + "learning_rate": 0.0001429130175791502, + "loss": 0.5765, + "step": 11170 + }, + { + "epoch": 219.22, + "learning_rate": 0.00014289581111184388, + "loss": 0.5706, + "step": 11180 + }, + { + "epoch": 219.41, + "learning_rate": 0.00014287858482047493, + "loss": 0.5729, + "step": 11190 + }, + { + "epoch": 219.61, + "learning_rate": 0.0001428613387100731, + "loss": 0.5661, + "step": 11200 + }, + { + "epoch": 219.8, + "learning_rate": 0.0001428440727856738, + "loss": 0.5713, + "step": 11210 + }, + { + "epoch": 220.0, + "learning_rate": 0.00014282678705231832, + "loss": 0.5713, + "step": 11220 + }, + { + "epoch": 220.0, + "eval_loss": 0.5764245986938477, + "eval_runtime": 2.235, + "eval_samples_per_second": 1019.686, + "eval_steps_per_second": 4.027, + "step": 11220 + }, + { + "epoch": 220.2, + "learning_rate": 0.00014280948151505367, + "loss": 0.5748, + "step": 11230 + }, + { + "epoch": 220.39, + "learning_rate": 0.00014279215617893275, + "loss": 0.5672, + "step": 11240 + }, + { + "epoch": 220.59, + "learning_rate": 0.00014277481104901413, + "loss": 0.5696, + "step": 11250 + }, + { + "epoch": 220.78, + "learning_rate": 0.00014275744613036223, + "loss": 0.5736, + "step": 11260 + }, + { + "epoch": 220.98, + "learning_rate": 0.00014274006142804714, + "loss": 0.5669, + "step": 11270 + }, + { + "epoch": 221.0, + "eval_loss": 0.5693748593330383, + "eval_runtime": 2.174, + "eval_samples_per_second": 1048.299, + "eval_steps_per_second": 4.14, + "step": 11271 + }, + { + "epoch": 221.18, + "learning_rate": 0.00014272265694714492, + "loss": 0.5725, + "step": 11280 + }, + { + "epoch": 221.37, + "learning_rate": 0.0001427052326927372, + "loss": 0.5697, + "step": 11290 + }, + { + "epoch": 221.57, + "learning_rate": 0.0001426877886699115, + "loss": 0.5718, + "step": 11300 + }, + { + "epoch": 221.76, + "learning_rate": 0.00014267032488376113, + "loss": 0.5724, + "step": 11310 + }, + { + "epoch": 221.96, + "learning_rate": 0.00014265284133938507, + "loss": 0.5669, + "step": 11320 + }, + { + "epoch": 222.0, + "eval_loss": 0.5748663544654846, + "eval_runtime": 2.1119, + "eval_samples_per_second": 1079.109, + "eval_steps_per_second": 4.262, + "step": 11322 + }, + { + "epoch": 222.16, + "learning_rate": 0.00014263533804188813, + "loss": 0.5645, + "step": 11330 + }, + { + "epoch": 222.35, + "learning_rate": 0.00014261781499638092, + "loss": 0.5696, + "step": 11340 + }, + { + "epoch": 222.55, + "learning_rate": 0.00014260027220797976, + "loss": 0.5726, + "step": 11350 + }, + { + "epoch": 222.75, + "learning_rate": 0.00014258270968180674, + "loss": 0.5702, + "step": 11360 + }, + { + "epoch": 222.94, + "learning_rate": 0.0001425651274229897, + "loss": 0.5665, + "step": 11370 + }, + { + "epoch": 223.0, + "eval_loss": 0.573235273361206, + "eval_runtime": 2.1302, + "eval_samples_per_second": 1069.829, + "eval_steps_per_second": 4.225, + "step": 11373 + }, + { + "epoch": 223.14, + "learning_rate": 0.00014254752543666234, + "loss": 0.5678, + "step": 11380 + }, + { + "epoch": 223.33, + "learning_rate": 0.000142529903727964, + "loss": 0.5682, + "step": 11390 + }, + { + "epoch": 223.53, + "learning_rate": 0.00014251226230203984, + "loss": 0.5727, + "step": 11400 + }, + { + "epoch": 223.73, + "learning_rate": 0.00014249460116404073, + "loss": 0.5643, + "step": 11410 + }, + { + "epoch": 223.92, + "learning_rate": 0.0001424769203191234, + "loss": 0.5676, + "step": 11420 + }, + { + "epoch": 224.0, + "eval_loss": 0.5675996541976929, + "eval_runtime": 2.2583, + "eval_samples_per_second": 1009.18, + "eval_steps_per_second": 3.985, + "step": 11424 + }, + { + "epoch": 224.12, + "learning_rate": 0.00014245921977245018, + "loss": 0.5733, + "step": 11430 + }, + { + "epoch": 224.31, + "learning_rate": 0.00014244149952918927, + "loss": 0.5716, + "step": 11440 + }, + { + "epoch": 224.51, + "learning_rate": 0.00014242375959451462, + "loss": 0.5697, + "step": 11450 + }, + { + "epoch": 224.71, + "learning_rate": 0.00014240599997360583, + "loss": 0.5662, + "step": 11460 + }, + { + "epoch": 224.9, + "learning_rate": 0.00014238822067164837, + "loss": 0.5621, + "step": 11470 + }, + { + "epoch": 225.0, + "eval_loss": 0.5676630735397339, + "eval_runtime": 2.1447, + "eval_samples_per_second": 1062.606, + "eval_steps_per_second": 4.196, + "step": 11475 + }, + { + "epoch": 225.1, + "learning_rate": 0.00014237042169383337, + "loss": 0.5671, + "step": 11480 + }, + { + "epoch": 225.29, + "learning_rate": 0.00014235260304535776, + "loss": 0.5671, + "step": 11490 + }, + { + "epoch": 225.49, + "learning_rate": 0.00014233476473142414, + "loss": 0.5673, + "step": 11500 + }, + { + "epoch": 225.69, + "learning_rate": 0.00014231690675724096, + "loss": 0.5709, + "step": 11510 + }, + { + "epoch": 225.88, + "learning_rate": 0.0001422990291280223, + "loss": 0.5623, + "step": 11520 + }, + { + "epoch": 226.0, + "eval_loss": 0.5714594125747681, + "eval_runtime": 2.2157, + "eval_samples_per_second": 1028.587, + "eval_steps_per_second": 4.062, + "step": 11526 + }, + { + "epoch": 226.08, + "learning_rate": 0.00014228113184898804, + "loss": 0.5665, + "step": 11530 + }, + { + "epoch": 226.27, + "learning_rate": 0.0001422632149253638, + "loss": 0.5607, + "step": 11540 + }, + { + "epoch": 226.47, + "learning_rate": 0.00014224527836238093, + "loss": 0.57, + "step": 11550 + }, + { + "epoch": 226.67, + "learning_rate": 0.0001422273221652765, + "loss": 0.5676, + "step": 11560 + }, + { + "epoch": 226.86, + "learning_rate": 0.0001422093463392933, + "loss": 0.5695, + "step": 11570 + }, + { + "epoch": 227.0, + "eval_loss": 0.5675697326660156, + "eval_runtime": 2.2269, + "eval_samples_per_second": 1023.399, + "eval_steps_per_second": 4.042, + "step": 11577 + }, + { + "epoch": 227.06, + "learning_rate": 0.00014219135088967987, + "loss": 0.5649, + "step": 11580 + }, + { + "epoch": 227.25, + "learning_rate": 0.00014217333582169052, + "loss": 0.5683, + "step": 11590 + }, + { + "epoch": 227.45, + "learning_rate": 0.00014215530114058522, + "loss": 0.5651, + "step": 11600 + }, + { + "epoch": 227.65, + "learning_rate": 0.00014213724685162968, + "loss": 0.5641, + "step": 11610 + }, + { + "epoch": 227.84, + "learning_rate": 0.00014211917296009534, + "loss": 0.5657, + "step": 11620 + }, + { + "epoch": 228.0, + "eval_loss": 0.5667091608047485, + "eval_runtime": 2.1477, + "eval_samples_per_second": 1061.131, + "eval_steps_per_second": 4.191, + "step": 11628 + }, + { + "epoch": 228.04, + "learning_rate": 0.00014210107947125943, + "loss": 0.5616, + "step": 11630 + }, + { + "epoch": 228.24, + "learning_rate": 0.00014208296639040482, + "loss": 0.5638, + "step": 11640 + }, + { + "epoch": 228.43, + "learning_rate": 0.0001420648337228201, + "loss": 0.5682, + "step": 11650 + }, + { + "epoch": 228.63, + "learning_rate": 0.00014204668147379962, + "loss": 0.5694, + "step": 11660 + }, + { + "epoch": 228.82, + "learning_rate": 0.00014202850964864348, + "loss": 0.565, + "step": 11670 + }, + { + "epoch": 229.0, + "eval_loss": 0.5644382238388062, + "eval_runtime": 2.1257, + "eval_samples_per_second": 1072.111, + "eval_steps_per_second": 4.234, + "step": 11679 + }, + { + "epoch": 229.02, + "learning_rate": 0.00014201031825265736, + "loss": 0.5655, + "step": 11680 + }, + { + "epoch": 229.22, + "learning_rate": 0.0001419921072911528, + "loss": 0.5682, + "step": 11690 + }, + { + "epoch": 229.41, + "learning_rate": 0.00014197387676944697, + "loss": 0.5641, + "step": 11700 + }, + { + "epoch": 229.61, + "learning_rate": 0.00014195562669286278, + "loss": 0.5655, + "step": 11710 + }, + { + "epoch": 229.8, + "learning_rate": 0.00014193735706672888, + "loss": 0.5598, + "step": 11720 + }, + { + "epoch": 230.0, + "learning_rate": 0.00014191906789637955, + "loss": 0.5617, + "step": 11730 + }, + { + "epoch": 230.0, + "eval_loss": 0.5650487542152405, + "eval_runtime": 2.122, + "eval_samples_per_second": 1074.007, + "eval_steps_per_second": 4.241, + "step": 11730 + }, + { + "epoch": 230.2, + "learning_rate": 0.00014190075918715483, + "loss": 0.5645, + "step": 11740 + }, + { + "epoch": 230.39, + "learning_rate": 0.00014188243094440047, + "loss": 0.5655, + "step": 11750 + }, + { + "epoch": 230.59, + "learning_rate": 0.00014186408317346788, + "loss": 0.563, + "step": 11760 + }, + { + "epoch": 230.78, + "learning_rate": 0.00014184571587971424, + "loss": 0.564, + "step": 11770 + }, + { + "epoch": 230.98, + "learning_rate": 0.00014182732906850234, + "loss": 0.5587, + "step": 11780 + }, + { + "epoch": 231.0, + "eval_loss": 0.5637187957763672, + "eval_runtime": 2.1964, + "eval_samples_per_second": 1037.624, + "eval_steps_per_second": 4.098, + "step": 11781 + }, + { + "epoch": 231.18, + "learning_rate": 0.00014180892274520075, + "loss": 0.5592, + "step": 11790 + }, + { + "epoch": 231.37, + "learning_rate": 0.0001417904969151837, + "loss": 0.563, + "step": 11800 + }, + { + "epoch": 231.57, + "learning_rate": 0.00014177205158383114, + "loss": 0.5652, + "step": 11810 + }, + { + "epoch": 231.76, + "learning_rate": 0.00014175358675652867, + "loss": 0.5653, + "step": 11820 + }, + { + "epoch": 231.96, + "learning_rate": 0.00014173510243866764, + "loss": 0.5591, + "step": 11830 + }, + { + "epoch": 232.0, + "eval_loss": 0.5652225017547607, + "eval_runtime": 2.1694, + "eval_samples_per_second": 1050.52, + "eval_steps_per_second": 4.149, + "step": 11832 + }, + { + "epoch": 232.16, + "learning_rate": 0.000141716598635645, + "loss": 0.5669, + "step": 11840 + }, + { + "epoch": 232.35, + "learning_rate": 0.0001416980753528635, + "loss": 0.5572, + "step": 11850 + }, + { + "epoch": 232.55, + "learning_rate": 0.0001416795325957315, + "loss": 0.563, + "step": 11860 + }, + { + "epoch": 232.75, + "learning_rate": 0.0001416609703696631, + "loss": 0.5621, + "step": 11870 + }, + { + "epoch": 232.94, + "learning_rate": 0.00014164238868007801, + "loss": 0.5607, + "step": 11880 + }, + { + "epoch": 233.0, + "eval_loss": 0.5647706985473633, + "eval_runtime": 2.2045, + "eval_samples_per_second": 1033.785, + "eval_steps_per_second": 4.083, + "step": 11883 + }, + { + "epoch": 233.14, + "learning_rate": 0.00014162378753240171, + "loss": 0.5612, + "step": 11890 + }, + { + "epoch": 233.33, + "learning_rate": 0.0001416051669320653, + "loss": 0.5632, + "step": 11900 + }, + { + "epoch": 233.53, + "learning_rate": 0.00014158652688450558, + "loss": 0.568, + "step": 11910 + }, + { + "epoch": 233.73, + "learning_rate": 0.00014156786739516505, + "loss": 0.5595, + "step": 11920 + }, + { + "epoch": 233.92, + "learning_rate": 0.00014154918846949184, + "loss": 0.559, + "step": 11930 + }, + { + "epoch": 234.0, + "eval_loss": 0.5681033730506897, + "eval_runtime": 2.1383, + "eval_samples_per_second": 1065.785, + "eval_steps_per_second": 4.209, + "step": 11934 + }, + { + "epoch": 234.12, + "learning_rate": 0.0001415304901129398, + "loss": 0.5634, + "step": 11940 + }, + { + "epoch": 234.31, + "learning_rate": 0.0001415117723309684, + "loss": 0.5601, + "step": 11950 + }, + { + "epoch": 234.51, + "learning_rate": 0.00014149303512904284, + "loss": 0.5612, + "step": 11960 + }, + { + "epoch": 234.71, + "learning_rate": 0.00014147427851263398, + "loss": 0.5619, + "step": 11970 + }, + { + "epoch": 234.9, + "learning_rate": 0.00014145550248721828, + "loss": 0.5601, + "step": 11980 + }, + { + "epoch": 235.0, + "eval_loss": 0.5636653304100037, + "eval_runtime": 2.2295, + "eval_samples_per_second": 1022.208, + "eval_steps_per_second": 4.037, + "step": 11985 + }, + { + "epoch": 235.1, + "learning_rate": 0.00014143670705827797, + "loss": 0.5592, + "step": 11990 + }, + { + "epoch": 235.29, + "learning_rate": 0.00014141789223130088, + "loss": 0.5628, + "step": 12000 + }, + { + "epoch": 235.49, + "learning_rate": 0.00014139905801178055, + "loss": 0.5556, + "step": 12010 + }, + { + "epoch": 235.69, + "learning_rate": 0.0001413802044052161, + "loss": 0.5593, + "step": 12020 + }, + { + "epoch": 235.88, + "learning_rate": 0.00014136133141711237, + "loss": 0.5605, + "step": 12030 + }, + { + "epoch": 236.0, + "eval_loss": 0.5697084069252014, + "eval_runtime": 2.1042, + "eval_samples_per_second": 1083.049, + "eval_steps_per_second": 4.277, + "step": 12036 + }, + { + "epoch": 236.08, + "learning_rate": 0.0001413424390529799, + "loss": 0.5606, + "step": 12040 + }, + { + "epoch": 236.27, + "learning_rate": 0.00014132352731833478, + "loss": 0.5586, + "step": 12050 + }, + { + "epoch": 236.47, + "learning_rate": 0.00014130459621869884, + "loss": 0.5618, + "step": 12060 + }, + { + "epoch": 236.67, + "learning_rate": 0.00014128564575959957, + "loss": 0.5602, + "step": 12070 + }, + { + "epoch": 236.86, + "learning_rate": 0.00014126667594657, + "loss": 0.5555, + "step": 12080 + }, + { + "epoch": 237.0, + "eval_loss": 0.5593078136444092, + "eval_runtime": 2.1583, + "eval_samples_per_second": 1055.931, + "eval_steps_per_second": 4.17, + "step": 12087 + }, + { + "epoch": 237.06, + "learning_rate": 0.000141247686785149, + "loss": 0.5566, + "step": 12090 + }, + { + "epoch": 237.25, + "learning_rate": 0.0001412286782808809, + "loss": 0.5645, + "step": 12100 + }, + { + "epoch": 237.45, + "learning_rate": 0.0001412096504393158, + "loss": 0.5599, + "step": 12110 + }, + { + "epoch": 237.65, + "learning_rate": 0.00014119060326600938, + "loss": 0.5636, + "step": 12120 + }, + { + "epoch": 237.84, + "learning_rate": 0.000141171536766523, + "loss": 0.5602, + "step": 12130 + }, + { + "epoch": 238.0, + "eval_loss": 0.5682786107063293, + "eval_runtime": 2.2249, + "eval_samples_per_second": 1024.331, + "eval_steps_per_second": 4.045, + "step": 12138 + }, + { + "epoch": 238.04, + "learning_rate": 0.00014115245094642364, + "loss": 0.5597, + "step": 12140 + }, + { + "epoch": 238.24, + "learning_rate": 0.00014113334581128395, + "loss": 0.5586, + "step": 12150 + }, + { + "epoch": 238.43, + "learning_rate": 0.00014111422136668222, + "loss": 0.559, + "step": 12160 + }, + { + "epoch": 238.63, + "learning_rate": 0.00014109507761820233, + "loss": 0.5573, + "step": 12170 + }, + { + "epoch": 238.82, + "learning_rate": 0.00014107591457143383, + "loss": 0.5647, + "step": 12180 + }, + { + "epoch": 239.0, + "eval_loss": 0.562912106513977, + "eval_runtime": 2.2937, + "eval_samples_per_second": 993.61, + "eval_steps_per_second": 3.924, + "step": 12189 + }, + { + "epoch": 239.02, + "learning_rate": 0.00014105673223197191, + "loss": 0.5602, + "step": 12190 + }, + { + "epoch": 239.22, + "learning_rate": 0.0001410375306054174, + "loss": 0.5584, + "step": 12200 + }, + { + "epoch": 239.41, + "learning_rate": 0.00014101830969737674, + "loss": 0.5558, + "step": 12210 + }, + { + "epoch": 239.61, + "learning_rate": 0.00014099906951346196, + "loss": 0.5594, + "step": 12220 + }, + { + "epoch": 239.8, + "learning_rate": 0.00014097981005929087, + "loss": 0.5571, + "step": 12230 + }, + { + "epoch": 240.0, + "learning_rate": 0.00014096053134048667, + "loss": 0.5575, + "step": 12240 + }, + { + "epoch": 240.0, + "eval_loss": 0.5610710978507996, + "eval_runtime": 2.1604, + "eval_samples_per_second": 1054.903, + "eval_steps_per_second": 4.166, + "step": 12240 + }, + { + "epoch": 240.2, + "learning_rate": 0.00014094123336267842, + "loss": 0.5567, + "step": 12250 + }, + { + "epoch": 240.39, + "learning_rate": 0.00014092191613150062, + "loss": 0.5612, + "step": 12260 + }, + { + "epoch": 240.59, + "learning_rate": 0.00014090257965259357, + "loss": 0.5601, + "step": 12270 + }, + { + "epoch": 240.78, + "learning_rate": 0.00014088322393160298, + "loss": 0.5577, + "step": 12280 + }, + { + "epoch": 240.98, + "learning_rate": 0.00014086384897418037, + "loss": 0.5577, + "step": 12290 + }, + { + "epoch": 241.0, + "eval_loss": 0.5588154792785645, + "eval_runtime": 2.1478, + "eval_samples_per_second": 1061.078, + "eval_steps_per_second": 4.19, + "step": 12291 + }, + { + "epoch": 241.18, + "learning_rate": 0.00014084445478598274, + "loss": 0.5576, + "step": 12300 + }, + { + "epoch": 241.37, + "learning_rate": 0.00014082504137267283, + "loss": 0.5548, + "step": 12310 + }, + { + "epoch": 241.57, + "learning_rate": 0.00014080560873991883, + "loss": 0.5584, + "step": 12320 + }, + { + "epoch": 241.76, + "learning_rate": 0.0001407861568933947, + "loss": 0.556, + "step": 12330 + }, + { + "epoch": 241.96, + "learning_rate": 0.00014076668583877993, + "loss": 0.5514, + "step": 12340 + }, + { + "epoch": 242.0, + "eval_loss": 0.5583884119987488, + "eval_runtime": 2.1247, + "eval_samples_per_second": 1072.62, + "eval_steps_per_second": 4.236, + "step": 12342 + }, + { + "epoch": 242.16, + "learning_rate": 0.00014074719558175968, + "loss": 0.5522, + "step": 12350 + }, + { + "epoch": 242.35, + "learning_rate": 0.0001407276861280246, + "loss": 0.5528, + "step": 12360 + }, + { + "epoch": 242.55, + "learning_rate": 0.000140708157483271, + "loss": 0.5548, + "step": 12370 + }, + { + "epoch": 242.75, + "learning_rate": 0.0001406886096532009, + "loss": 0.5559, + "step": 12380 + }, + { + "epoch": 242.94, + "learning_rate": 0.00014066904264352175, + "loss": 0.5581, + "step": 12390 + }, + { + "epoch": 243.0, + "eval_loss": 0.5565963983535767, + "eval_runtime": 2.1477, + "eval_samples_per_second": 1061.154, + "eval_steps_per_second": 4.191, + "step": 12393 + }, + { + "epoch": 243.14, + "learning_rate": 0.0001406494564599467, + "loss": 0.5556, + "step": 12400 + }, + { + "epoch": 243.33, + "learning_rate": 0.00014062985110819453, + "loss": 0.5603, + "step": 12410 + }, + { + "epoch": 243.53, + "learning_rate": 0.0001406102265939895, + "loss": 0.5578, + "step": 12420 + }, + { + "epoch": 243.73, + "learning_rate": 0.00014059058292306155, + "loss": 0.5555, + "step": 12430 + }, + { + "epoch": 243.92, + "learning_rate": 0.0001405709201011462, + "loss": 0.555, + "step": 12440 + }, + { + "epoch": 244.0, + "eval_loss": 0.5562523007392883, + "eval_runtime": 2.1073, + "eval_samples_per_second": 1081.465, + "eval_steps_per_second": 4.271, + "step": 12444 + }, + { + "epoch": 244.12, + "learning_rate": 0.00014055123813398455, + "loss": 0.5546, + "step": 12450 + }, + { + "epoch": 244.31, + "learning_rate": 0.00014053153702732333, + "loss": 0.5548, + "step": 12460 + }, + { + "epoch": 244.51, + "learning_rate": 0.00014051181678691475, + "loss": 0.5543, + "step": 12470 + }, + { + "epoch": 244.71, + "learning_rate": 0.00014049207741851676, + "loss": 0.5547, + "step": 12480 + }, + { + "epoch": 244.9, + "learning_rate": 0.00014047231892789274, + "loss": 0.5571, + "step": 12490 + }, + { + "epoch": 245.0, + "eval_loss": 0.5540693402290344, + "eval_runtime": 2.2162, + "eval_samples_per_second": 1028.327, + "eval_steps_per_second": 4.061, + "step": 12495 + }, + { + "epoch": 245.1, + "learning_rate": 0.0001404525413208118, + "loss": 0.5528, + "step": 12500 + }, + { + "epoch": 245.29, + "learning_rate": 0.0001404327446030485, + "loss": 0.5579, + "step": 12510 + }, + { + "epoch": 245.49, + "learning_rate": 0.00014041292878038308, + "loss": 0.557, + "step": 12520 + }, + { + "epoch": 245.69, + "learning_rate": 0.00014039309385860133, + "loss": 0.5521, + "step": 12530 + }, + { + "epoch": 245.88, + "learning_rate": 0.00014037323984349454, + "loss": 0.5549, + "step": 12540 + }, + { + "epoch": 246.0, + "eval_loss": 0.5541282296180725, + "eval_runtime": 2.1726, + "eval_samples_per_second": 1048.955, + "eval_steps_per_second": 4.142, + "step": 12546 + }, + { + "epoch": 246.08, + "learning_rate": 0.00014035336674085973, + "loss": 0.5601, + "step": 12550 + }, + { + "epoch": 246.27, + "learning_rate": 0.0001403334745564993, + "loss": 0.5522, + "step": 12560 + }, + { + "epoch": 246.47, + "learning_rate": 0.00014031356329622142, + "loss": 0.5521, + "step": 12570 + }, + { + "epoch": 246.67, + "learning_rate": 0.00014029363296583967, + "loss": 0.5543, + "step": 12580 + }, + { + "epoch": 246.86, + "learning_rate": 0.00014027368357117327, + "loss": 0.5521, + "step": 12590 + }, + { + "epoch": 247.0, + "eval_loss": 0.5520634651184082, + "eval_runtime": 2.1385, + "eval_samples_per_second": 1065.694, + "eval_steps_per_second": 4.209, + "step": 12597 + }, + { + "epoch": 247.06, + "learning_rate": 0.00014025371511804704, + "loss": 0.5545, + "step": 12600 + }, + { + "epoch": 247.25, + "learning_rate": 0.0001402337276122913, + "loss": 0.5564, + "step": 12610 + }, + { + "epoch": 247.45, + "learning_rate": 0.00014021372105974192, + "loss": 0.5577, + "step": 12620 + }, + { + "epoch": 247.65, + "learning_rate": 0.00014019369546624041, + "loss": 0.5555, + "step": 12630 + }, + { + "epoch": 247.84, + "learning_rate": 0.0001401736508376338, + "loss": 0.55, + "step": 12640 + }, + { + "epoch": 248.0, + "eval_loss": 0.5567444562911987, + "eval_runtime": 2.1464, + "eval_samples_per_second": 1061.759, + "eval_steps_per_second": 4.193, + "step": 12648 + }, + { + "epoch": 248.04, + "learning_rate": 0.00014015358717977462, + "loss": 0.5494, + "step": 12650 + }, + { + "epoch": 248.24, + "learning_rate": 0.00014013350449852108, + "loss": 0.5543, + "step": 12660 + }, + { + "epoch": 248.43, + "learning_rate": 0.00014011340279973685, + "loss": 0.556, + "step": 12670 + }, + { + "epoch": 248.63, + "learning_rate": 0.00014009328208929115, + "loss": 0.5497, + "step": 12680 + }, + { + "epoch": 248.82, + "learning_rate": 0.00014007314237305882, + "loss": 0.5518, + "step": 12690 + }, + { + "epoch": 249.0, + "eval_loss": 0.5559237599372864, + "eval_runtime": 2.164, + "eval_samples_per_second": 1053.148, + "eval_steps_per_second": 4.159, + "step": 12699 + }, + { + "epoch": 249.02, + "learning_rate": 0.00014005298365692018, + "loss": 0.5504, + "step": 12700 + }, + { + "epoch": 249.22, + "learning_rate": 0.00014003280594676113, + "loss": 0.5543, + "step": 12710 + }, + { + "epoch": 249.41, + "learning_rate": 0.00014001260924847314, + "loss": 0.5519, + "step": 12720 + }, + { + "epoch": 249.61, + "learning_rate": 0.00013999239356795315, + "loss": 0.55, + "step": 12730 + }, + { + "epoch": 249.8, + "learning_rate": 0.0001399721589111037, + "loss": 0.5529, + "step": 12740 + }, + { + "epoch": 250.0, + "learning_rate": 0.0001399519052838329, + "loss": 0.5522, + "step": 12750 + }, + { + "epoch": 250.0, + "eval_loss": 0.5535538792610168, + "eval_runtime": 2.0888, + "eval_samples_per_second": 1091.067, + "eval_steps_per_second": 4.309, + "step": 12750 + }, + { + "epoch": 250.2, + "learning_rate": 0.00013993163269205428, + "loss": 0.555, + "step": 12760 + }, + { + "epoch": 250.39, + "learning_rate": 0.00013991134114168708, + "loss": 0.5497, + "step": 12770 + }, + { + "epoch": 250.59, + "learning_rate": 0.0001398910306386559, + "loss": 0.5566, + "step": 12780 + }, + { + "epoch": 250.78, + "learning_rate": 0.000139870701188891, + "loss": 0.5536, + "step": 12790 + }, + { + "epoch": 250.98, + "learning_rate": 0.00013985035279832808, + "loss": 0.5481, + "step": 12800 + }, + { + "epoch": 251.0, + "eval_loss": 0.5503749251365662, + "eval_runtime": 2.191, + "eval_samples_per_second": 1040.186, + "eval_steps_per_second": 4.108, + "step": 12801 + }, + { + "epoch": 251.18, + "learning_rate": 0.00013982998547290847, + "loss": 0.5522, + "step": 12810 + }, + { + "epoch": 251.37, + "learning_rate": 0.00013980959921857893, + "loss": 0.5494, + "step": 12820 + }, + { + "epoch": 251.57, + "learning_rate": 0.00013978919404129185, + "loss": 0.5526, + "step": 12830 + }, + { + "epoch": 251.76, + "learning_rate": 0.00013976876994700502, + "loss": 0.5524, + "step": 12840 + }, + { + "epoch": 251.96, + "learning_rate": 0.00013974832694168188, + "loss": 0.5516, + "step": 12850 + }, + { + "epoch": 252.0, + "eval_loss": 0.5562964081764221, + "eval_runtime": 2.1875, + "eval_samples_per_second": 1041.847, + "eval_steps_per_second": 4.114, + "step": 12852 + }, + { + "epoch": 252.16, + "learning_rate": 0.00013972786503129125, + "loss": 0.5528, + "step": 12860 + }, + { + "epoch": 252.35, + "learning_rate": 0.00013970738422180765, + "loss": 0.5506, + "step": 12870 + }, + { + "epoch": 252.55, + "learning_rate": 0.00013968688451921094, + "loss": 0.5546, + "step": 12880 + }, + { + "epoch": 252.75, + "learning_rate": 0.00013966636592948662, + "loss": 0.5459, + "step": 12890 + }, + { + "epoch": 252.94, + "learning_rate": 0.00013964582845862566, + "loss": 0.5524, + "step": 12900 + }, + { + "epoch": 253.0, + "eval_loss": 0.5502599477767944, + "eval_runtime": 2.2087, + "eval_samples_per_second": 1031.838, + "eval_steps_per_second": 4.075, + "step": 12903 + }, + { + "epoch": 253.14, + "learning_rate": 0.0001396252721126245, + "loss": 0.5477, + "step": 12910 + }, + { + "epoch": 253.33, + "learning_rate": 0.0001396046968974852, + "loss": 0.5459, + "step": 12920 + }, + { + "epoch": 253.53, + "learning_rate": 0.00013958410281921522, + "loss": 0.5495, + "step": 12930 + }, + { + "epoch": 253.73, + "learning_rate": 0.00013956348988382756, + "loss": 0.55, + "step": 12940 + }, + { + "epoch": 253.92, + "learning_rate": 0.00013954285809734078, + "loss": 0.5582, + "step": 12950 + }, + { + "epoch": 254.0, + "eval_loss": 0.5519425272941589, + "eval_runtime": 2.2043, + "eval_samples_per_second": 1033.891, + "eval_steps_per_second": 4.083, + "step": 12954 + }, + { + "epoch": 254.12, + "learning_rate": 0.00013952220746577887, + "loss": 0.5544, + "step": 12960 + }, + { + "epoch": 254.31, + "learning_rate": 0.0001395015379951714, + "loss": 0.5509, + "step": 12970 + }, + { + "epoch": 254.51, + "learning_rate": 0.00013948084969155332, + "loss": 0.554, + "step": 12980 + }, + { + "epoch": 254.71, + "learning_rate": 0.00013946014256096523, + "loss": 0.5538, + "step": 12990 + }, + { + "epoch": 254.9, + "learning_rate": 0.0001394394166094531, + "loss": 0.5514, + "step": 13000 + }, + { + "epoch": 255.0, + "eval_loss": 0.5504211187362671, + "eval_runtime": 2.1612, + "eval_samples_per_second": 1054.513, + "eval_steps_per_second": 4.164, + "step": 13005 + }, + { + "epoch": 255.1, + "learning_rate": 0.00013941867184306841, + "loss": 0.5475, + "step": 13010 + }, + { + "epoch": 255.29, + "learning_rate": 0.00013939790826786826, + "loss": 0.5481, + "step": 13020 + }, + { + "epoch": 255.49, + "learning_rate": 0.00013937712588991513, + "loss": 0.5661, + "step": 13030 + }, + { + "epoch": 255.69, + "learning_rate": 0.00013935632471527692, + "loss": 0.5604, + "step": 13040 + }, + { + "epoch": 255.88, + "learning_rate": 0.00013933550475002724, + "loss": 0.5498, + "step": 13050 + }, + { + "epoch": 256.0, + "eval_loss": 0.5519892573356628, + "eval_runtime": 2.1905, + "eval_samples_per_second": 1040.423, + "eval_steps_per_second": 4.109, + "step": 13056 + }, + { + "epoch": 256.08, + "learning_rate": 0.00013931466600024497, + "loss": 0.5487, + "step": 13060 + }, + { + "epoch": 256.27, + "learning_rate": 0.00013929380847201462, + "loss": 0.5514, + "step": 13070 + }, + { + "epoch": 256.47, + "learning_rate": 0.00013927293217142603, + "loss": 0.5507, + "step": 13080 + }, + { + "epoch": 256.67, + "learning_rate": 0.0001392520371045747, + "loss": 0.5525, + "step": 13090 + }, + { + "epoch": 256.86, + "learning_rate": 0.0001392311232775615, + "loss": 0.5481, + "step": 13100 + }, + { + "epoch": 257.0, + "eval_loss": 0.5540376305580139, + "eval_runtime": 2.2252, + "eval_samples_per_second": 1024.175, + "eval_steps_per_second": 4.045, + "step": 13107 + }, + { + "epoch": 257.06, + "learning_rate": 0.0001392101906964928, + "loss": 0.5545, + "step": 13110 + }, + { + "epoch": 257.25, + "learning_rate": 0.00013918923936748044, + "loss": 0.5499, + "step": 13120 + }, + { + "epoch": 257.45, + "learning_rate": 0.00013916826929664171, + "loss": 0.5492, + "step": 13130 + }, + { + "epoch": 257.65, + "learning_rate": 0.0001391472804900995, + "loss": 0.5522, + "step": 13140 + }, + { + "epoch": 257.84, + "learning_rate": 0.00013912627295398195, + "loss": 0.551, + "step": 13150 + }, + { + "epoch": 258.0, + "eval_loss": 0.5503237843513489, + "eval_runtime": 2.1524, + "eval_samples_per_second": 1058.821, + "eval_steps_per_second": 4.181, + "step": 13158 + }, + { + "epoch": 258.04, + "learning_rate": 0.00013910524669442288, + "loss": 0.5414, + "step": 13160 + }, + { + "epoch": 258.24, + "learning_rate": 0.00013908420171756145, + "loss": 0.5502, + "step": 13170 + }, + { + "epoch": 258.43, + "learning_rate": 0.00013906313802954234, + "loss": 0.5495, + "step": 13180 + }, + { + "epoch": 258.63, + "learning_rate": 0.00013904205563651566, + "loss": 0.5498, + "step": 13190 + }, + { + "epoch": 258.82, + "learning_rate": 0.00013902095454463705, + "loss": 0.5495, + "step": 13200 + }, + { + "epoch": 259.0, + "eval_loss": 0.5490508079528809, + "eval_runtime": 2.1616, + "eval_samples_per_second": 1054.332, + "eval_steps_per_second": 4.164, + "step": 13209 + }, + { + "epoch": 259.02, + "learning_rate": 0.0001389998347600675, + "loss": 0.5475, + "step": 13210 + }, + { + "epoch": 259.22, + "learning_rate": 0.0001389786962889735, + "loss": 0.5538, + "step": 13220 + }, + { + "epoch": 259.41, + "learning_rate": 0.0001389575391375271, + "loss": 0.5466, + "step": 13230 + }, + { + "epoch": 259.61, + "learning_rate": 0.00013893636331190564, + "loss": 0.542, + "step": 13240 + }, + { + "epoch": 259.8, + "learning_rate": 0.00013891516881829198, + "loss": 0.5466, + "step": 13250 + }, + { + "epoch": 260.0, + "learning_rate": 0.0001388939556628745, + "loss": 0.5483, + "step": 13260 + }, + { + "epoch": 260.0, + "eval_loss": 0.5461385846138, + "eval_runtime": 2.2603, + "eval_samples_per_second": 1008.293, + "eval_steps_per_second": 3.982, + "step": 13260 + }, + { + "epoch": 260.2, + "learning_rate": 0.00013887272385184696, + "loss": 0.5471, + "step": 13270 + }, + { + "epoch": 260.39, + "learning_rate": 0.00013885147339140854, + "loss": 0.5463, + "step": 13280 + }, + { + "epoch": 260.59, + "learning_rate": 0.00013883020428776392, + "loss": 0.5462, + "step": 13290 + }, + { + "epoch": 260.78, + "learning_rate": 0.00013880891654712317, + "loss": 0.542, + "step": 13300 + }, + { + "epoch": 260.98, + "learning_rate": 0.0001387876101757019, + "loss": 0.5468, + "step": 13310 + }, + { + "epoch": 261.0, + "eval_loss": 0.5586115121841431, + "eval_runtime": 2.2776, + "eval_samples_per_second": 1000.623, + "eval_steps_per_second": 3.952, + "step": 13311 + }, + { + "epoch": 261.18, + "learning_rate": 0.00013876628517972106, + "loss": 0.5529, + "step": 13320 + }, + { + "epoch": 261.37, + "learning_rate": 0.00013874494156540707, + "loss": 0.5473, + "step": 13330 + }, + { + "epoch": 261.57, + "learning_rate": 0.00013872357933899176, + "loss": 0.5472, + "step": 13340 + }, + { + "epoch": 261.76, + "learning_rate": 0.0001387021985067125, + "loss": 0.5433, + "step": 13350 + }, + { + "epoch": 261.96, + "learning_rate": 0.00013868079907481196, + "loss": 0.5454, + "step": 13360 + }, + { + "epoch": 262.0, + "eval_loss": 0.5494788885116577, + "eval_runtime": 2.2512, + "eval_samples_per_second": 1012.37, + "eval_steps_per_second": 3.998, + "step": 13362 + }, + { + "epoch": 262.16, + "learning_rate": 0.0001386593810495383, + "loss": 0.5498, + "step": 13370 + }, + { + "epoch": 262.35, + "learning_rate": 0.0001386379444371451, + "loss": 0.5504, + "step": 13380 + }, + { + "epoch": 262.55, + "learning_rate": 0.00013861648924389143, + "loss": 0.5442, + "step": 13390 + }, + { + "epoch": 262.75, + "learning_rate": 0.00013859501547604166, + "loss": 0.5405, + "step": 13400 + }, + { + "epoch": 262.94, + "learning_rate": 0.00013857352313986567, + "loss": 0.5447, + "step": 13410 + }, + { + "epoch": 263.0, + "eval_loss": 0.5454888939857483, + "eval_runtime": 2.1676, + "eval_samples_per_second": 1051.378, + "eval_steps_per_second": 4.152, + "step": 13413 + }, + { + "epoch": 263.14, + "learning_rate": 0.00013855201224163876, + "loss": 0.5415, + "step": 13420 + }, + { + "epoch": 263.33, + "learning_rate": 0.00013853048278764164, + "loss": 0.5447, + "step": 13430 + }, + { + "epoch": 263.53, + "learning_rate": 0.0001385089347841604, + "loss": 0.5452, + "step": 13440 + }, + { + "epoch": 263.73, + "learning_rate": 0.00013848736823748658, + "loss": 0.5429, + "step": 13450 + }, + { + "epoch": 263.92, + "learning_rate": 0.00013846578315391715, + "loss": 0.5475, + "step": 13460 + }, + { + "epoch": 264.0, + "eval_loss": 0.5510943531990051, + "eval_runtime": 2.2148, + "eval_samples_per_second": 1028.991, + "eval_steps_per_second": 4.064, + "step": 13464 + }, + { + "epoch": 264.12, + "learning_rate": 0.00013844417953975445, + "loss": 0.5442, + "step": 13470 + }, + { + "epoch": 264.31, + "learning_rate": 0.0001384225574013063, + "loss": 0.5478, + "step": 13480 + }, + { + "epoch": 264.51, + "learning_rate": 0.0001384009167448858, + "loss": 0.5402, + "step": 13490 + }, + { + "epoch": 264.71, + "learning_rate": 0.00013837925757681163, + "loss": 0.5466, + "step": 13500 + }, + { + "epoch": 264.9, + "learning_rate": 0.00013835757990340774, + "loss": 0.5439, + "step": 13510 + }, + { + "epoch": 265.0, + "eval_loss": 0.5452569723129272, + "eval_runtime": 2.2132, + "eval_samples_per_second": 1029.718, + "eval_steps_per_second": 4.066, + "step": 13515 + }, + { + "epoch": 265.1, + "learning_rate": 0.0001383358837310035, + "loss": 0.5433, + "step": 13520 + }, + { + "epoch": 265.29, + "learning_rate": 0.00013831416906593376, + "loss": 0.5426, + "step": 13530 + }, + { + "epoch": 265.49, + "learning_rate": 0.0001382924359145387, + "loss": 0.5445, + "step": 13540 + }, + { + "epoch": 265.69, + "learning_rate": 0.0001382706842831639, + "loss": 0.5431, + "step": 13550 + }, + { + "epoch": 265.88, + "learning_rate": 0.00013824891417816036, + "loss": 0.542, + "step": 13560 + }, + { + "epoch": 266.0, + "eval_loss": 0.5477101802825928, + "eval_runtime": 2.1889, + "eval_samples_per_second": 1041.148, + "eval_steps_per_second": 4.112, + "step": 13566 + }, + { + "epoch": 266.08, + "learning_rate": 0.0001382271256058845, + "loss": 0.5452, + "step": 13570 + }, + { + "epoch": 266.27, + "learning_rate": 0.000138205318572698, + "loss": 0.5451, + "step": 13580 + }, + { + "epoch": 266.47, + "learning_rate": 0.00013818349308496812, + "loss": 0.5472, + "step": 13590 + }, + { + "epoch": 266.67, + "learning_rate": 0.0001381616491490674, + "loss": 0.5468, + "step": 13600 + }, + { + "epoch": 266.86, + "learning_rate": 0.00013813978677137379, + "loss": 0.5437, + "step": 13610 + }, + { + "epoch": 267.0, + "eval_loss": 0.5501764416694641, + "eval_runtime": 2.2879, + "eval_samples_per_second": 996.126, + "eval_steps_per_second": 3.934, + "step": 13617 + }, + { + "epoch": 267.06, + "learning_rate": 0.00013811790595827058, + "loss": 0.5383, + "step": 13620 + }, + { + "epoch": 267.25, + "learning_rate": 0.00013809600671614648, + "loss": 0.5438, + "step": 13630 + }, + { + "epoch": 267.45, + "learning_rate": 0.00013807408905139562, + "loss": 0.5413, + "step": 13640 + }, + { + "epoch": 267.65, + "learning_rate": 0.00013805215297041742, + "loss": 0.5441, + "step": 13650 + }, + { + "epoch": 267.84, + "learning_rate": 0.00013803019847961675, + "loss": 0.5452, + "step": 13660 + }, + { + "epoch": 268.0, + "eval_loss": 0.5432447195053101, + "eval_runtime": 2.2193, + "eval_samples_per_second": 1026.904, + "eval_steps_per_second": 4.055, + "step": 13668 + }, + { + "epoch": 268.04, + "learning_rate": 0.00013800822558540386, + "loss": 0.544, + "step": 13670 + }, + { + "epoch": 268.24, + "learning_rate": 0.0001379862342941943, + "loss": 0.5393, + "step": 13680 + }, + { + "epoch": 268.43, + "learning_rate": 0.00013796422461240907, + "loss": 0.5409, + "step": 13690 + }, + { + "epoch": 268.63, + "learning_rate": 0.0001379421965464745, + "loss": 0.5421, + "step": 13700 + }, + { + "epoch": 268.82, + "learning_rate": 0.00013792015010282227, + "loss": 0.5397, + "step": 13710 + }, + { + "epoch": 269.0, + "eval_loss": 0.5443356037139893, + "eval_runtime": 2.1268, + "eval_samples_per_second": 1071.567, + "eval_steps_per_second": 4.232, + "step": 13719 + }, + { + "epoch": 269.02, + "learning_rate": 0.00013789808528788945, + "loss": 0.5451, + "step": 13720 + }, + { + "epoch": 269.22, + "learning_rate": 0.00013787600210811852, + "loss": 0.5417, + "step": 13730 + }, + { + "epoch": 269.41, + "learning_rate": 0.0001378539005699572, + "loss": 0.5466, + "step": 13740 + }, + { + "epoch": 269.61, + "learning_rate": 0.00013783178067985875, + "loss": 0.5429, + "step": 13750 + }, + { + "epoch": 269.8, + "learning_rate": 0.0001378096424442816, + "loss": 0.5416, + "step": 13760 + }, + { + "epoch": 270.0, + "learning_rate": 0.00013778748586968962, + "loss": 0.5424, + "step": 13770 + }, + { + "epoch": 270.0, + "eval_loss": 0.5410163998603821, + "eval_runtime": 2.2841, + "eval_samples_per_second": 997.745, + "eval_steps_per_second": 3.94, + "step": 13770 + }, + { + "epoch": 270.2, + "learning_rate": 0.0001377653109625521, + "loss": 0.5391, + "step": 13780 + }, + { + "epoch": 270.39, + "learning_rate": 0.00013774311772934357, + "loss": 0.5427, + "step": 13790 + }, + { + "epoch": 270.59, + "learning_rate": 0.000137720906176544, + "loss": 0.5395, + "step": 13800 + }, + { + "epoch": 270.78, + "learning_rate": 0.00013769867631063858, + "loss": 0.5391, + "step": 13810 + }, + { + "epoch": 270.98, + "learning_rate": 0.00013767642813811802, + "loss": 0.5391, + "step": 13820 + }, + { + "epoch": 271.0, + "eval_loss": 0.5419728755950928, + "eval_runtime": 2.1784, + "eval_samples_per_second": 1046.187, + "eval_steps_per_second": 4.131, + "step": 13821 + }, + { + "epoch": 271.18, + "learning_rate": 0.00013765416166547825, + "loss": 0.5435, + "step": 13830 + }, + { + "epoch": 271.37, + "learning_rate": 0.00013763187689922062, + "loss": 0.5401, + "step": 13840 + }, + { + "epoch": 271.57, + "learning_rate": 0.00013760957384585174, + "loss": 0.5355, + "step": 13850 + }, + { + "epoch": 271.76, + "learning_rate": 0.00013758725251188366, + "loss": 0.5373, + "step": 13860 + }, + { + "epoch": 271.96, + "learning_rate": 0.00013756491290383365, + "loss": 0.5368, + "step": 13870 + }, + { + "epoch": 272.0, + "eval_loss": 0.5402165651321411, + "eval_runtime": 2.2021, + "eval_samples_per_second": 1034.911, + "eval_steps_per_second": 4.087, + "step": 13872 + }, + { + "epoch": 272.16, + "learning_rate": 0.0001375425550282244, + "loss": 0.5368, + "step": 13880 + }, + { + "epoch": 272.35, + "learning_rate": 0.00013752017889158394, + "loss": 0.5368, + "step": 13890 + }, + { + "epoch": 272.55, + "learning_rate": 0.00013749778450044558, + "loss": 0.5407, + "step": 13900 + }, + { + "epoch": 272.75, + "learning_rate": 0.00013747537186134797, + "loss": 0.545, + "step": 13910 + }, + { + "epoch": 272.94, + "learning_rate": 0.0001374529409808351, + "loss": 0.5387, + "step": 13920 + }, + { + "epoch": 273.0, + "eval_loss": 0.5400860905647278, + "eval_runtime": 2.1991, + "eval_samples_per_second": 1036.352, + "eval_steps_per_second": 4.093, + "step": 13923 + }, + { + "epoch": 273.14, + "learning_rate": 0.00013743049186545631, + "loss": 0.5387, + "step": 13930 + }, + { + "epoch": 273.33, + "learning_rate": 0.00013740802452176626, + "loss": 0.5415, + "step": 13940 + }, + { + "epoch": 273.53, + "learning_rate": 0.00013738553895632484, + "loss": 0.5341, + "step": 13950 + }, + { + "epoch": 273.73, + "learning_rate": 0.0001373630351756974, + "loss": 0.5385, + "step": 13960 + }, + { + "epoch": 273.92, + "learning_rate": 0.00013734051318645452, + "loss": 0.5362, + "step": 13970 + }, + { + "epoch": 274.0, + "eval_loss": 0.5413815975189209, + "eval_runtime": 2.1276, + "eval_samples_per_second": 1071.167, + "eval_steps_per_second": 4.23, + "step": 13974 + }, + { + "epoch": 274.12, + "learning_rate": 0.0001373179729951721, + "loss": 0.5348, + "step": 13980 + }, + { + "epoch": 274.31, + "learning_rate": 0.0001372954146084314, + "loss": 0.5412, + "step": 13990 + }, + { + "epoch": 274.51, + "learning_rate": 0.00013727283803281894, + "loss": 0.5411, + "step": 14000 + }, + { + "epoch": 274.71, + "learning_rate": 0.00013725024327492663, + "loss": 0.537, + "step": 14010 + }, + { + "epoch": 274.9, + "learning_rate": 0.00013722763034135156, + "loss": 0.5374, + "step": 14020 + }, + { + "epoch": 275.0, + "eval_loss": 0.5417589545249939, + "eval_runtime": 2.2039, + "eval_samples_per_second": 1034.085, + "eval_steps_per_second": 4.084, + "step": 14025 + }, + { + "epoch": 275.1, + "learning_rate": 0.00013720499923869627, + "loss": 0.5404, + "step": 14030 + }, + { + "epoch": 275.29, + "learning_rate": 0.0001371823499735685, + "loss": 0.5364, + "step": 14040 + }, + { + "epoch": 275.49, + "learning_rate": 0.00013715968255258136, + "loss": 0.5415, + "step": 14050 + }, + { + "epoch": 275.69, + "learning_rate": 0.0001371369969823532, + "loss": 0.5358, + "step": 14060 + }, + { + "epoch": 275.88, + "learning_rate": 0.0001371142932695077, + "loss": 0.5375, + "step": 14070 + }, + { + "epoch": 276.0, + "eval_loss": 0.5415284633636475, + "eval_runtime": 2.3244, + "eval_samples_per_second": 980.457, + "eval_steps_per_second": 3.872, + "step": 14076 + }, + { + "epoch": 276.08, + "learning_rate": 0.00013709157142067382, + "loss": 0.5416, + "step": 14080 + }, + { + "epoch": 276.27, + "learning_rate": 0.0001370688314424859, + "loss": 0.5305, + "step": 14090 + }, + { + "epoch": 276.47, + "learning_rate": 0.00013704607334158347, + "loss": 0.534, + "step": 14100 + }, + { + "epoch": 276.67, + "learning_rate": 0.00013702329712461135, + "loss": 0.5341, + "step": 14110 + }, + { + "epoch": 276.86, + "learning_rate": 0.00013700050279821975, + "loss": 0.5427, + "step": 14120 + }, + { + "epoch": 277.0, + "eval_loss": 0.5435522794723511, + "eval_runtime": 2.2092, + "eval_samples_per_second": 1031.595, + "eval_steps_per_second": 4.074, + "step": 14127 + }, + { + "epoch": 277.06, + "learning_rate": 0.0001369776903690641, + "loss": 0.5404, + "step": 14130 + }, + { + "epoch": 277.25, + "learning_rate": 0.00013695485984380505, + "loss": 0.5369, + "step": 14140 + }, + { + "epoch": 277.45, + "learning_rate": 0.0001369320112291087, + "loss": 0.5361, + "step": 14150 + }, + { + "epoch": 277.65, + "learning_rate": 0.00013690914453164625, + "loss": 0.5347, + "step": 14160 + }, + { + "epoch": 277.84, + "learning_rate": 0.0001368862597580943, + "loss": 0.5382, + "step": 14170 + }, + { + "epoch": 278.0, + "eval_loss": 0.5365801453590393, + "eval_runtime": 2.2436, + "eval_samples_per_second": 1015.78, + "eval_steps_per_second": 4.011, + "step": 14178 + }, + { + "epoch": 278.04, + "learning_rate": 0.00013686335691513474, + "loss": 0.5329, + "step": 14180 + }, + { + "epoch": 278.24, + "learning_rate": 0.0001368404360094546, + "loss": 0.5299, + "step": 14190 + }, + { + "epoch": 278.43, + "learning_rate": 0.00013681749704774637, + "loss": 0.5368, + "step": 14200 + }, + { + "epoch": 278.63, + "learning_rate": 0.00013679454003670766, + "loss": 0.5429, + "step": 14210 + }, + { + "epoch": 278.82, + "learning_rate": 0.00013677156498304136, + "loss": 0.5341, + "step": 14220 + }, + { + "epoch": 279.0, + "eval_loss": 0.5410821437835693, + "eval_runtime": 2.2163, + "eval_samples_per_second": 1028.301, + "eval_steps_per_second": 4.061, + "step": 14229 + }, + { + "epoch": 279.02, + "learning_rate": 0.00013674857189345578, + "loss": 0.5361, + "step": 14230 + }, + { + "epoch": 279.22, + "learning_rate": 0.0001367255607746643, + "loss": 0.5367, + "step": 14240 + }, + { + "epoch": 279.41, + "learning_rate": 0.00013670253163338572, + "loss": 0.5382, + "step": 14250 + }, + { + "epoch": 279.61, + "learning_rate": 0.00013667948447634398, + "loss": 0.5391, + "step": 14260 + }, + { + "epoch": 279.8, + "learning_rate": 0.00013665641931026837, + "loss": 0.5336, + "step": 14270 + }, + { + "epoch": 280.0, + "learning_rate": 0.00013663333614189336, + "loss": 0.5348, + "step": 14280 + }, + { + "epoch": 280.0, + "eval_loss": 0.5377461314201355, + "eval_runtime": 2.1689, + "eval_samples_per_second": 1050.764, + "eval_steps_per_second": 4.15, + "step": 14280 + }, + { + "epoch": 280.2, + "learning_rate": 0.00013661023497795878, + "loss": 0.5394, + "step": 14290 + }, + { + "epoch": 280.39, + "learning_rate": 0.00013658711582520964, + "loss": 0.5338, + "step": 14300 + }, + { + "epoch": 280.59, + "learning_rate": 0.0001365639786903962, + "loss": 0.5384, + "step": 14310 + }, + { + "epoch": 280.78, + "learning_rate": 0.00013654082358027398, + "loss": 0.5359, + "step": 14320 + }, + { + "epoch": 280.98, + "learning_rate": 0.00013651765050160376, + "loss": 0.5339, + "step": 14330 + }, + { + "epoch": 281.0, + "eval_loss": 0.5393053293228149, + "eval_runtime": 2.1543, + "eval_samples_per_second": 1057.88, + "eval_steps_per_second": 4.178, + "step": 14331 + }, + { + "epoch": 281.18, + "learning_rate": 0.0001364944594611516, + "loss": 0.5356, + "step": 14340 + }, + { + "epoch": 281.37, + "learning_rate": 0.0001364712504656887, + "loss": 0.5362, + "step": 14350 + }, + { + "epoch": 281.57, + "learning_rate": 0.00013644802352199165, + "loss": 0.5325, + "step": 14360 + }, + { + "epoch": 281.76, + "learning_rate": 0.0001364247786368421, + "loss": 0.5318, + "step": 14370 + }, + { + "epoch": 281.96, + "learning_rate": 0.00013640151581702716, + "loss": 0.5359, + "step": 14380 + }, + { + "epoch": 282.0, + "eval_loss": 0.5359378457069397, + "eval_runtime": 2.2586, + "eval_samples_per_second": 1009.032, + "eval_steps_per_second": 3.985, + "step": 14382 + }, + { + "epoch": 282.16, + "learning_rate": 0.00013637823506933893, + "loss": 0.5393, + "step": 14390 + }, + { + "epoch": 282.35, + "learning_rate": 0.00013635493640057496, + "loss": 0.5329, + "step": 14400 + }, + { + "epoch": 282.55, + "learning_rate": 0.0001363316198175379, + "loss": 0.5317, + "step": 14410 + }, + { + "epoch": 282.75, + "learning_rate": 0.00013630828532703568, + "loss": 0.5279, + "step": 14420 + }, + { + "epoch": 282.94, + "learning_rate": 0.00013628493293588143, + "loss": 0.536, + "step": 14430 + }, + { + "epoch": 283.0, + "eval_loss": 0.5368289947509766, + "eval_runtime": 2.2197, + "eval_samples_per_second": 1026.704, + "eval_steps_per_second": 4.055, + "step": 14433 + }, + { + "epoch": 283.14, + "learning_rate": 0.00013626156265089358, + "loss": 0.5332, + "step": 14440 + }, + { + "epoch": 283.33, + "learning_rate": 0.00013623817447889572, + "loss": 0.5327, + "step": 14450 + }, + { + "epoch": 283.53, + "learning_rate": 0.00013621476842671663, + "loss": 0.5371, + "step": 14460 + }, + { + "epoch": 283.73, + "learning_rate": 0.00013619134450119035, + "loss": 0.5358, + "step": 14470 + }, + { + "epoch": 283.92, + "learning_rate": 0.00013616790270915623, + "loss": 0.5362, + "step": 14480 + }, + { + "epoch": 284.0, + "eval_loss": 0.5383955240249634, + "eval_runtime": 2.221, + "eval_samples_per_second": 1026.116, + "eval_steps_per_second": 4.052, + "step": 14484 + }, + { + "epoch": 284.12, + "learning_rate": 0.00013614444305745866, + "loss": 0.5362, + "step": 14490 + }, + { + "epoch": 284.31, + "learning_rate": 0.00013612096555294737, + "loss": 0.536, + "step": 14500 + }, + { + "epoch": 284.51, + "learning_rate": 0.00013609747020247728, + "loss": 0.5365, + "step": 14510 + }, + { + "epoch": 284.71, + "learning_rate": 0.00013607395701290852, + "loss": 0.5349, + "step": 14520 + }, + { + "epoch": 284.9, + "learning_rate": 0.00013605042599110635, + "loss": 0.532, + "step": 14530 + }, + { + "epoch": 285.0, + "eval_loss": 0.5345928072929382, + "eval_runtime": 2.2377, + "eval_samples_per_second": 1018.459, + "eval_steps_per_second": 4.022, + "step": 14535 + }, + { + "epoch": 285.1, + "learning_rate": 0.00013602687714394138, + "loss": 0.5355, + "step": 14540 + }, + { + "epoch": 285.29, + "learning_rate": 0.00013600331047828928, + "loss": 0.537, + "step": 14550 + }, + { + "epoch": 285.49, + "learning_rate": 0.00013597972600103107, + "loss": 0.5363, + "step": 14560 + }, + { + "epoch": 285.69, + "learning_rate": 0.00013595612371905284, + "loss": 0.536, + "step": 14570 + }, + { + "epoch": 285.88, + "learning_rate": 0.0001359325036392459, + "loss": 0.5298, + "step": 14580 + }, + { + "epoch": 286.0, + "eval_loss": 0.5376359820365906, + "eval_runtime": 2.2303, + "eval_samples_per_second": 1021.823, + "eval_steps_per_second": 4.035, + "step": 14586 + }, + { + "epoch": 286.08, + "learning_rate": 0.00013590886576850684, + "loss": 0.5338, + "step": 14590 + }, + { + "epoch": 286.27, + "learning_rate": 0.0001358852101137374, + "loss": 0.5338, + "step": 14600 + }, + { + "epoch": 286.47, + "learning_rate": 0.00013586153668184445, + "loss": 0.5345, + "step": 14610 + }, + { + "epoch": 286.67, + "learning_rate": 0.00013583784547974015, + "loss": 0.5332, + "step": 14620 + }, + { + "epoch": 286.86, + "learning_rate": 0.0001358141365143418, + "loss": 0.5352, + "step": 14630 + }, + { + "epoch": 287.0, + "eval_loss": 0.5373082160949707, + "eval_runtime": 2.3172, + "eval_samples_per_second": 983.516, + "eval_steps_per_second": 3.884, + "step": 14637 + }, + { + "epoch": 287.06, + "learning_rate": 0.00013579040979257184, + "loss": 0.5325, + "step": 14640 + }, + { + "epoch": 287.25, + "learning_rate": 0.000135766665321358, + "loss": 0.5268, + "step": 14650 + }, + { + "epoch": 287.45, + "learning_rate": 0.0001357429031076331, + "loss": 0.5323, + "step": 14660 + }, + { + "epoch": 287.65, + "learning_rate": 0.0001357191231583352, + "loss": 0.5298, + "step": 14670 + }, + { + "epoch": 287.84, + "learning_rate": 0.0001356953254804075, + "loss": 0.5344, + "step": 14680 + }, + { + "epoch": 288.0, + "eval_loss": 0.5358995199203491, + "eval_runtime": 2.3331, + "eval_samples_per_second": 976.806, + "eval_steps_per_second": 3.858, + "step": 14688 + }, + { + "epoch": 288.04, + "learning_rate": 0.0001356715100807984, + "loss": 0.5339, + "step": 14690 + }, + { + "epoch": 288.24, + "learning_rate": 0.00013564767696646148, + "loss": 0.5311, + "step": 14700 + }, + { + "epoch": 288.43, + "learning_rate": 0.00013562382614435543, + "loss": 0.5301, + "step": 14710 + }, + { + "epoch": 288.63, + "learning_rate": 0.00013559995762144422, + "loss": 0.5341, + "step": 14720 + }, + { + "epoch": 288.82, + "learning_rate": 0.00013557607140469687, + "loss": 0.5399, + "step": 14730 + }, + { + "epoch": 289.0, + "eval_loss": 0.5426952838897705, + "eval_runtime": 2.2284, + "eval_samples_per_second": 1022.697, + "eval_steps_per_second": 4.039, + "step": 14739 + }, + { + "epoch": 289.02, + "learning_rate": 0.00013555216750108767, + "loss": 0.5402, + "step": 14740 + }, + { + "epoch": 289.22, + "learning_rate": 0.00013552824591759596, + "loss": 0.5383, + "step": 14750 + }, + { + "epoch": 289.41, + "learning_rate": 0.00013550430666120638, + "loss": 0.5334, + "step": 14760 + }, + { + "epoch": 289.61, + "learning_rate": 0.00013548034973890865, + "loss": 0.5359, + "step": 14770 + }, + { + "epoch": 289.8, + "learning_rate": 0.0001354563751576976, + "loss": 0.5325, + "step": 14780 + }, + { + "epoch": 290.0, + "learning_rate": 0.00013543238292457334, + "loss": 0.5329, + "step": 14790 + }, + { + "epoch": 290.0, + "eval_loss": 0.5349271297454834, + "eval_runtime": 2.1752, + "eval_samples_per_second": 1047.702, + "eval_steps_per_second": 4.137, + "step": 14790 + }, + { + "epoch": 290.2, + "learning_rate": 0.00013540837304654103, + "loss": 0.5372, + "step": 14800 + }, + { + "epoch": 290.39, + "learning_rate": 0.00013538434553061104, + "loss": 0.5305, + "step": 14810 + }, + { + "epoch": 290.59, + "learning_rate": 0.00013536030038379884, + "loss": 0.5318, + "step": 14820 + }, + { + "epoch": 290.78, + "learning_rate": 0.00013533623761312512, + "loss": 0.5301, + "step": 14830 + }, + { + "epoch": 290.98, + "learning_rate": 0.00013531215722561562, + "loss": 0.531, + "step": 14840 + }, + { + "epoch": 291.0, + "eval_loss": 0.5320532321929932, + "eval_runtime": 2.0985, + "eval_samples_per_second": 1086.033, + "eval_steps_per_second": 4.289, + "step": 14841 + }, + { + "epoch": 291.18, + "learning_rate": 0.0001352880592283013, + "loss": 0.529, + "step": 14850 + }, + { + "epoch": 291.37, + "learning_rate": 0.00013526394362821826, + "loss": 0.5338, + "step": 14860 + }, + { + "epoch": 291.57, + "learning_rate": 0.0001352398104324077, + "loss": 0.5283, + "step": 14870 + }, + { + "epoch": 291.76, + "learning_rate": 0.00013521565964791593, + "loss": 0.5317, + "step": 14880 + }, + { + "epoch": 291.96, + "learning_rate": 0.00013519149128179452, + "loss": 0.5317, + "step": 14890 + }, + { + "epoch": 292.0, + "eval_loss": 0.5360802412033081, + "eval_runtime": 2.2127, + "eval_samples_per_second": 1029.986, + "eval_steps_per_second": 4.068, + "step": 14892 + }, + { + "epoch": 292.16, + "learning_rate": 0.00013516730534110004, + "loss": 0.5329, + "step": 14900 + }, + { + "epoch": 292.35, + "learning_rate": 0.00013514310183289425, + "loss": 0.5334, + "step": 14910 + }, + { + "epoch": 292.55, + "learning_rate": 0.00013511888076424408, + "loss": 0.5318, + "step": 14920 + }, + { + "epoch": 292.75, + "learning_rate": 0.0001350946421422215, + "loss": 0.5329, + "step": 14930 + }, + { + "epoch": 292.94, + "learning_rate": 0.00013507038597390363, + "loss": 0.5303, + "step": 14940 + }, + { + "epoch": 293.0, + "eval_loss": 0.5295526385307312, + "eval_runtime": 2.2128, + "eval_samples_per_second": 1029.894, + "eval_steps_per_second": 4.067, + "step": 14943 + }, + { + "epoch": 293.14, + "learning_rate": 0.0001350461122663728, + "loss": 0.5312, + "step": 14950 + }, + { + "epoch": 293.33, + "learning_rate": 0.0001350218210267163, + "loss": 0.5275, + "step": 14960 + }, + { + "epoch": 293.53, + "learning_rate": 0.0001349975122620267, + "loss": 0.5299, + "step": 14970 + }, + { + "epoch": 293.73, + "learning_rate": 0.00013497318597940157, + "loss": 0.532, + "step": 14980 + }, + { + "epoch": 293.92, + "learning_rate": 0.00013494884218594367, + "loss": 0.5291, + "step": 14990 + }, + { + "epoch": 294.0, + "eval_loss": 0.5311741828918457, + "eval_runtime": 2.198, + "eval_samples_per_second": 1036.87, + "eval_steps_per_second": 4.095, + "step": 14994 + }, + { + "epoch": 294.12, + "learning_rate": 0.00013492448088876088, + "loss": 0.5217, + "step": 15000 + }, + { + "epoch": 294.31, + "learning_rate": 0.00013490010209496608, + "loss": 0.5287, + "step": 15010 + }, + { + "epoch": 294.51, + "learning_rate": 0.0001348757058116774, + "loss": 0.5309, + "step": 15020 + }, + { + "epoch": 294.71, + "learning_rate": 0.00013485129204601797, + "loss": 0.5265, + "step": 15030 + }, + { + "epoch": 294.9, + "learning_rate": 0.00013482686080511604, + "loss": 0.5335, + "step": 15040 + }, + { + "epoch": 295.0, + "eval_loss": 0.5243921875953674, + "eval_runtime": 2.1979, + "eval_samples_per_second": 1036.922, + "eval_steps_per_second": 4.095, + "step": 15045 + }, + { + "epoch": 295.1, + "learning_rate": 0.00013480241209610504, + "loss": 0.5291, + "step": 15050 + }, + { + "epoch": 295.29, + "learning_rate": 0.00013477794592612347, + "loss": 0.5267, + "step": 15060 + }, + { + "epoch": 295.49, + "learning_rate": 0.00013475346230231483, + "loss": 0.5312, + "step": 15070 + }, + { + "epoch": 295.69, + "learning_rate": 0.00013472896123182783, + "loss": 0.5315, + "step": 15080 + }, + { + "epoch": 295.88, + "learning_rate": 0.00013470444272181624, + "loss": 0.5309, + "step": 15090 + }, + { + "epoch": 296.0, + "eval_loss": 0.5251594185829163, + "eval_runtime": 2.2406, + "eval_samples_per_second": 1017.139, + "eval_steps_per_second": 4.017, + "step": 15096 + }, + { + "epoch": 296.08, + "learning_rate": 0.00013467990677943893, + "loss": 0.5258, + "step": 15100 + }, + { + "epoch": 296.27, + "learning_rate": 0.00013465535341185983, + "loss": 0.5265, + "step": 15110 + }, + { + "epoch": 296.47, + "learning_rate": 0.00013463078262624796, + "loss": 0.5241, + "step": 15120 + }, + { + "epoch": 296.67, + "learning_rate": 0.00013460619442977746, + "loss": 0.5273, + "step": 15130 + }, + { + "epoch": 296.86, + "learning_rate": 0.00013458158882962754, + "loss": 0.5251, + "step": 15140 + }, + { + "epoch": 297.0, + "eval_loss": 0.531033992767334, + "eval_runtime": 2.2049, + "eval_samples_per_second": 1033.623, + "eval_steps_per_second": 4.082, + "step": 15147 + }, + { + "epoch": 297.06, + "learning_rate": 0.0001345569658329825, + "loss": 0.5324, + "step": 15150 + }, + { + "epoch": 297.25, + "learning_rate": 0.00013453232544703163, + "loss": 0.5246, + "step": 15160 + }, + { + "epoch": 297.45, + "learning_rate": 0.00013450766767896948, + "loss": 0.5249, + "step": 15170 + }, + { + "epoch": 297.65, + "learning_rate": 0.0001344829925359955, + "loss": 0.5332, + "step": 15180 + }, + { + "epoch": 297.84, + "learning_rate": 0.0001344583000253143, + "loss": 0.5266, + "step": 15190 + }, + { + "epoch": 298.0, + "eval_loss": 0.5300943851470947, + "eval_runtime": 2.2014, + "eval_samples_per_second": 1035.249, + "eval_steps_per_second": 4.088, + "step": 15198 + }, + { + "epoch": 298.04, + "learning_rate": 0.00013443359015413554, + "loss": 0.5276, + "step": 15200 + }, + { + "epoch": 298.24, + "learning_rate": 0.00013440886292967396, + "loss": 0.5231, + "step": 15210 + }, + { + "epoch": 298.43, + "learning_rate": 0.00013438411835914934, + "loss": 0.5227, + "step": 15220 + }, + { + "epoch": 298.63, + "learning_rate": 0.00013435935644978656, + "loss": 0.5226, + "step": 15230 + }, + { + "epoch": 298.82, + "learning_rate": 0.00013433457720881555, + "loss": 0.5279, + "step": 15240 + }, + { + "epoch": 299.0, + "eval_loss": 0.530785322189331, + "eval_runtime": 2.2538, + "eval_samples_per_second": 1011.201, + "eval_steps_per_second": 3.993, + "step": 15249 + }, + { + "epoch": 299.02, + "learning_rate": 0.00013430978064347127, + "loss": 0.5257, + "step": 15250 + }, + { + "epoch": 299.22, + "learning_rate": 0.00013428496676099377, + "loss": 0.529, + "step": 15260 + }, + { + "epoch": 299.41, + "learning_rate": 0.0001342601355686282, + "loss": 0.5249, + "step": 15270 + }, + { + "epoch": 299.61, + "learning_rate": 0.00013423528707362463, + "loss": 0.5313, + "step": 15280 + }, + { + "epoch": 299.8, + "learning_rate": 0.0001342104212832383, + "loss": 0.528, + "step": 15290 + }, + { + "epoch": 300.0, + "learning_rate": 0.00013418553820472953, + "loss": 0.5261, + "step": 15300 + }, + { + "epoch": 300.0, + "eval_loss": 0.5249952077865601, + "eval_runtime": 2.3137, + "eval_samples_per_second": 985.006, + "eval_steps_per_second": 3.89, + "step": 15300 + }, + { + "epoch": 300.2, + "learning_rate": 0.00013416063784536353, + "loss": 0.5222, + "step": 15310 + }, + { + "epoch": 300.39, + "learning_rate": 0.00013413572021241067, + "loss": 0.5292, + "step": 15320 + }, + { + "epoch": 300.59, + "learning_rate": 0.0001341107853131464, + "loss": 0.5291, + "step": 15330 + }, + { + "epoch": 300.78, + "learning_rate": 0.00013408583315485111, + "loss": 0.5241, + "step": 15340 + }, + { + "epoch": 300.98, + "learning_rate": 0.0001340608637448103, + "loss": 0.5214, + "step": 15350 + }, + { + "epoch": 301.0, + "eval_loss": 0.5252038836479187, + "eval_runtime": 2.2486, + "eval_samples_per_second": 1013.503, + "eval_steps_per_second": 4.002, + "step": 15351 + }, + { + "epoch": 301.18, + "learning_rate": 0.00013403587709031443, + "loss": 0.5175, + "step": 15360 + }, + { + "epoch": 301.37, + "learning_rate": 0.00013401087319865908, + "loss": 0.5245, + "step": 15370 + }, + { + "epoch": 301.57, + "learning_rate": 0.00013398585207714483, + "loss": 0.5325, + "step": 15380 + }, + { + "epoch": 301.76, + "learning_rate": 0.00013396081373307728, + "loss": 0.5292, + "step": 15390 + }, + { + "epoch": 301.96, + "learning_rate": 0.0001339357581737671, + "loss": 0.5269, + "step": 15400 + }, + { + "epoch": 302.0, + "eval_loss": 0.5306328535079956, + "eval_runtime": 2.1462, + "eval_samples_per_second": 1061.875, + "eval_steps_per_second": 4.193, + "step": 15402 + }, + { + "epoch": 302.16, + "learning_rate": 0.0001339106854065299, + "loss": 0.5247, + "step": 15410 + }, + { + "epoch": 302.35, + "learning_rate": 0.00013388559543868643, + "loss": 0.5235, + "step": 15420 + }, + { + "epoch": 302.55, + "learning_rate": 0.0001338604882775623, + "loss": 0.5239, + "step": 15430 + }, + { + "epoch": 302.75, + "learning_rate": 0.00013383536393048837, + "loss": 0.5259, + "step": 15440 + }, + { + "epoch": 302.94, + "learning_rate": 0.00013381022240480033, + "loss": 0.5229, + "step": 15450 + }, + { + "epoch": 303.0, + "eval_loss": 0.5264057517051697, + "eval_runtime": 2.2349, + "eval_samples_per_second": 1019.731, + "eval_steps_per_second": 4.027, + "step": 15453 + }, + { + "epoch": 303.14, + "learning_rate": 0.00013378506370783892, + "loss": 0.5252, + "step": 15460 + }, + { + "epoch": 303.33, + "learning_rate": 0.00013375988784694994, + "loss": 0.5258, + "step": 15470 + }, + { + "epoch": 303.53, + "learning_rate": 0.0001337346948294842, + "loss": 0.526, + "step": 15480 + }, + { + "epoch": 303.73, + "learning_rate": 0.00013370948466279747, + "loss": 0.5235, + "step": 15490 + }, + { + "epoch": 303.92, + "learning_rate": 0.00013368425735425056, + "loss": 0.5234, + "step": 15500 + }, + { + "epoch": 304.0, + "eval_loss": 0.526339590549469, + "eval_runtime": 2.245, + "eval_samples_per_second": 1015.135, + "eval_steps_per_second": 4.009, + "step": 15504 + }, + { + "epoch": 304.12, + "learning_rate": 0.0001336590129112093, + "loss": 0.5267, + "step": 15510 + }, + { + "epoch": 304.31, + "learning_rate": 0.0001336337513410445, + "loss": 0.5271, + "step": 15520 + }, + { + "epoch": 304.51, + "learning_rate": 0.00013360847265113195, + "loss": 0.5247, + "step": 15530 + }, + { + "epoch": 304.71, + "learning_rate": 0.00013358317684885248, + "loss": 0.53, + "step": 15540 + }, + { + "epoch": 304.9, + "learning_rate": 0.00013355786394159193, + "loss": 0.5271, + "step": 15550 + }, + { + "epoch": 305.0, + "eval_loss": 0.5279854536056519, + "eval_runtime": 2.2065, + "eval_samples_per_second": 1032.864, + "eval_steps_per_second": 4.079, + "step": 15555 + }, + { + "epoch": 305.1, + "learning_rate": 0.00013353253393674105, + "loss": 0.5284, + "step": 15560 + }, + { + "epoch": 305.29, + "learning_rate": 0.00013350718684169567, + "loss": 0.5224, + "step": 15570 + }, + { + "epoch": 305.49, + "learning_rate": 0.00013348182266385654, + "loss": 0.5236, + "step": 15580 + }, + { + "epoch": 305.69, + "learning_rate": 0.0001334564414106295, + "loss": 0.5206, + "step": 15590 + }, + { + "epoch": 305.88, + "learning_rate": 0.00013343104308942527, + "loss": 0.525, + "step": 15600 + }, + { + "epoch": 306.0, + "eval_loss": 0.5233384370803833, + "eval_runtime": 2.1412, + "eval_samples_per_second": 1064.366, + "eval_steps_per_second": 4.203, + "step": 15606 + }, + { + "epoch": 306.08, + "learning_rate": 0.0001334056277076596, + "loss": 0.5251, + "step": 15610 + }, + { + "epoch": 306.27, + "learning_rate": 0.00013338019527275318, + "loss": 0.5208, + "step": 15620 + }, + { + "epoch": 306.47, + "learning_rate": 0.0001333547457921318, + "loss": 0.5224, + "step": 15630 + }, + { + "epoch": 306.67, + "learning_rate": 0.00013332927927322603, + "loss": 0.5226, + "step": 15640 + }, + { + "epoch": 306.86, + "learning_rate": 0.0001333037957234716, + "loss": 0.5216, + "step": 15650 + }, + { + "epoch": 307.0, + "eval_loss": 0.5210602879524231, + "eval_runtime": 2.2572, + "eval_samples_per_second": 1009.641, + "eval_steps_per_second": 3.987, + "step": 15657 + }, + { + "epoch": 307.06, + "learning_rate": 0.00013327829515030913, + "loss": 0.5253, + "step": 15660 + }, + { + "epoch": 307.25, + "learning_rate": 0.0001332527775611842, + "loss": 0.5211, + "step": 15670 + }, + { + "epoch": 307.45, + "learning_rate": 0.0001332272429635474, + "loss": 0.5272, + "step": 15680 + }, + { + "epoch": 307.65, + "learning_rate": 0.00013320169136485423, + "loss": 0.5209, + "step": 15690 + }, + { + "epoch": 307.84, + "learning_rate": 0.0001331761227725652, + "loss": 0.5247, + "step": 15700 + }, + { + "epoch": 308.0, + "eval_loss": 0.5245583653450012, + "eval_runtime": 2.2788, + "eval_samples_per_second": 1000.088, + "eval_steps_per_second": 3.949, + "step": 15708 + }, + { + "epoch": 308.04, + "learning_rate": 0.00013315053719414579, + "loss": 0.5215, + "step": 15710 + }, + { + "epoch": 308.24, + "learning_rate": 0.0001331249346370664, + "loss": 0.5243, + "step": 15720 + }, + { + "epoch": 308.43, + "learning_rate": 0.0001330993151088024, + "loss": 0.5232, + "step": 15730 + }, + { + "epoch": 308.63, + "learning_rate": 0.00013307367861683413, + "loss": 0.5242, + "step": 15740 + }, + { + "epoch": 308.82, + "learning_rate": 0.00013304802516864687, + "loss": 0.5203, + "step": 15750 + }, + { + "epoch": 309.0, + "eval_loss": 0.5279257893562317, + "eval_runtime": 2.2285, + "eval_samples_per_second": 1022.663, + "eval_steps_per_second": 4.039, + "step": 15759 + }, + { + "epoch": 309.02, + "learning_rate": 0.00013302235477173087, + "loss": 0.5277, + "step": 15760 + }, + { + "epoch": 309.22, + "learning_rate": 0.0001329966674335813, + "loss": 0.5237, + "step": 15770 + }, + { + "epoch": 309.41, + "learning_rate": 0.0001329709631616983, + "loss": 0.5241, + "step": 15780 + }, + { + "epoch": 309.61, + "learning_rate": 0.0001329452419635869, + "loss": 0.521, + "step": 15790 + }, + { + "epoch": 309.8, + "learning_rate": 0.00013291950384675718, + "loss": 0.5215, + "step": 15800 + }, + { + "epoch": 310.0, + "learning_rate": 0.00013289374881872404, + "loss": 0.5201, + "step": 15810 + }, + { + "epoch": 310.0, + "eval_loss": 0.5245955586433411, + "eval_runtime": 2.1278, + "eval_samples_per_second": 1071.041, + "eval_steps_per_second": 4.23, + "step": 15810 + }, + { + "epoch": 310.2, + "learning_rate": 0.00013286797688700743, + "loss": 0.5191, + "step": 15820 + }, + { + "epoch": 310.39, + "learning_rate": 0.00013284218805913214, + "loss": 0.5204, + "step": 15830 + }, + { + "epoch": 310.59, + "learning_rate": 0.00013281638234262795, + "loss": 0.5198, + "step": 15840 + }, + { + "epoch": 310.78, + "learning_rate": 0.00013279055974502952, + "loss": 0.5225, + "step": 15850 + }, + { + "epoch": 310.98, + "learning_rate": 0.00013276472027387652, + "loss": 0.5254, + "step": 15860 + }, + { + "epoch": 311.0, + "eval_loss": 0.5306283235549927, + "eval_runtime": 2.1978, + "eval_samples_per_second": 1036.929, + "eval_steps_per_second": 4.095, + "step": 15861 + }, + { + "epoch": 311.18, + "learning_rate": 0.00013273886393671348, + "loss": 0.5232, + "step": 15870 + }, + { + "epoch": 311.37, + "learning_rate": 0.00013271299074108986, + "loss": 0.518, + "step": 15880 + }, + { + "epoch": 311.57, + "learning_rate": 0.00013268710069456007, + "loss": 0.5248, + "step": 15890 + }, + { + "epoch": 311.76, + "learning_rate": 0.00013266119380468344, + "loss": 0.5186, + "step": 15900 + }, + { + "epoch": 311.96, + "learning_rate": 0.00013263527007902417, + "loss": 0.5166, + "step": 15910 + }, + { + "epoch": 312.0, + "eval_loss": 0.5223502516746521, + "eval_runtime": 2.1971, + "eval_samples_per_second": 1037.273, + "eval_steps_per_second": 4.096, + "step": 15912 + }, + { + "epoch": 312.16, + "learning_rate": 0.00013260932952515145, + "loss": 0.5217, + "step": 15920 + }, + { + "epoch": 312.35, + "learning_rate": 0.0001325833721506393, + "loss": 0.5172, + "step": 15930 + }, + { + "epoch": 312.55, + "learning_rate": 0.00013255739796306671, + "loss": 0.5131, + "step": 15940 + }, + { + "epoch": 312.75, + "learning_rate": 0.0001325314069700176, + "loss": 0.522, + "step": 15950 + }, + { + "epoch": 312.94, + "learning_rate": 0.0001325053991790807, + "loss": 0.525, + "step": 15960 + }, + { + "epoch": 313.0, + "eval_loss": 0.5192234516143799, + "eval_runtime": 2.1577, + "eval_samples_per_second": 1056.214, + "eval_steps_per_second": 4.171, + "step": 15963 + }, + { + "epoch": 313.14, + "learning_rate": 0.00013247937459784975, + "loss": 0.5198, + "step": 15970 + }, + { + "epoch": 313.33, + "learning_rate": 0.00013245333323392333, + "loss": 0.5172, + "step": 15980 + }, + { + "epoch": 313.53, + "learning_rate": 0.00013242727509490496, + "loss": 0.5195, + "step": 15990 + }, + { + "epoch": 313.73, + "learning_rate": 0.000132401200188403, + "loss": 0.5243, + "step": 16000 + }, + { + "epoch": 313.92, + "learning_rate": 0.00013237510852203072, + "loss": 0.5224, + "step": 16010 + }, + { + "epoch": 314.0, + "eval_loss": 0.5246869921684265, + "eval_runtime": 2.2265, + "eval_samples_per_second": 1023.575, + "eval_steps_per_second": 4.042, + "step": 16014 + }, + { + "epoch": 314.12, + "learning_rate": 0.00013234900010340638, + "loss": 0.527, + "step": 16020 + }, + { + "epoch": 314.31, + "learning_rate": 0.000132322874940153, + "loss": 0.5218, + "step": 16030 + }, + { + "epoch": 314.51, + "learning_rate": 0.00013229673303989857, + "loss": 0.5198, + "step": 16040 + }, + { + "epoch": 314.71, + "learning_rate": 0.00013227057441027594, + "loss": 0.518, + "step": 16050 + }, + { + "epoch": 314.9, + "learning_rate": 0.00013224439905892282, + "loss": 0.5195, + "step": 16060 + }, + { + "epoch": 315.0, + "eval_loss": 0.5229699015617371, + "eval_runtime": 2.3109, + "eval_samples_per_second": 986.188, + "eval_steps_per_second": 3.895, + "step": 16065 + }, + { + "epoch": 315.1, + "learning_rate": 0.00013221820699348187, + "loss": 0.5143, + "step": 16070 + }, + { + "epoch": 315.29, + "learning_rate": 0.00013219199822160058, + "loss": 0.5203, + "step": 16080 + }, + { + "epoch": 315.49, + "learning_rate": 0.00013216577275093126, + "loss": 0.5183, + "step": 16090 + }, + { + "epoch": 315.69, + "learning_rate": 0.00013213953058913126, + "loss": 0.5175, + "step": 16100 + }, + { + "epoch": 315.88, + "learning_rate": 0.00013211327174386266, + "loss": 0.5189, + "step": 16110 + }, + { + "epoch": 316.0, + "eval_loss": 0.523881733417511, + "eval_runtime": 2.3072, + "eval_samples_per_second": 987.77, + "eval_steps_per_second": 3.901, + "step": 16116 + }, + { + "epoch": 316.08, + "learning_rate": 0.00013208699622279247, + "loss": 0.5161, + "step": 16120 + }, + { + "epoch": 316.27, + "learning_rate": 0.0001320607040335925, + "loss": 0.523, + "step": 16130 + }, + { + "epoch": 316.47, + "learning_rate": 0.00013203439518393956, + "loss": 0.5188, + "step": 16140 + }, + { + "epoch": 316.67, + "learning_rate": 0.00013200806968151522, + "loss": 0.5192, + "step": 16150 + }, + { + "epoch": 316.86, + "learning_rate": 0.00013198172753400595, + "loss": 0.5226, + "step": 16160 + }, + { + "epoch": 317.0, + "eval_loss": 0.5179664492607117, + "eval_runtime": 2.3076, + "eval_samples_per_second": 987.623, + "eval_steps_per_second": 3.9, + "step": 16167 + }, + { + "epoch": 317.06, + "learning_rate": 0.00013195536874910304, + "loss": 0.515, + "step": 16170 + }, + { + "epoch": 317.25, + "learning_rate": 0.00013192899333450264, + "loss": 0.5194, + "step": 16180 + }, + { + "epoch": 317.45, + "learning_rate": 0.0001319026012979059, + "loss": 0.5164, + "step": 16190 + }, + { + "epoch": 317.65, + "learning_rate": 0.0001318761926470186, + "loss": 0.5135, + "step": 16200 + }, + { + "epoch": 317.84, + "learning_rate": 0.0001318497673895515, + "loss": 0.5166, + "step": 16210 + }, + { + "epoch": 318.0, + "eval_loss": 0.5197197794914246, + "eval_runtime": 2.3294, + "eval_samples_per_second": 978.381, + "eval_steps_per_second": 3.864, + "step": 16218 + }, + { + "epoch": 318.04, + "learning_rate": 0.00013182332553322021, + "loss": 0.5186, + "step": 16220 + }, + { + "epoch": 318.24, + "learning_rate": 0.00013179686708574513, + "loss": 0.5179, + "step": 16230 + }, + { + "epoch": 318.43, + "learning_rate": 0.00013177039205485158, + "loss": 0.5234, + "step": 16240 + }, + { + "epoch": 318.63, + "learning_rate": 0.00013174390044826963, + "loss": 0.5198, + "step": 16250 + }, + { + "epoch": 318.82, + "learning_rate": 0.00013171739227373427, + "loss": 0.5159, + "step": 16260 + }, + { + "epoch": 319.0, + "eval_loss": 0.5156288743019104, + "eval_runtime": 2.1651, + "eval_samples_per_second": 1052.585, + "eval_steps_per_second": 4.157, + "step": 16269 + }, + { + "epoch": 319.02, + "learning_rate": 0.0001316908675389853, + "loss": 0.516, + "step": 16270 + }, + { + "epoch": 319.22, + "learning_rate": 0.0001316643262517673, + "loss": 0.5215, + "step": 16280 + }, + { + "epoch": 319.41, + "learning_rate": 0.00013163776841982981, + "loss": 0.5148, + "step": 16290 + }, + { + "epoch": 319.61, + "learning_rate": 0.00013161119405092708, + "loss": 0.5153, + "step": 16300 + }, + { + "epoch": 319.8, + "learning_rate": 0.0001315846031528182, + "loss": 0.5167, + "step": 16310 + }, + { + "epoch": 320.0, + "learning_rate": 0.00013155799573326722, + "loss": 0.5156, + "step": 16320 + }, + { + "epoch": 320.0, + "eval_loss": 0.5203654170036316, + "eval_runtime": 2.301, + "eval_samples_per_second": 990.435, + "eval_steps_per_second": 3.911, + "step": 16320 + }, + { + "epoch": 320.2, + "learning_rate": 0.00013153137180004282, + "loss": 0.522, + "step": 16330 + }, + { + "epoch": 320.39, + "learning_rate": 0.0001315047313609186, + "loss": 0.5152, + "step": 16340 + }, + { + "epoch": 320.59, + "learning_rate": 0.00013147807442367304, + "loss": 0.5187, + "step": 16350 + }, + { + "epoch": 320.78, + "learning_rate": 0.00013145140099608932, + "loss": 0.5196, + "step": 16360 + }, + { + "epoch": 320.98, + "learning_rate": 0.00013142471108595552, + "loss": 0.5179, + "step": 16370 + }, + { + "epoch": 321.0, + "eval_loss": 0.5215443968772888, + "eval_runtime": 2.1498, + "eval_samples_per_second": 1060.099, + "eval_steps_per_second": 4.186, + "step": 16371 + }, + { + "epoch": 321.18, + "learning_rate": 0.00013139800470106443, + "loss": 0.5163, + "step": 16380 + }, + { + "epoch": 321.37, + "learning_rate": 0.00013137128184921378, + "loss": 0.5231, + "step": 16390 + }, + { + "epoch": 321.57, + "learning_rate": 0.00013134454253820605, + "loss": 0.5158, + "step": 16400 + }, + { + "epoch": 321.76, + "learning_rate": 0.0001313177867758485, + "loss": 0.5173, + "step": 16410 + }, + { + "epoch": 321.96, + "learning_rate": 0.0001312910145699532, + "loss": 0.5194, + "step": 16420 + }, + { + "epoch": 322.0, + "eval_loss": 0.5210621953010559, + "eval_runtime": 2.2088, + "eval_samples_per_second": 1031.76, + "eval_steps_per_second": 4.075, + "step": 16422 + }, + { + "epoch": 322.16, + "learning_rate": 0.0001312642259283371, + "loss": 0.5175, + "step": 16430 + }, + { + "epoch": 322.35, + "learning_rate": 0.0001312374208588218, + "loss": 0.5196, + "step": 16440 + }, + { + "epoch": 322.55, + "learning_rate": 0.00013121059936923384, + "loss": 0.5194, + "step": 16450 + }, + { + "epoch": 322.75, + "learning_rate": 0.00013118376146740448, + "loss": 0.5213, + "step": 16460 + }, + { + "epoch": 322.94, + "learning_rate": 0.00013115690716116978, + "loss": 0.519, + "step": 16470 + }, + { + "epoch": 323.0, + "eval_loss": 0.5211889147758484, + "eval_runtime": 2.1133, + "eval_samples_per_second": 1078.407, + "eval_steps_per_second": 4.259, + "step": 16473 + }, + { + "epoch": 323.14, + "learning_rate": 0.00013113003645837064, + "loss": 0.5173, + "step": 16480 + }, + { + "epoch": 323.33, + "learning_rate": 0.00013110314936685264, + "loss": 0.5228, + "step": 16490 + }, + { + "epoch": 323.53, + "learning_rate": 0.00013107624589446626, + "loss": 0.5124, + "step": 16500 + }, + { + "epoch": 323.73, + "learning_rate": 0.0001310493260490667, + "loss": 0.5142, + "step": 16510 + }, + { + "epoch": 323.92, + "learning_rate": 0.00013102238983851396, + "loss": 0.5112, + "step": 16520 + }, + { + "epoch": 324.0, + "eval_loss": 0.5174744725227356, + "eval_runtime": 2.2342, + "eval_samples_per_second": 1020.039, + "eval_steps_per_second": 4.028, + "step": 16524 + }, + { + "epoch": 324.12, + "learning_rate": 0.0001309954372706728, + "loss": 0.5118, + "step": 16530 + }, + { + "epoch": 324.31, + "learning_rate": 0.0001309684683534128, + "loss": 0.5161, + "step": 16540 + }, + { + "epoch": 324.51, + "learning_rate": 0.00013094148309460824, + "loss": 0.5108, + "step": 16550 + }, + { + "epoch": 324.71, + "learning_rate": 0.00013091448150213825, + "loss": 0.5161, + "step": 16560 + }, + { + "epoch": 324.9, + "learning_rate": 0.00013088746358388666, + "loss": 0.5163, + "step": 16570 + }, + { + "epoch": 325.0, + "eval_loss": 0.522521436214447, + "eval_runtime": 2.273, + "eval_samples_per_second": 1002.657, + "eval_steps_per_second": 3.96, + "step": 16575 + }, + { + "epoch": 325.1, + "learning_rate": 0.0001308604293477421, + "loss": 0.5235, + "step": 16580 + }, + { + "epoch": 325.29, + "learning_rate": 0.00013083337880159798, + "loss": 0.5158, + "step": 16590 + }, + { + "epoch": 325.49, + "learning_rate": 0.0001308063119533525, + "loss": 0.5184, + "step": 16600 + }, + { + "epoch": 325.69, + "learning_rate": 0.00013077922881090848, + "loss": 0.5217, + "step": 16610 + }, + { + "epoch": 325.88, + "learning_rate": 0.00013075212938217366, + "loss": 0.5165, + "step": 16620 + }, + { + "epoch": 326.0, + "eval_loss": 0.5172427296638489, + "eval_runtime": 2.2537, + "eval_samples_per_second": 1011.23, + "eval_steps_per_second": 3.993, + "step": 16626 + }, + { + "epoch": 326.08, + "learning_rate": 0.00013072501367506045, + "loss": 0.5107, + "step": 16630 + }, + { + "epoch": 326.27, + "learning_rate": 0.00013069788169748606, + "loss": 0.5164, + "step": 16640 + }, + { + "epoch": 326.47, + "learning_rate": 0.00013067073345737236, + "loss": 0.5147, + "step": 16650 + }, + { + "epoch": 326.67, + "learning_rate": 0.0001306435689626461, + "loss": 0.5154, + "step": 16660 + }, + { + "epoch": 326.86, + "learning_rate": 0.00013061638822123867, + "loss": 0.5104, + "step": 16670 + }, + { + "epoch": 327.0, + "eval_loss": 0.5200419425964355, + "eval_runtime": 2.266, + "eval_samples_per_second": 1005.748, + "eval_steps_per_second": 3.972, + "step": 16677 + }, + { + "epoch": 327.06, + "learning_rate": 0.00013058919124108625, + "loss": 0.515, + "step": 16680 + }, + { + "epoch": 327.25, + "learning_rate": 0.00013056197803012972, + "loss": 0.5194, + "step": 16690 + }, + { + "epoch": 327.45, + "learning_rate": 0.0001305347485963148, + "loss": 0.5123, + "step": 16700 + }, + { + "epoch": 327.65, + "learning_rate": 0.00013050750294759178, + "loss": 0.5114, + "step": 16710 + }, + { + "epoch": 327.84, + "learning_rate": 0.00013048024109191587, + "loss": 0.51, + "step": 16720 + }, + { + "epoch": 328.0, + "eval_loss": 0.5156450271606445, + "eval_runtime": 2.1441, + "eval_samples_per_second": 1062.933, + "eval_steps_per_second": 4.198, + "step": 16728 + }, + { + "epoch": 328.04, + "learning_rate": 0.0001304529630372469, + "loss": 0.5106, + "step": 16730 + }, + { + "epoch": 328.24, + "learning_rate": 0.00013042566879154942, + "loss": 0.5113, + "step": 16740 + }, + { + "epoch": 328.43, + "learning_rate": 0.00013039835836279278, + "loss": 0.5138, + "step": 16750 + }, + { + "epoch": 328.63, + "learning_rate": 0.000130371031758951, + "loss": 0.5174, + "step": 16760 + }, + { + "epoch": 328.82, + "learning_rate": 0.00013034368898800282, + "loss": 0.5129, + "step": 16770 + }, + { + "epoch": 329.0, + "eval_loss": 0.5160460472106934, + "eval_runtime": 2.1772, + "eval_samples_per_second": 1046.767, + "eval_steps_per_second": 4.134, + "step": 16779 + }, + { + "epoch": 329.02, + "learning_rate": 0.00013031633005793175, + "loss": 0.5099, + "step": 16780 + }, + { + "epoch": 329.22, + "learning_rate": 0.000130288954976726, + "loss": 0.5173, + "step": 16790 + }, + { + "epoch": 329.41, + "learning_rate": 0.00013026156375237844, + "loss": 0.5143, + "step": 16800 + }, + { + "epoch": 329.61, + "learning_rate": 0.00013023415639288675, + "loss": 0.5119, + "step": 16810 + }, + { + "epoch": 329.8, + "learning_rate": 0.0001302067329062532, + "loss": 0.5138, + "step": 16820 + }, + { + "epoch": 330.0, + "learning_rate": 0.00013017929330048485, + "loss": 0.5084, + "step": 16830 + }, + { + "epoch": 330.0, + "eval_loss": 0.5207294821739197, + "eval_runtime": 2.1796, + "eval_samples_per_second": 1045.587, + "eval_steps_per_second": 4.129, + "step": 16830 + }, + { + "epoch": 330.2, + "learning_rate": 0.00013015183758359353, + "loss": 0.5211, + "step": 16840 + }, + { + "epoch": 330.39, + "learning_rate": 0.0001301243657635956, + "loss": 0.5176, + "step": 16850 + }, + { + "epoch": 330.59, + "learning_rate": 0.00013009687784851226, + "loss": 0.5126, + "step": 16860 + }, + { + "epoch": 330.78, + "learning_rate": 0.00013006937384636938, + "loss": 0.5135, + "step": 16870 + }, + { + "epoch": 330.98, + "learning_rate": 0.0001300418537651975, + "loss": 0.5159, + "step": 16880 + }, + { + "epoch": 331.0, + "eval_loss": 0.5146752595901489, + "eval_runtime": 2.3002, + "eval_samples_per_second": 990.782, + "eval_steps_per_second": 3.913, + "step": 16881 + }, + { + "epoch": 331.18, + "learning_rate": 0.00013001431761303187, + "loss": 0.512, + "step": 16890 + }, + { + "epoch": 331.37, + "learning_rate": 0.00012998676539791246, + "loss": 0.5153, + "step": 16900 + }, + { + "epoch": 331.57, + "learning_rate": 0.00012995919712788383, + "loss": 0.5155, + "step": 16910 + }, + { + "epoch": 331.76, + "learning_rate": 0.00012993161281099538, + "loss": 0.5066, + "step": 16920 + }, + { + "epoch": 331.96, + "learning_rate": 0.00012990401245530108, + "loss": 0.5126, + "step": 16930 + }, + { + "epoch": 332.0, + "eval_loss": 0.5158648490905762, + "eval_runtime": 2.2177, + "eval_samples_per_second": 1027.619, + "eval_steps_per_second": 4.058, + "step": 16932 + }, + { + "epoch": 332.16, + "learning_rate": 0.00012987639606885964, + "loss": 0.5167, + "step": 16940 + }, + { + "epoch": 332.35, + "learning_rate": 0.0001298487636597344, + "loss": 0.5125, + "step": 16950 + }, + { + "epoch": 332.55, + "learning_rate": 0.0001298211152359934, + "loss": 0.5113, + "step": 16960 + }, + { + "epoch": 332.75, + "learning_rate": 0.0001297934508057094, + "loss": 0.5142, + "step": 16970 + }, + { + "epoch": 332.94, + "learning_rate": 0.00012976577037695974, + "loss": 0.5132, + "step": 16980 + }, + { + "epoch": 333.0, + "eval_loss": 0.5156267881393433, + "eval_runtime": 2.1373, + "eval_samples_per_second": 1066.28, + "eval_steps_per_second": 4.211, + "step": 16983 + }, + { + "epoch": 333.14, + "learning_rate": 0.00012973807395782655, + "loss": 0.5137, + "step": 16990 + }, + { + "epoch": 333.33, + "learning_rate": 0.00012971036155639656, + "loss": 0.5127, + "step": 17000 + }, + { + "epoch": 333.53, + "learning_rate": 0.00012968263318076113, + "loss": 0.5107, + "step": 17010 + }, + { + "epoch": 333.73, + "learning_rate": 0.00012965488883901635, + "loss": 0.5133, + "step": 17020 + }, + { + "epoch": 333.92, + "learning_rate": 0.00012962712853926297, + "loss": 0.5092, + "step": 17030 + }, + { + "epoch": 334.0, + "eval_loss": 0.5151438117027283, + "eval_runtime": 2.2763, + "eval_samples_per_second": 1001.176, + "eval_steps_per_second": 3.954, + "step": 17034 + }, + { + "epoch": 334.12, + "learning_rate": 0.00012959935228960636, + "loss": 0.5148, + "step": 17040 + }, + { + "epoch": 334.31, + "learning_rate": 0.00012957156009815656, + "loss": 0.5087, + "step": 17050 + }, + { + "epoch": 334.51, + "learning_rate": 0.00012954375197302826, + "loss": 0.5124, + "step": 17060 + }, + { + "epoch": 334.71, + "learning_rate": 0.00012951592792234085, + "loss": 0.5121, + "step": 17070 + }, + { + "epoch": 334.9, + "learning_rate": 0.00012948808795421827, + "loss": 0.5116, + "step": 17080 + }, + { + "epoch": 335.0, + "eval_loss": 0.514667809009552, + "eval_runtime": 2.1506, + "eval_samples_per_second": 1059.712, + "eval_steps_per_second": 4.185, + "step": 17085 + }, + { + "epoch": 335.1, + "learning_rate": 0.00012946023207678926, + "loss": 0.5167, + "step": 17090 + }, + { + "epoch": 335.29, + "learning_rate": 0.00012943236029818703, + "loss": 0.5145, + "step": 17100 + }, + { + "epoch": 335.49, + "learning_rate": 0.0001294044726265496, + "loss": 0.5099, + "step": 17110 + }, + { + "epoch": 335.69, + "learning_rate": 0.00012937656907001944, + "loss": 0.5061, + "step": 17120 + }, + { + "epoch": 335.88, + "learning_rate": 0.00012934864963674386, + "loss": 0.5113, + "step": 17130 + }, + { + "epoch": 336.0, + "eval_loss": 0.5120841264724731, + "eval_runtime": 2.2524, + "eval_samples_per_second": 1011.8, + "eval_steps_per_second": 3.996, + "step": 17136 + }, + { + "epoch": 336.08, + "learning_rate": 0.00012932071433487466, + "loss": 0.5057, + "step": 17140 + }, + { + "epoch": 336.27, + "learning_rate": 0.00012929276317256836, + "loss": 0.5119, + "step": 17150 + }, + { + "epoch": 336.47, + "learning_rate": 0.00012926479615798606, + "loss": 0.516, + "step": 17160 + }, + { + "epoch": 336.67, + "learning_rate": 0.0001292368132992935, + "loss": 0.5067, + "step": 17170 + }, + { + "epoch": 336.86, + "learning_rate": 0.00012920881460466106, + "loss": 0.5076, + "step": 17180 + }, + { + "epoch": 337.0, + "eval_loss": 0.5100632309913635, + "eval_runtime": 2.2378, + "eval_samples_per_second": 1018.414, + "eval_steps_per_second": 4.022, + "step": 17187 + }, + { + "epoch": 337.06, + "learning_rate": 0.0001291808000822637, + "loss": 0.5091, + "step": 17190 + }, + { + "epoch": 337.25, + "learning_rate": 0.0001291527697402811, + "loss": 0.5122, + "step": 17200 + }, + { + "epoch": 337.45, + "learning_rate": 0.00012912472358689745, + "loss": 0.5101, + "step": 17210 + }, + { + "epoch": 337.65, + "learning_rate": 0.0001290966616303016, + "loss": 0.5114, + "step": 17220 + }, + { + "epoch": 337.84, + "learning_rate": 0.00012906858387868705, + "loss": 0.5106, + "step": 17230 + }, + { + "epoch": 338.0, + "eval_loss": 0.5111255645751953, + "eval_runtime": 2.1515, + "eval_samples_per_second": 1059.276, + "eval_steps_per_second": 4.183, + "step": 17238 + }, + { + "epoch": 338.04, + "learning_rate": 0.00012904049034025183, + "loss": 0.5066, + "step": 17240 + }, + { + "epoch": 338.24, + "learning_rate": 0.0001290123810231987, + "loss": 0.506, + "step": 17250 + }, + { + "epoch": 338.43, + "learning_rate": 0.00012898425593573483, + "loss": 0.5117, + "step": 17260 + }, + { + "epoch": 338.63, + "learning_rate": 0.0001289561150860722, + "loss": 0.5159, + "step": 17270 + }, + { + "epoch": 338.82, + "learning_rate": 0.00012892795848242736, + "loss": 0.5117, + "step": 17280 + }, + { + "epoch": 339.0, + "eval_loss": 0.5093927383422852, + "eval_runtime": 2.2305, + "eval_samples_per_second": 1021.723, + "eval_steps_per_second": 4.035, + "step": 17289 + }, + { + "epoch": 339.02, + "learning_rate": 0.0001288997861330213, + "loss": 0.5084, + "step": 17290 + }, + { + "epoch": 339.22, + "learning_rate": 0.00012887159804607983, + "loss": 0.5062, + "step": 17300 + }, + { + "epoch": 339.41, + "learning_rate": 0.00012884339422983314, + "loss": 0.5113, + "step": 17310 + }, + { + "epoch": 339.61, + "learning_rate": 0.00012881517469251616, + "loss": 0.5081, + "step": 17320 + }, + { + "epoch": 339.8, + "learning_rate": 0.00012878693944236836, + "loss": 0.5099, + "step": 17330 + }, + { + "epoch": 340.0, + "learning_rate": 0.00012875868848763385, + "loss": 0.5086, + "step": 17340 + }, + { + "epoch": 340.0, + "eval_loss": 0.513230562210083, + "eval_runtime": 2.2424, + "eval_samples_per_second": 1016.333, + "eval_steps_per_second": 4.014, + "step": 17340 + }, + { + "epoch": 340.2, + "learning_rate": 0.0001287304218365612, + "loss": 0.5062, + "step": 17350 + }, + { + "epoch": 340.39, + "learning_rate": 0.0001287021394974037, + "loss": 0.5102, + "step": 17360 + }, + { + "epoch": 340.59, + "learning_rate": 0.0001286738414784191, + "loss": 0.5076, + "step": 17370 + }, + { + "epoch": 340.78, + "learning_rate": 0.00012864552778786984, + "loss": 0.5017, + "step": 17380 + }, + { + "epoch": 340.98, + "learning_rate": 0.0001286171984340229, + "loss": 0.5034, + "step": 17390 + }, + { + "epoch": 341.0, + "eval_loss": 0.5161563754081726, + "eval_runtime": 2.1498, + "eval_samples_per_second": 1060.082, + "eval_steps_per_second": 4.186, + "step": 17391 + }, + { + "epoch": 341.18, + "learning_rate": 0.0001285888534251498, + "loss": 0.5104, + "step": 17400 + }, + { + "epoch": 341.37, + "learning_rate": 0.00012856049276952663, + "loss": 0.509, + "step": 17410 + }, + { + "epoch": 341.57, + "learning_rate": 0.0001285321164754341, + "loss": 0.5104, + "step": 17420 + }, + { + "epoch": 341.76, + "learning_rate": 0.00012850372455115746, + "loss": 0.5075, + "step": 17430 + }, + { + "epoch": 341.96, + "learning_rate": 0.00012847531700498646, + "loss": 0.5061, + "step": 17440 + }, + { + "epoch": 342.0, + "eval_loss": 0.5142490267753601, + "eval_runtime": 2.2582, + "eval_samples_per_second": 1009.211, + "eval_steps_per_second": 3.985, + "step": 17442 + }, + { + "epoch": 342.16, + "learning_rate": 0.00012844689384521553, + "loss": 0.5063, + "step": 17450 + }, + { + "epoch": 342.35, + "learning_rate": 0.00012841845508014356, + "loss": 0.5092, + "step": 17460 + }, + { + "epoch": 342.55, + "learning_rate": 0.00012839000071807407, + "loss": 0.5075, + "step": 17470 + }, + { + "epoch": 342.75, + "learning_rate": 0.0001283615307673151, + "loss": 0.5106, + "step": 17480 + }, + { + "epoch": 342.94, + "learning_rate": 0.0001283330452361792, + "loss": 0.5101, + "step": 17490 + }, + { + "epoch": 343.0, + "eval_loss": 0.5135881304740906, + "eval_runtime": 2.3064, + "eval_samples_per_second": 988.109, + "eval_steps_per_second": 3.902, + "step": 17493 + }, + { + "epoch": 343.14, + "learning_rate": 0.00012830454413298353, + "loss": 0.5114, + "step": 17500 + }, + { + "epoch": 343.33, + "learning_rate": 0.00012827602746604978, + "loss": 0.5087, + "step": 17510 + }, + { + "epoch": 343.53, + "learning_rate": 0.0001282474952437042, + "loss": 0.5092, + "step": 17520 + }, + { + "epoch": 343.73, + "learning_rate": 0.00012821894747427754, + "loss": 0.507, + "step": 17530 + }, + { + "epoch": 343.92, + "learning_rate": 0.0001281903841661051, + "loss": 0.5042, + "step": 17540 + }, + { + "epoch": 344.0, + "eval_loss": 0.5135248899459839, + "eval_runtime": 2.2046, + "eval_samples_per_second": 1033.765, + "eval_steps_per_second": 4.082, + "step": 17544 + }, + { + "epoch": 344.12, + "learning_rate": 0.00012816180532752676, + "loss": 0.5102, + "step": 17550 + }, + { + "epoch": 344.31, + "learning_rate": 0.0001281332109668869, + "loss": 0.5064, + "step": 17560 + }, + { + "epoch": 344.51, + "learning_rate": 0.00012810460109253437, + "loss": 0.506, + "step": 17570 + }, + { + "epoch": 344.71, + "learning_rate": 0.00012807597571282272, + "loss": 0.5015, + "step": 17580 + }, + { + "epoch": 344.9, + "learning_rate": 0.00012804733483610982, + "loss": 0.5091, + "step": 17590 + }, + { + "epoch": 345.0, + "eval_loss": 0.5083193182945251, + "eval_runtime": 2.1876, + "eval_samples_per_second": 1041.783, + "eval_steps_per_second": 4.114, + "step": 17595 + }, + { + "epoch": 345.1, + "learning_rate": 0.00012801867847075826, + "loss": 0.5072, + "step": 17600 + }, + { + "epoch": 345.29, + "learning_rate": 0.000127990006625135, + "loss": 0.5105, + "step": 17610 + }, + { + "epoch": 345.49, + "learning_rate": 0.0001279613193076116, + "loss": 0.508, + "step": 17620 + }, + { + "epoch": 345.69, + "learning_rate": 0.00012793261652656413, + "loss": 0.5034, + "step": 17630 + }, + { + "epoch": 345.88, + "learning_rate": 0.00012790389829037314, + "loss": 0.5095, + "step": 17640 + }, + { + "epoch": 346.0, + "eval_loss": 0.5112407803535461, + "eval_runtime": 2.2322, + "eval_samples_per_second": 1020.968, + "eval_steps_per_second": 4.032, + "step": 17646 + }, + { + "epoch": 346.08, + "learning_rate": 0.00012787516460742372, + "loss": 0.5101, + "step": 17650 + }, + { + "epoch": 346.27, + "learning_rate": 0.00012784641548610546, + "loss": 0.5126, + "step": 17660 + }, + { + "epoch": 346.47, + "learning_rate": 0.00012781765093481247, + "loss": 0.5, + "step": 17670 + }, + { + "epoch": 346.67, + "learning_rate": 0.00012778887096194334, + "loss": 0.5035, + "step": 17680 + }, + { + "epoch": 346.86, + "learning_rate": 0.00012776007557590123, + "loss": 0.5058, + "step": 17690 + }, + { + "epoch": 347.0, + "eval_loss": 0.5121301412582397, + "eval_runtime": 2.2108, + "eval_samples_per_second": 1030.829, + "eval_steps_per_second": 4.071, + "step": 17697 + }, + { + "epoch": 347.06, + "learning_rate": 0.00012773126478509369, + "loss": 0.5002, + "step": 17700 + }, + { + "epoch": 347.25, + "learning_rate": 0.00012770243859793284, + "loss": 0.5069, + "step": 17710 + }, + { + "epoch": 347.45, + "learning_rate": 0.00012767359702283533, + "loss": 0.5106, + "step": 17720 + }, + { + "epoch": 347.65, + "learning_rate": 0.00012764474006822223, + "loss": 0.5048, + "step": 17730 + }, + { + "epoch": 347.84, + "learning_rate": 0.0001276158677425191, + "loss": 0.504, + "step": 17740 + }, + { + "epoch": 348.0, + "eval_loss": 0.508244514465332, + "eval_runtime": 2.1836, + "eval_samples_per_second": 1043.712, + "eval_steps_per_second": 4.122, + "step": 17748 + }, + { + "epoch": 348.04, + "learning_rate": 0.00012758698005415603, + "loss": 0.505, + "step": 17750 + }, + { + "epoch": 348.24, + "learning_rate": 0.0001275580770115676, + "loss": 0.5107, + "step": 17760 + }, + { + "epoch": 348.43, + "learning_rate": 0.00012752915862319285, + "loss": 0.5037, + "step": 17770 + }, + { + "epoch": 348.63, + "learning_rate": 0.00012750022489747527, + "loss": 0.5065, + "step": 17780 + }, + { + "epoch": 348.82, + "learning_rate": 0.0001274712758428629, + "loss": 0.5016, + "step": 17790 + }, + { + "epoch": 349.0, + "eval_loss": 0.5075437426567078, + "eval_runtime": 2.1943, + "eval_samples_per_second": 1038.611, + "eval_steps_per_second": 4.102, + "step": 17799 + }, + { + "epoch": 349.02, + "learning_rate": 0.00012744231146780821, + "loss": 0.5055, + "step": 17800 + }, + { + "epoch": 349.22, + "learning_rate": 0.00012741333178076816, + "loss": 0.5014, + "step": 17810 + }, + { + "epoch": 349.41, + "learning_rate": 0.00012738433679020412, + "loss": 0.5062, + "step": 17820 + }, + { + "epoch": 349.61, + "learning_rate": 0.00012735532650458208, + "loss": 0.5094, + "step": 17830 + }, + { + "epoch": 349.8, + "learning_rate": 0.0001273263009323723, + "loss": 0.506, + "step": 17840 + }, + { + "epoch": 350.0, + "learning_rate": 0.00012729726008204963, + "loss": 0.5042, + "step": 17850 + }, + { + "epoch": 350.0, + "eval_loss": 0.5090273022651672, + "eval_runtime": 2.2206, + "eval_samples_per_second": 1026.319, + "eval_steps_per_second": 4.053, + "step": 17850 + }, + { + "epoch": 350.2, + "learning_rate": 0.0001272682039620934, + "loss": 0.5059, + "step": 17860 + }, + { + "epoch": 350.39, + "learning_rate": 0.00012723913258098728, + "loss": 0.5087, + "step": 17870 + }, + { + "epoch": 350.59, + "learning_rate": 0.0001272100459472195, + "loss": 0.5047, + "step": 17880 + }, + { + "epoch": 350.78, + "learning_rate": 0.0001271809440692827, + "loss": 0.5051, + "step": 17890 + }, + { + "epoch": 350.98, + "learning_rate": 0.00012715182695567396, + "loss": 0.5036, + "step": 17900 + }, + { + "epoch": 351.0, + "eval_loss": 0.5089225172996521, + "eval_runtime": 2.2425, + "eval_samples_per_second": 1016.26, + "eval_steps_per_second": 4.013, + "step": 17901 + }, + { + "epoch": 351.18, + "learning_rate": 0.00012712269461489487, + "loss": 0.5095, + "step": 17910 + }, + { + "epoch": 351.37, + "learning_rate": 0.00012709354705545136, + "loss": 0.5009, + "step": 17920 + }, + { + "epoch": 351.57, + "learning_rate": 0.00012706438428585395, + "loss": 0.5073, + "step": 17930 + }, + { + "epoch": 351.76, + "learning_rate": 0.00012703520631461747, + "loss": 0.5043, + "step": 17940 + }, + { + "epoch": 351.96, + "learning_rate": 0.00012700601315026124, + "loss": 0.5045, + "step": 17950 + }, + { + "epoch": 352.0, + "eval_loss": 0.5094715356826782, + "eval_runtime": 2.2171, + "eval_samples_per_second": 1027.922, + "eval_steps_per_second": 4.059, + "step": 17952 + }, + { + "epoch": 352.16, + "learning_rate": 0.00012697680480130904, + "loss": 0.5054, + "step": 17960 + }, + { + "epoch": 352.35, + "learning_rate": 0.000126947581276289, + "loss": 0.5047, + "step": 17970 + }, + { + "epoch": 352.55, + "learning_rate": 0.0001269183425837338, + "loss": 0.508, + "step": 17980 + }, + { + "epoch": 352.75, + "learning_rate": 0.00012688908873218044, + "loss": 0.4993, + "step": 17990 + }, + { + "epoch": 352.94, + "learning_rate": 0.00012685981973017038, + "loss": 0.5067, + "step": 18000 + }, + { + "epoch": 353.0, + "eval_loss": 0.5087113976478577, + "eval_runtime": 2.19, + "eval_samples_per_second": 1040.626, + "eval_steps_per_second": 4.11, + "step": 18003 + }, + { + "epoch": 353.14, + "learning_rate": 0.0001268305355862496, + "loss": 0.5036, + "step": 18010 + }, + { + "epoch": 353.33, + "learning_rate": 0.0001268012363089683, + "loss": 0.5057, + "step": 18020 + }, + { + "epoch": 353.53, + "learning_rate": 0.00012677192190688134, + "loss": 0.5041, + "step": 18030 + }, + { + "epoch": 353.73, + "learning_rate": 0.00012674259238854778, + "loss": 0.5021, + "step": 18040 + }, + { + "epoch": 353.92, + "learning_rate": 0.00012671324776253123, + "loss": 0.5026, + "step": 18050 + }, + { + "epoch": 354.0, + "eval_loss": 0.5063843727111816, + "eval_runtime": 2.1337, + "eval_samples_per_second": 1068.082, + "eval_steps_per_second": 4.218, + "step": 18054 + }, + { + "epoch": 354.12, + "learning_rate": 0.00012668388803739963, + "loss": 0.5051, + "step": 18060 + }, + { + "epoch": 354.31, + "learning_rate": 0.0001266545132217254, + "loss": 0.5015, + "step": 18070 + }, + { + "epoch": 354.51, + "learning_rate": 0.00012662512332408532, + "loss": 0.5058, + "step": 18080 + }, + { + "epoch": 354.71, + "learning_rate": 0.00012659571835306057, + "loss": 0.5064, + "step": 18090 + }, + { + "epoch": 354.9, + "learning_rate": 0.00012656629831723674, + "loss": 0.5001, + "step": 18100 + }, + { + "epoch": 355.0, + "eval_loss": 0.5055447220802307, + "eval_runtime": 2.2709, + "eval_samples_per_second": 1003.563, + "eval_steps_per_second": 3.963, + "step": 18105 + }, + { + "epoch": 355.1, + "learning_rate": 0.00012653686322520387, + "loss": 0.5007, + "step": 18110 + }, + { + "epoch": 355.29, + "learning_rate": 0.0001265074130855563, + "loss": 0.5039, + "step": 18120 + }, + { + "epoch": 355.49, + "learning_rate": 0.00012647794790689285, + "loss": 0.501, + "step": 18130 + }, + { + "epoch": 355.69, + "learning_rate": 0.00012644846769781668, + "loss": 0.5054, + "step": 18140 + }, + { + "epoch": 355.88, + "learning_rate": 0.00012641897246693534, + "loss": 0.5036, + "step": 18150 + }, + { + "epoch": 356.0, + "eval_loss": 0.5056591629981995, + "eval_runtime": 2.1488, + "eval_samples_per_second": 1060.609, + "eval_steps_per_second": 4.188, + "step": 18156 + }, + { + "epoch": 356.08, + "learning_rate": 0.00012638946222286082, + "loss": 0.4965, + "step": 18160 + }, + { + "epoch": 356.27, + "learning_rate": 0.00012635993697420942, + "loss": 0.499, + "step": 18170 + }, + { + "epoch": 356.47, + "learning_rate": 0.00012633039672960183, + "loss": 0.5056, + "step": 18180 + }, + { + "epoch": 356.67, + "learning_rate": 0.00012630084149766322, + "loss": 0.5045, + "step": 18190 + }, + { + "epoch": 356.86, + "learning_rate": 0.000126271271287023, + "loss": 0.5012, + "step": 18200 + }, + { + "epoch": 357.0, + "eval_loss": 0.508310079574585, + "eval_runtime": 2.2643, + "eval_samples_per_second": 1006.498, + "eval_steps_per_second": 3.975, + "step": 18207 + }, + { + "epoch": 357.06, + "learning_rate": 0.00012624168610631502, + "loss": 0.501, + "step": 18210 + }, + { + "epoch": 357.25, + "learning_rate": 0.00012621208596417748, + "loss": 0.4977, + "step": 18220 + }, + { + "epoch": 357.45, + "learning_rate": 0.00012618247086925298, + "loss": 0.4988, + "step": 18230 + }, + { + "epoch": 357.65, + "learning_rate": 0.0001261528408301885, + "loss": 0.5085, + "step": 18240 + }, + { + "epoch": 357.84, + "learning_rate": 0.0001261231958556353, + "loss": 0.5031, + "step": 18250 + }, + { + "epoch": 358.0, + "eval_loss": 0.5109713077545166, + "eval_runtime": 2.2405, + "eval_samples_per_second": 1017.183, + "eval_steps_per_second": 4.017, + "step": 18258 + }, + { + "epoch": 358.04, + "learning_rate": 0.00012609353595424905, + "loss": 0.506, + "step": 18260 + }, + { + "epoch": 358.24, + "learning_rate": 0.0001260638611346898, + "loss": 0.5048, + "step": 18270 + }, + { + "epoch": 358.43, + "learning_rate": 0.00012603417140562195, + "loss": 0.5056, + "step": 18280 + }, + { + "epoch": 358.63, + "learning_rate": 0.00012600446677571423, + "loss": 0.5023, + "step": 18290 + }, + { + "epoch": 358.82, + "learning_rate": 0.0001259747472536397, + "loss": 0.5021, + "step": 18300 + }, + { + "epoch": 359.0, + "eval_loss": 0.5127790570259094, + "eval_runtime": 2.1395, + "eval_samples_per_second": 1065.184, + "eval_steps_per_second": 4.207, + "step": 18309 + }, + { + "epoch": 359.02, + "learning_rate": 0.00012594501284807582, + "loss": 0.503, + "step": 18310 + }, + { + "epoch": 359.22, + "learning_rate": 0.00012591526356770438, + "loss": 0.5041, + "step": 18320 + }, + { + "epoch": 359.41, + "learning_rate": 0.0001258854994212115, + "loss": 0.5002, + "step": 18330 + }, + { + "epoch": 359.61, + "learning_rate": 0.00012585572041728764, + "loss": 0.5071, + "step": 18340 + }, + { + "epoch": 359.8, + "learning_rate": 0.00012582592656462763, + "loss": 0.5027, + "step": 18350 + }, + { + "epoch": 360.0, + "learning_rate": 0.00012579611787193057, + "loss": 0.4973, + "step": 18360 + }, + { + "epoch": 360.0, + "eval_loss": 0.501369833946228, + "eval_runtime": 2.3093, + "eval_samples_per_second": 986.882, + "eval_steps_per_second": 3.897, + "step": 18360 + }, + { + "epoch": 360.2, + "learning_rate": 0.00012576629434789995, + "loss": 0.4983, + "step": 18370 + }, + { + "epoch": 360.39, + "learning_rate": 0.0001257364560012436, + "loss": 0.5008, + "step": 18380 + }, + { + "epoch": 360.59, + "learning_rate": 0.00012570660284067363, + "loss": 0.5024, + "step": 18390 + }, + { + "epoch": 360.78, + "learning_rate": 0.00012567673487490647, + "loss": 0.5037, + "step": 18400 + }, + { + "epoch": 360.98, + "learning_rate": 0.00012564685211266294, + "loss": 0.4988, + "step": 18410 + }, + { + "epoch": 361.0, + "eval_loss": 0.5028321743011475, + "eval_runtime": 2.297, + "eval_samples_per_second": 992.185, + "eval_steps_per_second": 3.918, + "step": 18411 + }, + { + "epoch": 361.18, + "learning_rate": 0.00012561695456266817, + "loss": 0.5011, + "step": 18420 + }, + { + "epoch": 361.37, + "learning_rate": 0.00012558704223365147, + "loss": 0.5029, + "step": 18430 + }, + { + "epoch": 361.57, + "learning_rate": 0.00012555711513434668, + "loss": 0.5038, + "step": 18440 + }, + { + "epoch": 361.76, + "learning_rate": 0.00012552717327349178, + "loss": 0.5029, + "step": 18450 + }, + { + "epoch": 361.96, + "learning_rate": 0.00012549721665982915, + "loss": 0.5013, + "step": 18460 + }, + { + "epoch": 362.0, + "eval_loss": 0.5034978985786438, + "eval_runtime": 2.1156, + "eval_samples_per_second": 1077.26, + "eval_steps_per_second": 4.254, + "step": 18462 + }, + { + "epoch": 362.16, + "learning_rate": 0.00012546724530210546, + "loss": 0.4995, + "step": 18470 + }, + { + "epoch": 362.35, + "learning_rate": 0.00012543725920907169, + "loss": 0.498, + "step": 18480 + }, + { + "epoch": 362.55, + "learning_rate": 0.00012540725838948308, + "loss": 0.5007, + "step": 18490 + }, + { + "epoch": 362.75, + "learning_rate": 0.0001253772428520992, + "loss": 0.4995, + "step": 18500 + }, + { + "epoch": 362.94, + "learning_rate": 0.00012534721260568392, + "loss": 0.5001, + "step": 18510 + }, + { + "epoch": 363.0, + "eval_loss": 0.5039771199226379, + "eval_runtime": 2.2553, + "eval_samples_per_second": 1010.502, + "eval_steps_per_second": 3.991, + "step": 18513 + }, + { + "epoch": 363.14, + "learning_rate": 0.00012531716765900545, + "loss": 0.4966, + "step": 18520 + }, + { + "epoch": 363.33, + "learning_rate": 0.00012528710802083617, + "loss": 0.5034, + "step": 18530 + }, + { + "epoch": 363.53, + "learning_rate": 0.00012525703369995286, + "loss": 0.4964, + "step": 18540 + }, + { + "epoch": 363.73, + "learning_rate": 0.00012522694470513658, + "loss": 0.5035, + "step": 18550 + }, + { + "epoch": 363.92, + "learning_rate": 0.00012519684104517258, + "loss": 0.4972, + "step": 18560 + }, + { + "epoch": 364.0, + "eval_loss": 0.5055744051933289, + "eval_runtime": 2.1783, + "eval_samples_per_second": 1046.212, + "eval_steps_per_second": 4.132, + "step": 18564 + }, + { + "epoch": 364.12, + "learning_rate": 0.0001251667227288505, + "loss": 0.5019, + "step": 18570 + }, + { + "epoch": 364.31, + "learning_rate": 0.00012513658976496424, + "loss": 0.4983, + "step": 18580 + }, + { + "epoch": 364.51, + "learning_rate": 0.00012510644216231188, + "loss": 0.4998, + "step": 18590 + }, + { + "epoch": 364.71, + "learning_rate": 0.0001250762799296959, + "loss": 0.4977, + "step": 18600 + }, + { + "epoch": 364.9, + "learning_rate": 0.000125046103075923, + "loss": 0.4994, + "step": 18610 + }, + { + "epoch": 365.0, + "eval_loss": 0.5070434808731079, + "eval_runtime": 2.2145, + "eval_samples_per_second": 1029.113, + "eval_steps_per_second": 4.064, + "step": 18615 + }, + { + "epoch": 365.1, + "learning_rate": 0.00012501591160980416, + "loss": 0.4925, + "step": 18620 + }, + { + "epoch": 365.29, + "learning_rate": 0.00012498570554015458, + "loss": 0.5013, + "step": 18630 + }, + { + "epoch": 365.49, + "learning_rate": 0.00012495548487579376, + "loss": 0.4956, + "step": 18640 + }, + { + "epoch": 365.69, + "learning_rate": 0.00012492524962554548, + "loss": 0.4945, + "step": 18650 + }, + { + "epoch": 365.88, + "learning_rate": 0.00012489499979823773, + "loss": 0.5005, + "step": 18660 + }, + { + "epoch": 366.0, + "eval_loss": 0.5070408582687378, + "eval_runtime": 2.2321, + "eval_samples_per_second": 1021.03, + "eval_steps_per_second": 4.032, + "step": 18666 + }, + { + "epoch": 366.08, + "learning_rate": 0.00012486473540270282, + "loss": 0.4997, + "step": 18670 + }, + { + "epoch": 366.27, + "learning_rate": 0.00012483445644777727, + "loss": 0.4993, + "step": 18680 + }, + { + "epoch": 366.47, + "learning_rate": 0.00012480416294230186, + "loss": 0.497, + "step": 18690 + }, + { + "epoch": 366.67, + "learning_rate": 0.00012477385489512158, + "loss": 0.497, + "step": 18700 + }, + { + "epoch": 366.86, + "learning_rate": 0.00012474353231508578, + "loss": 0.4993, + "step": 18710 + }, + { + "epoch": 367.0, + "eval_loss": 0.505254864692688, + "eval_runtime": 2.3073, + "eval_samples_per_second": 987.722, + "eval_steps_per_second": 3.901, + "step": 18717 + }, + { + "epoch": 367.06, + "learning_rate": 0.00012471319521104788, + "loss": 0.5011, + "step": 18720 + }, + { + "epoch": 367.25, + "learning_rate": 0.00012468284359186575, + "loss": 0.5013, + "step": 18730 + }, + { + "epoch": 367.45, + "learning_rate": 0.00012465247746640127, + "loss": 0.5033, + "step": 18740 + }, + { + "epoch": 367.65, + "learning_rate": 0.00012462209684352077, + "loss": 0.5043, + "step": 18750 + }, + { + "epoch": 367.84, + "learning_rate": 0.00012459170173209467, + "loss": 0.4975, + "step": 18760 + }, + { + "epoch": 368.0, + "eval_loss": 0.5035672187805176, + "eval_runtime": 2.1149, + "eval_samples_per_second": 1077.607, + "eval_steps_per_second": 4.256, + "step": 18768 + }, + { + "epoch": 368.04, + "learning_rate": 0.00012456129214099762, + "loss": 0.5014, + "step": 18770 + }, + { + "epoch": 368.24, + "learning_rate": 0.00012453086807910862, + "loss": 0.5, + "step": 18780 + }, + { + "epoch": 368.43, + "learning_rate": 0.0001245004295553108, + "loss": 0.4988, + "step": 18790 + }, + { + "epoch": 368.63, + "learning_rate": 0.0001244699765784915, + "loss": 0.501, + "step": 18800 + }, + { + "epoch": 368.82, + "learning_rate": 0.00012443950915754233, + "loss": 0.4967, + "step": 18810 + }, + { + "epoch": 369.0, + "eval_loss": 0.5026499629020691, + "eval_runtime": 2.1523, + "eval_samples_per_second": 1058.877, + "eval_steps_per_second": 4.182, + "step": 18819 + }, + { + "epoch": 369.02, + "learning_rate": 0.00012440902730135908, + "loss": 0.5034, + "step": 18820 + }, + { + "epoch": 369.22, + "learning_rate": 0.00012437853101884182, + "loss": 0.5019, + "step": 18830 + }, + { + "epoch": 369.41, + "learning_rate": 0.00012434802031889474, + "loss": 0.5041, + "step": 18840 + }, + { + "epoch": 369.61, + "learning_rate": 0.00012431749521042628, + "loss": 0.5081, + "step": 18850 + }, + { + "epoch": 369.8, + "learning_rate": 0.00012428695570234908, + "loss": 0.5021, + "step": 18860 + }, + { + "epoch": 370.0, + "learning_rate": 0.00012425640180358007, + "loss": 0.4968, + "step": 18870 + }, + { + "epoch": 370.0, + "eval_loss": 0.5011078119277954, + "eval_runtime": 2.1561, + "eval_samples_per_second": 1057.002, + "eval_steps_per_second": 4.174, + "step": 18870 + }, + { + "epoch": 370.2, + "learning_rate": 0.00012422583352304025, + "loss": 0.5004, + "step": 18880 + }, + { + "epoch": 370.39, + "learning_rate": 0.00012419525086965487, + "loss": 0.4996, + "step": 18890 + }, + { + "epoch": 370.59, + "learning_rate": 0.0001241646538523534, + "loss": 0.4972, + "step": 18900 + }, + { + "epoch": 370.78, + "learning_rate": 0.00012413404248006946, + "loss": 0.4946, + "step": 18910 + }, + { + "epoch": 370.98, + "learning_rate": 0.00012410341676174095, + "loss": 0.498, + "step": 18920 + }, + { + "epoch": 371.0, + "eval_loss": 0.4990316927433014, + "eval_runtime": 2.3201, + "eval_samples_per_second": 982.269, + "eval_steps_per_second": 3.879, + "step": 18921 + }, + { + "epoch": 371.18, + "learning_rate": 0.00012407277670630984, + "loss": 0.4921, + "step": 18930 + }, + { + "epoch": 371.37, + "learning_rate": 0.00012404212232272236, + "loss": 0.5002, + "step": 18940 + }, + { + "epoch": 371.57, + "learning_rate": 0.0001240114536199289, + "loss": 0.4923, + "step": 18950 + }, + { + "epoch": 371.76, + "learning_rate": 0.00012398077060688407, + "loss": 0.4992, + "step": 18960 + }, + { + "epoch": 371.96, + "learning_rate": 0.0001239500732925466, + "loss": 0.5022, + "step": 18970 + }, + { + "epoch": 372.0, + "eval_loss": 0.5031718611717224, + "eval_runtime": 2.2742, + "eval_samples_per_second": 1002.091, + "eval_steps_per_second": 3.957, + "step": 18972 + }, + { + "epoch": 372.16, + "learning_rate": 0.00012391936168587938, + "loss": 0.4934, + "step": 18980 + }, + { + "epoch": 372.35, + "learning_rate": 0.0001238886357958496, + "loss": 0.4988, + "step": 18990 + }, + { + "epoch": 372.55, + "learning_rate": 0.00012385789563142848, + "loss": 0.4995, + "step": 19000 + }, + { + "epoch": 372.75, + "learning_rate": 0.00012382714120159143, + "loss": 0.4984, + "step": 19010 + }, + { + "epoch": 372.94, + "learning_rate": 0.00012379637251531814, + "loss": 0.4959, + "step": 19020 + }, + { + "epoch": 373.0, + "eval_loss": 0.4971892833709717, + "eval_runtime": 2.1232, + "eval_samples_per_second": 1073.368, + "eval_steps_per_second": 4.239, + "step": 19023 + }, + { + "epoch": 373.14, + "learning_rate": 0.00012376558958159233, + "loss": 0.497, + "step": 19030 + }, + { + "epoch": 373.33, + "learning_rate": 0.00012373479240940198, + "loss": 0.4931, + "step": 19040 + }, + { + "epoch": 373.53, + "learning_rate": 0.0001237039810077391, + "loss": 0.4958, + "step": 19050 + }, + { + "epoch": 373.73, + "learning_rate": 0.00012367315538559996, + "loss": 0.4988, + "step": 19060 + }, + { + "epoch": 373.92, + "learning_rate": 0.00012364231555198497, + "loss": 0.4921, + "step": 19070 + }, + { + "epoch": 374.0, + "eval_loss": 0.4967401623725891, + "eval_runtime": 2.1905, + "eval_samples_per_second": 1040.405, + "eval_steps_per_second": 4.109, + "step": 19074 + }, + { + "epoch": 374.12, + "learning_rate": 0.00012361146151589866, + "loss": 0.4956, + "step": 19080 + }, + { + "epoch": 374.31, + "learning_rate": 0.00012358059328634974, + "loss": 0.498, + "step": 19090 + }, + { + "epoch": 374.51, + "learning_rate": 0.00012354971087235106, + "loss": 0.4947, + "step": 19100 + }, + { + "epoch": 374.71, + "learning_rate": 0.00012351881428291953, + "loss": 0.4924, + "step": 19110 + }, + { + "epoch": 374.9, + "learning_rate": 0.00012348790352707632, + "loss": 0.4936, + "step": 19120 + }, + { + "epoch": 375.0, + "eval_loss": 0.49671775102615356, + "eval_runtime": 2.1863, + "eval_samples_per_second": 1042.405, + "eval_steps_per_second": 4.117, + "step": 19125 + }, + { + "epoch": 375.1, + "learning_rate": 0.00012345697861384667, + "loss": 0.4902, + "step": 19130 + }, + { + "epoch": 375.29, + "learning_rate": 0.00012342603955225995, + "loss": 0.5001, + "step": 19140 + }, + { + "epoch": 375.49, + "learning_rate": 0.0001233950863513497, + "loss": 0.4989, + "step": 19150 + }, + { + "epoch": 375.69, + "learning_rate": 0.0001233641190201535, + "loss": 0.4882, + "step": 19160 + }, + { + "epoch": 375.88, + "learning_rate": 0.00012333313756771324, + "loss": 0.496, + "step": 19170 + }, + { + "epoch": 376.0, + "eval_loss": 0.5000470280647278, + "eval_runtime": 2.1974, + "eval_samples_per_second": 1037.127, + "eval_steps_per_second": 4.096, + "step": 19176 + }, + { + "epoch": 376.08, + "learning_rate": 0.0001233021420030747, + "loss": 0.4939, + "step": 19180 + }, + { + "epoch": 376.27, + "learning_rate": 0.00012327113233528796, + "loss": 0.4955, + "step": 19190 + }, + { + "epoch": 376.47, + "learning_rate": 0.00012324010857340712, + "loss": 0.502, + "step": 19200 + }, + { + "epoch": 376.67, + "learning_rate": 0.00012320907072649044, + "loss": 0.4931, + "step": 19210 + }, + { + "epoch": 376.86, + "learning_rate": 0.00012317801880360027, + "loss": 0.4941, + "step": 19220 + }, + { + "epoch": 377.0, + "eval_loss": 0.49797841906547546, + "eval_runtime": 2.17, + "eval_samples_per_second": 1050.218, + "eval_steps_per_second": 4.147, + "step": 19227 + }, + { + "epoch": 377.06, + "learning_rate": 0.00012314695281380307, + "loss": 0.4961, + "step": 19230 + }, + { + "epoch": 377.25, + "learning_rate": 0.00012311587276616945, + "loss": 0.4972, + "step": 19240 + }, + { + "epoch": 377.45, + "learning_rate": 0.0001230847786697741, + "loss": 0.4962, + "step": 19250 + }, + { + "epoch": 377.65, + "learning_rate": 0.0001230536705336957, + "loss": 0.498, + "step": 19260 + }, + { + "epoch": 377.84, + "learning_rate": 0.00012302254836701724, + "loss": 0.4937, + "step": 19270 + }, + { + "epoch": 378.0, + "eval_loss": 0.49754011631011963, + "eval_runtime": 2.1141, + "eval_samples_per_second": 1077.991, + "eval_steps_per_second": 4.257, + "step": 19278 + }, + { + "epoch": 378.04, + "learning_rate": 0.00012299141217882569, + "loss": 0.4955, + "step": 19280 + }, + { + "epoch": 378.24, + "learning_rate": 0.00012296026197821205, + "loss": 0.4938, + "step": 19290 + }, + { + "epoch": 378.43, + "learning_rate": 0.0001229290977742716, + "loss": 0.4989, + "step": 19300 + }, + { + "epoch": 378.63, + "learning_rate": 0.00012289791957610343, + "loss": 0.4934, + "step": 19310 + }, + { + "epoch": 378.82, + "learning_rate": 0.000122866727392811, + "loss": 0.4979, + "step": 19320 + }, + { + "epoch": 379.0, + "eval_loss": 0.49748286604881287, + "eval_runtime": 2.2523, + "eval_samples_per_second": 1011.858, + "eval_steps_per_second": 3.996, + "step": 19329 + }, + { + "epoch": 379.02, + "learning_rate": 0.00012283552123350174, + "loss": 0.4992, + "step": 19330 + }, + { + "epoch": 379.22, + "learning_rate": 0.0001228043011072871, + "loss": 0.4946, + "step": 19340 + }, + { + "epoch": 379.41, + "learning_rate": 0.00012277306702328266, + "loss": 0.495, + "step": 19350 + }, + { + "epoch": 379.61, + "learning_rate": 0.0001227418189906081, + "loss": 0.4932, + "step": 19360 + }, + { + "epoch": 379.8, + "learning_rate": 0.00012271055701838714, + "loss": 0.5027, + "step": 19370 + }, + { + "epoch": 380.0, + "learning_rate": 0.00012267928111574762, + "loss": 0.4996, + "step": 19380 + }, + { + "epoch": 380.0, + "eval_loss": 0.4932139813899994, + "eval_runtime": 2.2597, + "eval_samples_per_second": 1008.552, + "eval_steps_per_second": 3.983, + "step": 19380 + }, + { + "epoch": 380.2, + "learning_rate": 0.0001226479912918213, + "loss": 0.4889, + "step": 19390 + }, + { + "epoch": 380.39, + "learning_rate": 0.0001226166875557442, + "loss": 0.4946, + "step": 19400 + }, + { + "epoch": 380.59, + "learning_rate": 0.00012258536991665629, + "loss": 0.4995, + "step": 19410 + }, + { + "epoch": 380.78, + "learning_rate": 0.00012255403838370163, + "loss": 0.4914, + "step": 19420 + }, + { + "epoch": 380.98, + "learning_rate": 0.0001225226929660283, + "loss": 0.4961, + "step": 19430 + }, + { + "epoch": 381.0, + "eval_loss": 0.49828577041625977, + "eval_runtime": 2.1395, + "eval_samples_per_second": 1065.208, + "eval_steps_per_second": 4.207, + "step": 19431 + }, + { + "epoch": 381.18, + "learning_rate": 0.0001224913336727885, + "loss": 0.4935, + "step": 19440 + }, + { + "epoch": 381.37, + "learning_rate": 0.00012245996051313843, + "loss": 0.4931, + "step": 19450 + }, + { + "epoch": 381.57, + "learning_rate": 0.00012242857349623835, + "loss": 0.49, + "step": 19460 + }, + { + "epoch": 381.76, + "learning_rate": 0.00012239717263125256, + "loss": 0.4936, + "step": 19470 + }, + { + "epoch": 381.96, + "learning_rate": 0.00012236575792734942, + "loss": 0.4903, + "step": 19480 + }, + { + "epoch": 382.0, + "eval_loss": 0.497437059879303, + "eval_runtime": 2.2133, + "eval_samples_per_second": 1029.705, + "eval_steps_per_second": 4.066, + "step": 19482 + }, + { + "epoch": 382.16, + "learning_rate": 0.00012233432939370132, + "loss": 0.4949, + "step": 19490 + }, + { + "epoch": 382.35, + "learning_rate": 0.0001223028870394847, + "loss": 0.4926, + "step": 19500 + }, + { + "epoch": 382.55, + "learning_rate": 0.00012227143087388003, + "loss": 0.4876, + "step": 19510 + }, + { + "epoch": 382.75, + "learning_rate": 0.0001222399609060718, + "loss": 0.4913, + "step": 19520 + }, + { + "epoch": 382.94, + "learning_rate": 0.00012220847714524853, + "loss": 0.4899, + "step": 19530 + }, + { + "epoch": 383.0, + "eval_loss": 0.49533191323280334, + "eval_runtime": 2.2011, + "eval_samples_per_second": 1035.378, + "eval_steps_per_second": 4.089, + "step": 19533 + }, + { + "epoch": 383.14, + "learning_rate": 0.00012217697960060277, + "loss": 0.4923, + "step": 19540 + }, + { + "epoch": 383.33, + "learning_rate": 0.00012214546828133113, + "loss": 0.49, + "step": 19550 + }, + { + "epoch": 383.53, + "learning_rate": 0.00012211394319663421, + "loss": 0.4957, + "step": 19560 + }, + { + "epoch": 383.73, + "learning_rate": 0.00012208240435571664, + "loss": 0.4883, + "step": 19570 + }, + { + "epoch": 383.92, + "learning_rate": 0.00012205085176778698, + "loss": 0.4924, + "step": 19580 + }, + { + "epoch": 384.0, + "eval_loss": 0.4952709674835205, + "eval_runtime": 2.2127, + "eval_samples_per_second": 1029.974, + "eval_steps_per_second": 4.067, + "step": 19584 + }, + { + "epoch": 384.12, + "learning_rate": 0.000122019285442058, + "loss": 0.4929, + "step": 19590 + }, + { + "epoch": 384.31, + "learning_rate": 0.00012198770538774624, + "loss": 0.4935, + "step": 19600 + }, + { + "epoch": 384.51, + "learning_rate": 0.00012195611161407247, + "loss": 0.4876, + "step": 19610 + }, + { + "epoch": 384.71, + "learning_rate": 0.00012192450413026132, + "loss": 0.4953, + "step": 19620 + }, + { + "epoch": 384.9, + "learning_rate": 0.00012189288294554149, + "loss": 0.4895, + "step": 19630 + }, + { + "epoch": 385.0, + "eval_loss": 0.49638909101486206, + "eval_runtime": 2.3077, + "eval_samples_per_second": 987.57, + "eval_steps_per_second": 3.9, + "step": 19635 + }, + { + "epoch": 385.1, + "learning_rate": 0.00012186124806914566, + "loss": 0.4886, + "step": 19640 + }, + { + "epoch": 385.29, + "learning_rate": 0.00012182959951031048, + "loss": 0.4964, + "step": 19650 + }, + { + "epoch": 385.49, + "learning_rate": 0.00012179793727827667, + "loss": 0.4962, + "step": 19660 + }, + { + "epoch": 385.69, + "learning_rate": 0.00012176626138228886, + "loss": 0.4935, + "step": 19670 + }, + { + "epoch": 385.88, + "learning_rate": 0.0001217345718315957, + "loss": 0.4965, + "step": 19680 + }, + { + "epoch": 386.0, + "eval_loss": 0.5006343722343445, + "eval_runtime": 2.1604, + "eval_samples_per_second": 1054.911, + "eval_steps_per_second": 4.166, + "step": 19686 + }, + { + "epoch": 386.08, + "learning_rate": 0.00012170286863544986, + "loss": 0.4929, + "step": 19690 + }, + { + "epoch": 386.27, + "learning_rate": 0.00012167115180310793, + "loss": 0.495, + "step": 19700 + }, + { + "epoch": 386.47, + "learning_rate": 0.0001216394213438306, + "loss": 0.4947, + "step": 19710 + }, + { + "epoch": 386.67, + "learning_rate": 0.00012160767726688234, + "loss": 0.4911, + "step": 19720 + }, + { + "epoch": 386.86, + "learning_rate": 0.00012157591958153181, + "loss": 0.4896, + "step": 19730 + }, + { + "epoch": 387.0, + "eval_loss": 0.49377307295799255, + "eval_runtime": 2.2645, + "eval_samples_per_second": 1006.384, + "eval_steps_per_second": 3.974, + "step": 19737 + }, + { + "epoch": 387.06, + "learning_rate": 0.00012154414829705148, + "loss": 0.4905, + "step": 19740 + }, + { + "epoch": 387.25, + "learning_rate": 0.00012151236342271788, + "loss": 0.4938, + "step": 19750 + }, + { + "epoch": 387.45, + "learning_rate": 0.0001214805649678115, + "loss": 0.4934, + "step": 19760 + }, + { + "epoch": 387.65, + "learning_rate": 0.00012144875294161676, + "loss": 0.4969, + "step": 19770 + }, + { + "epoch": 387.84, + "learning_rate": 0.00012141692735342209, + "loss": 0.497, + "step": 19780 + }, + { + "epoch": 388.0, + "eval_loss": 0.4956132471561432, + "eval_runtime": 2.1319, + "eval_samples_per_second": 1069.001, + "eval_steps_per_second": 4.222, + "step": 19788 + }, + { + "epoch": 388.04, + "learning_rate": 0.0001213850882125198, + "loss": 0.4898, + "step": 19790 + }, + { + "epoch": 388.24, + "learning_rate": 0.00012135323552820626, + "loss": 0.4936, + "step": 19800 + }, + { + "epoch": 388.43, + "learning_rate": 0.00012132136930978172, + "loss": 0.491, + "step": 19810 + }, + { + "epoch": 388.63, + "learning_rate": 0.00012128948956655038, + "loss": 0.4929, + "step": 19820 + }, + { + "epoch": 388.82, + "learning_rate": 0.00012125759630782047, + "loss": 0.4924, + "step": 19830 + }, + { + "epoch": 389.0, + "eval_loss": 0.4960061311721802, + "eval_runtime": 2.2224, + "eval_samples_per_second": 1025.454, + "eval_steps_per_second": 4.05, + "step": 19839 + }, + { + "epoch": 389.02, + "learning_rate": 0.00012122568954290409, + "loss": 0.4914, + "step": 19840 + }, + { + "epoch": 389.22, + "learning_rate": 0.00012119376928111729, + "loss": 0.4931, + "step": 19850 + }, + { + "epoch": 389.41, + "learning_rate": 0.00012116183553178008, + "loss": 0.4938, + "step": 19860 + }, + { + "epoch": 389.61, + "learning_rate": 0.00012112988830421638, + "loss": 0.4899, + "step": 19870 + }, + { + "epoch": 389.8, + "learning_rate": 0.00012109792760775413, + "loss": 0.4897, + "step": 19880 + }, + { + "epoch": 390.0, + "learning_rate": 0.00012106595345172509, + "loss": 0.4904, + "step": 19890 + }, + { + "epoch": 390.0, + "eval_loss": 0.49724245071411133, + "eval_runtime": 2.2173, + "eval_samples_per_second": 1027.831, + "eval_steps_per_second": 4.059, + "step": 19890 + }, + { + "epoch": 390.2, + "learning_rate": 0.00012103396584546499, + "loss": 0.4905, + "step": 19900 + }, + { + "epoch": 390.39, + "learning_rate": 0.00012100196479831355, + "loss": 0.4975, + "step": 19910 + }, + { + "epoch": 390.59, + "learning_rate": 0.00012096995031961432, + "loss": 0.5014, + "step": 19920 + }, + { + "epoch": 390.78, + "learning_rate": 0.00012093792241871481, + "loss": 0.4979, + "step": 19930 + }, + { + "epoch": 390.98, + "learning_rate": 0.00012090588110496649, + "loss": 0.5, + "step": 19940 + }, + { + "epoch": 391.0, + "eval_loss": 0.4958445131778717, + "eval_runtime": 2.2084, + "eval_samples_per_second": 1031.971, + "eval_steps_per_second": 4.075, + "step": 19941 + }, + { + "epoch": 391.18, + "learning_rate": 0.00012087382638772467, + "loss": 0.4953, + "step": 19950 + }, + { + "epoch": 391.37, + "learning_rate": 0.00012084175827634866, + "loss": 0.4903, + "step": 19960 + }, + { + "epoch": 391.57, + "learning_rate": 0.00012080967678020158, + "loss": 0.4878, + "step": 19970 + }, + { + "epoch": 391.76, + "learning_rate": 0.00012077758190865055, + "loss": 0.491, + "step": 19980 + }, + { + "epoch": 391.96, + "learning_rate": 0.00012074547367106652, + "loss": 0.4961, + "step": 19990 + }, + { + "epoch": 392.0, + "eval_loss": 0.4906347990036011, + "eval_runtime": 2.203, + "eval_samples_per_second": 1034.495, + "eval_steps_per_second": 4.085, + "step": 19992 + }, + { + "epoch": 392.16, + "learning_rate": 0.00012071335207682442, + "loss": 0.4875, + "step": 20000 + }, + { + "epoch": 392.35, + "learning_rate": 0.00012068121713530302, + "loss": 0.4917, + "step": 20010 + }, + { + "epoch": 392.55, + "learning_rate": 0.00012064906885588497, + "loss": 0.4949, + "step": 20020 + }, + { + "epoch": 392.75, + "learning_rate": 0.00012061690724795693, + "loss": 0.492, + "step": 20030 + }, + { + "epoch": 392.94, + "learning_rate": 0.0001205847323209093, + "loss": 0.491, + "step": 20040 + }, + { + "epoch": 393.0, + "eval_loss": 0.49177783727645874, + "eval_runtime": 2.2703, + "eval_samples_per_second": 1003.851, + "eval_steps_per_second": 3.964, + "step": 20043 + }, + { + "epoch": 393.14, + "learning_rate": 0.00012055254408413652, + "loss": 0.4926, + "step": 20050 + }, + { + "epoch": 393.33, + "learning_rate": 0.00012052034254703675, + "loss": 0.4889, + "step": 20060 + }, + { + "epoch": 393.53, + "learning_rate": 0.00012048812771901217, + "loss": 0.4897, + "step": 20070 + }, + { + "epoch": 393.73, + "learning_rate": 0.00012045589960946876, + "loss": 0.4903, + "step": 20080 + }, + { + "epoch": 393.92, + "learning_rate": 0.00012042365822781646, + "loss": 0.4878, + "step": 20090 + }, + { + "epoch": 394.0, + "eval_loss": 0.49539539217948914, + "eval_runtime": 2.1871, + "eval_samples_per_second": 1042.041, + "eval_steps_per_second": 4.115, + "step": 20094 + }, + { + "epoch": 394.12, + "learning_rate": 0.00012039140358346896, + "loss": 0.4904, + "step": 20100 + }, + { + "epoch": 394.31, + "learning_rate": 0.00012035913568584398, + "loss": 0.4898, + "step": 20110 + }, + { + "epoch": 394.51, + "learning_rate": 0.00012032685454436298, + "loss": 0.4889, + "step": 20120 + }, + { + "epoch": 394.71, + "learning_rate": 0.00012029456016845132, + "loss": 0.4908, + "step": 20130 + }, + { + "epoch": 394.9, + "learning_rate": 0.00012026225256753828, + "loss": 0.4881, + "step": 20140 + }, + { + "epoch": 395.0, + "eval_loss": 0.4915597438812256, + "eval_runtime": 2.2004, + "eval_samples_per_second": 1035.724, + "eval_steps_per_second": 4.09, + "step": 20145 + }, + { + "epoch": 395.1, + "learning_rate": 0.00012022993175105693, + "loss": 0.49, + "step": 20150 + }, + { + "epoch": 395.29, + "learning_rate": 0.00012019759772844423, + "loss": 0.489, + "step": 20160 + }, + { + "epoch": 395.49, + "learning_rate": 0.00012016525050914098, + "loss": 0.4957, + "step": 20170 + }, + { + "epoch": 395.69, + "learning_rate": 0.0001201328901025919, + "loss": 0.4914, + "step": 20180 + }, + { + "epoch": 395.88, + "learning_rate": 0.00012010051651824546, + "loss": 0.49, + "step": 20190 + }, + { + "epoch": 396.0, + "eval_loss": 0.4946361482143402, + "eval_runtime": 2.1514, + "eval_samples_per_second": 1059.324, + "eval_steps_per_second": 4.183, + "step": 20196 + }, + { + "epoch": 396.08, + "learning_rate": 0.000120068129765554, + "loss": 0.4896, + "step": 20200 + }, + { + "epoch": 396.27, + "learning_rate": 0.00012003572985397382, + "loss": 0.4901, + "step": 20210 + }, + { + "epoch": 396.47, + "learning_rate": 0.00012000331679296488, + "loss": 0.4894, + "step": 20220 + }, + { + "epoch": 396.67, + "learning_rate": 0.00011997089059199112, + "loss": 0.4914, + "step": 20230 + }, + { + "epoch": 396.86, + "learning_rate": 0.00011993845126052025, + "loss": 0.4881, + "step": 20240 + }, + { + "epoch": 397.0, + "eval_loss": 0.49236392974853516, + "eval_runtime": 2.1606, + "eval_samples_per_second": 1054.789, + "eval_steps_per_second": 4.165, + "step": 20247 + }, + { + "epoch": 397.06, + "learning_rate": 0.00011990599880802382, + "loss": 0.4938, + "step": 20250 + }, + { + "epoch": 397.25, + "learning_rate": 0.00011987353324397729, + "loss": 0.4934, + "step": 20260 + }, + { + "epoch": 397.45, + "learning_rate": 0.0001198410545778598, + "loss": 0.4952, + "step": 20270 + }, + { + "epoch": 397.65, + "learning_rate": 0.00011980856281915442, + "loss": 0.4858, + "step": 20280 + }, + { + "epoch": 397.84, + "learning_rate": 0.00011977605797734803, + "loss": 0.4871, + "step": 20290 + }, + { + "epoch": 398.0, + "eval_loss": 0.49587199091911316, + "eval_runtime": 2.305, + "eval_samples_per_second": 988.741, + "eval_steps_per_second": 3.905, + "step": 20298 + }, + { + "epoch": 398.04, + "learning_rate": 0.00011974354006193131, + "loss": 0.4878, + "step": 20300 + }, + { + "epoch": 398.24, + "learning_rate": 0.00011971100908239877, + "loss": 0.4952, + "step": 20310 + }, + { + "epoch": 398.43, + "learning_rate": 0.00011967846504824876, + "loss": 0.4868, + "step": 20320 + }, + { + "epoch": 398.63, + "learning_rate": 0.00011964590796898333, + "loss": 0.4938, + "step": 20330 + }, + { + "epoch": 398.82, + "learning_rate": 0.00011961333785410852, + "loss": 0.492, + "step": 20340 + }, + { + "epoch": 399.0, + "eval_loss": 0.48673364520072937, + "eval_runtime": 2.1177, + "eval_samples_per_second": 1076.163, + "eval_steps_per_second": 4.25, + "step": 20349 + }, + { + "epoch": 399.02, + "learning_rate": 0.00011958075471313399, + "loss": 0.4877, + "step": 20350 + }, + { + "epoch": 399.22, + "learning_rate": 0.00011954815855557338, + "loss": 0.4819, + "step": 20360 + }, + { + "epoch": 399.41, + "learning_rate": 0.00011951554939094395, + "loss": 0.4908, + "step": 20370 + }, + { + "epoch": 399.61, + "learning_rate": 0.0001194829272287669, + "loss": 0.4934, + "step": 20380 + }, + { + "epoch": 399.8, + "learning_rate": 0.00011945029207856717, + "loss": 0.4891, + "step": 20390 + }, + { + "epoch": 400.0, + "learning_rate": 0.0001194176439498735, + "loss": 0.4883, + "step": 20400 + }, + { + "epoch": 400.0, + "eval_loss": 0.4891131818294525, + "eval_runtime": 2.2177, + "eval_samples_per_second": 1027.662, + "eval_steps_per_second": 4.058, + "step": 20400 + }, + { + "epoch": 400.2, + "learning_rate": 0.0001193849828522184, + "loss": 0.49, + "step": 20410 + }, + { + "epoch": 400.39, + "learning_rate": 0.00011935230879513817, + "loss": 0.4854, + "step": 20420 + }, + { + "epoch": 400.59, + "learning_rate": 0.00011931962178817295, + "loss": 0.4883, + "step": 20430 + }, + { + "epoch": 400.78, + "learning_rate": 0.00011928692184086658, + "loss": 0.487, + "step": 20440 + }, + { + "epoch": 400.98, + "learning_rate": 0.00011925420896276673, + "loss": 0.4864, + "step": 20450 + }, + { + "epoch": 401.0, + "eval_loss": 0.49455273151397705, + "eval_runtime": 2.1545, + "eval_samples_per_second": 1057.788, + "eval_steps_per_second": 4.177, + "step": 20451 + }, + { + "epoch": 401.18, + "learning_rate": 0.00011922148316342483, + "loss": 0.4938, + "step": 20460 + }, + { + "epoch": 401.37, + "learning_rate": 0.00011918874445239606, + "loss": 0.4971, + "step": 20470 + }, + { + "epoch": 401.57, + "learning_rate": 0.00011915599283923944, + "loss": 0.4905, + "step": 20480 + }, + { + "epoch": 401.76, + "learning_rate": 0.00011912322833351768, + "loss": 0.489, + "step": 20490 + }, + { + "epoch": 401.96, + "learning_rate": 0.00011909045094479726, + "loss": 0.4898, + "step": 20500 + }, + { + "epoch": 402.0, + "eval_loss": 0.49220773577690125, + "eval_runtime": 2.1981, + "eval_samples_per_second": 1036.809, + "eval_steps_per_second": 4.094, + "step": 20502 + }, + { + "epoch": 402.16, + "learning_rate": 0.0001190576606826485, + "loss": 0.4885, + "step": 20510 + }, + { + "epoch": 402.35, + "learning_rate": 0.00011902485755664542, + "loss": 0.489, + "step": 20520 + }, + { + "epoch": 402.55, + "learning_rate": 0.00011899204157636577, + "loss": 0.4877, + "step": 20530 + }, + { + "epoch": 402.75, + "learning_rate": 0.00011895921275139109, + "loss": 0.4857, + "step": 20540 + }, + { + "epoch": 402.94, + "learning_rate": 0.00011892637109130667, + "loss": 0.4841, + "step": 20550 + }, + { + "epoch": 403.0, + "eval_loss": 0.49017834663391113, + "eval_runtime": 2.2994, + "eval_samples_per_second": 991.128, + "eval_steps_per_second": 3.914, + "step": 20553 + }, + { + "epoch": 403.14, + "learning_rate": 0.00011889351660570155, + "loss": 0.4864, + "step": 20560 + }, + { + "epoch": 403.33, + "learning_rate": 0.00011886064930416852, + "loss": 0.487, + "step": 20570 + }, + { + "epoch": 403.53, + "learning_rate": 0.00011882776919630406, + "loss": 0.4857, + "step": 20580 + }, + { + "epoch": 403.73, + "learning_rate": 0.00011879487629170845, + "loss": 0.4874, + "step": 20590 + }, + { + "epoch": 403.92, + "learning_rate": 0.00011876197059998569, + "loss": 0.4879, + "step": 20600 + }, + { + "epoch": 404.0, + "eval_loss": 0.49214035272598267, + "eval_runtime": 2.1629, + "eval_samples_per_second": 1053.659, + "eval_steps_per_second": 4.161, + "step": 20604 + }, + { + "epoch": 404.12, + "learning_rate": 0.00011872905213074348, + "loss": 0.4914, + "step": 20610 + }, + { + "epoch": 404.31, + "learning_rate": 0.00011869612089359333, + "loss": 0.4845, + "step": 20620 + }, + { + "epoch": 404.51, + "learning_rate": 0.0001186631768981504, + "loss": 0.4848, + "step": 20630 + }, + { + "epoch": 404.71, + "learning_rate": 0.00011863022015403356, + "loss": 0.4905, + "step": 20640 + }, + { + "epoch": 404.9, + "learning_rate": 0.00011859725067086551, + "loss": 0.4801, + "step": 20650 + }, + { + "epoch": 405.0, + "eval_loss": 0.49142250418663025, + "eval_runtime": 2.2732, + "eval_samples_per_second": 1002.561, + "eval_steps_per_second": 3.959, + "step": 20655 + }, + { + "epoch": 405.1, + "learning_rate": 0.00011856426845827259, + "loss": 0.4882, + "step": 20660 + }, + { + "epoch": 405.29, + "learning_rate": 0.00011853127352588484, + "loss": 0.4859, + "step": 20670 + }, + { + "epoch": 405.49, + "learning_rate": 0.00011849826588333606, + "loss": 0.4866, + "step": 20680 + }, + { + "epoch": 405.69, + "learning_rate": 0.00011846524554026375, + "loss": 0.4828, + "step": 20690 + }, + { + "epoch": 405.88, + "learning_rate": 0.00011843221250630909, + "loss": 0.4877, + "step": 20700 + }, + { + "epoch": 406.0, + "eval_loss": 0.48823556303977966, + "eval_runtime": 2.2635, + "eval_samples_per_second": 1006.835, + "eval_steps_per_second": 3.976, + "step": 20706 + }, + { + "epoch": 406.08, + "learning_rate": 0.00011839916679111705, + "loss": 0.4797, + "step": 20710 + }, + { + "epoch": 406.27, + "learning_rate": 0.00011836610840433619, + "loss": 0.4864, + "step": 20720 + }, + { + "epoch": 406.47, + "learning_rate": 0.00011833303735561884, + "loss": 0.487, + "step": 20730 + }, + { + "epoch": 406.67, + "learning_rate": 0.00011829995365462098, + "loss": 0.4845, + "step": 20740 + }, + { + "epoch": 406.86, + "learning_rate": 0.00011826685731100235, + "loss": 0.4858, + "step": 20750 + }, + { + "epoch": 407.0, + "eval_loss": 0.48820948600769043, + "eval_runtime": 2.3072, + "eval_samples_per_second": 987.76, + "eval_steps_per_second": 3.901, + "step": 20757 + }, + { + "epoch": 407.06, + "learning_rate": 0.00011823374833442632, + "loss": 0.4859, + "step": 20760 + }, + { + "epoch": 407.25, + "learning_rate": 0.00011820062673455999, + "loss": 0.4768, + "step": 20770 + }, + { + "epoch": 407.45, + "learning_rate": 0.00011816749252107412, + "loss": 0.4855, + "step": 20780 + }, + { + "epoch": 407.65, + "learning_rate": 0.00011813434570364315, + "loss": 0.4875, + "step": 20790 + }, + { + "epoch": 407.84, + "learning_rate": 0.00011810118629194525, + "loss": 0.4856, + "step": 20800 + }, + { + "epoch": 408.0, + "eval_loss": 0.48716312646865845, + "eval_runtime": 2.1662, + "eval_samples_per_second": 1052.052, + "eval_steps_per_second": 4.155, + "step": 20808 + }, + { + "epoch": 408.04, + "learning_rate": 0.00011806801429566218, + "loss": 0.4819, + "step": 20810 + }, + { + "epoch": 408.24, + "learning_rate": 0.00011803482972447946, + "loss": 0.4848, + "step": 20820 + }, + { + "epoch": 408.43, + "learning_rate": 0.00011800163258808624, + "loss": 0.4851, + "step": 20830 + }, + { + "epoch": 408.63, + "learning_rate": 0.00011796842289617532, + "loss": 0.4844, + "step": 20840 + }, + { + "epoch": 408.82, + "learning_rate": 0.00011793520065844319, + "loss": 0.4825, + "step": 20850 + }, + { + "epoch": 409.0, + "eval_loss": 0.4870782792568207, + "eval_runtime": 2.1607, + "eval_samples_per_second": 1054.746, + "eval_steps_per_second": 4.165, + "step": 20859 + }, + { + "epoch": 409.02, + "learning_rate": 0.00011790196588459002, + "loss": 0.4766, + "step": 20860 + }, + { + "epoch": 409.22, + "learning_rate": 0.00011786871858431966, + "loss": 0.4798, + "step": 20870 + }, + { + "epoch": 409.41, + "learning_rate": 0.00011783545876733949, + "loss": 0.4842, + "step": 20880 + }, + { + "epoch": 409.61, + "learning_rate": 0.0001178021864433607, + "loss": 0.483, + "step": 20890 + }, + { + "epoch": 409.8, + "learning_rate": 0.00011776890162209804, + "loss": 0.4919, + "step": 20900 + }, + { + "epoch": 410.0, + "learning_rate": 0.00011773560431326995, + "loss": 0.4865, + "step": 20910 + }, + { + "epoch": 410.0, + "eval_loss": 0.4852657616138458, + "eval_runtime": 2.2662, + "eval_samples_per_second": 1005.667, + "eval_steps_per_second": 3.971, + "step": 20910 + }, + { + "epoch": 410.2, + "learning_rate": 0.00011770229452659851, + "loss": 0.4851, + "step": 20920 + }, + { + "epoch": 410.39, + "learning_rate": 0.00011766897227180941, + "loss": 0.4838, + "step": 20930 + }, + { + "epoch": 410.59, + "learning_rate": 0.00011763563755863201, + "loss": 0.485, + "step": 20940 + }, + { + "epoch": 410.78, + "learning_rate": 0.00011760229039679933, + "loss": 0.4842, + "step": 20950 + }, + { + "epoch": 410.98, + "learning_rate": 0.00011756893079604795, + "loss": 0.4834, + "step": 20960 + }, + { + "epoch": 411.0, + "eval_loss": 0.4907666742801666, + "eval_runtime": 2.2355, + "eval_samples_per_second": 1019.481, + "eval_steps_per_second": 4.026, + "step": 20961 + }, + { + "epoch": 411.18, + "learning_rate": 0.00011753555876611818, + "loss": 0.4817, + "step": 20970 + }, + { + "epoch": 411.37, + "learning_rate": 0.00011750217431675389, + "loss": 0.486, + "step": 20980 + }, + { + "epoch": 411.57, + "learning_rate": 0.00011746877745770258, + "loss": 0.4862, + "step": 20990 + }, + { + "epoch": 411.76, + "learning_rate": 0.00011743536819871539, + "loss": 0.4838, + "step": 21000 + }, + { + "epoch": 411.96, + "learning_rate": 0.0001174019465495471, + "loss": 0.4815, + "step": 21010 + }, + { + "epoch": 412.0, + "eval_loss": 0.4847215414047241, + "eval_runtime": 2.1743, + "eval_samples_per_second": 1048.158, + "eval_steps_per_second": 4.139, + "step": 21012 + }, + { + "epoch": 412.16, + "learning_rate": 0.00011736851251995606, + "loss": 0.4849, + "step": 21020 + }, + { + "epoch": 412.35, + "learning_rate": 0.00011733506611970429, + "loss": 0.4762, + "step": 21030 + }, + { + "epoch": 412.55, + "learning_rate": 0.00011730160735855736, + "loss": 0.4838, + "step": 21040 + }, + { + "epoch": 412.75, + "learning_rate": 0.00011726813624628451, + "loss": 0.48, + "step": 21050 + }, + { + "epoch": 412.94, + "learning_rate": 0.00011723465279265853, + "loss": 0.4828, + "step": 21060 + }, + { + "epoch": 413.0, + "eval_loss": 0.49191340804100037, + "eval_runtime": 2.3099, + "eval_samples_per_second": 986.634, + "eval_steps_per_second": 3.896, + "step": 21063 + }, + { + "epoch": 413.14, + "learning_rate": 0.00011720115700745588, + "loss": 0.4873, + "step": 21070 + }, + { + "epoch": 413.33, + "learning_rate": 0.00011716764890045656, + "loss": 0.486, + "step": 21080 + }, + { + "epoch": 413.53, + "learning_rate": 0.00011713412848144419, + "loss": 0.4863, + "step": 21090 + }, + { + "epoch": 413.73, + "learning_rate": 0.00011710059576020595, + "loss": 0.4863, + "step": 21100 + }, + { + "epoch": 413.92, + "learning_rate": 0.00011706705074653273, + "loss": 0.487, + "step": 21110 + }, + { + "epoch": 414.0, + "eval_loss": 0.4898955821990967, + "eval_runtime": 2.2311, + "eval_samples_per_second": 1021.471, + "eval_steps_per_second": 4.034, + "step": 21114 + }, + { + "epoch": 414.12, + "learning_rate": 0.00011703349345021887, + "loss": 0.4859, + "step": 21120 + }, + { + "epoch": 414.31, + "learning_rate": 0.00011699992388106235, + "loss": 0.4833, + "step": 21130 + }, + { + "epoch": 414.51, + "learning_rate": 0.00011696634204886474, + "loss": 0.4849, + "step": 21140 + }, + { + "epoch": 414.71, + "learning_rate": 0.00011693274796343119, + "loss": 0.4819, + "step": 21150 + }, + { + "epoch": 414.9, + "learning_rate": 0.00011689914163457044, + "loss": 0.4842, + "step": 21160 + }, + { + "epoch": 415.0, + "eval_loss": 0.48760807514190674, + "eval_runtime": 2.207, + "eval_samples_per_second": 1032.62, + "eval_steps_per_second": 4.078, + "step": 21165 + }, + { + "epoch": 415.1, + "learning_rate": 0.00011686552307209477, + "loss": 0.4853, + "step": 21170 + }, + { + "epoch": 415.29, + "learning_rate": 0.00011683189228582005, + "loss": 0.4834, + "step": 21180 + }, + { + "epoch": 415.49, + "learning_rate": 0.00011679824928556573, + "loss": 0.4834, + "step": 21190 + }, + { + "epoch": 415.69, + "learning_rate": 0.00011676459408115479, + "loss": 0.4787, + "step": 21200 + }, + { + "epoch": 415.88, + "learning_rate": 0.00011673092668241384, + "loss": 0.4902, + "step": 21210 + }, + { + "epoch": 416.0, + "eval_loss": 0.48727309703826904, + "eval_runtime": 2.1481, + "eval_samples_per_second": 1060.942, + "eval_steps_per_second": 4.19, + "step": 21216 + }, + { + "epoch": 416.08, + "learning_rate": 0.000116697247099173, + "loss": 0.4826, + "step": 21220 + }, + { + "epoch": 416.27, + "learning_rate": 0.00011666355534126592, + "loss": 0.4854, + "step": 21230 + }, + { + "epoch": 416.47, + "learning_rate": 0.00011662985141852987, + "loss": 0.4849, + "step": 21240 + }, + { + "epoch": 416.67, + "learning_rate": 0.00011659613534080564, + "loss": 0.4814, + "step": 21250 + }, + { + "epoch": 416.86, + "learning_rate": 0.00011656240711793759, + "loss": 0.4809, + "step": 21260 + }, + { + "epoch": 417.0, + "eval_loss": 0.49133971333503723, + "eval_runtime": 2.1325, + "eval_samples_per_second": 1068.686, + "eval_steps_per_second": 4.22, + "step": 21267 + }, + { + "epoch": 417.06, + "learning_rate": 0.0001165286667597736, + "loss": 0.4785, + "step": 21270 + }, + { + "epoch": 417.25, + "learning_rate": 0.00011649491427616508, + "loss": 0.4871, + "step": 21280 + }, + { + "epoch": 417.45, + "learning_rate": 0.00011646114967696701, + "loss": 0.4778, + "step": 21290 + }, + { + "epoch": 417.65, + "learning_rate": 0.00011642737297203793, + "loss": 0.4808, + "step": 21300 + }, + { + "epoch": 417.84, + "learning_rate": 0.00011639358417123985, + "loss": 0.4825, + "step": 21310 + }, + { + "epoch": 418.0, + "eval_loss": 0.48316019773483276, + "eval_runtime": 2.1954, + "eval_samples_per_second": 1038.08, + "eval_steps_per_second": 4.099, + "step": 21318 + }, + { + "epoch": 418.04, + "learning_rate": 0.00011635978328443837, + "loss": 0.4785, + "step": 21320 + }, + { + "epoch": 418.24, + "learning_rate": 0.00011632597032150254, + "loss": 0.4855, + "step": 21330 + }, + { + "epoch": 418.43, + "learning_rate": 0.0001162921452923051, + "loss": 0.4795, + "step": 21340 + }, + { + "epoch": 418.63, + "learning_rate": 0.0001162583082067221, + "loss": 0.4841, + "step": 21350 + }, + { + "epoch": 418.82, + "learning_rate": 0.00011622445907463325, + "loss": 0.4797, + "step": 21360 + }, + { + "epoch": 419.0, + "eval_loss": 0.4872037172317505, + "eval_runtime": 2.1935, + "eval_samples_per_second": 1038.999, + "eval_steps_per_second": 4.103, + "step": 21369 + }, + { + "epoch": 419.02, + "learning_rate": 0.00011619059790592175, + "loss": 0.483, + "step": 21370 + }, + { + "epoch": 419.22, + "learning_rate": 0.0001161567247104743, + "loss": 0.4835, + "step": 21380 + }, + { + "epoch": 419.41, + "learning_rate": 0.00011612283949818115, + "loss": 0.4799, + "step": 21390 + }, + { + "epoch": 419.61, + "learning_rate": 0.00011608894227893595, + "loss": 0.4868, + "step": 21400 + }, + { + "epoch": 419.8, + "learning_rate": 0.00011605503306263599, + "loss": 0.4777, + "step": 21410 + }, + { + "epoch": 420.0, + "learning_rate": 0.00011602111185918203, + "loss": 0.4852, + "step": 21420 + }, + { + "epoch": 420.0, + "eval_loss": 0.4868069887161255, + "eval_runtime": 2.1839, + "eval_samples_per_second": 1043.561, + "eval_steps_per_second": 4.121, + "step": 21420 + }, + { + "epoch": 420.2, + "learning_rate": 0.00011598717867847822, + "loss": 0.4811, + "step": 21430 + }, + { + "epoch": 420.39, + "learning_rate": 0.00011595323353043236, + "loss": 0.4864, + "step": 21440 + }, + { + "epoch": 420.59, + "learning_rate": 0.00011591927642495564, + "loss": 0.4823, + "step": 21450 + }, + { + "epoch": 420.78, + "learning_rate": 0.0001158853073719628, + "loss": 0.4804, + "step": 21460 + }, + { + "epoch": 420.98, + "learning_rate": 0.00011585132638137203, + "loss": 0.4879, + "step": 21470 + }, + { + "epoch": 421.0, + "eval_loss": 0.48333823680877686, + "eval_runtime": 2.2861, + "eval_samples_per_second": 996.882, + "eval_steps_per_second": 3.937, + "step": 21471 + }, + { + "epoch": 421.18, + "learning_rate": 0.00011581733346310504, + "loss": 0.4874, + "step": 21480 + }, + { + "epoch": 421.37, + "learning_rate": 0.000115783328627087, + "loss": 0.4826, + "step": 21490 + }, + { + "epoch": 421.57, + "learning_rate": 0.00011574931188324656, + "loss": 0.4757, + "step": 21500 + }, + { + "epoch": 421.76, + "learning_rate": 0.00011571528324151581, + "loss": 0.481, + "step": 21510 + }, + { + "epoch": 421.96, + "learning_rate": 0.00011568124271183042, + "loss": 0.4823, + "step": 21520 + }, + { + "epoch": 422.0, + "eval_loss": 0.4823528826236725, + "eval_runtime": 2.3957, + "eval_samples_per_second": 951.291, + "eval_steps_per_second": 3.757, + "step": 21522 + }, + { + "epoch": 422.16, + "learning_rate": 0.00011564719030412944, + "loss": 0.4827, + "step": 21530 + }, + { + "epoch": 422.35, + "learning_rate": 0.00011561312602835541, + "loss": 0.4851, + "step": 21540 + }, + { + "epoch": 422.55, + "learning_rate": 0.00011557904989445434, + "loss": 0.4831, + "step": 21550 + }, + { + "epoch": 422.75, + "learning_rate": 0.0001155449619123757, + "loss": 0.478, + "step": 21560 + }, + { + "epoch": 422.94, + "learning_rate": 0.00011551086209207242, + "loss": 0.4729, + "step": 21570 + }, + { + "epoch": 423.0, + "eval_loss": 0.47928565740585327, + "eval_runtime": 2.2457, + "eval_samples_per_second": 1014.839, + "eval_steps_per_second": 4.008, + "step": 21573 + }, + { + "epoch": 423.14, + "learning_rate": 0.0001154767504435009, + "loss": 0.4828, + "step": 21580 + }, + { + "epoch": 423.33, + "learning_rate": 0.00011544262697662093, + "loss": 0.4811, + "step": 21590 + }, + { + "epoch": 423.53, + "learning_rate": 0.00011540849170139588, + "loss": 0.4806, + "step": 21600 + }, + { + "epoch": 423.73, + "learning_rate": 0.00011537434462779246, + "loss": 0.4799, + "step": 21610 + }, + { + "epoch": 423.92, + "learning_rate": 0.00011534018576578084, + "loss": 0.4825, + "step": 21620 + }, + { + "epoch": 424.0, + "eval_loss": 0.4812348186969757, + "eval_runtime": 2.2984, + "eval_samples_per_second": 991.549, + "eval_steps_per_second": 3.916, + "step": 21624 + }, + { + "epoch": 424.12, + "learning_rate": 0.00011530601512533462, + "loss": 0.4796, + "step": 21630 + }, + { + "epoch": 424.31, + "learning_rate": 0.00011527183271643091, + "loss": 0.4806, + "step": 21640 + }, + { + "epoch": 424.51, + "learning_rate": 0.0001152376385490502, + "loss": 0.4799, + "step": 21650 + }, + { + "epoch": 424.71, + "learning_rate": 0.00011520343263317641, + "loss": 0.4829, + "step": 21660 + }, + { + "epoch": 424.9, + "learning_rate": 0.00011516921497879693, + "loss": 0.4739, + "step": 21670 + }, + { + "epoch": 425.0, + "eval_loss": 0.4831399917602539, + "eval_runtime": 2.2851, + "eval_samples_per_second": 997.312, + "eval_steps_per_second": 3.938, + "step": 21675 + }, + { + "epoch": 425.1, + "learning_rate": 0.00011513498559590251, + "loss": 0.4753, + "step": 21680 + }, + { + "epoch": 425.29, + "learning_rate": 0.00011510074449448743, + "loss": 0.4875, + "step": 21690 + }, + { + "epoch": 425.49, + "learning_rate": 0.00011506649168454926, + "loss": 0.4773, + "step": 21700 + }, + { + "epoch": 425.69, + "learning_rate": 0.0001150322271760891, + "loss": 0.478, + "step": 21710 + }, + { + "epoch": 425.88, + "learning_rate": 0.00011499795097911141, + "loss": 0.4767, + "step": 21720 + }, + { + "epoch": 426.0, + "eval_loss": 0.4847799241542816, + "eval_runtime": 2.3019, + "eval_samples_per_second": 990.034, + "eval_steps_per_second": 3.91, + "step": 21726 + }, + { + "epoch": 426.08, + "learning_rate": 0.00011496366310362408, + "loss": 0.4833, + "step": 21730 + }, + { + "epoch": 426.27, + "learning_rate": 0.00011492936355963839, + "loss": 0.4844, + "step": 21740 + }, + { + "epoch": 426.47, + "learning_rate": 0.00011489505235716906, + "loss": 0.4805, + "step": 21750 + }, + { + "epoch": 426.67, + "learning_rate": 0.00011486072950623418, + "loss": 0.4808, + "step": 21760 + }, + { + "epoch": 426.86, + "learning_rate": 0.00011482639501685529, + "loss": 0.4806, + "step": 21770 + }, + { + "epoch": 427.0, + "eval_loss": 0.48580941557884216, + "eval_runtime": 2.3158, + "eval_samples_per_second": 984.13, + "eval_steps_per_second": 3.886, + "step": 21777 + }, + { + "epoch": 427.06, + "learning_rate": 0.00011479204889905722, + "loss": 0.4783, + "step": 21780 + }, + { + "epoch": 427.25, + "learning_rate": 0.00011475769116286837, + "loss": 0.4786, + "step": 21790 + }, + { + "epoch": 427.45, + "learning_rate": 0.00011472332181832034, + "loss": 0.4772, + "step": 21800 + }, + { + "epoch": 427.65, + "learning_rate": 0.00011468894087544828, + "loss": 0.4754, + "step": 21810 + }, + { + "epoch": 427.84, + "learning_rate": 0.00011465454834429066, + "loss": 0.4736, + "step": 21820 + }, + { + "epoch": 428.0, + "eval_loss": 0.48313280940055847, + "eval_runtime": 2.1586, + "eval_samples_per_second": 1055.778, + "eval_steps_per_second": 4.169, + "step": 21828 + }, + { + "epoch": 428.04, + "learning_rate": 0.00011462014423488926, + "loss": 0.4773, + "step": 21830 + }, + { + "epoch": 428.24, + "learning_rate": 0.00011458572855728937, + "loss": 0.4773, + "step": 21840 + }, + { + "epoch": 428.43, + "learning_rate": 0.00011455130132153959, + "loss": 0.4773, + "step": 21850 + }, + { + "epoch": 428.63, + "learning_rate": 0.00011451686253769192, + "loss": 0.4753, + "step": 21860 + }, + { + "epoch": 428.82, + "learning_rate": 0.00011448241221580167, + "loss": 0.4857, + "step": 21870 + }, + { + "epoch": 429.0, + "eval_loss": 0.4785289764404297, + "eval_runtime": 2.2717, + "eval_samples_per_second": 1003.196, + "eval_steps_per_second": 3.962, + "step": 21879 + }, + { + "epoch": 429.02, + "learning_rate": 0.00011444795036592761, + "loss": 0.4738, + "step": 21880 + }, + { + "epoch": 429.22, + "learning_rate": 0.00011441347699813185, + "loss": 0.4752, + "step": 21890 + }, + { + "epoch": 429.41, + "learning_rate": 0.00011437899212247977, + "loss": 0.4768, + "step": 21900 + }, + { + "epoch": 429.61, + "learning_rate": 0.00011434449574904024, + "loss": 0.4774, + "step": 21910 + }, + { + "epoch": 429.8, + "learning_rate": 0.0001143099878878854, + "loss": 0.4768, + "step": 21920 + }, + { + "epoch": 430.0, + "learning_rate": 0.00011427546854909084, + "loss": 0.4819, + "step": 21930 + }, + { + "epoch": 430.0, + "eval_loss": 0.48050355911254883, + "eval_runtime": 2.1897, + "eval_samples_per_second": 1040.776, + "eval_steps_per_second": 4.11, + "step": 21930 + }, + { + "epoch": 430.2, + "learning_rate": 0.00011424093774273535, + "loss": 0.477, + "step": 21940 + }, + { + "epoch": 430.39, + "learning_rate": 0.00011420639547890122, + "loss": 0.4774, + "step": 21950 + }, + { + "epoch": 430.59, + "learning_rate": 0.000114171841767674, + "loss": 0.4751, + "step": 21960 + }, + { + "epoch": 430.78, + "learning_rate": 0.00011413727661914259, + "loss": 0.4773, + "step": 21970 + }, + { + "epoch": 430.98, + "learning_rate": 0.00011410270004339924, + "loss": 0.4767, + "step": 21980 + }, + { + "epoch": 431.0, + "eval_loss": 0.48454973101615906, + "eval_runtime": 2.2681, + "eval_samples_per_second": 1004.809, + "eval_steps_per_second": 3.968, + "step": 21981 + }, + { + "epoch": 431.18, + "learning_rate": 0.00011406811205053956, + "loss": 0.4798, + "step": 21990 + }, + { + "epoch": 431.37, + "learning_rate": 0.00011403351265066249, + "loss": 0.4791, + "step": 22000 + }, + { + "epoch": 431.57, + "learning_rate": 0.00011399890185387023, + "loss": 0.4769, + "step": 22010 + }, + { + "epoch": 431.76, + "learning_rate": 0.0001139642796702684, + "loss": 0.4805, + "step": 22020 + }, + { + "epoch": 431.96, + "learning_rate": 0.0001139296461099659, + "loss": 0.4765, + "step": 22030 + }, + { + "epoch": 432.0, + "eval_loss": 0.4803260564804077, + "eval_runtime": 2.1961, + "eval_samples_per_second": 1037.757, + "eval_steps_per_second": 4.098, + "step": 22032 + }, + { + "epoch": 432.16, + "learning_rate": 0.00011389500118307494, + "loss": 0.4787, + "step": 22040 + }, + { + "epoch": 432.35, + "learning_rate": 0.00011386034489971108, + "loss": 0.473, + "step": 22050 + }, + { + "epoch": 432.55, + "learning_rate": 0.00011382567726999318, + "loss": 0.4783, + "step": 22060 + }, + { + "epoch": 432.75, + "learning_rate": 0.00011379099830404341, + "loss": 0.4726, + "step": 22070 + }, + { + "epoch": 432.94, + "learning_rate": 0.00011375630801198725, + "loss": 0.4785, + "step": 22080 + }, + { + "epoch": 433.0, + "eval_loss": 0.4825577139854431, + "eval_runtime": 2.2219, + "eval_samples_per_second": 1025.707, + "eval_steps_per_second": 4.051, + "step": 22083 + }, + { + "epoch": 433.14, + "learning_rate": 0.00011372160640395352, + "loss": 0.4733, + "step": 22090 + }, + { + "epoch": 433.33, + "learning_rate": 0.0001136868934900743, + "loss": 0.4737, + "step": 22100 + }, + { + "epoch": 433.53, + "learning_rate": 0.00011365216928048498, + "loss": 0.4766, + "step": 22110 + }, + { + "epoch": 433.73, + "learning_rate": 0.00011361743378532422, + "loss": 0.4767, + "step": 22120 + }, + { + "epoch": 433.92, + "learning_rate": 0.00011358268701473408, + "loss": 0.4758, + "step": 22130 + }, + { + "epoch": 434.0, + "eval_loss": 0.48143434524536133, + "eval_runtime": 2.319, + "eval_samples_per_second": 982.761, + "eval_steps_per_second": 3.881, + "step": 22134 + }, + { + "epoch": 434.12, + "learning_rate": 0.00011354792897885981, + "loss": 0.4729, + "step": 22140 + }, + { + "epoch": 434.31, + "learning_rate": 0.00011351315968784996, + "loss": 0.4798, + "step": 22150 + }, + { + "epoch": 434.51, + "learning_rate": 0.00011347837915185645, + "loss": 0.4768, + "step": 22160 + }, + { + "epoch": 434.71, + "learning_rate": 0.00011344358738103432, + "loss": 0.4777, + "step": 22170 + }, + { + "epoch": 434.9, + "learning_rate": 0.0001134087843855421, + "loss": 0.4677, + "step": 22180 + }, + { + "epoch": 435.0, + "eval_loss": 0.4814690947532654, + "eval_runtime": 2.3045, + "eval_samples_per_second": 988.926, + "eval_steps_per_second": 3.905, + "step": 22185 + }, + { + "epoch": 435.1, + "learning_rate": 0.00011337397017554141, + "loss": 0.4754, + "step": 22190 + }, + { + "epoch": 435.29, + "learning_rate": 0.00011333914476119726, + "loss": 0.4744, + "step": 22200 + }, + { + "epoch": 435.49, + "learning_rate": 0.00011330430815267787, + "loss": 0.4691, + "step": 22210 + }, + { + "epoch": 435.69, + "learning_rate": 0.0001132694603601548, + "loss": 0.4759, + "step": 22220 + }, + { + "epoch": 435.88, + "learning_rate": 0.00011323460139380279, + "loss": 0.4735, + "step": 22230 + }, + { + "epoch": 436.0, + "eval_loss": 0.48106876015663147, + "eval_runtime": 2.1513, + "eval_samples_per_second": 1059.348, + "eval_steps_per_second": 4.183, + "step": 22236 + }, + { + "epoch": 436.08, + "learning_rate": 0.00011319973126379986, + "loss": 0.4761, + "step": 22240 + }, + { + "epoch": 436.27, + "learning_rate": 0.00011316484998032736, + "loss": 0.4761, + "step": 22250 + }, + { + "epoch": 436.47, + "learning_rate": 0.00011312995755356982, + "loss": 0.476, + "step": 22260 + }, + { + "epoch": 436.67, + "learning_rate": 0.00011309505399371506, + "loss": 0.4783, + "step": 22270 + }, + { + "epoch": 436.86, + "learning_rate": 0.00011306013931095412, + "loss": 0.4764, + "step": 22280 + }, + { + "epoch": 437.0, + "eval_loss": 0.47487953305244446, + "eval_runtime": 2.3105, + "eval_samples_per_second": 986.353, + "eval_steps_per_second": 3.895, + "step": 22287 + }, + { + "epoch": 437.06, + "learning_rate": 0.00011302521351548133, + "loss": 0.4741, + "step": 22290 + }, + { + "epoch": 437.25, + "learning_rate": 0.00011299027661749425, + "loss": 0.4758, + "step": 22300 + }, + { + "epoch": 437.45, + "learning_rate": 0.00011295532862719366, + "loss": 0.4735, + "step": 22310 + }, + { + "epoch": 437.65, + "learning_rate": 0.00011292036955478361, + "loss": 0.4778, + "step": 22320 + }, + { + "epoch": 437.84, + "learning_rate": 0.0001128853994104713, + "loss": 0.4743, + "step": 22330 + }, + { + "epoch": 438.0, + "eval_loss": 0.4845726490020752, + "eval_runtime": 2.2198, + "eval_samples_per_second": 1026.683, + "eval_steps_per_second": 4.054, + "step": 22338 + }, + { + "epoch": 438.04, + "learning_rate": 0.00011285041820446735, + "loss": 0.4728, + "step": 22340 + }, + { + "epoch": 438.24, + "learning_rate": 0.0001128154259469854, + "loss": 0.479, + "step": 22350 + }, + { + "epoch": 438.43, + "learning_rate": 0.00011278042264824247, + "loss": 0.4758, + "step": 22360 + }, + { + "epoch": 438.63, + "learning_rate": 0.0001127454083184587, + "loss": 0.4799, + "step": 22370 + }, + { + "epoch": 438.82, + "learning_rate": 0.00011271038296785748, + "loss": 0.4736, + "step": 22380 + }, + { + "epoch": 439.0, + "eval_loss": 0.4824729263782501, + "eval_runtime": 2.1493, + "eval_samples_per_second": 1060.367, + "eval_steps_per_second": 4.187, + "step": 22389 + }, + { + "epoch": 439.02, + "learning_rate": 0.00011267534660666548, + "loss": 0.474, + "step": 22390 + }, + { + "epoch": 439.22, + "learning_rate": 0.0001126402992451125, + "loss": 0.4794, + "step": 22400 + }, + { + "epoch": 439.41, + "learning_rate": 0.0001126052408934316, + "loss": 0.4767, + "step": 22410 + }, + { + "epoch": 439.61, + "learning_rate": 0.00011257017156185904, + "loss": 0.4762, + "step": 22420 + }, + { + "epoch": 439.8, + "learning_rate": 0.00011253509126063428, + "loss": 0.4721, + "step": 22430 + }, + { + "epoch": 440.0, + "learning_rate": 0.0001125, + "loss": 0.4732, + "step": 22440 + }, + { + "epoch": 440.0, + "eval_loss": 0.47832006216049194, + "eval_runtime": 2.181, + "eval_samples_per_second": 1044.937, + "eval_steps_per_second": 4.127, + "step": 22440 + }, + { + "epoch": 440.2, + "learning_rate": 0.00011246489779020203, + "loss": 0.4754, + "step": 22450 + }, + { + "epoch": 440.39, + "learning_rate": 0.00011242978464148945, + "loss": 0.4796, + "step": 22460 + }, + { + "epoch": 440.59, + "learning_rate": 0.00011239466056411455, + "loss": 0.4794, + "step": 22470 + }, + { + "epoch": 440.78, + "learning_rate": 0.00011235952556833274, + "loss": 0.4781, + "step": 22480 + }, + { + "epoch": 440.98, + "learning_rate": 0.00011232437966440264, + "loss": 0.4706, + "step": 22490 + }, + { + "epoch": 441.0, + "eval_loss": 0.48102495074272156, + "eval_runtime": 2.2838, + "eval_samples_per_second": 997.887, + "eval_steps_per_second": 3.941, + "step": 22491 + }, + { + "epoch": 441.18, + "learning_rate": 0.00011228922286258613, + "loss": 0.472, + "step": 22500 + }, + { + "epoch": 441.37, + "learning_rate": 0.00011225405517314813, + "loss": 0.4749, + "step": 22510 + }, + { + "epoch": 441.57, + "learning_rate": 0.00011221887660635688, + "loss": 0.479, + "step": 22520 + }, + { + "epoch": 441.76, + "learning_rate": 0.00011218368717248373, + "loss": 0.4701, + "step": 22530 + }, + { + "epoch": 441.96, + "learning_rate": 0.0001121484868818032, + "loss": 0.4735, + "step": 22540 + }, + { + "epoch": 442.0, + "eval_loss": 0.477983683347702, + "eval_runtime": 2.2047, + "eval_samples_per_second": 1033.722, + "eval_steps_per_second": 4.082, + "step": 22542 + }, + { + "epoch": 442.16, + "learning_rate": 0.000112113275744593, + "loss": 0.4774, + "step": 22550 + }, + { + "epoch": 442.35, + "learning_rate": 0.00011207805377113397, + "loss": 0.4765, + "step": 22560 + }, + { + "epoch": 442.55, + "learning_rate": 0.00011204282097171016, + "loss": 0.4727, + "step": 22570 + }, + { + "epoch": 442.75, + "learning_rate": 0.0001120075773566088, + "loss": 0.4713, + "step": 22580 + }, + { + "epoch": 442.94, + "learning_rate": 0.00011197232293612015, + "loss": 0.4796, + "step": 22590 + }, + { + "epoch": 443.0, + "eval_loss": 0.4880768954753876, + "eval_runtime": 2.2972, + "eval_samples_per_second": 992.057, + "eval_steps_per_second": 3.918, + "step": 22593 + }, + { + "epoch": 443.14, + "learning_rate": 0.0001119370577205378, + "loss": 0.4796, + "step": 22600 + }, + { + "epoch": 443.33, + "learning_rate": 0.00011190178172015837, + "loss": 0.4784, + "step": 22610 + }, + { + "epoch": 443.53, + "learning_rate": 0.00011186649494528165, + "loss": 0.4766, + "step": 22620 + }, + { + "epoch": 443.73, + "learning_rate": 0.00011183119740621062, + "loss": 0.4778, + "step": 22630 + }, + { + "epoch": 443.92, + "learning_rate": 0.00011179588911325136, + "loss": 0.4724, + "step": 22640 + }, + { + "epoch": 444.0, + "eval_loss": 0.4784562289714813, + "eval_runtime": 2.3023, + "eval_samples_per_second": 989.881, + "eval_steps_per_second": 3.909, + "step": 22644 + }, + { + "epoch": 444.12, + "learning_rate": 0.0001117605700767131, + "loss": 0.4807, + "step": 22650 + }, + { + "epoch": 444.31, + "learning_rate": 0.00011172524030690823, + "loss": 0.4827, + "step": 22660 + }, + { + "epoch": 444.51, + "learning_rate": 0.00011168989981415223, + "loss": 0.4754, + "step": 22670 + }, + { + "epoch": 444.71, + "learning_rate": 0.00011165454860876375, + "loss": 0.4823, + "step": 22680 + }, + { + "epoch": 444.9, + "learning_rate": 0.00011161918670106455, + "loss": 0.4701, + "step": 22690 + }, + { + "epoch": 445.0, + "eval_loss": 0.47529175877571106, + "eval_runtime": 2.1777, + "eval_samples_per_second": 1046.517, + "eval_steps_per_second": 4.133, + "step": 22695 + }, + { + "epoch": 445.1, + "learning_rate": 0.00011158381410137952, + "loss": 0.4754, + "step": 22700 + }, + { + "epoch": 445.29, + "learning_rate": 0.00011154843082003669, + "loss": 0.4733, + "step": 22710 + }, + { + "epoch": 445.49, + "learning_rate": 0.00011151303686736717, + "loss": 0.4736, + "step": 22720 + }, + { + "epoch": 445.69, + "learning_rate": 0.00011147763225370518, + "loss": 0.4716, + "step": 22730 + }, + { + "epoch": 445.88, + "learning_rate": 0.00011144221698938812, + "loss": 0.4764, + "step": 22740 + }, + { + "epoch": 446.0, + "eval_loss": 0.47874537110328674, + "eval_runtime": 2.1189, + "eval_samples_per_second": 1075.539, + "eval_steps_per_second": 4.247, + "step": 22746 + }, + { + "epoch": 446.08, + "learning_rate": 0.00011140679108475641, + "loss": 0.4709, + "step": 22750 + }, + { + "epoch": 446.27, + "learning_rate": 0.0001113713545501537, + "loss": 0.4714, + "step": 22760 + }, + { + "epoch": 446.47, + "learning_rate": 0.0001113359073959266, + "loss": 0.4686, + "step": 22770 + }, + { + "epoch": 446.67, + "learning_rate": 0.00011130044963242492, + "loss": 0.4723, + "step": 22780 + }, + { + "epoch": 446.86, + "learning_rate": 0.0001112649812700015, + "loss": 0.4729, + "step": 22790 + }, + { + "epoch": 447.0, + "eval_loss": 0.48238447308540344, + "eval_runtime": 2.236, + "eval_samples_per_second": 1019.217, + "eval_steps_per_second": 4.025, + "step": 22797 + }, + { + "epoch": 447.06, + "learning_rate": 0.00011122950231901234, + "loss": 0.4718, + "step": 22800 + }, + { + "epoch": 447.25, + "learning_rate": 0.00011119401278981652, + "loss": 0.4718, + "step": 22810 + }, + { + "epoch": 447.45, + "learning_rate": 0.00011115851269277615, + "loss": 0.4731, + "step": 22820 + }, + { + "epoch": 447.65, + "learning_rate": 0.00011112300203825649, + "loss": 0.4734, + "step": 22830 + }, + { + "epoch": 447.84, + "learning_rate": 0.00011108748083662589, + "loss": 0.4726, + "step": 22840 + }, + { + "epoch": 448.0, + "eval_loss": 0.47418108582496643, + "eval_runtime": 2.2463, + "eval_samples_per_second": 1014.566, + "eval_steps_per_second": 4.007, + "step": 22848 + }, + { + "epoch": 448.04, + "learning_rate": 0.00011105194909825568, + "loss": 0.4732, + "step": 22850 + }, + { + "epoch": 448.24, + "learning_rate": 0.00011101640683352039, + "loss": 0.4746, + "step": 22860 + }, + { + "epoch": 448.43, + "learning_rate": 0.00011098085405279753, + "loss": 0.4708, + "step": 22870 + }, + { + "epoch": 448.63, + "learning_rate": 0.00011094529076646774, + "loss": 0.4745, + "step": 22880 + }, + { + "epoch": 448.82, + "learning_rate": 0.0001109097169849147, + "loss": 0.4736, + "step": 22890 + }, + { + "epoch": 449.0, + "eval_loss": 0.47750285267829895, + "eval_runtime": 2.1452, + "eval_samples_per_second": 1062.363, + "eval_steps_per_second": 4.195, + "step": 22899 + }, + { + "epoch": 449.02, + "learning_rate": 0.00011087413271852517, + "loss": 0.4703, + "step": 22900 + }, + { + "epoch": 449.22, + "learning_rate": 0.00011083853797768895, + "loss": 0.4719, + "step": 22910 + }, + { + "epoch": 449.41, + "learning_rate": 0.00011080293277279894, + "loss": 0.4737, + "step": 22920 + }, + { + "epoch": 449.61, + "learning_rate": 0.00011076731711425101, + "loss": 0.4706, + "step": 22930 + }, + { + "epoch": 449.8, + "learning_rate": 0.00011073169101244421, + "loss": 0.4715, + "step": 22940 + }, + { + "epoch": 450.0, + "learning_rate": 0.00011069605447778052, + "loss": 0.4764, + "step": 22950 + }, + { + "epoch": 450.0, + "eval_loss": 0.47553837299346924, + "eval_runtime": 2.1747, + "eval_samples_per_second": 1047.979, + "eval_steps_per_second": 4.139, + "step": 22950 + }, + { + "epoch": 450.2, + "learning_rate": 0.00011066040752066499, + "loss": 0.4765, + "step": 22960 + }, + { + "epoch": 450.39, + "learning_rate": 0.0001106247501515058, + "loss": 0.4758, + "step": 22970 + }, + { + "epoch": 450.59, + "learning_rate": 0.00011058908238071406, + "loss": 0.4674, + "step": 22980 + }, + { + "epoch": 450.78, + "learning_rate": 0.000110553404218704, + "loss": 0.4684, + "step": 22990 + }, + { + "epoch": 450.98, + "learning_rate": 0.0001105177156758928, + "loss": 0.4701, + "step": 23000 + }, + { + "epoch": 451.0, + "eval_loss": 0.47549954056739807, + "eval_runtime": 2.2025, + "eval_samples_per_second": 1034.721, + "eval_steps_per_second": 4.086, + "step": 23001 + }, + { + "epoch": 451.18, + "learning_rate": 0.00011048201676270076, + "loss": 0.4771, + "step": 23010 + }, + { + "epoch": 451.37, + "learning_rate": 0.00011044630748955113, + "loss": 0.4733, + "step": 23020 + }, + { + "epoch": 451.57, + "learning_rate": 0.00011041058786687028, + "loss": 0.4733, + "step": 23030 + }, + { + "epoch": 451.76, + "learning_rate": 0.00011037485790508745, + "loss": 0.4714, + "step": 23040 + }, + { + "epoch": 451.96, + "learning_rate": 0.0001103391176146351, + "loss": 0.4746, + "step": 23050 + }, + { + "epoch": 452.0, + "eval_loss": 0.4750150740146637, + "eval_runtime": 2.2323, + "eval_samples_per_second": 1020.924, + "eval_steps_per_second": 4.032, + "step": 23052 + }, + { + "epoch": 452.16, + "learning_rate": 0.00011030336700594852, + "loss": 0.4654, + "step": 23060 + }, + { + "epoch": 452.35, + "learning_rate": 0.00011026760608946611, + "loss": 0.4708, + "step": 23070 + }, + { + "epoch": 452.55, + "learning_rate": 0.00011023183487562929, + "loss": 0.4726, + "step": 23080 + }, + { + "epoch": 452.75, + "learning_rate": 0.00011019605337488241, + "loss": 0.4665, + "step": 23090 + }, + { + "epoch": 452.94, + "learning_rate": 0.0001101602615976729, + "loss": 0.4727, + "step": 23100 + }, + { + "epoch": 453.0, + "eval_loss": 0.47314518690109253, + "eval_runtime": 2.1703, + "eval_samples_per_second": 1050.085, + "eval_steps_per_second": 4.147, + "step": 23103 + }, + { + "epoch": 453.14, + "learning_rate": 0.00011012445955445117, + "loss": 0.4679, + "step": 23110 + }, + { + "epoch": 453.33, + "learning_rate": 0.00011008864725567059, + "loss": 0.4682, + "step": 23120 + }, + { + "epoch": 453.53, + "learning_rate": 0.00011005282471178757, + "loss": 0.4684, + "step": 23130 + }, + { + "epoch": 453.73, + "learning_rate": 0.00011001699193326147, + "loss": 0.4692, + "step": 23140 + }, + { + "epoch": 453.92, + "learning_rate": 0.00010998114893055469, + "loss": 0.4691, + "step": 23150 + }, + { + "epoch": 454.0, + "eval_loss": 0.4686477482318878, + "eval_runtime": 2.3412, + "eval_samples_per_second": 973.429, + "eval_steps_per_second": 3.844, + "step": 23154 + }, + { + "epoch": 454.12, + "learning_rate": 0.00010994529571413258, + "loss": 0.4665, + "step": 23160 + }, + { + "epoch": 454.31, + "learning_rate": 0.00010990943229446346, + "loss": 0.466, + "step": 23170 + }, + { + "epoch": 454.51, + "learning_rate": 0.0001098735586820187, + "loss": 0.47, + "step": 23180 + }, + { + "epoch": 454.71, + "learning_rate": 0.00010983767488727253, + "loss": 0.4683, + "step": 23190 + }, + { + "epoch": 454.9, + "learning_rate": 0.00010980178092070225, + "loss": 0.4673, + "step": 23200 + }, + { + "epoch": 455.0, + "eval_loss": 0.4761298596858978, + "eval_runtime": 2.277, + "eval_samples_per_second": 1000.862, + "eval_steps_per_second": 3.953, + "step": 23205 + }, + { + "epoch": 455.1, + "learning_rate": 0.00010976587679278812, + "loss": 0.4718, + "step": 23210 + }, + { + "epoch": 455.29, + "learning_rate": 0.00010972996251401328, + "loss": 0.4687, + "step": 23220 + }, + { + "epoch": 455.49, + "learning_rate": 0.00010969403809486397, + "loss": 0.4687, + "step": 23230 + }, + { + "epoch": 455.69, + "learning_rate": 0.0001096581035458293, + "loss": 0.468, + "step": 23240 + }, + { + "epoch": 455.88, + "learning_rate": 0.00010962215887740132, + "loss": 0.4726, + "step": 23250 + }, + { + "epoch": 456.0, + "eval_loss": 0.4763098955154419, + "eval_runtime": 2.1556, + "eval_samples_per_second": 1057.242, + "eval_steps_per_second": 4.175, + "step": 23256 + }, + { + "epoch": 456.08, + "learning_rate": 0.00010958620410007513, + "loss": 0.4706, + "step": 23260 + }, + { + "epoch": 456.27, + "learning_rate": 0.00010955023922434864, + "loss": 0.4695, + "step": 23270 + }, + { + "epoch": 456.47, + "learning_rate": 0.00010951426426072286, + "loss": 0.4676, + "step": 23280 + }, + { + "epoch": 456.67, + "learning_rate": 0.00010947827921970169, + "loss": 0.4688, + "step": 23290 + }, + { + "epoch": 456.86, + "learning_rate": 0.00010944228411179189, + "loss": 0.4726, + "step": 23300 + }, + { + "epoch": 457.0, + "eval_loss": 0.4806825816631317, + "eval_runtime": 2.1723, + "eval_samples_per_second": 1049.096, + "eval_steps_per_second": 4.143, + "step": 23307 + }, + { + "epoch": 457.06, + "learning_rate": 0.00010940627894750328, + "loss": 0.4692, + "step": 23310 + }, + { + "epoch": 457.25, + "learning_rate": 0.00010937026373734856, + "loss": 0.478, + "step": 23320 + }, + { + "epoch": 457.45, + "learning_rate": 0.00010933423849184336, + "loss": 0.4758, + "step": 23330 + }, + { + "epoch": 457.65, + "learning_rate": 0.00010929820322150624, + "loss": 0.4698, + "step": 23340 + }, + { + "epoch": 457.84, + "learning_rate": 0.00010926215793685869, + "loss": 0.4696, + "step": 23350 + }, + { + "epoch": 458.0, + "eval_loss": 0.4738100469112396, + "eval_runtime": 2.3236, + "eval_samples_per_second": 980.81, + "eval_steps_per_second": 3.873, + "step": 23358 + }, + { + "epoch": 458.04, + "learning_rate": 0.00010922610264842516, + "loss": 0.4709, + "step": 23360 + }, + { + "epoch": 458.24, + "learning_rate": 0.00010919003736673297, + "loss": 0.4675, + "step": 23370 + }, + { + "epoch": 458.43, + "learning_rate": 0.00010915396210231239, + "loss": 0.4716, + "step": 23380 + }, + { + "epoch": 458.63, + "learning_rate": 0.00010911787686569658, + "loss": 0.4712, + "step": 23390 + }, + { + "epoch": 458.82, + "learning_rate": 0.00010908178166742161, + "loss": 0.4689, + "step": 23400 + }, + { + "epoch": 459.0, + "eval_loss": 0.4727371335029602, + "eval_runtime": 2.3226, + "eval_samples_per_second": 981.246, + "eval_steps_per_second": 3.875, + "step": 23409 + }, + { + "epoch": 459.02, + "learning_rate": 0.0001090456765180265, + "loss": 0.4678, + "step": 23410 + }, + { + "epoch": 459.22, + "learning_rate": 0.00010900956142805315, + "loss": 0.4697, + "step": 23420 + }, + { + "epoch": 459.41, + "learning_rate": 0.00010897343640804634, + "loss": 0.472, + "step": 23430 + }, + { + "epoch": 459.61, + "learning_rate": 0.00010893730146855378, + "loss": 0.4685, + "step": 23440 + }, + { + "epoch": 459.8, + "learning_rate": 0.00010890115662012607, + "loss": 0.4681, + "step": 23450 + }, + { + "epoch": 460.0, + "learning_rate": 0.0001088650018733167, + "loss": 0.4702, + "step": 23460 + }, + { + "epoch": 460.0, + "eval_loss": 0.479326456785202, + "eval_runtime": 2.2444, + "eval_samples_per_second": 1015.425, + "eval_steps_per_second": 4.01, + "step": 23460 + }, + { + "epoch": 460.2, + "learning_rate": 0.00010882883723868205, + "loss": 0.4758, + "step": 23470 + }, + { + "epoch": 460.39, + "learning_rate": 0.00010879266272678136, + "loss": 0.4722, + "step": 23480 + }, + { + "epoch": 460.59, + "learning_rate": 0.00010875647834817681, + "loss": 0.4707, + "step": 23490 + }, + { + "epoch": 460.78, + "learning_rate": 0.00010872028411343344, + "loss": 0.4692, + "step": 23500 + }, + { + "epoch": 460.98, + "learning_rate": 0.00010868408003311912, + "loss": 0.4692, + "step": 23510 + }, + { + "epoch": 461.0, + "eval_loss": 0.4696498513221741, + "eval_runtime": 2.2495, + "eval_samples_per_second": 1013.127, + "eval_steps_per_second": 4.001, + "step": 23511 + }, + { + "epoch": 461.18, + "learning_rate": 0.00010864786611780469, + "loss": 0.4652, + "step": 23520 + }, + { + "epoch": 461.37, + "learning_rate": 0.00010861164237806375, + "loss": 0.468, + "step": 23530 + }, + { + "epoch": 461.57, + "learning_rate": 0.00010857540882447286, + "loss": 0.4651, + "step": 23540 + }, + { + "epoch": 461.76, + "learning_rate": 0.0001085391654676114, + "loss": 0.4701, + "step": 23550 + }, + { + "epoch": 461.96, + "learning_rate": 0.00010850291231806159, + "loss": 0.4694, + "step": 23560 + }, + { + "epoch": 462.0, + "eval_loss": 0.47131288051605225, + "eval_runtime": 2.2683, + "eval_samples_per_second": 1004.726, + "eval_steps_per_second": 3.968, + "step": 23562 + }, + { + "epoch": 462.16, + "learning_rate": 0.00010846664938640861, + "loss": 0.4661, + "step": 23570 + }, + { + "epoch": 462.35, + "learning_rate": 0.00010843037668324038, + "loss": 0.4732, + "step": 23580 + }, + { + "epoch": 462.55, + "learning_rate": 0.00010839409421914771, + "loss": 0.4709, + "step": 23590 + }, + { + "epoch": 462.75, + "learning_rate": 0.00010835780200472429, + "loss": 0.4654, + "step": 23600 + }, + { + "epoch": 462.94, + "learning_rate": 0.00010832150005056665, + "loss": 0.4628, + "step": 23610 + }, + { + "epoch": 463.0, + "eval_loss": 0.47472110390663147, + "eval_runtime": 2.2214, + "eval_samples_per_second": 1025.941, + "eval_steps_per_second": 4.052, + "step": 23613 + }, + { + "epoch": 463.14, + "learning_rate": 0.00010828518836727413, + "loss": 0.4711, + "step": 23620 + }, + { + "epoch": 463.33, + "learning_rate": 0.00010824886696544895, + "loss": 0.4662, + "step": 23630 + }, + { + "epoch": 463.53, + "learning_rate": 0.00010821253585569609, + "loss": 0.471, + "step": 23640 + }, + { + "epoch": 463.73, + "learning_rate": 0.00010817619504862352, + "loss": 0.4736, + "step": 23650 + }, + { + "epoch": 463.92, + "learning_rate": 0.00010813984455484189, + "loss": 0.4677, + "step": 23660 + }, + { + "epoch": 464.0, + "eval_loss": 0.4787036180496216, + "eval_runtime": 2.1739, + "eval_samples_per_second": 1048.334, + "eval_steps_per_second": 4.14, + "step": 23664 + }, + { + "epoch": 464.12, + "learning_rate": 0.00010810348438496473, + "loss": 0.472, + "step": 23670 + }, + { + "epoch": 464.31, + "learning_rate": 0.00010806711454960843, + "loss": 0.4654, + "step": 23680 + }, + { + "epoch": 464.51, + "learning_rate": 0.00010803073505939212, + "loss": 0.4696, + "step": 23690 + }, + { + "epoch": 464.71, + "learning_rate": 0.00010799434592493785, + "loss": 0.467, + "step": 23700 + }, + { + "epoch": 464.9, + "learning_rate": 0.0001079579471568704, + "loss": 0.4673, + "step": 23710 + }, + { + "epoch": 465.0, + "eval_loss": 0.4681728184223175, + "eval_runtime": 2.1469, + "eval_samples_per_second": 1061.532, + "eval_steps_per_second": 4.192, + "step": 23715 + }, + { + "epoch": 465.1, + "learning_rate": 0.00010792153876581743, + "loss": 0.4626, + "step": 23720 + }, + { + "epoch": 465.29, + "learning_rate": 0.00010788512076240935, + "loss": 0.4646, + "step": 23730 + }, + { + "epoch": 465.49, + "learning_rate": 0.00010784869315727942, + "loss": 0.4706, + "step": 23740 + }, + { + "epoch": 465.69, + "learning_rate": 0.0001078122559610637, + "loss": 0.4601, + "step": 23750 + }, + { + "epoch": 465.88, + "learning_rate": 0.000107775809184401, + "loss": 0.4709, + "step": 23760 + }, + { + "epoch": 466.0, + "eval_loss": 0.4692438542842865, + "eval_runtime": 2.3021, + "eval_samples_per_second": 989.952, + "eval_steps_per_second": 3.909, + "step": 23766 + }, + { + "epoch": 466.08, + "learning_rate": 0.00010773935283793298, + "loss": 0.4682, + "step": 23770 + }, + { + "epoch": 466.27, + "learning_rate": 0.00010770288693230411, + "loss": 0.4682, + "step": 23780 + }, + { + "epoch": 466.47, + "learning_rate": 0.00010766641147816161, + "loss": 0.4669, + "step": 23790 + }, + { + "epoch": 466.67, + "learning_rate": 0.00010762992648615548, + "loss": 0.4654, + "step": 23800 + }, + { + "epoch": 466.86, + "learning_rate": 0.00010759343196693854, + "loss": 0.463, + "step": 23810 + }, + { + "epoch": 467.0, + "eval_loss": 0.46763309836387634, + "eval_runtime": 2.1636, + "eval_samples_per_second": 1053.34, + "eval_steps_per_second": 4.16, + "step": 23817 + }, + { + "epoch": 467.06, + "learning_rate": 0.00010755692793116637, + "loss": 0.4643, + "step": 23820 + }, + { + "epoch": 467.25, + "learning_rate": 0.00010752041438949733, + "loss": 0.4676, + "step": 23830 + }, + { + "epoch": 467.45, + "learning_rate": 0.00010748389135259255, + "loss": 0.4628, + "step": 23840 + }, + { + "epoch": 467.65, + "learning_rate": 0.00010744735883111596, + "loss": 0.4687, + "step": 23850 + }, + { + "epoch": 467.84, + "learning_rate": 0.00010741081683573427, + "loss": 0.4654, + "step": 23860 + }, + { + "epoch": 468.0, + "eval_loss": 0.4696432054042816, + "eval_runtime": 2.3135, + "eval_samples_per_second": 985.094, + "eval_steps_per_second": 3.89, + "step": 23868 + }, + { + "epoch": 468.04, + "learning_rate": 0.00010737426537711687, + "loss": 0.4669, + "step": 23870 + }, + { + "epoch": 468.24, + "learning_rate": 0.00010733770446593599, + "loss": 0.4703, + "step": 23880 + }, + { + "epoch": 468.43, + "learning_rate": 0.00010730113411286661, + "loss": 0.4674, + "step": 23890 + }, + { + "epoch": 468.63, + "learning_rate": 0.00010726455432858645, + "loss": 0.4677, + "step": 23900 + }, + { + "epoch": 468.82, + "learning_rate": 0.000107227965123776, + "loss": 0.4648, + "step": 23910 + }, + { + "epoch": 469.0, + "eval_loss": 0.46745070815086365, + "eval_runtime": 2.136, + "eval_samples_per_second": 1066.943, + "eval_steps_per_second": 4.213, + "step": 23919 + }, + { + "epoch": 469.02, + "learning_rate": 0.0001071913665091185, + "loss": 0.4628, + "step": 23920 + }, + { + "epoch": 469.22, + "learning_rate": 0.0001071547584952999, + "loss": 0.4594, + "step": 23930 + }, + { + "epoch": 469.41, + "learning_rate": 0.00010711814109300897, + "loss": 0.4666, + "step": 23940 + }, + { + "epoch": 469.61, + "learning_rate": 0.0001070815143129371, + "loss": 0.4666, + "step": 23950 + }, + { + "epoch": 469.8, + "learning_rate": 0.00010704487816577857, + "loss": 0.462, + "step": 23960 + }, + { + "epoch": 470.0, + "learning_rate": 0.00010700823266223026, + "loss": 0.4642, + "step": 23970 + }, + { + "epoch": 470.0, + "eval_loss": 0.4700300395488739, + "eval_runtime": 2.281, + "eval_samples_per_second": 999.12, + "eval_steps_per_second": 3.946, + "step": 23970 + }, + { + "epoch": 470.2, + "learning_rate": 0.00010697157781299187, + "loss": 0.4698, + "step": 23980 + }, + { + "epoch": 470.39, + "learning_rate": 0.00010693491362876583, + "loss": 0.4675, + "step": 23990 + }, + { + "epoch": 470.59, + "learning_rate": 0.0001068982401202572, + "loss": 0.4652, + "step": 24000 + }, + { + "epoch": 470.78, + "learning_rate": 0.00010686155729817386, + "loss": 0.4582, + "step": 24010 + }, + { + "epoch": 470.98, + "learning_rate": 0.00010682486517322637, + "loss": 0.4687, + "step": 24020 + }, + { + "epoch": 471.0, + "eval_loss": 0.46906590461730957, + "eval_runtime": 2.2917, + "eval_samples_per_second": 994.474, + "eval_steps_per_second": 3.927, + "step": 24021 + }, + { + "epoch": 471.18, + "learning_rate": 0.000106788163756128, + "loss": 0.4654, + "step": 24030 + }, + { + "epoch": 471.37, + "learning_rate": 0.00010675145305759477, + "loss": 0.4646, + "step": 24040 + }, + { + "epoch": 471.57, + "learning_rate": 0.00010671473308834538, + "loss": 0.4708, + "step": 24050 + }, + { + "epoch": 471.76, + "learning_rate": 0.00010667800385910123, + "loss": 0.4675, + "step": 24060 + }, + { + "epoch": 471.96, + "learning_rate": 0.00010664126538058645, + "loss": 0.469, + "step": 24070 + }, + { + "epoch": 472.0, + "eval_loss": 0.4749109745025635, + "eval_runtime": 2.2452, + "eval_samples_per_second": 1015.044, + "eval_steps_per_second": 4.009, + "step": 24072 + }, + { + "epoch": 472.16, + "learning_rate": 0.0001066045176635278, + "loss": 0.4687, + "step": 24080 + }, + { + "epoch": 472.35, + "learning_rate": 0.0001065677607186549, + "loss": 0.4688, + "step": 24090 + }, + { + "epoch": 472.55, + "learning_rate": 0.00010653099455669988, + "loss": 0.4732, + "step": 24100 + }, + { + "epoch": 472.75, + "learning_rate": 0.00010649421918839764, + "loss": 0.4664, + "step": 24110 + }, + { + "epoch": 472.94, + "learning_rate": 0.0001064574346244858, + "loss": 0.4692, + "step": 24120 + }, + { + "epoch": 473.0, + "eval_loss": 0.4672113358974457, + "eval_runtime": 2.1455, + "eval_samples_per_second": 1062.232, + "eval_steps_per_second": 4.195, + "step": 24123 + }, + { + "epoch": 473.14, + "learning_rate": 0.00010642064087570464, + "loss": 0.4617, + "step": 24130 + }, + { + "epoch": 473.33, + "learning_rate": 0.00010638383795279706, + "loss": 0.4674, + "step": 24140 + }, + { + "epoch": 473.53, + "learning_rate": 0.00010634702586650875, + "loss": 0.4667, + "step": 24150 + }, + { + "epoch": 473.73, + "learning_rate": 0.00010631020462758798, + "loss": 0.4658, + "step": 24160 + }, + { + "epoch": 473.92, + "learning_rate": 0.00010627337424678576, + "loss": 0.4635, + "step": 24170 + }, + { + "epoch": 474.0, + "eval_loss": 0.4706786870956421, + "eval_runtime": 2.1628, + "eval_samples_per_second": 1053.708, + "eval_steps_per_second": 4.161, + "step": 24174 + }, + { + "epoch": 474.12, + "learning_rate": 0.0001062365347348557, + "loss": 0.4611, + "step": 24180 + }, + { + "epoch": 474.31, + "learning_rate": 0.00010619968610255416, + "loss": 0.4698, + "step": 24190 + }, + { + "epoch": 474.51, + "learning_rate": 0.00010616282836064008, + "loss": 0.4638, + "step": 24200 + }, + { + "epoch": 474.71, + "learning_rate": 0.00010612596151987513, + "loss": 0.4641, + "step": 24210 + }, + { + "epoch": 474.9, + "learning_rate": 0.00010608908559102359, + "loss": 0.4635, + "step": 24220 + }, + { + "epoch": 475.0, + "eval_loss": 0.46961140632629395, + "eval_runtime": 2.3372, + "eval_samples_per_second": 975.091, + "eval_steps_per_second": 3.851, + "step": 24225 + }, + { + "epoch": 475.1, + "learning_rate": 0.0001060522005848524, + "loss": 0.4676, + "step": 24230 + }, + { + "epoch": 475.29, + "learning_rate": 0.00010601530651213118, + "loss": 0.4638, + "step": 24240 + }, + { + "epoch": 475.49, + "learning_rate": 0.00010597840338363216, + "loss": 0.4637, + "step": 24250 + }, + { + "epoch": 475.69, + "learning_rate": 0.00010594149121013026, + "loss": 0.4719, + "step": 24260 + }, + { + "epoch": 475.88, + "learning_rate": 0.00010590457000240298, + "loss": 0.4655, + "step": 24270 + }, + { + "epoch": 476.0, + "eval_loss": 0.46518537402153015, + "eval_runtime": 2.3011, + "eval_samples_per_second": 990.409, + "eval_steps_per_second": 3.911, + "step": 24276 + }, + { + "epoch": 476.08, + "learning_rate": 0.0001058676397712305, + "loss": 0.4668, + "step": 24280 + }, + { + "epoch": 476.27, + "learning_rate": 0.00010583070052739558, + "loss": 0.4618, + "step": 24290 + }, + { + "epoch": 476.47, + "learning_rate": 0.00010579375228168375, + "loss": 0.471, + "step": 24300 + }, + { + "epoch": 476.67, + "learning_rate": 0.000105756795044883, + "loss": 0.4621, + "step": 24310 + }, + { + "epoch": 476.86, + "learning_rate": 0.00010571982882778404, + "loss": 0.4633, + "step": 24320 + }, + { + "epoch": 477.0, + "eval_loss": 0.47023797035217285, + "eval_runtime": 2.2428, + "eval_samples_per_second": 1016.149, + "eval_steps_per_second": 4.013, + "step": 24327 + }, + { + "epoch": 477.06, + "learning_rate": 0.00010568285364118019, + "loss": 0.4672, + "step": 24330 + }, + { + "epoch": 477.25, + "learning_rate": 0.00010564586949586735, + "loss": 0.4653, + "step": 24340 + }, + { + "epoch": 477.45, + "learning_rate": 0.00010560887640264411, + "loss": 0.4642, + "step": 24350 + }, + { + "epoch": 477.65, + "learning_rate": 0.0001055718743723116, + "loss": 0.463, + "step": 24360 + }, + { + "epoch": 477.84, + "learning_rate": 0.00010553486341567358, + "loss": 0.4622, + "step": 24370 + }, + { + "epoch": 478.0, + "eval_loss": 0.46373993158340454, + "eval_runtime": 2.2432, + "eval_samples_per_second": 1015.964, + "eval_steps_per_second": 4.012, + "step": 24378 + }, + { + "epoch": 478.04, + "learning_rate": 0.00010549784354353645, + "loss": 0.4653, + "step": 24380 + }, + { + "epoch": 478.24, + "learning_rate": 0.00010546081476670916, + "loss": 0.4636, + "step": 24390 + }, + { + "epoch": 478.43, + "learning_rate": 0.0001054237770960033, + "loss": 0.4671, + "step": 24400 + }, + { + "epoch": 478.63, + "learning_rate": 0.00010538673054223307, + "loss": 0.4628, + "step": 24410 + }, + { + "epoch": 478.82, + "learning_rate": 0.00010534967511621517, + "loss": 0.4571, + "step": 24420 + }, + { + "epoch": 479.0, + "eval_loss": 0.4678489565849304, + "eval_runtime": 2.1558, + "eval_samples_per_second": 1057.143, + "eval_steps_per_second": 4.175, + "step": 24429 + }, + { + "epoch": 479.02, + "learning_rate": 0.00010531261082876903, + "loss": 0.4718, + "step": 24430 + }, + { + "epoch": 479.22, + "learning_rate": 0.00010527553769071657, + "loss": 0.4683, + "step": 24440 + }, + { + "epoch": 479.41, + "learning_rate": 0.00010523845571288229, + "loss": 0.4668, + "step": 24450 + }, + { + "epoch": 479.61, + "learning_rate": 0.00010520136490609335, + "loss": 0.4611, + "step": 24460 + }, + { + "epoch": 479.8, + "learning_rate": 0.00010516426528117939, + "loss": 0.4606, + "step": 24470 + }, + { + "epoch": 480.0, + "learning_rate": 0.0001051271568489727, + "loss": 0.4645, + "step": 24480 + }, + { + "epoch": 480.0, + "eval_loss": 0.46348774433135986, + "eval_runtime": 2.2724, + "eval_samples_per_second": 1002.887, + "eval_steps_per_second": 3.961, + "step": 24480 + }, + { + "epoch": 480.2, + "learning_rate": 0.00010509003962030813, + "loss": 0.4638, + "step": 24490 + }, + { + "epoch": 480.39, + "learning_rate": 0.00010505291360602302, + "loss": 0.4716, + "step": 24500 + }, + { + "epoch": 480.59, + "learning_rate": 0.00010501577881695744, + "loss": 0.465, + "step": 24510 + }, + { + "epoch": 480.78, + "learning_rate": 0.00010497863526395384, + "loss": 0.459, + "step": 24520 + }, + { + "epoch": 480.98, + "learning_rate": 0.0001049414829578573, + "loss": 0.4654, + "step": 24530 + }, + { + "epoch": 481.0, + "eval_loss": 0.4655218720436096, + "eval_runtime": 2.1945, + "eval_samples_per_second": 1038.497, + "eval_steps_per_second": 4.101, + "step": 24531 + }, + { + "epoch": 481.18, + "learning_rate": 0.00010490432190951555, + "loss": 0.4653, + "step": 24540 + }, + { + "epoch": 481.37, + "learning_rate": 0.00010486715212977869, + "loss": 0.4632, + "step": 24550 + }, + { + "epoch": 481.57, + "learning_rate": 0.00010482997362949951, + "loss": 0.46, + "step": 24560 + }, + { + "epoch": 481.76, + "learning_rate": 0.00010479278641953334, + "loss": 0.4667, + "step": 24570 + }, + { + "epoch": 481.96, + "learning_rate": 0.00010475559051073795, + "loss": 0.4588, + "step": 24580 + }, + { + "epoch": 482.0, + "eval_loss": 0.4688310921192169, + "eval_runtime": 2.2239, + "eval_samples_per_second": 1024.76, + "eval_steps_per_second": 4.047, + "step": 24582 + }, + { + "epoch": 482.16, + "learning_rate": 0.00010471838591397375, + "loss": 0.4668, + "step": 24590 + }, + { + "epoch": 482.35, + "learning_rate": 0.00010468117264010365, + "loss": 0.461, + "step": 24600 + }, + { + "epoch": 482.55, + "learning_rate": 0.0001046439506999931, + "loss": 0.4644, + "step": 24610 + }, + { + "epoch": 482.75, + "learning_rate": 0.00010460672010451007, + "loss": 0.4635, + "step": 24620 + }, + { + "epoch": 482.94, + "learning_rate": 0.00010456948086452506, + "loss": 0.4608, + "step": 24630 + }, + { + "epoch": 483.0, + "eval_loss": 0.4639376103878021, + "eval_runtime": 2.1825, + "eval_samples_per_second": 1044.2, + "eval_steps_per_second": 4.124, + "step": 24633 + }, + { + "epoch": 483.14, + "learning_rate": 0.00010453223299091109, + "loss": 0.4648, + "step": 24640 + }, + { + "epoch": 483.33, + "learning_rate": 0.00010449497649454372, + "loss": 0.4604, + "step": 24650 + }, + { + "epoch": 483.53, + "learning_rate": 0.00010445771138630103, + "loss": 0.4682, + "step": 24660 + }, + { + "epoch": 483.73, + "learning_rate": 0.00010442043767706357, + "loss": 0.4593, + "step": 24670 + }, + { + "epoch": 483.92, + "learning_rate": 0.00010438315537771447, + "loss": 0.4606, + "step": 24680 + }, + { + "epoch": 484.0, + "eval_loss": 0.4653979241847992, + "eval_runtime": 2.1706, + "eval_samples_per_second": 1049.951, + "eval_steps_per_second": 4.146, + "step": 24684 + }, + { + "epoch": 484.12, + "learning_rate": 0.0001043458644991393, + "loss": 0.4604, + "step": 24690 + }, + { + "epoch": 484.31, + "learning_rate": 0.00010430856505222615, + "loss": 0.4633, + "step": 24700 + }, + { + "epoch": 484.51, + "learning_rate": 0.00010427125704786566, + "loss": 0.4568, + "step": 24710 + }, + { + "epoch": 484.71, + "learning_rate": 0.00010423394049695094, + "loss": 0.4643, + "step": 24720 + }, + { + "epoch": 484.9, + "learning_rate": 0.00010419661541037757, + "loss": 0.4624, + "step": 24730 + }, + { + "epoch": 485.0, + "eval_loss": 0.46611276268959045, + "eval_runtime": 2.1771, + "eval_samples_per_second": 1046.812, + "eval_steps_per_second": 4.134, + "step": 24735 + }, + { + "epoch": 485.1, + "learning_rate": 0.00010415928179904363, + "loss": 0.4584, + "step": 24740 + }, + { + "epoch": 485.29, + "learning_rate": 0.00010412193967384975, + "loss": 0.4598, + "step": 24750 + }, + { + "epoch": 485.49, + "learning_rate": 0.00010408458904569895, + "loss": 0.4652, + "step": 24760 + }, + { + "epoch": 485.69, + "learning_rate": 0.00010404722992549679, + "loss": 0.4618, + "step": 24770 + }, + { + "epoch": 485.88, + "learning_rate": 0.00010400986232415133, + "loss": 0.4612, + "step": 24780 + }, + { + "epoch": 486.0, + "eval_loss": 0.4668976664543152, + "eval_runtime": 2.2071, + "eval_samples_per_second": 1032.556, + "eval_steps_per_second": 4.078, + "step": 24786 + }, + { + "epoch": 486.08, + "learning_rate": 0.00010397248625257304, + "loss": 0.4547, + "step": 24790 + }, + { + "epoch": 486.27, + "learning_rate": 0.0001039351017216749, + "loss": 0.4597, + "step": 24800 + }, + { + "epoch": 486.47, + "learning_rate": 0.00010389770874237239, + "loss": 0.4615, + "step": 24810 + }, + { + "epoch": 486.67, + "learning_rate": 0.00010386030732558342, + "loss": 0.4635, + "step": 24820 + }, + { + "epoch": 486.86, + "learning_rate": 0.00010382289748222834, + "loss": 0.46, + "step": 24830 + }, + { + "epoch": 487.0, + "eval_loss": 0.4653010666370392, + "eval_runtime": 2.233, + "eval_samples_per_second": 1020.607, + "eval_steps_per_second": 4.03, + "step": 24837 + }, + { + "epoch": 487.06, + "learning_rate": 0.00010378547922323, + "loss": 0.4602, + "step": 24840 + }, + { + "epoch": 487.25, + "learning_rate": 0.00010374805255951372, + "loss": 0.4638, + "step": 24850 + }, + { + "epoch": 487.45, + "learning_rate": 0.00010371061750200723, + "loss": 0.4647, + "step": 24860 + }, + { + "epoch": 487.65, + "learning_rate": 0.00010367317406164075, + "loss": 0.4633, + "step": 24870 + }, + { + "epoch": 487.84, + "learning_rate": 0.00010363572224934692, + "loss": 0.4623, + "step": 24880 + }, + { + "epoch": 488.0, + "eval_loss": 0.468781054019928, + "eval_runtime": 2.2652, + "eval_samples_per_second": 1006.101, + "eval_steps_per_second": 3.973, + "step": 24888 + }, + { + "epoch": 488.04, + "learning_rate": 0.00010359826207606081, + "loss": 0.4618, + "step": 24890 + }, + { + "epoch": 488.24, + "learning_rate": 0.00010356079355272, + "loss": 0.4628, + "step": 24900 + }, + { + "epoch": 488.43, + "learning_rate": 0.00010352331669026443, + "loss": 0.4608, + "step": 24910 + }, + { + "epoch": 488.63, + "learning_rate": 0.0001034858314996365, + "loss": 0.4598, + "step": 24920 + }, + { + "epoch": 488.82, + "learning_rate": 0.00010344833799178109, + "loss": 0.4648, + "step": 24930 + }, + { + "epoch": 489.0, + "eval_loss": 0.464847594499588, + "eval_runtime": 2.2742, + "eval_samples_per_second": 1002.105, + "eval_steps_per_second": 3.957, + "step": 24939 + }, + { + "epoch": 489.02, + "learning_rate": 0.00010341083617764545, + "loss": 0.4605, + "step": 24940 + }, + { + "epoch": 489.22, + "learning_rate": 0.00010337332606817925, + "loss": 0.4586, + "step": 24950 + }, + { + "epoch": 489.41, + "learning_rate": 0.00010333580767433465, + "loss": 0.4569, + "step": 24960 + }, + { + "epoch": 489.61, + "learning_rate": 0.00010329828100706613, + "loss": 0.4621, + "step": 24970 + }, + { + "epoch": 489.8, + "learning_rate": 0.00010326074607733068, + "loss": 0.4627, + "step": 24980 + }, + { + "epoch": 490.0, + "learning_rate": 0.00010322320289608766, + "loss": 0.4602, + "step": 24990 + }, + { + "epoch": 490.0, + "eval_loss": 0.46202248334884644, + "eval_runtime": 2.2413, + "eval_samples_per_second": 1016.833, + "eval_steps_per_second": 4.016, + "step": 24990 + }, + { + "epoch": 490.2, + "learning_rate": 0.0001031856514742988, + "loss": 0.4667, + "step": 25000 + }, + { + "epoch": 490.39, + "learning_rate": 0.00010314809182292835, + "loss": 0.4642, + "step": 25010 + }, + { + "epoch": 490.59, + "learning_rate": 0.00010311052395294285, + "loss": 0.4599, + "step": 25020 + }, + { + "epoch": 490.78, + "learning_rate": 0.00010307294787531127, + "loss": 0.4623, + "step": 25030 + }, + { + "epoch": 490.98, + "learning_rate": 0.00010303536360100501, + "loss": 0.4587, + "step": 25040 + }, + { + "epoch": 491.0, + "eval_loss": 0.46522802114486694, + "eval_runtime": 2.2035, + "eval_samples_per_second": 1034.275, + "eval_steps_per_second": 4.084, + "step": 25041 + }, + { + "epoch": 491.18, + "learning_rate": 0.0001029977711409978, + "loss": 0.4634, + "step": 25050 + }, + { + "epoch": 491.37, + "learning_rate": 0.00010296017050626583, + "loss": 0.4635, + "step": 25060 + }, + { + "epoch": 491.57, + "learning_rate": 0.00010292256170778768, + "loss": 0.46, + "step": 25070 + }, + { + "epoch": 491.76, + "learning_rate": 0.0001028849447565442, + "loss": 0.4615, + "step": 25080 + }, + { + "epoch": 491.96, + "learning_rate": 0.00010284731966351879, + "loss": 0.4627, + "step": 25090 + }, + { + "epoch": 492.0, + "eval_loss": 0.46937766671180725, + "eval_runtime": 2.2693, + "eval_samples_per_second": 1004.287, + "eval_steps_per_second": 3.966, + "step": 25092 + }, + { + "epoch": 492.16, + "learning_rate": 0.00010280968643969705, + "loss": 0.4662, + "step": 25100 + }, + { + "epoch": 492.35, + "learning_rate": 0.00010277204509606712, + "loss": 0.4631, + "step": 25110 + }, + { + "epoch": 492.55, + "learning_rate": 0.0001027343956436194, + "loss": 0.4627, + "step": 25120 + }, + { + "epoch": 492.75, + "learning_rate": 0.00010269673809334665, + "loss": 0.4622, + "step": 25130 + }, + { + "epoch": 492.94, + "learning_rate": 0.00010265907245624411, + "loss": 0.4638, + "step": 25140 + }, + { + "epoch": 493.0, + "eval_loss": 0.4619758725166321, + "eval_runtime": 2.1822, + "eval_samples_per_second": 1044.355, + "eval_steps_per_second": 4.124, + "step": 25143 + }, + { + "epoch": 493.14, + "learning_rate": 0.00010262139874330926, + "loss": 0.4589, + "step": 25150 + }, + { + "epoch": 493.33, + "learning_rate": 0.00010258371696554199, + "loss": 0.4604, + "step": 25160 + }, + { + "epoch": 493.53, + "learning_rate": 0.00010254602713394455, + "loss": 0.4613, + "step": 25170 + }, + { + "epoch": 493.73, + "learning_rate": 0.0001025083292595215, + "loss": 0.4568, + "step": 25180 + }, + { + "epoch": 493.92, + "learning_rate": 0.00010247062335327983, + "loss": 0.4565, + "step": 25190 + }, + { + "epoch": 494.0, + "eval_loss": 0.4652526378631592, + "eval_runtime": 2.2708, + "eval_samples_per_second": 1003.596, + "eval_steps_per_second": 3.963, + "step": 25194 + }, + { + "epoch": 494.12, + "learning_rate": 0.00010243290942622879, + "loss": 0.462, + "step": 25200 + }, + { + "epoch": 494.31, + "learning_rate": 0.00010239518748937999, + "loss": 0.4635, + "step": 25210 + }, + { + "epoch": 494.51, + "learning_rate": 0.00010235745755374745, + "loss": 0.4616, + "step": 25220 + }, + { + "epoch": 494.71, + "learning_rate": 0.0001023197196303474, + "loss": 0.4571, + "step": 25230 + }, + { + "epoch": 494.9, + "learning_rate": 0.00010228197373019853, + "loss": 0.4588, + "step": 25240 + }, + { + "epoch": 495.0, + "eval_loss": 0.45982059836387634, + "eval_runtime": 2.3244, + "eval_samples_per_second": 980.453, + "eval_steps_per_second": 3.872, + "step": 25245 + }, + { + "epoch": 495.1, + "learning_rate": 0.00010224421986432178, + "loss": 0.4594, + "step": 25250 + }, + { + "epoch": 495.29, + "learning_rate": 0.0001022064580437404, + "loss": 0.4618, + "step": 25260 + }, + { + "epoch": 495.49, + "learning_rate": 0.00010216868827948008, + "loss": 0.4567, + "step": 25270 + }, + { + "epoch": 495.69, + "learning_rate": 0.00010213091058256868, + "loss": 0.4546, + "step": 25280 + }, + { + "epoch": 495.88, + "learning_rate": 0.00010209312496403647, + "loss": 0.4568, + "step": 25290 + }, + { + "epoch": 496.0, + "eval_loss": 0.461697518825531, + "eval_runtime": 2.2304, + "eval_samples_per_second": 1021.781, + "eval_steps_per_second": 4.035, + "step": 25296 + }, + { + "epoch": 496.08, + "learning_rate": 0.00010205533143491601, + "loss": 0.4572, + "step": 25300 + }, + { + "epoch": 496.27, + "learning_rate": 0.00010201753000624215, + "loss": 0.4592, + "step": 25310 + }, + { + "epoch": 496.47, + "learning_rate": 0.00010197972068905208, + "loss": 0.4641, + "step": 25320 + }, + { + "epoch": 496.67, + "learning_rate": 0.0001019419034943853, + "loss": 0.4587, + "step": 25330 + }, + { + "epoch": 496.86, + "learning_rate": 0.00010190407843328351, + "loss": 0.4524, + "step": 25340 + }, + { + "epoch": 497.0, + "eval_loss": 0.4631481468677521, + "eval_runtime": 2.1613, + "eval_samples_per_second": 1054.478, + "eval_steps_per_second": 4.164, + "step": 25347 + }, + { + "epoch": 497.06, + "learning_rate": 0.00010186624551679089, + "loss": 0.4544, + "step": 25350 + }, + { + "epoch": 497.25, + "learning_rate": 0.00010182840475595374, + "loss": 0.462, + "step": 25360 + }, + { + "epoch": 497.45, + "learning_rate": 0.00010179055616182074, + "loss": 0.4558, + "step": 25370 + }, + { + "epoch": 497.65, + "learning_rate": 0.00010175269974544281, + "loss": 0.4606, + "step": 25380 + }, + { + "epoch": 497.84, + "learning_rate": 0.0001017148355178732, + "loss": 0.4635, + "step": 25390 + }, + { + "epoch": 498.0, + "eval_loss": 0.4639947712421417, + "eval_runtime": 2.2201, + "eval_samples_per_second": 1026.549, + "eval_steps_per_second": 4.054, + "step": 25398 + }, + { + "epoch": 498.04, + "learning_rate": 0.00010167696349016742, + "loss": 0.4639, + "step": 25400 + }, + { + "epoch": 498.24, + "learning_rate": 0.00010163908367338325, + "loss": 0.463, + "step": 25410 + }, + { + "epoch": 498.43, + "learning_rate": 0.00010160119607858076, + "loss": 0.4628, + "step": 25420 + }, + { + "epoch": 498.63, + "learning_rate": 0.0001015633007168223, + "loss": 0.4606, + "step": 25430 + }, + { + "epoch": 498.82, + "learning_rate": 0.00010152539759917242, + "loss": 0.4534, + "step": 25440 + }, + { + "epoch": 499.0, + "eval_loss": 0.4642672538757324, + "eval_runtime": 2.2526, + "eval_samples_per_second": 1011.722, + "eval_steps_per_second": 3.995, + "step": 25449 + }, + { + "epoch": 499.02, + "learning_rate": 0.00010148748673669804, + "loss": 0.4633, + "step": 25450 + }, + { + "epoch": 499.22, + "learning_rate": 0.00010144956814046823, + "loss": 0.4603, + "step": 25460 + }, + { + "epoch": 499.41, + "learning_rate": 0.00010141164182155442, + "loss": 0.4532, + "step": 25470 + }, + { + "epoch": 499.61, + "learning_rate": 0.00010137370779103025, + "loss": 0.4535, + "step": 25480 + }, + { + "epoch": 499.8, + "learning_rate": 0.00010133576605997158, + "loss": 0.4539, + "step": 25490 + }, + { + "epoch": 500.0, + "learning_rate": 0.00010129781663945658, + "loss": 0.4599, + "step": 25500 + }, + { + "epoch": 500.0, + "eval_loss": 0.46625402569770813, + "eval_runtime": 2.1508, + "eval_samples_per_second": 1059.606, + "eval_steps_per_second": 4.184, + "step": 25500 + }, + { + "epoch": 500.2, + "learning_rate": 0.00010125985954056561, + "loss": 0.4591, + "step": 25510 + }, + { + "epoch": 500.39, + "learning_rate": 0.0001012218947743813, + "loss": 0.4557, + "step": 25520 + }, + { + "epoch": 500.59, + "learning_rate": 0.00010118392235198851, + "loss": 0.4556, + "step": 25530 + }, + { + "epoch": 500.78, + "learning_rate": 0.00010114594228447439, + "loss": 0.4598, + "step": 25540 + }, + { + "epoch": 500.98, + "learning_rate": 0.0001011079545829282, + "loss": 0.4549, + "step": 25550 + }, + { + "epoch": 501.0, + "eval_loss": 0.4588215947151184, + "eval_runtime": 2.1877, + "eval_samples_per_second": 1041.737, + "eval_steps_per_second": 4.114, + "step": 25551 + }, + { + "epoch": 501.18, + "learning_rate": 0.00010106995925844154, + "loss": 0.4607, + "step": 25560 + }, + { + "epoch": 501.37, + "learning_rate": 0.0001010319563221082, + "loss": 0.4562, + "step": 25570 + }, + { + "epoch": 501.57, + "learning_rate": 0.00010099394578502419, + "loss": 0.4575, + "step": 25580 + }, + { + "epoch": 501.76, + "learning_rate": 0.00010095592765828774, + "loss": 0.46, + "step": 25590 + }, + { + "epoch": 501.96, + "learning_rate": 0.00010091790195299925, + "loss": 0.4595, + "step": 25600 + }, + { + "epoch": 502.0, + "eval_loss": 0.46614253520965576, + "eval_runtime": 2.3019, + "eval_samples_per_second": 990.035, + "eval_steps_per_second": 3.91, + "step": 25602 + }, + { + "epoch": 502.16, + "learning_rate": 0.00010087986868026144, + "loss": 0.4567, + "step": 25610 + }, + { + "epoch": 502.35, + "learning_rate": 0.00010084182785117916, + "loss": 0.4569, + "step": 25620 + }, + { + "epoch": 502.55, + "learning_rate": 0.00010080377947685946, + "loss": 0.4602, + "step": 25630 + }, + { + "epoch": 502.75, + "learning_rate": 0.00010076572356841164, + "loss": 0.461, + "step": 25640 + }, + { + "epoch": 502.94, + "learning_rate": 0.00010072766013694715, + "loss": 0.46, + "step": 25650 + }, + { + "epoch": 503.0, + "eval_loss": 0.4626482427120209, + "eval_runtime": 2.185, + "eval_samples_per_second": 1043.005, + "eval_steps_per_second": 4.119, + "step": 25653 + }, + { + "epoch": 503.14, + "learning_rate": 0.0001006895891935797, + "loss": 0.4623, + "step": 25660 + }, + { + "epoch": 503.33, + "learning_rate": 0.00010065151074942516, + "loss": 0.4602, + "step": 25670 + }, + { + "epoch": 503.53, + "learning_rate": 0.00010061342481560151, + "loss": 0.457, + "step": 25680 + }, + { + "epoch": 503.73, + "learning_rate": 0.0001005753314032291, + "loss": 0.4564, + "step": 25690 + }, + { + "epoch": 503.92, + "learning_rate": 0.0001005372305234303, + "loss": 0.4504, + "step": 25700 + }, + { + "epoch": 504.0, + "eval_loss": 0.4590928256511688, + "eval_runtime": 2.299, + "eval_samples_per_second": 991.307, + "eval_steps_per_second": 3.915, + "step": 25704 + }, + { + "epoch": 504.12, + "learning_rate": 0.00010049912218732971, + "loss": 0.4553, + "step": 25710 + }, + { + "epoch": 504.31, + "learning_rate": 0.00010046100640605413, + "loss": 0.4599, + "step": 25720 + }, + { + "epoch": 504.51, + "learning_rate": 0.0001004228831907325, + "loss": 0.456, + "step": 25730 + }, + { + "epoch": 504.71, + "learning_rate": 0.00010038475255249597, + "loss": 0.4505, + "step": 25740 + }, + { + "epoch": 504.9, + "learning_rate": 0.00010034661450247785, + "loss": 0.459, + "step": 25750 + }, + { + "epoch": 505.0, + "eval_loss": 0.46226629614830017, + "eval_runtime": 2.2506, + "eval_samples_per_second": 1012.612, + "eval_steps_per_second": 3.999, + "step": 25755 + }, + { + "epoch": 505.1, + "learning_rate": 0.00010030846905181356, + "loss": 0.456, + "step": 25760 + }, + { + "epoch": 505.29, + "learning_rate": 0.00010027031621164076, + "loss": 0.4556, + "step": 25770 + }, + { + "epoch": 505.49, + "learning_rate": 0.0001002321559930992, + "loss": 0.4565, + "step": 25780 + }, + { + "epoch": 505.69, + "learning_rate": 0.00010019398840733082, + "loss": 0.4603, + "step": 25790 + }, + { + "epoch": 505.88, + "learning_rate": 0.00010015581346547969, + "loss": 0.4582, + "step": 25800 + }, + { + "epoch": 506.0, + "eval_loss": 0.46172964572906494, + "eval_runtime": 2.1975, + "eval_samples_per_second": 1037.086, + "eval_steps_per_second": 4.096, + "step": 25806 + }, + { + "epoch": 506.08, + "learning_rate": 0.00010011763117869207, + "loss": 0.4576, + "step": 25810 + }, + { + "epoch": 506.27, + "learning_rate": 0.00010007944155811633, + "loss": 0.4531, + "step": 25820 + }, + { + "epoch": 506.47, + "learning_rate": 0.00010004124461490297, + "loss": 0.4574, + "step": 25830 + }, + { + "epoch": 506.67, + "learning_rate": 0.00010000304036020466, + "loss": 0.456, + "step": 25840 + }, + { + "epoch": 506.86, + "learning_rate": 9.996482880517619e-05, + "loss": 0.4532, + "step": 25850 + }, + { + "epoch": 507.0, + "eval_loss": 0.4579889178276062, + "eval_runtime": 2.2839, + "eval_samples_per_second": 997.87, + "eval_steps_per_second": 3.941, + "step": 25857 + }, + { + "epoch": 507.06, + "learning_rate": 9.992660996097446e-05, + "loss": 0.4539, + "step": 25860 + }, + { + "epoch": 507.25, + "learning_rate": 9.988838383875856e-05, + "loss": 0.4575, + "step": 25870 + }, + { + "epoch": 507.45, + "learning_rate": 9.985015044968964e-05, + "loss": 0.4508, + "step": 25880 + }, + { + "epoch": 507.65, + "learning_rate": 9.981190980493099e-05, + "loss": 0.4547, + "step": 25890 + }, + { + "epoch": 507.84, + "learning_rate": 9.977366191564806e-05, + "loss": 0.4555, + "step": 25900 + }, + { + "epoch": 508.0, + "eval_loss": 0.46151694655418396, + "eval_runtime": 2.2996, + "eval_samples_per_second": 991.055, + "eval_steps_per_second": 3.914, + "step": 25908 + }, + { + "epoch": 508.04, + "learning_rate": 9.973540679300834e-05, + "loss": 0.4548, + "step": 25910 + }, + { + "epoch": 508.24, + "learning_rate": 9.96971444481815e-05, + "loss": 0.4524, + "step": 25920 + }, + { + "epoch": 508.43, + "learning_rate": 9.965887489233927e-05, + "loss": 0.4586, + "step": 25930 + }, + { + "epoch": 508.63, + "learning_rate": 9.962059813665552e-05, + "loss": 0.4573, + "step": 25940 + }, + { + "epoch": 508.82, + "learning_rate": 9.95823141923062e-05, + "loss": 0.4571, + "step": 25950 + }, + { + "epoch": 509.0, + "eval_loss": 0.4616622030735016, + "eval_runtime": 2.1945, + "eval_samples_per_second": 1038.515, + "eval_steps_per_second": 4.101, + "step": 25959 + }, + { + "epoch": 509.02, + "learning_rate": 9.954402307046938e-05, + "loss": 0.4542, + "step": 25960 + }, + { + "epoch": 509.22, + "learning_rate": 9.950572478232521e-05, + "loss": 0.4546, + "step": 25970 + }, + { + "epoch": 509.41, + "learning_rate": 9.946741933905595e-05, + "loss": 0.4548, + "step": 25980 + }, + { + "epoch": 509.61, + "learning_rate": 9.942910675184589e-05, + "loss": 0.46, + "step": 25990 + }, + { + "epoch": 509.8, + "learning_rate": 9.93907870318815e-05, + "loss": 0.4555, + "step": 26000 + }, + { + "epoch": 510.0, + "learning_rate": 9.935246019035126e-05, + "loss": 0.4561, + "step": 26010 + }, + { + "epoch": 510.0, + "eval_loss": 0.4578864276409149, + "eval_runtime": 2.2028, + "eval_samples_per_second": 1034.57, + "eval_steps_per_second": 4.086, + "step": 26010 + }, + { + "epoch": 510.2, + "learning_rate": 9.931412623844574e-05, + "loss": 0.4559, + "step": 26020 + }, + { + "epoch": 510.39, + "learning_rate": 9.927578518735765e-05, + "loss": 0.4575, + "step": 26030 + }, + { + "epoch": 510.59, + "learning_rate": 9.923743704828166e-05, + "loss": 0.4473, + "step": 26040 + }, + { + "epoch": 510.78, + "learning_rate": 9.919908183241461e-05, + "loss": 0.4509, + "step": 26050 + }, + { + "epoch": 510.98, + "learning_rate": 9.916071955095537e-05, + "loss": 0.4541, + "step": 26060 + }, + { + "epoch": 511.0, + "eval_loss": 0.46014508605003357, + "eval_runtime": 2.2714, + "eval_samples_per_second": 1003.329, + "eval_steps_per_second": 3.962, + "step": 26061 + }, + { + "epoch": 511.18, + "learning_rate": 9.912235021510483e-05, + "loss": 0.4526, + "step": 26070 + }, + { + "epoch": 511.37, + "learning_rate": 9.908397383606601e-05, + "loss": 0.4553, + "step": 26080 + }, + { + "epoch": 511.57, + "learning_rate": 9.904559042504398e-05, + "loss": 0.455, + "step": 26090 + }, + { + "epoch": 511.76, + "learning_rate": 9.900719999324578e-05, + "loss": 0.4538, + "step": 26100 + }, + { + "epoch": 511.96, + "learning_rate": 9.896880255188064e-05, + "loss": 0.4534, + "step": 26110 + }, + { + "epoch": 512.0, + "eval_loss": 0.4626559019088745, + "eval_runtime": 2.1826, + "eval_samples_per_second": 1044.191, + "eval_steps_per_second": 4.124, + "step": 26112 + }, + { + "epoch": 512.16, + "learning_rate": 9.893039811215967e-05, + "loss": 0.4576, + "step": 26120 + }, + { + "epoch": 512.35, + "learning_rate": 9.889198668529617e-05, + "loss": 0.4529, + "step": 26130 + }, + { + "epoch": 512.55, + "learning_rate": 9.88535682825054e-05, + "loss": 0.457, + "step": 26140 + }, + { + "epoch": 512.75, + "learning_rate": 9.881514291500467e-05, + "loss": 0.4545, + "step": 26150 + }, + { + "epoch": 512.94, + "learning_rate": 9.877671059401334e-05, + "loss": 0.4569, + "step": 26160 + }, + { + "epoch": 513.0, + "eval_loss": 0.46150699257850647, + "eval_runtime": 2.2927, + "eval_samples_per_second": 994.031, + "eval_steps_per_second": 3.926, + "step": 26163 + }, + { + "epoch": 513.14, + "learning_rate": 9.873827133075279e-05, + "loss": 0.4467, + "step": 26170 + }, + { + "epoch": 513.33, + "learning_rate": 9.869982513644645e-05, + "loss": 0.4549, + "step": 26180 + }, + { + "epoch": 513.53, + "learning_rate": 9.866137202231968e-05, + "loss": 0.4542, + "step": 26190 + }, + { + "epoch": 513.73, + "learning_rate": 9.86229119996e-05, + "loss": 0.4548, + "step": 26200 + }, + { + "epoch": 513.92, + "learning_rate": 9.858444507951688e-05, + "loss": 0.4583, + "step": 26210 + }, + { + "epoch": 514.0, + "eval_loss": 0.45271191000938416, + "eval_runtime": 2.2058, + "eval_samples_per_second": 1033.196, + "eval_steps_per_second": 4.08, + "step": 26214 + }, + { + "epoch": 514.12, + "learning_rate": 9.854597127330176e-05, + "loss": 0.4542, + "step": 26220 + }, + { + "epoch": 514.31, + "learning_rate": 9.850749059218815e-05, + "loss": 0.4526, + "step": 26230 + }, + { + "epoch": 514.51, + "learning_rate": 9.846900304741158e-05, + "loss": 0.4558, + "step": 26240 + }, + { + "epoch": 514.71, + "learning_rate": 9.84305086502095e-05, + "loss": 0.4566, + "step": 26250 + }, + { + "epoch": 514.9, + "learning_rate": 9.839200741182147e-05, + "loss": 0.4498, + "step": 26260 + }, + { + "epoch": 515.0, + "eval_loss": 0.45869016647338867, + "eval_runtime": 2.3386, + "eval_samples_per_second": 974.499, + "eval_steps_per_second": 3.848, + "step": 26265 + }, + { + "epoch": 515.1, + "learning_rate": 9.835349934348896e-05, + "loss": 0.4524, + "step": 26270 + }, + { + "epoch": 515.29, + "learning_rate": 9.831498445645545e-05, + "loss": 0.4531, + "step": 26280 + }, + { + "epoch": 515.49, + "learning_rate": 9.827646276196647e-05, + "loss": 0.4572, + "step": 26290 + }, + { + "epoch": 515.69, + "learning_rate": 9.82379342712695e-05, + "loss": 0.4539, + "step": 26300 + }, + { + "epoch": 515.88, + "learning_rate": 9.819939899561396e-05, + "loss": 0.4511, + "step": 26310 + }, + { + "epoch": 516.0, + "eval_loss": 0.45518627762794495, + "eval_runtime": 2.1521, + "eval_samples_per_second": 1058.985, + "eval_steps_per_second": 4.182, + "step": 26316 + }, + { + "epoch": 516.08, + "learning_rate": 9.816085694625133e-05, + "loss": 0.4531, + "step": 26320 + }, + { + "epoch": 516.27, + "learning_rate": 9.812230813443498e-05, + "loss": 0.4527, + "step": 26330 + }, + { + "epoch": 516.47, + "learning_rate": 9.808375257142035e-05, + "loss": 0.4538, + "step": 26340 + }, + { + "epoch": 516.67, + "learning_rate": 9.804519026846476e-05, + "loss": 0.4529, + "step": 26350 + }, + { + "epoch": 516.86, + "learning_rate": 9.800662123682759e-05, + "loss": 0.4535, + "step": 26360 + }, + { + "epoch": 517.0, + "eval_loss": 0.457948237657547, + "eval_runtime": 2.2407, + "eval_samples_per_second": 1017.082, + "eval_steps_per_second": 4.017, + "step": 26367 + }, + { + "epoch": 517.06, + "learning_rate": 9.796804548777013e-05, + "loss": 0.456, + "step": 26370 + }, + { + "epoch": 517.25, + "learning_rate": 9.79294630325556e-05, + "loss": 0.4534, + "step": 26380 + }, + { + "epoch": 517.45, + "learning_rate": 9.789087388244927e-05, + "loss": 0.4549, + "step": 26390 + }, + { + "epoch": 517.65, + "learning_rate": 9.785227804871827e-05, + "loss": 0.4502, + "step": 26400 + }, + { + "epoch": 517.84, + "learning_rate": 9.781367554263172e-05, + "loss": 0.4551, + "step": 26410 + }, + { + "epoch": 518.0, + "eval_loss": 0.4542873203754425, + "eval_runtime": 2.2808, + "eval_samples_per_second": 999.213, + "eval_steps_per_second": 3.946, + "step": 26418 + }, + { + "epoch": 518.04, + "learning_rate": 9.777506637546072e-05, + "loss": 0.4515, + "step": 26420 + }, + { + "epoch": 518.24, + "learning_rate": 9.773645055847825e-05, + "loss": 0.4536, + "step": 26430 + }, + { + "epoch": 518.43, + "learning_rate": 9.76978281029593e-05, + "loss": 0.4534, + "step": 26440 + }, + { + "epoch": 518.63, + "learning_rate": 9.76591990201808e-05, + "loss": 0.4444, + "step": 26450 + }, + { + "epoch": 518.82, + "learning_rate": 9.762056332142147e-05, + "loss": 0.4581, + "step": 26460 + }, + { + "epoch": 519.0, + "eval_loss": 0.4596610963344574, + "eval_runtime": 2.2928, + "eval_samples_per_second": 993.96, + "eval_steps_per_second": 3.925, + "step": 26469 + }, + { + "epoch": 519.02, + "learning_rate": 9.758192101796217e-05, + "loss": 0.454, + "step": 26470 + }, + { + "epoch": 519.22, + "learning_rate": 9.754327212108556e-05, + "loss": 0.4528, + "step": 26480 + }, + { + "epoch": 519.41, + "learning_rate": 9.750461664207622e-05, + "loss": 0.4552, + "step": 26490 + }, + { + "epoch": 519.61, + "learning_rate": 9.746595459222076e-05, + "loss": 0.4541, + "step": 26500 + }, + { + "epoch": 519.8, + "learning_rate": 9.742728598280759e-05, + "loss": 0.4487, + "step": 26510 + }, + { + "epoch": 520.0, + "learning_rate": 9.738861082512709e-05, + "loss": 0.4573, + "step": 26520 + }, + { + "epoch": 520.0, + "eval_loss": 0.45396384596824646, + "eval_runtime": 2.2352, + "eval_samples_per_second": 1019.61, + "eval_steps_per_second": 4.027, + "step": 26520 + }, + { + "epoch": 520.2, + "learning_rate": 9.734992913047155e-05, + "loss": 0.4491, + "step": 26530 + }, + { + "epoch": 520.39, + "learning_rate": 9.731124091013513e-05, + "loss": 0.452, + "step": 26540 + }, + { + "epoch": 520.59, + "learning_rate": 9.727254617541398e-05, + "loss": 0.4491, + "step": 26550 + }, + { + "epoch": 520.78, + "learning_rate": 9.723384493760606e-05, + "loss": 0.4543, + "step": 26560 + }, + { + "epoch": 520.98, + "learning_rate": 9.719513720801126e-05, + "loss": 0.4495, + "step": 26570 + }, + { + "epoch": 521.0, + "eval_loss": 0.4577941596508026, + "eval_runtime": 2.2275, + "eval_samples_per_second": 1023.139, + "eval_steps_per_second": 4.04, + "step": 26571 + }, + { + "epoch": 521.18, + "learning_rate": 9.715642299793144e-05, + "loss": 0.4459, + "step": 26580 + }, + { + "epoch": 521.37, + "learning_rate": 9.711770231867022e-05, + "loss": 0.4513, + "step": 26590 + }, + { + "epoch": 521.57, + "learning_rate": 9.70789751815332e-05, + "loss": 0.4512, + "step": 26600 + }, + { + "epoch": 521.76, + "learning_rate": 9.704024159782782e-05, + "loss": 0.4571, + "step": 26610 + }, + { + "epoch": 521.96, + "learning_rate": 9.700150157886345e-05, + "loss": 0.4532, + "step": 26620 + }, + { + "epoch": 522.0, + "eval_loss": 0.4605408012866974, + "eval_runtime": 2.251, + "eval_samples_per_second": 1012.42, + "eval_steps_per_second": 3.998, + "step": 26622 + }, + { + "epoch": 522.16, + "learning_rate": 9.69627551359513e-05, + "loss": 0.455, + "step": 26630 + }, + { + "epoch": 522.35, + "learning_rate": 9.692400228040447e-05, + "loss": 0.4555, + "step": 26640 + }, + { + "epoch": 522.55, + "learning_rate": 9.688524302353792e-05, + "loss": 0.4535, + "step": 26650 + }, + { + "epoch": 522.75, + "learning_rate": 9.68464773766685e-05, + "loss": 0.4543, + "step": 26660 + }, + { + "epoch": 522.94, + "learning_rate": 9.68077053511149e-05, + "loss": 0.4474, + "step": 26670 + }, + { + "epoch": 523.0, + "eval_loss": 0.45791128277778625, + "eval_runtime": 2.139, + "eval_samples_per_second": 1065.459, + "eval_steps_per_second": 4.208, + "step": 26673 + }, + { + "epoch": 523.14, + "learning_rate": 9.67689269581977e-05, + "loss": 0.454, + "step": 26680 + }, + { + "epoch": 523.33, + "learning_rate": 9.673014220923934e-05, + "loss": 0.4547, + "step": 26690 + }, + { + "epoch": 523.53, + "learning_rate": 9.669135111556406e-05, + "loss": 0.4555, + "step": 26700 + }, + { + "epoch": 523.73, + "learning_rate": 9.665255368849804e-05, + "loss": 0.4555, + "step": 26710 + }, + { + "epoch": 523.92, + "learning_rate": 9.661374993936924e-05, + "loss": 0.4504, + "step": 26720 + }, + { + "epoch": 524.0, + "eval_loss": 0.456340491771698, + "eval_runtime": 2.2323, + "eval_samples_per_second": 1020.94, + "eval_steps_per_second": 4.032, + "step": 26724 + }, + { + "epoch": 524.12, + "learning_rate": 9.657493987950747e-05, + "loss": 0.4502, + "step": 26730 + }, + { + "epoch": 524.31, + "learning_rate": 9.653612352024446e-05, + "loss": 0.4512, + "step": 26740 + }, + { + "epoch": 524.51, + "learning_rate": 9.649730087291364e-05, + "loss": 0.4583, + "step": 26750 + }, + { + "epoch": 524.71, + "learning_rate": 9.645847194885042e-05, + "loss": 0.4529, + "step": 26760 + }, + { + "epoch": 524.9, + "learning_rate": 9.641963675939197e-05, + "loss": 0.4529, + "step": 26770 + }, + { + "epoch": 525.0, + "eval_loss": 0.45831215381622314, + "eval_runtime": 2.2948, + "eval_samples_per_second": 993.117, + "eval_steps_per_second": 3.922, + "step": 26775 + }, + { + "epoch": 525.1, + "learning_rate": 9.638079531587728e-05, + "loss": 0.4495, + "step": 26780 + }, + { + "epoch": 525.29, + "learning_rate": 9.63419476296472e-05, + "loss": 0.4527, + "step": 26790 + }, + { + "epoch": 525.49, + "learning_rate": 9.63030937120444e-05, + "loss": 0.4501, + "step": 26800 + }, + { + "epoch": 525.69, + "learning_rate": 9.626423357441331e-05, + "loss": 0.4495, + "step": 26810 + }, + { + "epoch": 525.88, + "learning_rate": 9.622536722810026e-05, + "loss": 0.4475, + "step": 26820 + }, + { + "epoch": 526.0, + "eval_loss": 0.4616130292415619, + "eval_runtime": 2.2822, + "eval_samples_per_second": 998.609, + "eval_steps_per_second": 3.944, + "step": 26826 + }, + { + "epoch": 526.08, + "learning_rate": 9.618649468445336e-05, + "loss": 0.4521, + "step": 26830 + }, + { + "epoch": 526.27, + "learning_rate": 9.614761595482252e-05, + "loss": 0.4569, + "step": 26840 + }, + { + "epoch": 526.47, + "learning_rate": 9.610873105055945e-05, + "loss": 0.458, + "step": 26850 + }, + { + "epoch": 526.67, + "learning_rate": 9.60698399830177e-05, + "loss": 0.4507, + "step": 26860 + }, + { + "epoch": 526.86, + "learning_rate": 9.603094276355257e-05, + "loss": 0.4457, + "step": 26870 + }, + { + "epoch": 527.0, + "eval_loss": 0.4558161199092865, + "eval_runtime": 2.317, + "eval_samples_per_second": 983.601, + "eval_steps_per_second": 3.884, + "step": 26877 + }, + { + "epoch": 527.06, + "learning_rate": 9.599203940352118e-05, + "loss": 0.451, + "step": 26880 + }, + { + "epoch": 527.25, + "learning_rate": 9.595312991428245e-05, + "loss": 0.4454, + "step": 26890 + }, + { + "epoch": 527.45, + "learning_rate": 9.59142143071971e-05, + "loss": 0.4559, + "step": 26900 + }, + { + "epoch": 527.65, + "learning_rate": 9.587529259362759e-05, + "loss": 0.4518, + "step": 26910 + }, + { + "epoch": 527.84, + "learning_rate": 9.583636478493823e-05, + "loss": 0.4532, + "step": 26920 + }, + { + "epoch": 528.0, + "eval_loss": 0.45839163661003113, + "eval_runtime": 2.2859, + "eval_samples_per_second": 996.975, + "eval_steps_per_second": 3.937, + "step": 26928 + }, + { + "epoch": 528.04, + "learning_rate": 9.579743089249503e-05, + "loss": 0.4491, + "step": 26930 + }, + { + "epoch": 528.24, + "learning_rate": 9.575849092766584e-05, + "loss": 0.4496, + "step": 26940 + }, + { + "epoch": 528.43, + "learning_rate": 9.571954490182026e-05, + "loss": 0.4544, + "step": 26950 + }, + { + "epoch": 528.63, + "learning_rate": 9.568059282632964e-05, + "loss": 0.4531, + "step": 26960 + }, + { + "epoch": 528.82, + "learning_rate": 9.564163471256716e-05, + "loss": 0.4566, + "step": 26970 + }, + { + "epoch": 529.0, + "eval_loss": 0.4572843313217163, + "eval_runtime": 2.3239, + "eval_samples_per_second": 980.668, + "eval_steps_per_second": 3.873, + "step": 26979 + }, + { + "epoch": 529.02, + "learning_rate": 9.56026705719077e-05, + "loss": 0.4539, + "step": 26980 + }, + { + "epoch": 529.22, + "learning_rate": 9.55637004157279e-05, + "loss": 0.46, + "step": 26990 + }, + { + "epoch": 529.41, + "learning_rate": 9.552472425540622e-05, + "loss": 0.4526, + "step": 27000 + }, + { + "epoch": 529.61, + "learning_rate": 9.548574210232277e-05, + "loss": 0.454, + "step": 27010 + }, + { + "epoch": 529.8, + "learning_rate": 9.544675396785952e-05, + "loss": 0.4532, + "step": 27020 + }, + { + "epoch": 530.0, + "learning_rate": 9.540775986340012e-05, + "loss": 0.4546, + "step": 27030 + }, + { + "epoch": 530.0, + "eval_loss": 0.4563215970993042, + "eval_runtime": 2.1683, + "eval_samples_per_second": 1051.064, + "eval_steps_per_second": 4.151, + "step": 27030 + }, + { + "epoch": 530.2, + "learning_rate": 9.536875980032996e-05, + "loss": 0.4493, + "step": 27040 + }, + { + "epoch": 530.39, + "learning_rate": 9.532975379003623e-05, + "loss": 0.4495, + "step": 27050 + }, + { + "epoch": 530.59, + "learning_rate": 9.529074184390779e-05, + "loss": 0.4491, + "step": 27060 + }, + { + "epoch": 530.78, + "learning_rate": 9.525172397333525e-05, + "loss": 0.4553, + "step": 27070 + }, + { + "epoch": 530.98, + "learning_rate": 9.521270018971095e-05, + "loss": 0.4479, + "step": 27080 + }, + { + "epoch": 531.0, + "eval_loss": 0.46282848715782166, + "eval_runtime": 2.3449, + "eval_samples_per_second": 971.905, + "eval_steps_per_second": 3.838, + "step": 27081 + }, + { + "epoch": 531.18, + "learning_rate": 9.5173670504429e-05, + "loss": 0.4551, + "step": 27090 + }, + { + "epoch": 531.37, + "learning_rate": 9.513463492888519e-05, + "loss": 0.4503, + "step": 27100 + }, + { + "epoch": 531.57, + "learning_rate": 9.509559347447701e-05, + "loss": 0.4468, + "step": 27110 + }, + { + "epoch": 531.76, + "learning_rate": 9.50565461526037e-05, + "loss": 0.4551, + "step": 27120 + }, + { + "epoch": 531.96, + "learning_rate": 9.501749297466626e-05, + "loss": 0.4485, + "step": 27130 + }, + { + "epoch": 532.0, + "eval_loss": 0.45467355847358704, + "eval_runtime": 2.2383, + "eval_samples_per_second": 1018.188, + "eval_steps_per_second": 4.021, + "step": 27132 + }, + { + "epoch": 532.16, + "learning_rate": 9.497843395206726e-05, + "loss": 0.4511, + "step": 27140 + }, + { + "epoch": 532.35, + "learning_rate": 9.493936909621113e-05, + "loss": 0.4563, + "step": 27150 + }, + { + "epoch": 532.55, + "learning_rate": 9.49002984185039e-05, + "loss": 0.4486, + "step": 27160 + }, + { + "epoch": 532.75, + "learning_rate": 9.486122193035337e-05, + "loss": 0.4467, + "step": 27170 + }, + { + "epoch": 532.94, + "learning_rate": 9.482213964316898e-05, + "loss": 0.4491, + "step": 27180 + }, + { + "epoch": 533.0, + "eval_loss": 0.45390358567237854, + "eval_runtime": 2.2377, + "eval_samples_per_second": 1018.434, + "eval_steps_per_second": 4.022, + "step": 27183 + }, + { + "epoch": 533.14, + "learning_rate": 9.478305156836188e-05, + "loss": 0.4484, + "step": 27190 + }, + { + "epoch": 533.33, + "learning_rate": 9.474395771734493e-05, + "loss": 0.45, + "step": 27200 + }, + { + "epoch": 533.53, + "learning_rate": 9.470485810153268e-05, + "loss": 0.4471, + "step": 27210 + }, + { + "epoch": 533.73, + "learning_rate": 9.46657527323413e-05, + "loss": 0.449, + "step": 27220 + }, + { + "epoch": 533.92, + "learning_rate": 9.462664162118871e-05, + "loss": 0.4522, + "step": 27230 + }, + { + "epoch": 534.0, + "eval_loss": 0.4536179304122925, + "eval_runtime": 2.288, + "eval_samples_per_second": 996.082, + "eval_steps_per_second": 3.934, + "step": 27234 + }, + { + "epoch": 534.12, + "learning_rate": 9.458752477949451e-05, + "loss": 0.4502, + "step": 27240 + }, + { + "epoch": 534.31, + "learning_rate": 9.454840221867989e-05, + "loss": 0.449, + "step": 27250 + }, + { + "epoch": 534.51, + "learning_rate": 9.450927395016781e-05, + "loss": 0.4502, + "step": 27260 + }, + { + "epoch": 534.71, + "learning_rate": 9.447013998538283e-05, + "loss": 0.4442, + "step": 27270 + }, + { + "epoch": 534.9, + "learning_rate": 9.44310003357512e-05, + "loss": 0.4477, + "step": 27280 + }, + { + "epoch": 535.0, + "eval_loss": 0.45614269375801086, + "eval_runtime": 2.3434, + "eval_samples_per_second": 972.526, + "eval_steps_per_second": 3.841, + "step": 27285 + }, + { + "epoch": 535.1, + "learning_rate": 9.439185501270083e-05, + "loss": 0.4535, + "step": 27290 + }, + { + "epoch": 535.29, + "learning_rate": 9.435270402766128e-05, + "loss": 0.4503, + "step": 27300 + }, + { + "epoch": 535.49, + "learning_rate": 9.431354739206374e-05, + "loss": 0.4503, + "step": 27310 + }, + { + "epoch": 535.69, + "learning_rate": 9.42743851173411e-05, + "loss": 0.4542, + "step": 27320 + }, + { + "epoch": 535.88, + "learning_rate": 9.423521721492788e-05, + "loss": 0.45, + "step": 27330 + }, + { + "epoch": 536.0, + "eval_loss": 0.4529837667942047, + "eval_runtime": 2.2382, + "eval_samples_per_second": 1018.237, + "eval_steps_per_second": 4.021, + "step": 27336 + }, + { + "epoch": 536.08, + "learning_rate": 9.419604369626024e-05, + "loss": 0.4511, + "step": 27340 + }, + { + "epoch": 536.27, + "learning_rate": 9.415686457277593e-05, + "loss": 0.4497, + "step": 27350 + }, + { + "epoch": 536.47, + "learning_rate": 9.41176798559144e-05, + "loss": 0.4475, + "step": 27360 + }, + { + "epoch": 536.67, + "learning_rate": 9.407848955711672e-05, + "loss": 0.445, + "step": 27370 + }, + { + "epoch": 536.86, + "learning_rate": 9.403929368782558e-05, + "loss": 0.4522, + "step": 27380 + }, + { + "epoch": 537.0, + "eval_loss": 0.45247963070869446, + "eval_runtime": 2.2916, + "eval_samples_per_second": 994.507, + "eval_steps_per_second": 3.927, + "step": 27387 + }, + { + "epoch": 537.06, + "learning_rate": 9.40000922594853e-05, + "loss": 0.4471, + "step": 27390 + }, + { + "epoch": 537.25, + "learning_rate": 9.396088528354184e-05, + "loss": 0.4472, + "step": 27400 + }, + { + "epoch": 537.45, + "learning_rate": 9.392167277144273e-05, + "loss": 0.4466, + "step": 27410 + }, + { + "epoch": 537.65, + "learning_rate": 9.388245473463717e-05, + "loss": 0.4453, + "step": 27420 + }, + { + "epoch": 537.84, + "learning_rate": 9.384323118457593e-05, + "loss": 0.4475, + "step": 27430 + }, + { + "epoch": 538.0, + "eval_loss": 0.4553549885749817, + "eval_runtime": 2.1572, + "eval_samples_per_second": 1056.468, + "eval_steps_per_second": 4.172, + "step": 27438 + }, + { + "epoch": 538.04, + "learning_rate": 9.380400213271146e-05, + "loss": 0.4426, + "step": 27440 + }, + { + "epoch": 538.24, + "learning_rate": 9.376476759049773e-05, + "loss": 0.4476, + "step": 27450 + }, + { + "epoch": 538.43, + "learning_rate": 9.372552756939033e-05, + "loss": 0.447, + "step": 27460 + }, + { + "epoch": 538.63, + "learning_rate": 9.368628208084654e-05, + "loss": 0.4471, + "step": 27470 + }, + { + "epoch": 538.82, + "learning_rate": 9.36470311363251e-05, + "loss": 0.4475, + "step": 27480 + }, + { + "epoch": 539.0, + "eval_loss": 0.44862520694732666, + "eval_runtime": 2.2485, + "eval_samples_per_second": 1013.542, + "eval_steps_per_second": 4.003, + "step": 27489 + }, + { + "epoch": 539.02, + "learning_rate": 9.360777474728644e-05, + "loss": 0.446, + "step": 27490 + }, + { + "epoch": 539.22, + "learning_rate": 9.356851292519255e-05, + "loss": 0.4503, + "step": 27500 + }, + { + "epoch": 539.41, + "learning_rate": 9.3529245681507e-05, + "loss": 0.4517, + "step": 27510 + }, + { + "epoch": 539.61, + "learning_rate": 9.348997302769497e-05, + "loss": 0.4513, + "step": 27520 + }, + { + "epoch": 539.8, + "learning_rate": 9.345069497522318e-05, + "loss": 0.445, + "step": 27530 + }, + { + "epoch": 540.0, + "learning_rate": 9.341141153555993e-05, + "loss": 0.4512, + "step": 27540 + }, + { + "epoch": 540.0, + "eval_loss": 0.45837074518203735, + "eval_runtime": 2.1784, + "eval_samples_per_second": 1046.161, + "eval_steps_per_second": 4.131, + "step": 27540 + }, + { + "epoch": 540.2, + "learning_rate": 9.337212272017514e-05, + "loss": 0.4532, + "step": 27550 + }, + { + "epoch": 540.39, + "learning_rate": 9.333282854054025e-05, + "loss": 0.454, + "step": 27560 + }, + { + "epoch": 540.59, + "learning_rate": 9.329352900812828e-05, + "loss": 0.4488, + "step": 27570 + }, + { + "epoch": 540.78, + "learning_rate": 9.325422413441384e-05, + "loss": 0.4521, + "step": 27580 + }, + { + "epoch": 540.98, + "learning_rate": 9.321491393087304e-05, + "loss": 0.445, + "step": 27590 + }, + { + "epoch": 541.0, + "eval_loss": 0.4543311297893524, + "eval_runtime": 2.2001, + "eval_samples_per_second": 1035.874, + "eval_steps_per_second": 4.091, + "step": 27591 + }, + { + "epoch": 541.18, + "learning_rate": 9.317559840898362e-05, + "loss": 0.4519, + "step": 27600 + }, + { + "epoch": 541.37, + "learning_rate": 9.313627758022483e-05, + "loss": 0.4481, + "step": 27610 + }, + { + "epoch": 541.57, + "learning_rate": 9.309695145607745e-05, + "loss": 0.4495, + "step": 27620 + }, + { + "epoch": 541.76, + "learning_rate": 9.305762004802384e-05, + "loss": 0.4435, + "step": 27630 + }, + { + "epoch": 541.96, + "learning_rate": 9.30182833675479e-05, + "loss": 0.4478, + "step": 27640 + }, + { + "epoch": 542.0, + "eval_loss": 0.45065122842788696, + "eval_runtime": 2.1992, + "eval_samples_per_second": 1036.271, + "eval_steps_per_second": 4.092, + "step": 27642 + }, + { + "epoch": 542.16, + "learning_rate": 9.297894142613508e-05, + "loss": 0.4459, + "step": 27650 + }, + { + "epoch": 542.35, + "learning_rate": 9.293959423527233e-05, + "loss": 0.4501, + "step": 27660 + }, + { + "epoch": 542.55, + "learning_rate": 9.290024180644814e-05, + "loss": 0.4467, + "step": 27670 + }, + { + "epoch": 542.75, + "learning_rate": 9.286088415115258e-05, + "loss": 0.4475, + "step": 27680 + }, + { + "epoch": 542.94, + "learning_rate": 9.282152128087714e-05, + "loss": 0.4472, + "step": 27690 + }, + { + "epoch": 543.0, + "eval_loss": 0.45196664333343506, + "eval_runtime": 2.1993, + "eval_samples_per_second": 1036.26, + "eval_steps_per_second": 4.092, + "step": 27693 + }, + { + "epoch": 543.14, + "learning_rate": 9.278215320711498e-05, + "loss": 0.4518, + "step": 27700 + }, + { + "epoch": 543.33, + "learning_rate": 9.274277994136068e-05, + "loss": 0.4531, + "step": 27710 + }, + { + "epoch": 543.53, + "learning_rate": 9.270340149511031e-05, + "loss": 0.4465, + "step": 27720 + }, + { + "epoch": 543.73, + "learning_rate": 9.266401787986152e-05, + "loss": 0.4475, + "step": 27730 + }, + { + "epoch": 543.92, + "learning_rate": 9.262462910711349e-05, + "loss": 0.448, + "step": 27740 + }, + { + "epoch": 544.0, + "eval_loss": 0.450714111328125, + "eval_runtime": 2.2524, + "eval_samples_per_second": 1011.789, + "eval_steps_per_second": 3.996, + "step": 27744 + }, + { + "epoch": 544.12, + "learning_rate": 9.25852351883668e-05, + "loss": 0.4441, + "step": 27750 + }, + { + "epoch": 544.31, + "learning_rate": 9.254583613512365e-05, + "loss": 0.4489, + "step": 27760 + }, + { + "epoch": 544.51, + "learning_rate": 9.250643195888763e-05, + "loss": 0.4485, + "step": 27770 + }, + { + "epoch": 544.71, + "learning_rate": 9.24670226711639e-05, + "loss": 0.4464, + "step": 27780 + }, + { + "epoch": 544.9, + "learning_rate": 9.242760828345914e-05, + "loss": 0.4447, + "step": 27790 + }, + { + "epoch": 545.0, + "eval_loss": 0.4513770639896393, + "eval_runtime": 2.2074, + "eval_samples_per_second": 1032.432, + "eval_steps_per_second": 4.077, + "step": 27795 + }, + { + "epoch": 545.1, + "learning_rate": 9.238818880728141e-05, + "loss": 0.4453, + "step": 27800 + }, + { + "epoch": 545.29, + "learning_rate": 9.234876425414038e-05, + "loss": 0.4424, + "step": 27810 + }, + { + "epoch": 545.49, + "learning_rate": 9.230933463554707e-05, + "loss": 0.4513, + "step": 27820 + }, + { + "epoch": 545.69, + "learning_rate": 9.226989996301406e-05, + "loss": 0.4442, + "step": 27830 + }, + { + "epoch": 545.88, + "learning_rate": 9.223046024805545e-05, + "loss": 0.4485, + "step": 27840 + }, + { + "epoch": 546.0, + "eval_loss": 0.45527341961860657, + "eval_runtime": 2.2727, + "eval_samples_per_second": 1002.78, + "eval_steps_per_second": 3.96, + "step": 27846 + }, + { + "epoch": 546.08, + "learning_rate": 9.21910155021867e-05, + "loss": 0.4453, + "step": 27850 + }, + { + "epoch": 546.27, + "learning_rate": 9.215156573692484e-05, + "loss": 0.4465, + "step": 27860 + }, + { + "epoch": 546.47, + "learning_rate": 9.211211096378832e-05, + "loss": 0.4485, + "step": 27870 + }, + { + "epoch": 546.67, + "learning_rate": 9.207265119429701e-05, + "loss": 0.447, + "step": 27880 + }, + { + "epoch": 546.86, + "learning_rate": 9.203318643997231e-05, + "loss": 0.4482, + "step": 27890 + }, + { + "epoch": 547.0, + "eval_loss": 0.45322051644325256, + "eval_runtime": 2.2565, + "eval_samples_per_second": 1009.989, + "eval_steps_per_second": 3.989, + "step": 27897 + }, + { + "epoch": 547.06, + "learning_rate": 9.199371671233703e-05, + "loss": 0.4495, + "step": 27900 + }, + { + "epoch": 547.25, + "learning_rate": 9.19542420229155e-05, + "loss": 0.4469, + "step": 27910 + }, + { + "epoch": 547.45, + "learning_rate": 9.19147623832334e-05, + "loss": 0.4501, + "step": 27920 + }, + { + "epoch": 547.65, + "learning_rate": 9.18752778048179e-05, + "loss": 0.4452, + "step": 27930 + }, + { + "epoch": 547.84, + "learning_rate": 9.183578829919766e-05, + "loss": 0.4448, + "step": 27940 + }, + { + "epoch": 548.0, + "eval_loss": 0.45331457257270813, + "eval_runtime": 2.3182, + "eval_samples_per_second": 983.097, + "eval_steps_per_second": 3.882, + "step": 27948 + }, + { + "epoch": 548.04, + "learning_rate": 9.179629387790273e-05, + "loss": 0.4482, + "step": 27950 + }, + { + "epoch": 548.24, + "learning_rate": 9.175679455246455e-05, + "loss": 0.4493, + "step": 27960 + }, + { + "epoch": 548.43, + "learning_rate": 9.171729033441608e-05, + "loss": 0.4473, + "step": 27970 + }, + { + "epoch": 548.63, + "learning_rate": 9.167778123529166e-05, + "loss": 0.4484, + "step": 27980 + }, + { + "epoch": 548.82, + "learning_rate": 9.163826726662708e-05, + "loss": 0.4467, + "step": 27990 + }, + { + "epoch": 549.0, + "eval_loss": 0.451092928647995, + "eval_runtime": 2.3345, + "eval_samples_per_second": 976.221, + "eval_steps_per_second": 3.855, + "step": 27999 + }, + { + "epoch": 549.02, + "learning_rate": 9.159874843995953e-05, + "loss": 0.441, + "step": 28000 + }, + { + "epoch": 549.22, + "learning_rate": 9.155922476682761e-05, + "loss": 0.4482, + "step": 28010 + }, + { + "epoch": 549.41, + "learning_rate": 9.151969625877138e-05, + "loss": 0.4456, + "step": 28020 + }, + { + "epoch": 549.61, + "learning_rate": 9.148016292733227e-05, + "loss": 0.4459, + "step": 28030 + }, + { + "epoch": 549.8, + "learning_rate": 9.144062478405311e-05, + "loss": 0.447, + "step": 28040 + }, + { + "epoch": 550.0, + "learning_rate": 9.140108184047819e-05, + "loss": 0.4473, + "step": 28050 + }, + { + "epoch": 550.0, + "eval_loss": 0.4531325399875641, + "eval_runtime": 2.2118, + "eval_samples_per_second": 1030.365, + "eval_steps_per_second": 4.069, + "step": 28050 + }, + { + "epoch": 550.2, + "learning_rate": 9.136153410815314e-05, + "loss": 0.4424, + "step": 28060 + }, + { + "epoch": 550.39, + "learning_rate": 9.132198159862502e-05, + "loss": 0.4469, + "step": 28070 + }, + { + "epoch": 550.59, + "learning_rate": 9.128242432344232e-05, + "loss": 0.4473, + "step": 28080 + }, + { + "epoch": 550.78, + "learning_rate": 9.124286229415483e-05, + "loss": 0.4431, + "step": 28090 + }, + { + "epoch": 550.98, + "learning_rate": 9.12032955223138e-05, + "loss": 0.4423, + "step": 28100 + }, + { + "epoch": 551.0, + "eval_loss": 0.4461700916290283, + "eval_runtime": 2.3161, + "eval_samples_per_second": 983.994, + "eval_steps_per_second": 3.886, + "step": 28101 + }, + { + "epoch": 551.18, + "learning_rate": 9.116372401947184e-05, + "loss": 0.4453, + "step": 28110 + }, + { + "epoch": 551.37, + "learning_rate": 9.112414779718297e-05, + "loss": 0.4445, + "step": 28120 + }, + { + "epoch": 551.57, + "learning_rate": 9.108456686700254e-05, + "loss": 0.4451, + "step": 28130 + }, + { + "epoch": 551.76, + "learning_rate": 9.104498124048729e-05, + "loss": 0.4423, + "step": 28140 + }, + { + "epoch": 551.96, + "learning_rate": 9.100539092919539e-05, + "loss": 0.4473, + "step": 28150 + }, + { + "epoch": 552.0, + "eval_loss": 0.45376914739608765, + "eval_runtime": 2.2299, + "eval_samples_per_second": 1022.003, + "eval_steps_per_second": 4.036, + "step": 28152 + }, + { + "epoch": 552.16, + "learning_rate": 9.096579594468628e-05, + "loss": 0.4518, + "step": 28160 + }, + { + "epoch": 552.35, + "learning_rate": 9.092619629852082e-05, + "loss": 0.4467, + "step": 28170 + }, + { + "epoch": 552.55, + "learning_rate": 9.088659200226123e-05, + "loss": 0.4415, + "step": 28180 + }, + { + "epoch": 552.75, + "learning_rate": 9.084698306747106e-05, + "loss": 0.4454, + "step": 28190 + }, + { + "epoch": 552.94, + "learning_rate": 9.080736950571528e-05, + "loss": 0.4463, + "step": 28200 + }, + { + "epoch": 553.0, + "eval_loss": 0.44718244671821594, + "eval_runtime": 2.2312, + "eval_samples_per_second": 1021.403, + "eval_steps_per_second": 4.034, + "step": 28203 + }, + { + "epoch": 553.14, + "learning_rate": 9.076775132856014e-05, + "loss": 0.4421, + "step": 28210 + }, + { + "epoch": 553.33, + "learning_rate": 9.072812854757326e-05, + "loss": 0.4447, + "step": 28220 + }, + { + "epoch": 553.53, + "learning_rate": 9.068850117432362e-05, + "loss": 0.4417, + "step": 28230 + }, + { + "epoch": 553.73, + "learning_rate": 9.064886922038155e-05, + "loss": 0.4433, + "step": 28240 + }, + { + "epoch": 553.92, + "learning_rate": 9.060923269731863e-05, + "loss": 0.4459, + "step": 28250 + }, + { + "epoch": 554.0, + "eval_loss": 0.44858473539352417, + "eval_runtime": 2.2955, + "eval_samples_per_second": 992.806, + "eval_steps_per_second": 3.921, + "step": 28254 + }, + { + "epoch": 554.12, + "learning_rate": 9.056959161670789e-05, + "loss": 0.4465, + "step": 28260 + }, + { + "epoch": 554.31, + "learning_rate": 9.052994599012364e-05, + "loss": 0.4401, + "step": 28270 + }, + { + "epoch": 554.51, + "learning_rate": 9.049029582914152e-05, + "loss": 0.4439, + "step": 28280 + }, + { + "epoch": 554.71, + "learning_rate": 9.045064114533851e-05, + "loss": 0.4463, + "step": 28290 + }, + { + "epoch": 554.9, + "learning_rate": 9.041098195029282e-05, + "loss": 0.4432, + "step": 28300 + }, + { + "epoch": 555.0, + "eval_loss": 0.4470457136631012, + "eval_runtime": 2.3026, + "eval_samples_per_second": 989.739, + "eval_steps_per_second": 3.909, + "step": 28305 + }, + { + "epoch": 555.1, + "learning_rate": 9.037131825558412e-05, + "loss": 0.4453, + "step": 28310 + }, + { + "epoch": 555.29, + "learning_rate": 9.033165007279328e-05, + "loss": 0.4443, + "step": 28320 + }, + { + "epoch": 555.49, + "learning_rate": 9.029197741350259e-05, + "loss": 0.4401, + "step": 28330 + }, + { + "epoch": 555.69, + "learning_rate": 9.025230028929551e-05, + "loss": 0.4378, + "step": 28340 + }, + { + "epoch": 555.88, + "learning_rate": 9.021261871175689e-05, + "loss": 0.4448, + "step": 28350 + }, + { + "epoch": 556.0, + "eval_loss": 0.45223483443260193, + "eval_runtime": 2.292, + "eval_samples_per_second": 994.322, + "eval_steps_per_second": 3.927, + "step": 28356 + }, + { + "epoch": 556.08, + "learning_rate": 9.017293269247294e-05, + "loss": 0.4481, + "step": 28360 + }, + { + "epoch": 556.27, + "learning_rate": 9.0133242243031e-05, + "loss": 0.448, + "step": 28370 + }, + { + "epoch": 556.47, + "learning_rate": 9.009354737501981e-05, + "loss": 0.4442, + "step": 28380 + }, + { + "epoch": 556.67, + "learning_rate": 9.005384810002943e-05, + "loss": 0.4417, + "step": 28390 + }, + { + "epoch": 556.86, + "learning_rate": 9.001414442965111e-05, + "loss": 0.4406, + "step": 28400 + }, + { + "epoch": 557.0, + "eval_loss": 0.45280978083610535, + "eval_runtime": 2.1811, + "eval_samples_per_second": 1044.869, + "eval_steps_per_second": 4.126, + "step": 28407 + }, + { + "epoch": 557.06, + "learning_rate": 8.997443637547749e-05, + "loss": 0.4386, + "step": 28410 + }, + { + "epoch": 557.25, + "learning_rate": 8.99347239491024e-05, + "loss": 0.4453, + "step": 28420 + }, + { + "epoch": 557.45, + "learning_rate": 8.9895007162121e-05, + "loss": 0.445, + "step": 28430 + }, + { + "epoch": 557.65, + "learning_rate": 8.98552860261297e-05, + "loss": 0.4407, + "step": 28440 + }, + { + "epoch": 557.84, + "learning_rate": 8.981556055272618e-05, + "loss": 0.4433, + "step": 28450 + }, + { + "epoch": 558.0, + "eval_loss": 0.45024630427360535, + "eval_runtime": 2.2208, + "eval_samples_per_second": 1026.227, + "eval_steps_per_second": 4.053, + "step": 28458 + }, + { + "epoch": 558.04, + "learning_rate": 8.97758307535094e-05, + "loss": 0.4461, + "step": 28460 + }, + { + "epoch": 558.24, + "learning_rate": 8.973609664007956e-05, + "loss": 0.4451, + "step": 28470 + }, + { + "epoch": 558.43, + "learning_rate": 8.969635822403816e-05, + "loss": 0.4448, + "step": 28480 + }, + { + "epoch": 558.63, + "learning_rate": 8.965661551698793e-05, + "loss": 0.4462, + "step": 28490 + }, + { + "epoch": 558.82, + "learning_rate": 8.961686853053284e-05, + "loss": 0.4447, + "step": 28500 + }, + { + "epoch": 559.0, + "eval_loss": 0.44705361127853394, + "eval_runtime": 2.3532, + "eval_samples_per_second": 968.453, + "eval_steps_per_second": 3.825, + "step": 28509 + }, + { + "epoch": 559.02, + "learning_rate": 8.95771172762781e-05, + "loss": 0.4436, + "step": 28510 + }, + { + "epoch": 559.22, + "learning_rate": 8.953736176583024e-05, + "loss": 0.4435, + "step": 28520 + }, + { + "epoch": 559.41, + "learning_rate": 8.949760201079695e-05, + "loss": 0.4488, + "step": 28530 + }, + { + "epoch": 559.61, + "learning_rate": 8.945783802278721e-05, + "loss": 0.4413, + "step": 28540 + }, + { + "epoch": 559.8, + "learning_rate": 8.941806981341121e-05, + "loss": 0.4442, + "step": 28550 + }, + { + "epoch": 560.0, + "learning_rate": 8.937829739428038e-05, + "loss": 0.4438, + "step": 28560 + }, + { + "epoch": 560.0, + "eval_loss": 0.44999274611473083, + "eval_runtime": 2.2349, + "eval_samples_per_second": 1019.738, + "eval_steps_per_second": 4.027, + "step": 28560 + }, + { + "epoch": 560.2, + "learning_rate": 8.933852077700738e-05, + "loss": 0.4454, + "step": 28570 + }, + { + "epoch": 560.39, + "learning_rate": 8.929873997320608e-05, + "loss": 0.4419, + "step": 28580 + }, + { + "epoch": 560.59, + "learning_rate": 8.92589549944916e-05, + "loss": 0.4426, + "step": 28590 + }, + { + "epoch": 560.78, + "learning_rate": 8.921916585248027e-05, + "loss": 0.4401, + "step": 28600 + }, + { + "epoch": 560.98, + "learning_rate": 8.917937255878963e-05, + "loss": 0.4433, + "step": 28610 + }, + { + "epoch": 561.0, + "eval_loss": 0.4470755159854889, + "eval_runtime": 2.354, + "eval_samples_per_second": 968.157, + "eval_steps_per_second": 3.823, + "step": 28611 + }, + { + "epoch": 561.18, + "learning_rate": 8.913957512503844e-05, + "loss": 0.4507, + "step": 28620 + }, + { + "epoch": 561.37, + "learning_rate": 8.909977356284665e-05, + "loss": 0.4438, + "step": 28630 + }, + { + "epoch": 561.57, + "learning_rate": 8.905996788383543e-05, + "loss": 0.4419, + "step": 28640 + }, + { + "epoch": 561.76, + "learning_rate": 8.902015809962717e-05, + "loss": 0.4417, + "step": 28650 + }, + { + "epoch": 561.96, + "learning_rate": 8.898034422184542e-05, + "loss": 0.4412, + "step": 28660 + }, + { + "epoch": 562.0, + "eval_loss": 0.44909417629241943, + "eval_runtime": 2.2227, + "eval_samples_per_second": 1025.318, + "eval_steps_per_second": 4.049, + "step": 28662 + }, + { + "epoch": 562.16, + "learning_rate": 8.894052626211494e-05, + "loss": 0.4466, + "step": 28670 + }, + { + "epoch": 562.35, + "learning_rate": 8.890070423206171e-05, + "loss": 0.4483, + "step": 28680 + }, + { + "epoch": 562.55, + "learning_rate": 8.886087814331283e-05, + "loss": 0.4412, + "step": 28690 + }, + { + "epoch": 562.75, + "learning_rate": 8.882104800749671e-05, + "loss": 0.4419, + "step": 28700 + }, + { + "epoch": 562.94, + "learning_rate": 8.878121383624278e-05, + "loss": 0.4357, + "step": 28710 + }, + { + "epoch": 563.0, + "eval_loss": 0.4474387466907501, + "eval_runtime": 2.317, + "eval_samples_per_second": 983.597, + "eval_steps_per_second": 3.884, + "step": 28713 + }, + { + "epoch": 563.14, + "learning_rate": 8.874137564118174e-05, + "loss": 0.4407, + "step": 28720 + }, + { + "epoch": 563.33, + "learning_rate": 8.870153343394552e-05, + "loss": 0.4416, + "step": 28730 + }, + { + "epoch": 563.53, + "learning_rate": 8.866168722616707e-05, + "loss": 0.4397, + "step": 28740 + }, + { + "epoch": 563.73, + "learning_rate": 8.862183702948066e-05, + "loss": 0.4414, + "step": 28750 + }, + { + "epoch": 563.92, + "learning_rate": 8.858198285552164e-05, + "loss": 0.4424, + "step": 28760 + }, + { + "epoch": 564.0, + "eval_loss": 0.44805294275283813, + "eval_runtime": 2.3439, + "eval_samples_per_second": 972.294, + "eval_steps_per_second": 3.84, + "step": 28764 + }, + { + "epoch": 564.12, + "learning_rate": 8.854212471592652e-05, + "loss": 0.4423, + "step": 28770 + }, + { + "epoch": 564.31, + "learning_rate": 8.850226262233302e-05, + "loss": 0.4466, + "step": 28780 + }, + { + "epoch": 564.51, + "learning_rate": 8.846239658637994e-05, + "loss": 0.4365, + "step": 28790 + }, + { + "epoch": 564.71, + "learning_rate": 8.84225266197073e-05, + "loss": 0.4449, + "step": 28800 + }, + { + "epoch": 564.9, + "learning_rate": 8.838265273395625e-05, + "loss": 0.4412, + "step": 28810 + }, + { + "epoch": 565.0, + "eval_loss": 0.4479809105396271, + "eval_runtime": 2.2726, + "eval_samples_per_second": 1002.819, + "eval_steps_per_second": 3.96, + "step": 28815 + }, + { + "epoch": 565.1, + "learning_rate": 8.834277494076904e-05, + "loss": 0.4408, + "step": 28820 + }, + { + "epoch": 565.29, + "learning_rate": 8.830289325178915e-05, + "loss": 0.4453, + "step": 28830 + }, + { + "epoch": 565.49, + "learning_rate": 8.826300767866111e-05, + "loss": 0.4493, + "step": 28840 + }, + { + "epoch": 565.69, + "learning_rate": 8.822311823303061e-05, + "loss": 0.4414, + "step": 28850 + }, + { + "epoch": 565.88, + "learning_rate": 8.818322492654448e-05, + "loss": 0.4483, + "step": 28860 + }, + { + "epoch": 566.0, + "eval_loss": 0.4453369081020355, + "eval_runtime": 2.2664, + "eval_samples_per_second": 1005.562, + "eval_steps_per_second": 3.971, + "step": 28866 + }, + { + "epoch": 566.08, + "learning_rate": 8.81433277708507e-05, + "loss": 0.4432, + "step": 28870 + }, + { + "epoch": 566.27, + "learning_rate": 8.810342677759833e-05, + "loss": 0.4364, + "step": 28880 + }, + { + "epoch": 566.47, + "learning_rate": 8.80635219584376e-05, + "loss": 0.439, + "step": 28890 + }, + { + "epoch": 566.67, + "learning_rate": 8.802361332501978e-05, + "loss": 0.4412, + "step": 28900 + }, + { + "epoch": 566.86, + "learning_rate": 8.798370088899733e-05, + "loss": 0.4397, + "step": 28910 + }, + { + "epoch": 567.0, + "eval_loss": 0.4435195326805115, + "eval_runtime": 2.2184, + "eval_samples_per_second": 1027.336, + "eval_steps_per_second": 4.057, + "step": 28917 + }, + { + "epoch": 567.06, + "learning_rate": 8.794378466202377e-05, + "loss": 0.4363, + "step": 28920 + }, + { + "epoch": 567.25, + "learning_rate": 8.790386465575376e-05, + "loss": 0.4437, + "step": 28930 + }, + { + "epoch": 567.45, + "learning_rate": 8.78639408818431e-05, + "loss": 0.4402, + "step": 28940 + }, + { + "epoch": 567.65, + "learning_rate": 8.782401335194854e-05, + "loss": 0.4403, + "step": 28950 + }, + { + "epoch": 567.84, + "learning_rate": 8.778408207772813e-05, + "loss": 0.4377, + "step": 28960 + }, + { + "epoch": 568.0, + "eval_loss": 0.446013480424881, + "eval_runtime": 2.2216, + "eval_samples_per_second": 1025.817, + "eval_steps_per_second": 4.051, + "step": 28968 + }, + { + "epoch": 568.04, + "learning_rate": 8.774414707084085e-05, + "loss": 0.4416, + "step": 28970 + }, + { + "epoch": 568.24, + "learning_rate": 8.770420834294683e-05, + "loss": 0.435, + "step": 28980 + }, + { + "epoch": 568.43, + "learning_rate": 8.766426590570733e-05, + "loss": 0.4426, + "step": 28990 + }, + { + "epoch": 568.63, + "learning_rate": 8.762431977078461e-05, + "loss": 0.4417, + "step": 29000 + }, + { + "epoch": 568.82, + "learning_rate": 8.758436994984206e-05, + "loss": 0.4424, + "step": 29010 + }, + { + "epoch": 569.0, + "eval_loss": 0.4475310444831848, + "eval_runtime": 2.2115, + "eval_samples_per_second": 1030.531, + "eval_steps_per_second": 4.07, + "step": 29019 + }, + { + "epoch": 569.02, + "learning_rate": 8.754441645454416e-05, + "loss": 0.4454, + "step": 29020 + }, + { + "epoch": 569.22, + "learning_rate": 8.750445929655642e-05, + "loss": 0.4375, + "step": 29030 + }, + { + "epoch": 569.41, + "learning_rate": 8.746449848754546e-05, + "loss": 0.44, + "step": 29040 + }, + { + "epoch": 569.61, + "learning_rate": 8.74245340391789e-05, + "loss": 0.4443, + "step": 29050 + }, + { + "epoch": 569.8, + "learning_rate": 8.738456596312549e-05, + "loss": 0.44, + "step": 29060 + }, + { + "epoch": 570.0, + "learning_rate": 8.734459427105504e-05, + "loss": 0.4412, + "step": 29070 + }, + { + "epoch": 570.0, + "eval_loss": 0.44452720880508423, + "eval_runtime": 2.2457, + "eval_samples_per_second": 1014.851, + "eval_steps_per_second": 4.008, + "step": 29070 + }, + { + "epoch": 570.2, + "learning_rate": 8.730461897463838e-05, + "loss": 0.4409, + "step": 29080 + }, + { + "epoch": 570.39, + "learning_rate": 8.726464008554736e-05, + "loss": 0.4418, + "step": 29090 + }, + { + "epoch": 570.59, + "learning_rate": 8.7224657615455e-05, + "loss": 0.4427, + "step": 29100 + }, + { + "epoch": 570.78, + "learning_rate": 8.718467157603525e-05, + "loss": 0.4463, + "step": 29110 + }, + { + "epoch": 570.98, + "learning_rate": 8.714468197896313e-05, + "loss": 0.4435, + "step": 29120 + }, + { + "epoch": 571.0, + "eval_loss": 0.441842257976532, + "eval_runtime": 2.1954, + "eval_samples_per_second": 1038.08, + "eval_steps_per_second": 4.099, + "step": 29121 + }, + { + "epoch": 571.18, + "learning_rate": 8.710468883591474e-05, + "loss": 0.44, + "step": 29130 + }, + { + "epoch": 571.37, + "learning_rate": 8.706469215856715e-05, + "loss": 0.4434, + "step": 29140 + }, + { + "epoch": 571.57, + "learning_rate": 8.702469195859853e-05, + "loss": 0.4397, + "step": 29150 + }, + { + "epoch": 571.76, + "learning_rate": 8.698468824768803e-05, + "loss": 0.4398, + "step": 29160 + }, + { + "epoch": 571.96, + "learning_rate": 8.694468103751586e-05, + "loss": 0.4398, + "step": 29170 + }, + { + "epoch": 572.0, + "eval_loss": 0.4434479773044586, + "eval_runtime": 2.2832, + "eval_samples_per_second": 998.16, + "eval_steps_per_second": 3.942, + "step": 29172 + }, + { + "epoch": 572.16, + "learning_rate": 8.690467033976322e-05, + "loss": 0.4408, + "step": 29180 + }, + { + "epoch": 572.35, + "learning_rate": 8.686465616611232e-05, + "loss": 0.4413, + "step": 29190 + }, + { + "epoch": 572.55, + "learning_rate": 8.682463852824644e-05, + "loss": 0.4385, + "step": 29200 + }, + { + "epoch": 572.75, + "learning_rate": 8.678461743784983e-05, + "loss": 0.4384, + "step": 29210 + }, + { + "epoch": 572.94, + "learning_rate": 8.674459290660773e-05, + "loss": 0.4427, + "step": 29220 + }, + { + "epoch": 573.0, + "eval_loss": 0.44168439507484436, + "eval_runtime": 2.2332, + "eval_samples_per_second": 1020.522, + "eval_steps_per_second": 4.03, + "step": 29223 + }, + { + "epoch": 573.14, + "learning_rate": 8.670456494620645e-05, + "loss": 0.4434, + "step": 29230 + }, + { + "epoch": 573.33, + "learning_rate": 8.666453356833323e-05, + "loss": 0.4396, + "step": 29240 + }, + { + "epoch": 573.53, + "learning_rate": 8.662449878467637e-05, + "loss": 0.4406, + "step": 29250 + }, + { + "epoch": 573.73, + "learning_rate": 8.658446060692512e-05, + "loss": 0.4434, + "step": 29260 + }, + { + "epoch": 573.92, + "learning_rate": 8.65444190467697e-05, + "loss": 0.4409, + "step": 29270 + }, + { + "epoch": 574.0, + "eval_loss": 0.44099777936935425, + "eval_runtime": 2.2519, + "eval_samples_per_second": 1012.023, + "eval_steps_per_second": 3.997, + "step": 29274 + }, + { + "epoch": 574.12, + "learning_rate": 8.650437411590141e-05, + "loss": 0.4399, + "step": 29280 + }, + { + "epoch": 574.31, + "learning_rate": 8.646432582601244e-05, + "loss": 0.4385, + "step": 29290 + }, + { + "epoch": 574.51, + "learning_rate": 8.6424274188796e-05, + "loss": 0.4378, + "step": 29300 + }, + { + "epoch": 574.71, + "learning_rate": 8.63842192159463e-05, + "loss": 0.4413, + "step": 29310 + }, + { + "epoch": 574.9, + "learning_rate": 8.634416091915846e-05, + "loss": 0.4425, + "step": 29320 + }, + { + "epoch": 575.0, + "eval_loss": 0.4433988630771637, + "eval_runtime": 2.3093, + "eval_samples_per_second": 986.89, + "eval_steps_per_second": 3.897, + "step": 29325 + }, + { + "epoch": 575.1, + "learning_rate": 8.630409931012866e-05, + "loss": 0.445, + "step": 29330 + }, + { + "epoch": 575.29, + "learning_rate": 8.626403440055395e-05, + "loss": 0.439, + "step": 29340 + }, + { + "epoch": 575.49, + "learning_rate": 8.622396620213241e-05, + "loss": 0.4458, + "step": 29350 + }, + { + "epoch": 575.69, + "learning_rate": 8.618389472656305e-05, + "loss": 0.4361, + "step": 29360 + }, + { + "epoch": 575.88, + "learning_rate": 8.614381998554585e-05, + "loss": 0.4402, + "step": 29370 + }, + { + "epoch": 576.0, + "eval_loss": 0.4489006996154785, + "eval_runtime": 2.2375, + "eval_samples_per_second": 1018.567, + "eval_steps_per_second": 4.022, + "step": 29376 + }, + { + "epoch": 576.08, + "learning_rate": 8.610374199078179e-05, + "loss": 0.4417, + "step": 29380 + }, + { + "epoch": 576.27, + "learning_rate": 8.606366075397266e-05, + "loss": 0.4436, + "step": 29390 + }, + { + "epoch": 576.47, + "learning_rate": 8.602357628682135e-05, + "loss": 0.4445, + "step": 29400 + }, + { + "epoch": 576.67, + "learning_rate": 8.598348860103162e-05, + "loss": 0.4367, + "step": 29410 + }, + { + "epoch": 576.86, + "learning_rate": 8.594339770830815e-05, + "loss": 0.4394, + "step": 29420 + }, + { + "epoch": 577.0, + "eval_loss": 0.4435146450996399, + "eval_runtime": 2.2075, + "eval_samples_per_second": 1032.38, + "eval_steps_per_second": 4.077, + "step": 29427 + }, + { + "epoch": 577.06, + "learning_rate": 8.590330362035663e-05, + "loss": 0.4387, + "step": 29430 + }, + { + "epoch": 577.25, + "learning_rate": 8.58632063488836e-05, + "loss": 0.4438, + "step": 29440 + }, + { + "epoch": 577.45, + "learning_rate": 8.582310590559662e-05, + "loss": 0.4413, + "step": 29450 + }, + { + "epoch": 577.65, + "learning_rate": 8.578300230220408e-05, + "loss": 0.4334, + "step": 29460 + }, + { + "epoch": 577.84, + "learning_rate": 8.574289555041537e-05, + "loss": 0.4379, + "step": 29470 + }, + { + "epoch": 578.0, + "eval_loss": 0.4446564018726349, + "eval_runtime": 2.2644, + "eval_samples_per_second": 1006.439, + "eval_steps_per_second": 3.975, + "step": 29478 + }, + { + "epoch": 578.04, + "learning_rate": 8.570278566194071e-05, + "loss": 0.4394, + "step": 29480 + }, + { + "epoch": 578.24, + "learning_rate": 8.566267264849137e-05, + "loss": 0.4376, + "step": 29490 + }, + { + "epoch": 578.43, + "learning_rate": 8.56225565217794e-05, + "loss": 0.4346, + "step": 29500 + }, + { + "epoch": 578.63, + "learning_rate": 8.558243729351784e-05, + "loss": 0.442, + "step": 29510 + }, + { + "epoch": 578.82, + "learning_rate": 8.554231497542058e-05, + "loss": 0.4391, + "step": 29520 + }, + { + "epoch": 579.0, + "eval_loss": 0.44711729884147644, + "eval_runtime": 2.3296, + "eval_samples_per_second": 978.296, + "eval_steps_per_second": 3.863, + "step": 29529 + }, + { + "epoch": 579.02, + "learning_rate": 8.550218957920247e-05, + "loss": 0.4372, + "step": 29530 + }, + { + "epoch": 579.22, + "learning_rate": 8.546206111657923e-05, + "loss": 0.4419, + "step": 29540 + }, + { + "epoch": 579.41, + "learning_rate": 8.542192959926748e-05, + "loss": 0.4369, + "step": 29550 + }, + { + "epoch": 579.61, + "learning_rate": 8.538179503898471e-05, + "loss": 0.4351, + "step": 29560 + }, + { + "epoch": 579.8, + "learning_rate": 8.534165744744933e-05, + "loss": 0.4397, + "step": 29570 + }, + { + "epoch": 580.0, + "learning_rate": 8.530151683638061e-05, + "loss": 0.4404, + "step": 29580 + }, + { + "epoch": 580.0, + "eval_loss": 0.44352006912231445, + "eval_runtime": 2.2363, + "eval_samples_per_second": 1019.074, + "eval_steps_per_second": 4.024, + "step": 29580 + }, + { + "epoch": 580.2, + "learning_rate": 8.526137321749872e-05, + "loss": 0.4387, + "step": 29590 + }, + { + "epoch": 580.39, + "learning_rate": 8.522122660252471e-05, + "loss": 0.4392, + "step": 29600 + }, + { + "epoch": 580.59, + "learning_rate": 8.518107700318048e-05, + "loss": 0.4391, + "step": 29610 + }, + { + "epoch": 580.78, + "learning_rate": 8.514092443118883e-05, + "loss": 0.4382, + "step": 29620 + }, + { + "epoch": 580.98, + "learning_rate": 8.51007688982734e-05, + "loss": 0.4399, + "step": 29630 + }, + { + "epoch": 581.0, + "eval_loss": 0.4410766363143921, + "eval_runtime": 2.2013, + "eval_samples_per_second": 1035.315, + "eval_steps_per_second": 4.089, + "step": 29631 + }, + { + "epoch": 581.18, + "learning_rate": 8.506061041615872e-05, + "loss": 0.4412, + "step": 29640 + }, + { + "epoch": 581.37, + "learning_rate": 8.50204489965702e-05, + "loss": 0.4364, + "step": 29650 + }, + { + "epoch": 581.57, + "learning_rate": 8.498028465123402e-05, + "loss": 0.439, + "step": 29660 + }, + { + "epoch": 581.76, + "learning_rate": 8.494011739187732e-05, + "loss": 0.4371, + "step": 29670 + }, + { + "epoch": 581.96, + "learning_rate": 8.489994723022801e-05, + "loss": 0.4353, + "step": 29680 + }, + { + "epoch": 582.0, + "eval_loss": 0.4415852725505829, + "eval_runtime": 2.2155, + "eval_samples_per_second": 1028.655, + "eval_steps_per_second": 4.062, + "step": 29682 + }, + { + "epoch": 582.16, + "learning_rate": 8.485977417801492e-05, + "loss": 0.4406, + "step": 29690 + }, + { + "epoch": 582.35, + "learning_rate": 8.481959824696765e-05, + "loss": 0.4348, + "step": 29700 + }, + { + "epoch": 582.55, + "learning_rate": 8.47794194488167e-05, + "loss": 0.4389, + "step": 29710 + }, + { + "epoch": 582.75, + "learning_rate": 8.473923779529337e-05, + "loss": 0.4417, + "step": 29720 + }, + { + "epoch": 582.94, + "learning_rate": 8.469905329812981e-05, + "loss": 0.4417, + "step": 29730 + }, + { + "epoch": 583.0, + "eval_loss": 0.4416983425617218, + "eval_runtime": 2.1672, + "eval_samples_per_second": 1051.592, + "eval_steps_per_second": 4.153, + "step": 29733 + }, + { + "epoch": 583.14, + "learning_rate": 8.4658865969059e-05, + "loss": 0.4421, + "step": 29740 + }, + { + "epoch": 583.33, + "learning_rate": 8.461867581981472e-05, + "loss": 0.4421, + "step": 29750 + }, + { + "epoch": 583.53, + "learning_rate": 8.457848286213166e-05, + "loss": 0.435, + "step": 29760 + }, + { + "epoch": 583.73, + "learning_rate": 8.453828710774517e-05, + "loss": 0.4389, + "step": 29770 + }, + { + "epoch": 583.92, + "learning_rate": 8.44980885683916e-05, + "loss": 0.4389, + "step": 29780 + }, + { + "epoch": 584.0, + "eval_loss": 0.4399338662624359, + "eval_runtime": 2.2366, + "eval_samples_per_second": 1018.961, + "eval_steps_per_second": 4.024, + "step": 29784 + }, + { + "epoch": 584.12, + "learning_rate": 8.4457887255808e-05, + "loss": 0.4362, + "step": 29790 + }, + { + "epoch": 584.31, + "learning_rate": 8.441768318173226e-05, + "loss": 0.4391, + "step": 29800 + }, + { + "epoch": 584.51, + "learning_rate": 8.437747635790304e-05, + "loss": 0.4367, + "step": 29810 + }, + { + "epoch": 584.71, + "learning_rate": 8.433726679605987e-05, + "loss": 0.4409, + "step": 29820 + }, + { + "epoch": 584.9, + "learning_rate": 8.429705450794304e-05, + "loss": 0.4378, + "step": 29830 + }, + { + "epoch": 585.0, + "eval_loss": 0.44315850734710693, + "eval_runtime": 2.3103, + "eval_samples_per_second": 986.471, + "eval_steps_per_second": 3.896, + "step": 29835 + }, + { + "epoch": 585.1, + "learning_rate": 8.425683950529364e-05, + "loss": 0.4343, + "step": 29840 + }, + { + "epoch": 585.29, + "learning_rate": 8.421662179985356e-05, + "loss": 0.4368, + "step": 29850 + }, + { + "epoch": 585.49, + "learning_rate": 8.417640140336546e-05, + "loss": 0.4365, + "step": 29860 + }, + { + "epoch": 585.69, + "learning_rate": 8.413617832757278e-05, + "loss": 0.4407, + "step": 29870 + }, + { + "epoch": 585.88, + "learning_rate": 8.409595258421981e-05, + "loss": 0.439, + "step": 29880 + }, + { + "epoch": 586.0, + "eval_loss": 0.44265684485435486, + "eval_runtime": 2.3136, + "eval_samples_per_second": 985.054, + "eval_steps_per_second": 3.89, + "step": 29886 + }, + { + "epoch": 586.08, + "learning_rate": 8.405572418505156e-05, + "loss": 0.4379, + "step": 29890 + }, + { + "epoch": 586.27, + "learning_rate": 8.401549314181376e-05, + "loss": 0.4392, + "step": 29900 + }, + { + "epoch": 586.47, + "learning_rate": 8.397525946625307e-05, + "loss": 0.438, + "step": 29910 + }, + { + "epoch": 586.67, + "learning_rate": 8.393502317011676e-05, + "loss": 0.4313, + "step": 29920 + }, + { + "epoch": 586.86, + "learning_rate": 8.389478426515299e-05, + "loss": 0.431, + "step": 29930 + }, + { + "epoch": 587.0, + "eval_loss": 0.4403259754180908, + "eval_runtime": 2.3194, + "eval_samples_per_second": 982.582, + "eval_steps_per_second": 3.88, + "step": 29937 + }, + { + "epoch": 587.06, + "learning_rate": 8.385454276311057e-05, + "loss": 0.4365, + "step": 29940 + }, + { + "epoch": 587.25, + "learning_rate": 8.381429867573911e-05, + "loss": 0.4384, + "step": 29950 + }, + { + "epoch": 587.45, + "learning_rate": 8.377405201478904e-05, + "loss": 0.433, + "step": 29960 + }, + { + "epoch": 587.65, + "learning_rate": 8.373380279201146e-05, + "loss": 0.4378, + "step": 29970 + }, + { + "epoch": 587.84, + "learning_rate": 8.369355101915824e-05, + "loss": 0.4348, + "step": 29980 + }, + { + "epoch": 588.0, + "eval_loss": 0.4408820867538452, + "eval_runtime": 2.3452, + "eval_samples_per_second": 971.782, + "eval_steps_per_second": 3.838, + "step": 29988 + }, + { + "epoch": 588.04, + "learning_rate": 8.365329670798203e-05, + "loss": 0.435, + "step": 29990 + }, + { + "epoch": 588.24, + "learning_rate": 8.361303987023614e-05, + "loss": 0.4375, + "step": 30000 + }, + { + "epoch": 588.43, + "learning_rate": 8.357278051767472e-05, + "loss": 0.4412, + "step": 30010 + }, + { + "epoch": 588.63, + "learning_rate": 8.353251866205257e-05, + "loss": 0.4345, + "step": 30020 + }, + { + "epoch": 588.82, + "learning_rate": 8.349225431512524e-05, + "loss": 0.4363, + "step": 30030 + }, + { + "epoch": 589.0, + "eval_loss": 0.44250038266181946, + "eval_runtime": 2.2027, + "eval_samples_per_second": 1034.662, + "eval_steps_per_second": 4.086, + "step": 30039 + }, + { + "epoch": 589.02, + "learning_rate": 8.345198748864909e-05, + "loss": 0.4344, + "step": 30040 + }, + { + "epoch": 589.22, + "learning_rate": 8.341171819438106e-05, + "loss": 0.4384, + "step": 30050 + }, + { + "epoch": 589.41, + "learning_rate": 8.337144644407893e-05, + "loss": 0.4389, + "step": 30060 + }, + { + "epoch": 589.61, + "learning_rate": 8.333117224950114e-05, + "loss": 0.4378, + "step": 30070 + }, + { + "epoch": 589.8, + "learning_rate": 8.329089562240686e-05, + "loss": 0.4367, + "step": 30080 + }, + { + "epoch": 590.0, + "learning_rate": 8.325061657455594e-05, + "loss": 0.4399, + "step": 30090 + }, + { + "epoch": 590.0, + "eval_loss": 0.4393501877784729, + "eval_runtime": 2.3569, + "eval_samples_per_second": 966.933, + "eval_steps_per_second": 3.819, + "step": 30090 + }, + { + "epoch": 590.2, + "learning_rate": 8.3210335117709e-05, + "loss": 0.4309, + "step": 30100 + }, + { + "epoch": 590.39, + "learning_rate": 8.317005126362731e-05, + "loss": 0.4375, + "step": 30110 + }, + { + "epoch": 590.59, + "learning_rate": 8.312976502407288e-05, + "loss": 0.435, + "step": 30120 + }, + { + "epoch": 590.78, + "learning_rate": 8.308947641080836e-05, + "loss": 0.4352, + "step": 30130 + }, + { + "epoch": 590.98, + "learning_rate": 8.304918543559715e-05, + "loss": 0.4342, + "step": 30140 + }, + { + "epoch": 591.0, + "eval_loss": 0.4411936104297638, + "eval_runtime": 2.2081, + "eval_samples_per_second": 1032.125, + "eval_steps_per_second": 4.076, + "step": 30141 + }, + { + "epoch": 591.18, + "learning_rate": 8.300889211020331e-05, + "loss": 0.4317, + "step": 30150 + }, + { + "epoch": 591.37, + "learning_rate": 8.296859644639157e-05, + "loss": 0.4366, + "step": 30160 + }, + { + "epoch": 591.57, + "learning_rate": 8.292829845592739e-05, + "loss": 0.4366, + "step": 30170 + }, + { + "epoch": 591.76, + "learning_rate": 8.288799815057689e-05, + "loss": 0.4329, + "step": 30180 + }, + { + "epoch": 591.96, + "learning_rate": 8.284769554210685e-05, + "loss": 0.4342, + "step": 30190 + }, + { + "epoch": 592.0, + "eval_loss": 0.4399246275424957, + "eval_runtime": 2.2423, + "eval_samples_per_second": 1016.359, + "eval_steps_per_second": 4.014, + "step": 30192 + }, + { + "epoch": 592.16, + "learning_rate": 8.280739064228471e-05, + "loss": 0.4357, + "step": 30200 + }, + { + "epoch": 592.35, + "learning_rate": 8.276708346287865e-05, + "loss": 0.4347, + "step": 30210 + }, + { + "epoch": 592.55, + "learning_rate": 8.272677401565742e-05, + "loss": 0.4355, + "step": 30220 + }, + { + "epoch": 592.75, + "learning_rate": 8.268646231239052e-05, + "loss": 0.4405, + "step": 30230 + }, + { + "epoch": 592.94, + "learning_rate": 8.264614836484803e-05, + "loss": 0.4348, + "step": 30240 + }, + { + "epoch": 593.0, + "eval_loss": 0.441998690366745, + "eval_runtime": 2.2767, + "eval_samples_per_second": 1001.018, + "eval_steps_per_second": 3.953, + "step": 30243 + }, + { + "epoch": 593.14, + "learning_rate": 8.260583218480075e-05, + "loss": 0.4352, + "step": 30250 + }, + { + "epoch": 593.33, + "learning_rate": 8.256551378402012e-05, + "loss": 0.4402, + "step": 30260 + }, + { + "epoch": 593.53, + "learning_rate": 8.252519317427817e-05, + "loss": 0.436, + "step": 30270 + }, + { + "epoch": 593.73, + "learning_rate": 8.248487036734766e-05, + "loss": 0.4357, + "step": 30280 + }, + { + "epoch": 593.92, + "learning_rate": 8.244454537500189e-05, + "loss": 0.4326, + "step": 30290 + }, + { + "epoch": 594.0, + "eval_loss": 0.44458866119384766, + "eval_runtime": 2.2919, + "eval_samples_per_second": 994.37, + "eval_steps_per_second": 3.927, + "step": 30294 + }, + { + "epoch": 594.12, + "learning_rate": 8.240421820901495e-05, + "loss": 0.4314, + "step": 30300 + }, + { + "epoch": 594.31, + "learning_rate": 8.23638888811614e-05, + "loss": 0.4389, + "step": 30310 + }, + { + "epoch": 594.51, + "learning_rate": 8.232355740321651e-05, + "loss": 0.4343, + "step": 30320 + }, + { + "epoch": 594.71, + "learning_rate": 8.228322378695622e-05, + "loss": 0.433, + "step": 30330 + }, + { + "epoch": 594.9, + "learning_rate": 8.2242888044157e-05, + "loss": 0.4333, + "step": 30340 + }, + { + "epoch": 595.0, + "eval_loss": 0.44296392798423767, + "eval_runtime": 2.1971, + "eval_samples_per_second": 1037.271, + "eval_steps_per_second": 4.096, + "step": 30345 + }, + { + "epoch": 595.1, + "learning_rate": 8.220255018659601e-05, + "loss": 0.4384, + "step": 30350 + }, + { + "epoch": 595.29, + "learning_rate": 8.2162210226051e-05, + "loss": 0.4359, + "step": 30360 + }, + { + "epoch": 595.49, + "learning_rate": 8.212186817430031e-05, + "loss": 0.4307, + "step": 30370 + }, + { + "epoch": 595.69, + "learning_rate": 8.208152404312299e-05, + "loss": 0.4335, + "step": 30380 + }, + { + "epoch": 595.88, + "learning_rate": 8.204117784429856e-05, + "loss": 0.4336, + "step": 30390 + }, + { + "epoch": 596.0, + "eval_loss": 0.4396732747554779, + "eval_runtime": 2.2713, + "eval_samples_per_second": 1003.396, + "eval_steps_per_second": 3.963, + "step": 30396 + }, + { + "epoch": 596.08, + "learning_rate": 8.200082958960723e-05, + "loss": 0.4356, + "step": 30400 + }, + { + "epoch": 596.27, + "learning_rate": 8.196047929082981e-05, + "loss": 0.4374, + "step": 30410 + }, + { + "epoch": 596.47, + "learning_rate": 8.192012695974765e-05, + "loss": 0.4351, + "step": 30420 + }, + { + "epoch": 596.67, + "learning_rate": 8.187977260814275e-05, + "loss": 0.431, + "step": 30430 + }, + { + "epoch": 596.86, + "learning_rate": 8.183941624779769e-05, + "loss": 0.4314, + "step": 30440 + }, + { + "epoch": 597.0, + "eval_loss": 0.44181305170059204, + "eval_runtime": 2.2614, + "eval_samples_per_second": 1007.787, + "eval_steps_per_second": 3.98, + "step": 30447 + }, + { + "epoch": 597.06, + "learning_rate": 8.179905789049561e-05, + "loss": 0.4307, + "step": 30450 + }, + { + "epoch": 597.25, + "learning_rate": 8.175869754802028e-05, + "loss": 0.4337, + "step": 30460 + }, + { + "epoch": 597.45, + "learning_rate": 8.1718335232156e-05, + "loss": 0.4334, + "step": 30470 + }, + { + "epoch": 597.65, + "learning_rate": 8.167797095468766e-05, + "loss": 0.4348, + "step": 30480 + }, + { + "epoch": 597.84, + "learning_rate": 8.163760472740073e-05, + "loss": 0.4371, + "step": 30490 + }, + { + "epoch": 598.0, + "eval_loss": 0.441135048866272, + "eval_runtime": 2.2413, + "eval_samples_per_second": 1016.834, + "eval_steps_per_second": 4.016, + "step": 30498 + }, + { + "epoch": 598.04, + "learning_rate": 8.159723656208126e-05, + "loss": 0.436, + "step": 30500 + }, + { + "epoch": 598.24, + "learning_rate": 8.155686647051584e-05, + "loss": 0.4382, + "step": 30510 + }, + { + "epoch": 598.43, + "learning_rate": 8.151649446449163e-05, + "loss": 0.4335, + "step": 30520 + }, + { + "epoch": 598.63, + "learning_rate": 8.147612055579639e-05, + "loss": 0.434, + "step": 30530 + }, + { + "epoch": 598.82, + "learning_rate": 8.143574475621837e-05, + "loss": 0.4333, + "step": 30540 + }, + { + "epoch": 599.0, + "eval_loss": 0.4385489225387573, + "eval_runtime": 2.2156, + "eval_samples_per_second": 1028.626, + "eval_steps_per_second": 4.062, + "step": 30549 + }, + { + "epoch": 599.02, + "learning_rate": 8.139536707754641e-05, + "loss": 0.4423, + "step": 30550 + }, + { + "epoch": 599.22, + "learning_rate": 8.13549875315699e-05, + "loss": 0.4333, + "step": 30560 + }, + { + "epoch": 599.41, + "learning_rate": 8.131460613007875e-05, + "loss": 0.4335, + "step": 30570 + }, + { + "epoch": 599.61, + "learning_rate": 8.127422288486345e-05, + "loss": 0.432, + "step": 30580 + }, + { + "epoch": 599.8, + "learning_rate": 8.123383780771498e-05, + "loss": 0.4281, + "step": 30590 + }, + { + "epoch": 600.0, + "learning_rate": 8.119345091042493e-05, + "loss": 0.4337, + "step": 30600 + }, + { + "epoch": 600.0, + "eval_loss": 0.43944406509399414, + "eval_runtime": 2.3431, + "eval_samples_per_second": 972.654, + "eval_steps_per_second": 3.841, + "step": 30600 + }, + { + "epoch": 600.2, + "learning_rate": 8.115306220478532e-05, + "loss": 0.4308, + "step": 30610 + }, + { + "epoch": 600.39, + "learning_rate": 8.111267170258878e-05, + "loss": 0.4389, + "step": 30620 + }, + { + "epoch": 600.59, + "learning_rate": 8.107227941562841e-05, + "loss": 0.4416, + "step": 30630 + }, + { + "epoch": 600.78, + "learning_rate": 8.103188535569788e-05, + "loss": 0.4374, + "step": 30640 + }, + { + "epoch": 600.98, + "learning_rate": 8.099148953459137e-05, + "loss": 0.4371, + "step": 30650 + }, + { + "epoch": 601.0, + "eval_loss": 0.44066575169563293, + "eval_runtime": 2.2914, + "eval_samples_per_second": 994.578, + "eval_steps_per_second": 3.928, + "step": 30651 + }, + { + "epoch": 601.18, + "learning_rate": 8.095109196410353e-05, + "loss": 0.4357, + "step": 30660 + }, + { + "epoch": 601.37, + "learning_rate": 8.091069265602957e-05, + "loss": 0.4378, + "step": 30670 + }, + { + "epoch": 601.57, + "learning_rate": 8.087029162216514e-05, + "loss": 0.435, + "step": 30680 + }, + { + "epoch": 601.76, + "learning_rate": 8.082988887430652e-05, + "loss": 0.4314, + "step": 30690 + }, + { + "epoch": 601.96, + "learning_rate": 8.078948442425035e-05, + "loss": 0.4294, + "step": 30700 + }, + { + "epoch": 602.0, + "eval_loss": 0.43954363465309143, + "eval_runtime": 2.1748, + "eval_samples_per_second": 1047.926, + "eval_steps_per_second": 4.138, + "step": 30702 + }, + { + "epoch": 602.16, + "learning_rate": 8.074907828379383e-05, + "loss": 0.4387, + "step": 30710 + }, + { + "epoch": 602.35, + "learning_rate": 8.070867046473468e-05, + "loss": 0.431, + "step": 30720 + }, + { + "epoch": 602.55, + "learning_rate": 8.066826097887109e-05, + "loss": 0.4358, + "step": 30730 + }, + { + "epoch": 602.75, + "learning_rate": 8.062784983800169e-05, + "loss": 0.437, + "step": 30740 + }, + { + "epoch": 602.94, + "learning_rate": 8.058743705392566e-05, + "loss": 0.4323, + "step": 30750 + }, + { + "epoch": 603.0, + "eval_loss": 0.4404396712779999, + "eval_runtime": 2.3305, + "eval_samples_per_second": 977.881, + "eval_steps_per_second": 3.862, + "step": 30753 + }, + { + "epoch": 603.14, + "learning_rate": 8.054702263844258e-05, + "loss": 0.4352, + "step": 30760 + }, + { + "epoch": 603.33, + "learning_rate": 8.050660660335264e-05, + "loss": 0.4357, + "step": 30770 + }, + { + "epoch": 603.53, + "learning_rate": 8.046618896045638e-05, + "loss": 0.4345, + "step": 30780 + }, + { + "epoch": 603.73, + "learning_rate": 8.042576972155484e-05, + "loss": 0.4309, + "step": 30790 + }, + { + "epoch": 603.92, + "learning_rate": 8.038534889844957e-05, + "loss": 0.4303, + "step": 30800 + }, + { + "epoch": 604.0, + "eval_loss": 0.44217541813850403, + "eval_runtime": 2.3094, + "eval_samples_per_second": 986.824, + "eval_steps_per_second": 3.897, + "step": 30804 + }, + { + "epoch": 604.12, + "learning_rate": 8.03449265029425e-05, + "loss": 0.4332, + "step": 30810 + }, + { + "epoch": 604.31, + "learning_rate": 8.030450254683612e-05, + "loss": 0.4369, + "step": 30820 + }, + { + "epoch": 604.51, + "learning_rate": 8.026407704193327e-05, + "loss": 0.4384, + "step": 30830 + }, + { + "epoch": 604.71, + "learning_rate": 8.022365000003734e-05, + "loss": 0.4343, + "step": 30840 + }, + { + "epoch": 604.9, + "learning_rate": 8.01832214329521e-05, + "loss": 0.4325, + "step": 30850 + }, + { + "epoch": 605.0, + "eval_loss": 0.4375738203525543, + "eval_runtime": 2.1954, + "eval_samples_per_second": 1038.103, + "eval_steps_per_second": 4.1, + "step": 30855 + }, + { + "epoch": 605.1, + "learning_rate": 8.014279135248181e-05, + "loss": 0.437, + "step": 30860 + }, + { + "epoch": 605.29, + "learning_rate": 8.010235977043112e-05, + "loss": 0.4353, + "step": 30870 + }, + { + "epoch": 605.49, + "learning_rate": 8.006192669860521e-05, + "loss": 0.4295, + "step": 30880 + }, + { + "epoch": 605.69, + "learning_rate": 8.002149214880955e-05, + "loss": 0.4342, + "step": 30890 + }, + { + "epoch": 605.88, + "learning_rate": 7.99810561328502e-05, + "loss": 0.44, + "step": 30900 + }, + { + "epoch": 606.0, + "eval_loss": 0.4398665130138397, + "eval_runtime": 2.2661, + "eval_samples_per_second": 1005.707, + "eval_steps_per_second": 3.972, + "step": 30906 + }, + { + "epoch": 606.08, + "learning_rate": 7.994061866253355e-05, + "loss": 0.436, + "step": 30910 + }, + { + "epoch": 606.27, + "learning_rate": 7.990017974966642e-05, + "loss": 0.433, + "step": 30920 + }, + { + "epoch": 606.47, + "learning_rate": 7.98597394060561e-05, + "loss": 0.4341, + "step": 30930 + }, + { + "epoch": 606.67, + "learning_rate": 7.981929764351026e-05, + "loss": 0.4318, + "step": 30940 + }, + { + "epoch": 606.86, + "learning_rate": 7.977885447383698e-05, + "loss": 0.4343, + "step": 30950 + }, + { + "epoch": 607.0, + "eval_loss": 0.4403430223464966, + "eval_runtime": 2.2274, + "eval_samples_per_second": 1023.166, + "eval_steps_per_second": 4.041, + "step": 30957 + }, + { + "epoch": 607.06, + "learning_rate": 7.973840990884477e-05, + "loss": 0.4345, + "step": 30960 + }, + { + "epoch": 607.25, + "learning_rate": 7.969796396034253e-05, + "loss": 0.4375, + "step": 30970 + }, + { + "epoch": 607.45, + "learning_rate": 7.965751664013962e-05, + "loss": 0.4318, + "step": 30980 + }, + { + "epoch": 607.65, + "learning_rate": 7.961706796004572e-05, + "loss": 0.4307, + "step": 30990 + }, + { + "epoch": 607.84, + "learning_rate": 7.957661793187091e-05, + "loss": 0.4313, + "step": 31000 + }, + { + "epoch": 608.0, + "eval_loss": 0.43968504667282104, + "eval_runtime": 2.2198, + "eval_samples_per_second": 1026.684, + "eval_steps_per_second": 4.054, + "step": 31008 + }, + { + "epoch": 608.04, + "learning_rate": 7.953616656742579e-05, + "loss": 0.4283, + "step": 31010 + }, + { + "epoch": 608.24, + "learning_rate": 7.949571387852114e-05, + "loss": 0.4336, + "step": 31020 + }, + { + "epoch": 608.43, + "learning_rate": 7.945525987696835e-05, + "loss": 0.4313, + "step": 31030 + }, + { + "epoch": 608.63, + "learning_rate": 7.941480457457901e-05, + "loss": 0.4299, + "step": 31040 + }, + { + "epoch": 608.82, + "learning_rate": 7.937434798316518e-05, + "loss": 0.4338, + "step": 31050 + }, + { + "epoch": 609.0, + "eval_loss": 0.4378510117530823, + "eval_runtime": 2.2222, + "eval_samples_per_second": 1025.549, + "eval_steps_per_second": 4.05, + "step": 31059 + }, + { + "epoch": 609.02, + "learning_rate": 7.933389011453933e-05, + "loss": 0.4263, + "step": 31060 + }, + { + "epoch": 609.22, + "learning_rate": 7.929343098051422e-05, + "loss": 0.4302, + "step": 31070 + }, + { + "epoch": 609.41, + "learning_rate": 7.9252970592903e-05, + "loss": 0.4299, + "step": 31080 + }, + { + "epoch": 609.61, + "learning_rate": 7.921250896351922e-05, + "loss": 0.4326, + "step": 31090 + }, + { + "epoch": 609.8, + "learning_rate": 7.917204610417677e-05, + "loss": 0.4275, + "step": 31100 + }, + { + "epoch": 610.0, + "learning_rate": 7.91315820266899e-05, + "loss": 0.4299, + "step": 31110 + }, + { + "epoch": 610.0, + "eval_loss": 0.4349246919155121, + "eval_runtime": 2.3617, + "eval_samples_per_second": 964.973, + "eval_steps_per_second": 3.811, + "step": 31110 + }, + { + "epoch": 610.2, + "learning_rate": 7.909111674287323e-05, + "loss": 0.4344, + "step": 31120 + }, + { + "epoch": 610.39, + "learning_rate": 7.905065026454171e-05, + "loss": 0.4368, + "step": 31130 + }, + { + "epoch": 610.59, + "learning_rate": 7.901018260351064e-05, + "loss": 0.4281, + "step": 31140 + }, + { + "epoch": 610.78, + "learning_rate": 7.896971377159571e-05, + "loss": 0.4305, + "step": 31150 + }, + { + "epoch": 610.98, + "learning_rate": 7.892924378061289e-05, + "loss": 0.4325, + "step": 31160 + }, + { + "epoch": 611.0, + "eval_loss": 0.4369864761829376, + "eval_runtime": 2.1774, + "eval_samples_per_second": 1046.683, + "eval_steps_per_second": 4.133, + "step": 31161 + }, + { + "epoch": 611.18, + "learning_rate": 7.88887726423785e-05, + "loss": 0.4301, + "step": 31170 + }, + { + "epoch": 611.37, + "learning_rate": 7.884830036870921e-05, + "loss": 0.427, + "step": 31180 + }, + { + "epoch": 611.57, + "learning_rate": 7.880782697142207e-05, + "loss": 0.4304, + "step": 31190 + }, + { + "epoch": 611.76, + "learning_rate": 7.876735246233437e-05, + "loss": 0.4378, + "step": 31200 + }, + { + "epoch": 611.96, + "learning_rate": 7.872687685326375e-05, + "loss": 0.429, + "step": 31210 + }, + { + "epoch": 612.0, + "eval_loss": 0.43705418705940247, + "eval_runtime": 2.3129, + "eval_samples_per_second": 985.344, + "eval_steps_per_second": 3.891, + "step": 31212 + }, + { + "epoch": 612.16, + "learning_rate": 7.868640015602824e-05, + "loss": 0.4303, + "step": 31220 + }, + { + "epoch": 612.35, + "learning_rate": 7.864592238244607e-05, + "loss": 0.4328, + "step": 31230 + }, + { + "epoch": 612.55, + "learning_rate": 7.86054435443359e-05, + "loss": 0.4279, + "step": 31240 + }, + { + "epoch": 612.75, + "learning_rate": 7.85649636535166e-05, + "loss": 0.4313, + "step": 31250 + }, + { + "epoch": 612.94, + "learning_rate": 7.852448272180744e-05, + "loss": 0.4291, + "step": 31260 + }, + { + "epoch": 613.0, + "eval_loss": 0.42991194128990173, + "eval_runtime": 2.234, + "eval_samples_per_second": 1020.124, + "eval_steps_per_second": 4.029, + "step": 31263 + }, + { + "epoch": 613.14, + "learning_rate": 7.848400076102792e-05, + "loss": 0.4317, + "step": 31270 + }, + { + "epoch": 613.33, + "learning_rate": 7.844351778299788e-05, + "loss": 0.4335, + "step": 31280 + }, + { + "epoch": 613.53, + "learning_rate": 7.840303379953746e-05, + "loss": 0.4325, + "step": 31290 + }, + { + "epoch": 613.73, + "learning_rate": 7.836254882246704e-05, + "loss": 0.4302, + "step": 31300 + }, + { + "epoch": 613.92, + "learning_rate": 7.832206286360736e-05, + "loss": 0.4349, + "step": 31310 + }, + { + "epoch": 614.0, + "eval_loss": 0.43643268942832947, + "eval_runtime": 2.2319, + "eval_samples_per_second": 1021.095, + "eval_steps_per_second": 4.032, + "step": 31314 + }, + { + "epoch": 614.12, + "learning_rate": 7.828157593477942e-05, + "loss": 0.4316, + "step": 31320 + }, + { + "epoch": 614.31, + "learning_rate": 7.82410880478045e-05, + "loss": 0.4332, + "step": 31330 + }, + { + "epoch": 614.51, + "learning_rate": 7.820059921450414e-05, + "loss": 0.431, + "step": 31340 + }, + { + "epoch": 614.71, + "learning_rate": 7.816010944670021e-05, + "loss": 0.435, + "step": 31350 + }, + { + "epoch": 614.9, + "learning_rate": 7.811961875621478e-05, + "loss": 0.4308, + "step": 31360 + }, + { + "epoch": 615.0, + "eval_loss": 0.43355175852775574, + "eval_runtime": 2.3224, + "eval_samples_per_second": 981.333, + "eval_steps_per_second": 3.875, + "step": 31365 + }, + { + "epoch": 615.1, + "learning_rate": 7.807912715487025e-05, + "loss": 0.4322, + "step": 31370 + }, + { + "epoch": 615.29, + "learning_rate": 7.803863465448927e-05, + "loss": 0.4239, + "step": 31380 + }, + { + "epoch": 615.49, + "learning_rate": 7.799814126689471e-05, + "loss": 0.4337, + "step": 31390 + }, + { + "epoch": 615.69, + "learning_rate": 7.79576470039098e-05, + "loss": 0.4332, + "step": 31400 + }, + { + "epoch": 615.88, + "learning_rate": 7.791715187735792e-05, + "loss": 0.4305, + "step": 31410 + }, + { + "epoch": 616.0, + "eval_loss": 0.4343326687812805, + "eval_runtime": 2.3142, + "eval_samples_per_second": 984.779, + "eval_steps_per_second": 3.889, + "step": 31416 + }, + { + "epoch": 616.08, + "learning_rate": 7.787665589906275e-05, + "loss": 0.4302, + "step": 31420 + }, + { + "epoch": 616.27, + "learning_rate": 7.783615908084822e-05, + "loss": 0.4308, + "step": 31430 + }, + { + "epoch": 616.47, + "learning_rate": 7.779566143453846e-05, + "loss": 0.4339, + "step": 31440 + }, + { + "epoch": 616.67, + "learning_rate": 7.775516297195794e-05, + "loss": 0.4281, + "step": 31450 + }, + { + "epoch": 616.86, + "learning_rate": 7.771466370493127e-05, + "loss": 0.4267, + "step": 31460 + }, + { + "epoch": 617.0, + "eval_loss": 0.4391220808029175, + "eval_runtime": 2.3012, + "eval_samples_per_second": 990.354, + "eval_steps_per_second": 3.911, + "step": 31467 + }, + { + "epoch": 617.06, + "learning_rate": 7.767416364528332e-05, + "loss": 0.4367, + "step": 31470 + }, + { + "epoch": 617.25, + "learning_rate": 7.763366280483926e-05, + "loss": 0.4325, + "step": 31480 + }, + { + "epoch": 617.45, + "learning_rate": 7.759316119542437e-05, + "loss": 0.4279, + "step": 31490 + }, + { + "epoch": 617.65, + "learning_rate": 7.755265882886426e-05, + "loss": 0.4301, + "step": 31500 + }, + { + "epoch": 617.84, + "learning_rate": 7.75121557169847e-05, + "loss": 0.4329, + "step": 31510 + }, + { + "epoch": 618.0, + "eval_loss": 0.43645963072776794, + "eval_runtime": 2.1947, + "eval_samples_per_second": 1038.398, + "eval_steps_per_second": 4.101, + "step": 31518 + }, + { + "epoch": 618.04, + "learning_rate": 7.747165187161168e-05, + "loss": 0.4271, + "step": 31520 + }, + { + "epoch": 618.24, + "learning_rate": 7.743114730457145e-05, + "loss": 0.428, + "step": 31530 + }, + { + "epoch": 618.43, + "learning_rate": 7.739064202769044e-05, + "loss": 0.4333, + "step": 31540 + }, + { + "epoch": 618.63, + "learning_rate": 7.735013605279525e-05, + "loss": 0.4311, + "step": 31550 + }, + { + "epoch": 618.82, + "learning_rate": 7.730962939171278e-05, + "loss": 0.4269, + "step": 31560 + }, + { + "epoch": 619.0, + "eval_loss": 0.43330323696136475, + "eval_runtime": 2.2191, + "eval_samples_per_second": 1026.991, + "eval_steps_per_second": 4.056, + "step": 31569 + }, + { + "epoch": 619.02, + "learning_rate": 7.726912205627e-05, + "loss": 0.4339, + "step": 31570 + }, + { + "epoch": 619.22, + "learning_rate": 7.722861405829422e-05, + "loss": 0.4303, + "step": 31580 + }, + { + "epoch": 619.41, + "learning_rate": 7.718810540961281e-05, + "loss": 0.43, + "step": 31590 + }, + { + "epoch": 619.61, + "learning_rate": 7.714759612205342e-05, + "loss": 0.4249, + "step": 31600 + }, + { + "epoch": 619.8, + "learning_rate": 7.710708620744387e-05, + "loss": 0.4292, + "step": 31610 + }, + { + "epoch": 620.0, + "learning_rate": 7.706657567761216e-05, + "loss": 0.4251, + "step": 31620 + }, + { + "epoch": 620.0, + "eval_loss": 0.4343318045139313, + "eval_runtime": 2.1695, + "eval_samples_per_second": 1050.467, + "eval_steps_per_second": 4.148, + "step": 31620 + }, + { + "epoch": 620.2, + "learning_rate": 7.702606454438641e-05, + "loss": 0.4296, + "step": 31630 + }, + { + "epoch": 620.39, + "learning_rate": 7.698555281959501e-05, + "loss": 0.4276, + "step": 31640 + }, + { + "epoch": 620.59, + "learning_rate": 7.694504051506647e-05, + "loss": 0.4316, + "step": 31650 + }, + { + "epoch": 620.78, + "learning_rate": 7.690452764262947e-05, + "loss": 0.4313, + "step": 31660 + }, + { + "epoch": 620.98, + "learning_rate": 7.686401421411288e-05, + "loss": 0.427, + "step": 31670 + }, + { + "epoch": 621.0, + "eval_loss": 0.4344363510608673, + "eval_runtime": 2.2851, + "eval_samples_per_second": 997.328, + "eval_steps_per_second": 3.939, + "step": 31671 + }, + { + "epoch": 621.18, + "learning_rate": 7.68235002413457e-05, + "loss": 0.427, + "step": 31680 + }, + { + "epoch": 621.37, + "learning_rate": 7.678298573615714e-05, + "loss": 0.4335, + "step": 31690 + }, + { + "epoch": 621.57, + "learning_rate": 7.67424707103765e-05, + "loss": 0.4321, + "step": 31700 + }, + { + "epoch": 621.76, + "learning_rate": 7.670195517583325e-05, + "loss": 0.4297, + "step": 31710 + }, + { + "epoch": 621.96, + "learning_rate": 7.666143914435709e-05, + "loss": 0.4327, + "step": 31720 + }, + { + "epoch": 622.0, + "eval_loss": 0.434471994638443, + "eval_runtime": 2.1731, + "eval_samples_per_second": 1048.718, + "eval_steps_per_second": 4.141, + "step": 31722 + }, + { + "epoch": 622.16, + "learning_rate": 7.662092262777771e-05, + "loss": 0.4279, + "step": 31730 + }, + { + "epoch": 622.35, + "learning_rate": 7.658040563792508e-05, + "loss": 0.4317, + "step": 31740 + }, + { + "epoch": 622.55, + "learning_rate": 7.653988818662927e-05, + "loss": 0.4361, + "step": 31750 + }, + { + "epoch": 622.75, + "learning_rate": 7.649937028572046e-05, + "loss": 0.4314, + "step": 31760 + }, + { + "epoch": 622.94, + "learning_rate": 7.645885194702896e-05, + "loss": 0.4263, + "step": 31770 + }, + { + "epoch": 623.0, + "eval_loss": 0.4369990825653076, + "eval_runtime": 2.2753, + "eval_samples_per_second": 1001.637, + "eval_steps_per_second": 3.956, + "step": 31773 + }, + { + "epoch": 623.14, + "learning_rate": 7.641833318238519e-05, + "loss": 0.429, + "step": 31780 + }, + { + "epoch": 623.33, + "learning_rate": 7.63778140036198e-05, + "loss": 0.4323, + "step": 31790 + }, + { + "epoch": 623.53, + "learning_rate": 7.633729442256343e-05, + "loss": 0.4243, + "step": 31800 + }, + { + "epoch": 623.73, + "learning_rate": 7.629677445104691e-05, + "loss": 0.427, + "step": 31810 + }, + { + "epoch": 623.92, + "learning_rate": 7.62562541009012e-05, + "loss": 0.4288, + "step": 31820 + }, + { + "epoch": 624.0, + "eval_loss": 0.43234148621559143, + "eval_runtime": 2.3479, + "eval_samples_per_second": 970.674, + "eval_steps_per_second": 3.833, + "step": 31824 + }, + { + "epoch": 624.12, + "learning_rate": 7.621573338395731e-05, + "loss": 0.4321, + "step": 31830 + }, + { + "epoch": 624.31, + "learning_rate": 7.617521231204636e-05, + "loss": 0.4283, + "step": 31840 + }, + { + "epoch": 624.51, + "learning_rate": 7.613469089699965e-05, + "loss": 0.433, + "step": 31850 + }, + { + "epoch": 624.71, + "learning_rate": 7.609416915064846e-05, + "loss": 0.4244, + "step": 31860 + }, + { + "epoch": 624.9, + "learning_rate": 7.605364708482432e-05, + "loss": 0.4316, + "step": 31870 + }, + { + "epoch": 625.0, + "eval_loss": 0.4324721395969391, + "eval_runtime": 2.2438, + "eval_samples_per_second": 1015.687, + "eval_steps_per_second": 4.011, + "step": 31875 + }, + { + "epoch": 625.1, + "learning_rate": 7.60131247113587e-05, + "loss": 0.4274, + "step": 31880 + }, + { + "epoch": 625.29, + "learning_rate": 7.597260204208328e-05, + "loss": 0.4307, + "step": 31890 + }, + { + "epoch": 625.49, + "learning_rate": 7.593207908882977e-05, + "loss": 0.4314, + "step": 31900 + }, + { + "epoch": 625.69, + "learning_rate": 7.589155586342992e-05, + "loss": 0.4292, + "step": 31910 + }, + { + "epoch": 625.88, + "learning_rate": 7.585103237771566e-05, + "loss": 0.431, + "step": 31920 + }, + { + "epoch": 626.0, + "eval_loss": 0.43281376361846924, + "eval_runtime": 2.193, + "eval_samples_per_second": 1039.194, + "eval_steps_per_second": 4.104, + "step": 31926 + }, + { + "epoch": 626.08, + "learning_rate": 7.581050864351893e-05, + "loss": 0.4305, + "step": 31930 + }, + { + "epoch": 626.27, + "learning_rate": 7.576998467267174e-05, + "loss": 0.435, + "step": 31940 + }, + { + "epoch": 626.47, + "learning_rate": 7.57294604770062e-05, + "loss": 0.4336, + "step": 31950 + }, + { + "epoch": 626.67, + "learning_rate": 7.568893606835449e-05, + "loss": 0.4347, + "step": 31960 + }, + { + "epoch": 626.86, + "learning_rate": 7.56484114585488e-05, + "loss": 0.4316, + "step": 31970 + }, + { + "epoch": 627.0, + "eval_loss": 0.4315592646598816, + "eval_runtime": 2.2604, + "eval_samples_per_second": 1008.212, + "eval_steps_per_second": 3.982, + "step": 31977 + }, + { + "epoch": 627.06, + "learning_rate": 7.56078866594214e-05, + "loss": 0.4269, + "step": 31980 + }, + { + "epoch": 627.25, + "learning_rate": 7.556736168280467e-05, + "loss": 0.4258, + "step": 31990 + }, + { + "epoch": 627.45, + "learning_rate": 7.552683654053099e-05, + "loss": 0.431, + "step": 32000 + }, + { + "epoch": 627.65, + "learning_rate": 7.548631124443279e-05, + "loss": 0.4276, + "step": 32010 + }, + { + "epoch": 627.84, + "learning_rate": 7.544578580634253e-05, + "loss": 0.4325, + "step": 32020 + }, + { + "epoch": 628.0, + "eval_loss": 0.43107348680496216, + "eval_runtime": 2.2599, + "eval_samples_per_second": 1008.442, + "eval_steps_per_second": 3.982, + "step": 32028 + }, + { + "epoch": 628.04, + "learning_rate": 7.54052602380928e-05, + "loss": 0.4279, + "step": 32030 + }, + { + "epoch": 628.24, + "learning_rate": 7.536473455151605e-05, + "loss": 0.4284, + "step": 32040 + }, + { + "epoch": 628.43, + "learning_rate": 7.532420875844502e-05, + "loss": 0.4271, + "step": 32050 + }, + { + "epoch": 628.63, + "learning_rate": 7.528368287071222e-05, + "loss": 0.4283, + "step": 32060 + }, + { + "epoch": 628.82, + "learning_rate": 7.524315690015034e-05, + "loss": 0.4287, + "step": 32070 + }, + { + "epoch": 629.0, + "eval_loss": 0.4322940707206726, + "eval_runtime": 2.2832, + "eval_samples_per_second": 998.171, + "eval_steps_per_second": 3.942, + "step": 32079 + }, + { + "epoch": 629.02, + "learning_rate": 7.52026308585921e-05, + "loss": 0.4283, + "step": 32080 + }, + { + "epoch": 629.22, + "learning_rate": 7.516210475787015e-05, + "loss": 0.4231, + "step": 32090 + }, + { + "epoch": 629.41, + "learning_rate": 7.512157860981725e-05, + "loss": 0.4285, + "step": 32100 + }, + { + "epoch": 629.61, + "learning_rate": 7.508105242626608e-05, + "loss": 0.4219, + "step": 32110 + }, + { + "epoch": 629.8, + "learning_rate": 7.504052621904941e-05, + "loss": 0.4272, + "step": 32120 + }, + { + "epoch": 630.0, + "learning_rate": 7.5e-05, + "loss": 0.4267, + "step": 32130 + }, + { + "epoch": 630.0, + "eval_loss": 0.4301910400390625, + "eval_runtime": 2.3346, + "eval_samples_per_second": 976.183, + "eval_steps_per_second": 3.855, + "step": 32130 + }, + { + "epoch": 630.2, + "learning_rate": 7.495947378095059e-05, + "loss": 0.4302, + "step": 32140 + }, + { + "epoch": 630.39, + "learning_rate": 7.49189475737339e-05, + "loss": 0.4287, + "step": 32150 + }, + { + "epoch": 630.59, + "learning_rate": 7.487842139018277e-05, + "loss": 0.4281, + "step": 32160 + }, + { + "epoch": 630.78, + "learning_rate": 7.483789524212983e-05, + "loss": 0.4288, + "step": 32170 + }, + { + "epoch": 630.98, + "learning_rate": 7.47973691414079e-05, + "loss": 0.426, + "step": 32180 + }, + { + "epoch": 631.0, + "eval_loss": 0.43417489528656006, + "eval_runtime": 2.248, + "eval_samples_per_second": 1013.791, + "eval_steps_per_second": 4.004, + "step": 32181 + }, + { + "epoch": 631.18, + "learning_rate": 7.475684309984963e-05, + "loss": 0.428, + "step": 32190 + }, + { + "epoch": 631.37, + "learning_rate": 7.471631712928778e-05, + "loss": 0.4266, + "step": 32200 + }, + { + "epoch": 631.57, + "learning_rate": 7.467579124155501e-05, + "loss": 0.4296, + "step": 32210 + }, + { + "epoch": 631.76, + "learning_rate": 7.463526544848393e-05, + "loss": 0.4288, + "step": 32220 + }, + { + "epoch": 631.96, + "learning_rate": 7.459473976190722e-05, + "loss": 0.4259, + "step": 32230 + }, + { + "epoch": 632.0, + "eval_loss": 0.4324003756046295, + "eval_runtime": 2.2505, + "eval_samples_per_second": 1012.666, + "eval_steps_per_second": 3.999, + "step": 32232 + }, + { + "epoch": 632.16, + "learning_rate": 7.455421419365746e-05, + "loss": 0.4285, + "step": 32240 + }, + { + "epoch": 632.35, + "learning_rate": 7.451368875556721e-05, + "loss": 0.4285, + "step": 32250 + }, + { + "epoch": 632.55, + "learning_rate": 7.4473163459469e-05, + "loss": 0.4277, + "step": 32260 + }, + { + "epoch": 632.75, + "learning_rate": 7.443263831719533e-05, + "loss": 0.4292, + "step": 32270 + }, + { + "epoch": 632.94, + "learning_rate": 7.439211334057861e-05, + "loss": 0.427, + "step": 32280 + }, + { + "epoch": 633.0, + "eval_loss": 0.4315228760242462, + "eval_runtime": 2.2264, + "eval_samples_per_second": 1023.622, + "eval_steps_per_second": 4.042, + "step": 32283 + }, + { + "epoch": 633.14, + "learning_rate": 7.435158854145122e-05, + "loss": 0.4282, + "step": 32290 + }, + { + "epoch": 633.33, + "learning_rate": 7.431106393164551e-05, + "loss": 0.4249, + "step": 32300 + }, + { + "epoch": 633.53, + "learning_rate": 7.427053952299378e-05, + "loss": 0.4264, + "step": 32310 + }, + { + "epoch": 633.73, + "learning_rate": 7.423001532732826e-05, + "loss": 0.425, + "step": 32320 + }, + { + "epoch": 633.92, + "learning_rate": 7.418949135648106e-05, + "loss": 0.4268, + "step": 32330 + }, + { + "epoch": 634.0, + "eval_loss": 0.4299897253513336, + "eval_runtime": 2.1933, + "eval_samples_per_second": 1039.059, + "eval_steps_per_second": 4.103, + "step": 32334 + }, + { + "epoch": 634.12, + "learning_rate": 7.414896762228434e-05, + "loss": 0.4227, + "step": 32340 + }, + { + "epoch": 634.31, + "learning_rate": 7.410844413657008e-05, + "loss": 0.4277, + "step": 32350 + }, + { + "epoch": 634.51, + "learning_rate": 7.406792091117022e-05, + "loss": 0.4286, + "step": 32360 + }, + { + "epoch": 634.71, + "learning_rate": 7.402739795791672e-05, + "loss": 0.4228, + "step": 32370 + }, + { + "epoch": 634.9, + "learning_rate": 7.398687528864128e-05, + "loss": 0.4251, + "step": 32380 + }, + { + "epoch": 635.0, + "eval_loss": 0.4384912848472595, + "eval_runtime": 2.2966, + "eval_samples_per_second": 992.338, + "eval_steps_per_second": 3.919, + "step": 32385 + }, + { + "epoch": 635.1, + "learning_rate": 7.394635291517568e-05, + "loss": 0.4337, + "step": 32390 + }, + { + "epoch": 635.29, + "learning_rate": 7.390583084935152e-05, + "loss": 0.4361, + "step": 32400 + }, + { + "epoch": 635.49, + "learning_rate": 7.386530910300036e-05, + "loss": 0.4254, + "step": 32410 + }, + { + "epoch": 635.69, + "learning_rate": 7.382478768795366e-05, + "loss": 0.4283, + "step": 32420 + }, + { + "epoch": 635.88, + "learning_rate": 7.37842666160427e-05, + "loss": 0.4291, + "step": 32430 + }, + { + "epoch": 636.0, + "eval_loss": 0.43578821420669556, + "eval_runtime": 2.3566, + "eval_samples_per_second": 967.062, + "eval_steps_per_second": 3.819, + "step": 32436 + }, + { + "epoch": 636.08, + "learning_rate": 7.37437458990988e-05, + "loss": 0.4256, + "step": 32440 + }, + { + "epoch": 636.27, + "learning_rate": 7.370322554895306e-05, + "loss": 0.4282, + "step": 32450 + }, + { + "epoch": 636.47, + "learning_rate": 7.366270557743655e-05, + "loss": 0.4266, + "step": 32460 + }, + { + "epoch": 636.67, + "learning_rate": 7.362218599638018e-05, + "loss": 0.4216, + "step": 32470 + }, + { + "epoch": 636.86, + "learning_rate": 7.35816668176148e-05, + "loss": 0.4273, + "step": 32480 + }, + { + "epoch": 637.0, + "eval_loss": 0.43420571088790894, + "eval_runtime": 2.2015, + "eval_samples_per_second": 1035.206, + "eval_steps_per_second": 4.088, + "step": 32487 + }, + { + "epoch": 637.06, + "learning_rate": 7.354114805297107e-05, + "loss": 0.4285, + "step": 32490 + }, + { + "epoch": 637.25, + "learning_rate": 7.350062971427954e-05, + "loss": 0.4278, + "step": 32500 + }, + { + "epoch": 637.45, + "learning_rate": 7.346011181337071e-05, + "loss": 0.424, + "step": 32510 + }, + { + "epoch": 637.65, + "learning_rate": 7.341959436207488e-05, + "loss": 0.4276, + "step": 32520 + }, + { + "epoch": 637.84, + "learning_rate": 7.337907737222228e-05, + "loss": 0.4238, + "step": 32530 + }, + { + "epoch": 638.0, + "eval_loss": 0.4311440587043762, + "eval_runtime": 2.2684, + "eval_samples_per_second": 1004.661, + "eval_steps_per_second": 3.968, + "step": 32538 + }, + { + "epoch": 638.04, + "learning_rate": 7.333856085564293e-05, + "loss": 0.4303, + "step": 32540 + }, + { + "epoch": 638.24, + "learning_rate": 7.329804482416673e-05, + "loss": 0.4302, + "step": 32550 + }, + { + "epoch": 638.43, + "learning_rate": 7.325752928962352e-05, + "loss": 0.4234, + "step": 32560 + }, + { + "epoch": 638.63, + "learning_rate": 7.321701426384285e-05, + "loss": 0.4266, + "step": 32570 + }, + { + "epoch": 638.82, + "learning_rate": 7.31764997586543e-05, + "loss": 0.4262, + "step": 32580 + }, + { + "epoch": 639.0, + "eval_loss": 0.432700514793396, + "eval_runtime": 2.3427, + "eval_samples_per_second": 972.821, + "eval_steps_per_second": 3.842, + "step": 32589 + }, + { + "epoch": 639.02, + "learning_rate": 7.313598578588712e-05, + "loss": 0.4235, + "step": 32590 + }, + { + "epoch": 639.22, + "learning_rate": 7.309547235737053e-05, + "loss": 0.4276, + "step": 32600 + }, + { + "epoch": 639.41, + "learning_rate": 7.305495948493354e-05, + "loss": 0.4363, + "step": 32610 + }, + { + "epoch": 639.61, + "learning_rate": 7.301444718040499e-05, + "loss": 0.4284, + "step": 32620 + }, + { + "epoch": 639.8, + "learning_rate": 7.29739354556136e-05, + "loss": 0.4243, + "step": 32630 + }, + { + "epoch": 640.0, + "learning_rate": 7.293342432238786e-05, + "loss": 0.4251, + "step": 32640 + }, + { + "epoch": 640.0, + "eval_loss": 0.43292704224586487, + "eval_runtime": 2.3486, + "eval_samples_per_second": 970.382, + "eval_steps_per_second": 3.832, + "step": 32640 + }, + { + "epoch": 640.2, + "learning_rate": 7.289291379255611e-05, + "loss": 0.4293, + "step": 32650 + }, + { + "epoch": 640.39, + "learning_rate": 7.285240387794655e-05, + "loss": 0.433, + "step": 32660 + }, + { + "epoch": 640.59, + "learning_rate": 7.281189459038718e-05, + "loss": 0.4284, + "step": 32670 + }, + { + "epoch": 640.78, + "learning_rate": 7.27713859417058e-05, + "loss": 0.4247, + "step": 32680 + }, + { + "epoch": 640.98, + "learning_rate": 7.273087794373e-05, + "loss": 0.4276, + "step": 32690 + }, + { + "epoch": 641.0, + "eval_loss": 0.43443429470062256, + "eval_runtime": 2.2905, + "eval_samples_per_second": 994.976, + "eval_steps_per_second": 3.929, + "step": 32691 + }, + { + "epoch": 641.18, + "learning_rate": 7.269037060828723e-05, + "loss": 0.4231, + "step": 32700 + }, + { + "epoch": 641.37, + "learning_rate": 7.264986394720473e-05, + "loss": 0.4244, + "step": 32710 + }, + { + "epoch": 641.57, + "learning_rate": 7.260935797230956e-05, + "loss": 0.4202, + "step": 32720 + }, + { + "epoch": 641.76, + "learning_rate": 7.256885269542851e-05, + "loss": 0.4239, + "step": 32730 + }, + { + "epoch": 641.96, + "learning_rate": 7.252834812838831e-05, + "loss": 0.4274, + "step": 32740 + }, + { + "epoch": 642.0, + "eval_loss": 0.43040502071380615, + "eval_runtime": 2.223, + "eval_samples_per_second": 1025.208, + "eval_steps_per_second": 4.049, + "step": 32742 + }, + { + "epoch": 642.16, + "learning_rate": 7.248784428301531e-05, + "loss": 0.4222, + "step": 32750 + }, + { + "epoch": 642.35, + "learning_rate": 7.244734117113573e-05, + "loss": 0.4268, + "step": 32760 + }, + { + "epoch": 642.55, + "learning_rate": 7.240683880457563e-05, + "loss": 0.4229, + "step": 32770 + }, + { + "epoch": 642.75, + "learning_rate": 7.236633719516073e-05, + "loss": 0.425, + "step": 32780 + }, + { + "epoch": 642.94, + "learning_rate": 7.232583635471668e-05, + "loss": 0.4269, + "step": 32790 + }, + { + "epoch": 643.0, + "eval_loss": 0.42628300189971924, + "eval_runtime": 2.2073, + "eval_samples_per_second": 1032.467, + "eval_steps_per_second": 4.077, + "step": 32793 + }, + { + "epoch": 643.14, + "learning_rate": 7.228533629506874e-05, + "loss": 0.4247, + "step": 32800 + }, + { + "epoch": 643.33, + "learning_rate": 7.224483702804207e-05, + "loss": 0.4222, + "step": 32810 + }, + { + "epoch": 643.53, + "learning_rate": 7.220433856546153e-05, + "loss": 0.4268, + "step": 32820 + }, + { + "epoch": 643.73, + "learning_rate": 7.216384091915178e-05, + "loss": 0.4273, + "step": 32830 + }, + { + "epoch": 643.92, + "learning_rate": 7.212334410093727e-05, + "loss": 0.4217, + "step": 32840 + }, + { + "epoch": 644.0, + "eval_loss": 0.43053871393203735, + "eval_runtime": 2.4095, + "eval_samples_per_second": 945.824, + "eval_steps_per_second": 3.735, + "step": 32844 + }, + { + "epoch": 644.12, + "learning_rate": 7.208284812264208e-05, + "loss": 0.43, + "step": 32850 + }, + { + "epoch": 644.31, + "learning_rate": 7.20423529960902e-05, + "loss": 0.424, + "step": 32860 + }, + { + "epoch": 644.51, + "learning_rate": 7.200185873310526e-05, + "loss": 0.4227, + "step": 32870 + }, + { + "epoch": 644.71, + "learning_rate": 7.196136534551073e-05, + "loss": 0.4279, + "step": 32880 + }, + { + "epoch": 644.9, + "learning_rate": 7.192087284512977e-05, + "loss": 0.4204, + "step": 32890 + }, + { + "epoch": 645.0, + "eval_loss": 0.431392639875412, + "eval_runtime": 2.3078, + "eval_samples_per_second": 987.517, + "eval_steps_per_second": 3.9, + "step": 32895 + }, + { + "epoch": 645.1, + "learning_rate": 7.188038124378522e-05, + "loss": 0.4231, + "step": 32900 + }, + { + "epoch": 645.29, + "learning_rate": 7.18398905532998e-05, + "loss": 0.4287, + "step": 32910 + }, + { + "epoch": 645.49, + "learning_rate": 7.179940078549585e-05, + "loss": 0.4257, + "step": 32920 + }, + { + "epoch": 645.69, + "learning_rate": 7.17589119521955e-05, + "loss": 0.4217, + "step": 32930 + }, + { + "epoch": 645.88, + "learning_rate": 7.171842406522055e-05, + "loss": 0.4268, + "step": 32940 + }, + { + "epoch": 646.0, + "eval_loss": 0.4283953607082367, + "eval_runtime": 2.1783, + "eval_samples_per_second": 1046.25, + "eval_steps_per_second": 4.132, + "step": 32946 + }, + { + "epoch": 646.08, + "learning_rate": 7.167793713639264e-05, + "loss": 0.4245, + "step": 32950 + }, + { + "epoch": 646.27, + "learning_rate": 7.163745117753296e-05, + "loss": 0.4217, + "step": 32960 + }, + { + "epoch": 646.47, + "learning_rate": 7.159696620046254e-05, + "loss": 0.4245, + "step": 32970 + }, + { + "epoch": 646.67, + "learning_rate": 7.15564822170021e-05, + "loss": 0.4269, + "step": 32980 + }, + { + "epoch": 646.86, + "learning_rate": 7.151599923897207e-05, + "loss": 0.4227, + "step": 32990 + }, + { + "epoch": 647.0, + "eval_loss": 0.42809709906578064, + "eval_runtime": 2.3106, + "eval_samples_per_second": 986.327, + "eval_steps_per_second": 3.895, + "step": 32997 + }, + { + "epoch": 647.06, + "learning_rate": 7.147551727819256e-05, + "loss": 0.4247, + "step": 33000 + }, + { + "epoch": 647.25, + "learning_rate": 7.143503634648338e-05, + "loss": 0.4278, + "step": 33010 + }, + { + "epoch": 647.45, + "learning_rate": 7.13945564556641e-05, + "loss": 0.424, + "step": 33020 + }, + { + "epoch": 647.65, + "learning_rate": 7.135407761755393e-05, + "loss": 0.4287, + "step": 33030 + }, + { + "epoch": 647.84, + "learning_rate": 7.131359984397175e-05, + "loss": 0.4236, + "step": 33040 + }, + { + "epoch": 648.0, + "eval_loss": 0.4320383071899414, + "eval_runtime": 2.2798, + "eval_samples_per_second": 999.632, + "eval_steps_per_second": 3.948, + "step": 33048 + }, + { + "epoch": 648.04, + "learning_rate": 7.127312314673624e-05, + "loss": 0.4257, + "step": 33050 + }, + { + "epoch": 648.24, + "learning_rate": 7.123264753766563e-05, + "loss": 0.4239, + "step": 33060 + }, + { + "epoch": 648.43, + "learning_rate": 7.119217302857792e-05, + "loss": 0.4216, + "step": 33070 + }, + { + "epoch": 648.63, + "learning_rate": 7.115169963129076e-05, + "loss": 0.4278, + "step": 33080 + }, + { + "epoch": 648.82, + "learning_rate": 7.11112273576215e-05, + "loss": 0.4245, + "step": 33090 + }, + { + "epoch": 649.0, + "eval_loss": 0.42947202920913696, + "eval_runtime": 2.2983, + "eval_samples_per_second": 991.598, + "eval_steps_per_second": 3.916, + "step": 33099 + }, + { + "epoch": 649.02, + "learning_rate": 7.107075621938714e-05, + "loss": 0.4214, + "step": 33100 + }, + { + "epoch": 649.22, + "learning_rate": 7.103028622840429e-05, + "loss": 0.4207, + "step": 33110 + }, + { + "epoch": 649.41, + "learning_rate": 7.098981739648934e-05, + "loss": 0.4245, + "step": 33120 + }, + { + "epoch": 649.61, + "learning_rate": 7.094934973545827e-05, + "loss": 0.4224, + "step": 33130 + }, + { + "epoch": 649.8, + "learning_rate": 7.090888325712676e-05, + "loss": 0.4268, + "step": 33140 + }, + { + "epoch": 650.0, + "learning_rate": 7.086841797331007e-05, + "loss": 0.4229, + "step": 33150 + }, + { + "epoch": 650.0, + "eval_loss": 0.42622441053390503, + "eval_runtime": 2.3563, + "eval_samples_per_second": 967.185, + "eval_steps_per_second": 3.82, + "step": 33150 + }, + { + "epoch": 650.2, + "learning_rate": 7.082795389582323e-05, + "loss": 0.4253, + "step": 33160 + }, + { + "epoch": 650.39, + "learning_rate": 7.078749103648079e-05, + "loss": 0.4191, + "step": 33170 + }, + { + "epoch": 650.59, + "learning_rate": 7.074702940709699e-05, + "loss": 0.4206, + "step": 33180 + }, + { + "epoch": 650.78, + "learning_rate": 7.070656901948578e-05, + "loss": 0.4246, + "step": 33190 + }, + { + "epoch": 650.98, + "learning_rate": 7.066610988546065e-05, + "loss": 0.423, + "step": 33200 + }, + { + "epoch": 651.0, + "eval_loss": 0.4238925576210022, + "eval_runtime": 2.2734, + "eval_samples_per_second": 1002.449, + "eval_steps_per_second": 3.959, + "step": 33201 + }, + { + "epoch": 651.18, + "learning_rate": 7.06256520168348e-05, + "loss": 0.4184, + "step": 33210 + }, + { + "epoch": 651.37, + "learning_rate": 7.0585195425421e-05, + "loss": 0.421, + "step": 33220 + }, + { + "epoch": 651.57, + "learning_rate": 7.054474012303166e-05, + "loss": 0.4231, + "step": 33230 + }, + { + "epoch": 651.76, + "learning_rate": 7.050428612147885e-05, + "loss": 0.4226, + "step": 33240 + }, + { + "epoch": 651.96, + "learning_rate": 7.046383343257421e-05, + "loss": 0.4209, + "step": 33250 + }, + { + "epoch": 652.0, + "eval_loss": 0.42940622568130493, + "eval_runtime": 2.2106, + "eval_samples_per_second": 1030.937, + "eval_steps_per_second": 4.071, + "step": 33252 + }, + { + "epoch": 652.16, + "learning_rate": 7.042338206812907e-05, + "loss": 0.4214, + "step": 33260 + }, + { + "epoch": 652.35, + "learning_rate": 7.038293203995428e-05, + "loss": 0.4221, + "step": 33270 + }, + { + "epoch": 652.55, + "learning_rate": 7.034248335986037e-05, + "loss": 0.4252, + "step": 33280 + }, + { + "epoch": 652.75, + "learning_rate": 7.030203603965747e-05, + "loss": 0.4217, + "step": 33290 + }, + { + "epoch": 652.94, + "learning_rate": 7.026159009115522e-05, + "loss": 0.4209, + "step": 33300 + }, + { + "epoch": 653.0, + "eval_loss": 0.43150344491004944, + "eval_runtime": 2.3003, + "eval_samples_per_second": 990.759, + "eval_steps_per_second": 3.913, + "step": 33303 + }, + { + "epoch": 653.14, + "learning_rate": 7.022114552616304e-05, + "loss": 0.4225, + "step": 33310 + }, + { + "epoch": 653.33, + "learning_rate": 7.018070235648975e-05, + "loss": 0.421, + "step": 33320 + }, + { + "epoch": 653.53, + "learning_rate": 7.01402605939439e-05, + "loss": 0.4221, + "step": 33330 + }, + { + "epoch": 653.73, + "learning_rate": 7.009982025033356e-05, + "loss": 0.4231, + "step": 33340 + }, + { + "epoch": 653.92, + "learning_rate": 7.005938133746645e-05, + "loss": 0.425, + "step": 33350 + }, + { + "epoch": 654.0, + "eval_loss": 0.42987799644470215, + "eval_runtime": 2.1499, + "eval_samples_per_second": 1060.042, + "eval_steps_per_second": 4.186, + "step": 33354 + }, + { + "epoch": 654.12, + "learning_rate": 7.001894386714981e-05, + "loss": 0.4214, + "step": 33360 + }, + { + "epoch": 654.31, + "learning_rate": 6.997850785119044e-05, + "loss": 0.4247, + "step": 33370 + }, + { + "epoch": 654.51, + "learning_rate": 6.993807330139481e-05, + "loss": 0.4203, + "step": 33380 + }, + { + "epoch": 654.71, + "learning_rate": 6.989764022956885e-05, + "loss": 0.4245, + "step": 33390 + }, + { + "epoch": 654.9, + "learning_rate": 6.985720864751819e-05, + "loss": 0.418, + "step": 33400 + }, + { + "epoch": 655.0, + "eval_loss": 0.42820972204208374, + "eval_runtime": 2.2377, + "eval_samples_per_second": 1018.475, + "eval_steps_per_second": 4.022, + "step": 33405 + }, + { + "epoch": 655.1, + "learning_rate": 6.981677856704788e-05, + "loss": 0.4256, + "step": 33410 + }, + { + "epoch": 655.29, + "learning_rate": 6.977634999996266e-05, + "loss": 0.4199, + "step": 33420 + }, + { + "epoch": 655.49, + "learning_rate": 6.973592295806673e-05, + "loss": 0.4251, + "step": 33430 + }, + { + "epoch": 655.69, + "learning_rate": 6.96954974531639e-05, + "loss": 0.4221, + "step": 33440 + }, + { + "epoch": 655.88, + "learning_rate": 6.96550734970575e-05, + "loss": 0.423, + "step": 33450 + }, + { + "epoch": 656.0, + "eval_loss": 0.4263513386249542, + "eval_runtime": 2.317, + "eval_samples_per_second": 983.589, + "eval_steps_per_second": 3.884, + "step": 33456 + }, + { + "epoch": 656.08, + "learning_rate": 6.961465110155042e-05, + "loss": 0.4227, + "step": 33460 + }, + { + "epoch": 656.27, + "learning_rate": 6.957423027844515e-05, + "loss": 0.425, + "step": 33470 + }, + { + "epoch": 656.47, + "learning_rate": 6.95338110395436e-05, + "loss": 0.419, + "step": 33480 + }, + { + "epoch": 656.67, + "learning_rate": 6.949339339664735e-05, + "loss": 0.4221, + "step": 33490 + }, + { + "epoch": 656.86, + "learning_rate": 6.945297736155742e-05, + "loss": 0.4267, + "step": 33500 + }, + { + "epoch": 657.0, + "eval_loss": 0.42961037158966064, + "eval_runtime": 2.3194, + "eval_samples_per_second": 982.598, + "eval_steps_per_second": 3.88, + "step": 33507 + }, + { + "epoch": 657.06, + "learning_rate": 6.941256294607435e-05, + "loss": 0.4198, + "step": 33510 + }, + { + "epoch": 657.25, + "learning_rate": 6.937215016199833e-05, + "loss": 0.4256, + "step": 33520 + }, + { + "epoch": 657.45, + "learning_rate": 6.933173902112892e-05, + "loss": 0.4255, + "step": 33530 + }, + { + "epoch": 657.65, + "learning_rate": 6.929132953526531e-05, + "loss": 0.424, + "step": 33540 + }, + { + "epoch": 657.84, + "learning_rate": 6.925092171620616e-05, + "loss": 0.4226, + "step": 33550 + }, + { + "epoch": 658.0, + "eval_loss": 0.4268750548362732, + "eval_runtime": 2.2466, + "eval_samples_per_second": 1014.438, + "eval_steps_per_second": 4.006, + "step": 33558 + }, + { + "epoch": 658.04, + "learning_rate": 6.921051557574965e-05, + "loss": 0.4151, + "step": 33560 + }, + { + "epoch": 658.24, + "learning_rate": 6.91701111256935e-05, + "loss": 0.4231, + "step": 33570 + }, + { + "epoch": 658.43, + "learning_rate": 6.912970837783485e-05, + "loss": 0.4223, + "step": 33580 + }, + { + "epoch": 658.63, + "learning_rate": 6.908930734397044e-05, + "loss": 0.4202, + "step": 33590 + }, + { + "epoch": 658.82, + "learning_rate": 6.904890803589645e-05, + "loss": 0.4213, + "step": 33600 + }, + { + "epoch": 659.0, + "eval_loss": 0.42958545684814453, + "eval_runtime": 2.3495, + "eval_samples_per_second": 969.976, + "eval_steps_per_second": 3.831, + "step": 33609 + }, + { + "epoch": 659.02, + "learning_rate": 6.900851046540862e-05, + "loss": 0.4146, + "step": 33610 + }, + { + "epoch": 659.22, + "learning_rate": 6.896811464430209e-05, + "loss": 0.4223, + "step": 33620 + }, + { + "epoch": 659.41, + "learning_rate": 6.892772058437158e-05, + "loss": 0.4202, + "step": 33630 + }, + { + "epoch": 659.61, + "learning_rate": 6.888732829741124e-05, + "loss": 0.4214, + "step": 33640 + }, + { + "epoch": 659.8, + "learning_rate": 6.884693779521468e-05, + "loss": 0.4244, + "step": 33650 + }, + { + "epoch": 660.0, + "learning_rate": 6.880654908957507e-05, + "loss": 0.4192, + "step": 33660 + }, + { + "epoch": 660.0, + "eval_loss": 0.4259309470653534, + "eval_runtime": 2.3163, + "eval_samples_per_second": 983.884, + "eval_steps_per_second": 3.885, + "step": 33660 + }, + { + "epoch": 660.2, + "learning_rate": 6.876616219228499e-05, + "loss": 0.4216, + "step": 33670 + }, + { + "epoch": 660.39, + "learning_rate": 6.872577711513655e-05, + "loss": 0.4201, + "step": 33680 + }, + { + "epoch": 660.59, + "learning_rate": 6.868539386992124e-05, + "loss": 0.4154, + "step": 33690 + }, + { + "epoch": 660.78, + "learning_rate": 6.86450124684301e-05, + "loss": 0.4229, + "step": 33700 + }, + { + "epoch": 660.98, + "learning_rate": 6.860463292245359e-05, + "loss": 0.4234, + "step": 33710 + }, + { + "epoch": 661.0, + "eval_loss": 0.42434313893318176, + "eval_runtime": 2.3499, + "eval_samples_per_second": 969.824, + "eval_steps_per_second": 3.83, + "step": 33711 + }, + { + "epoch": 661.18, + "learning_rate": 6.856425524378163e-05, + "loss": 0.4272, + "step": 33720 + }, + { + "epoch": 661.37, + "learning_rate": 6.852387944420363e-05, + "loss": 0.4188, + "step": 33730 + }, + { + "epoch": 661.57, + "learning_rate": 6.848350553550837e-05, + "loss": 0.4209, + "step": 33740 + }, + { + "epoch": 661.76, + "learning_rate": 6.844313352948416e-05, + "loss": 0.4176, + "step": 33750 + }, + { + "epoch": 661.96, + "learning_rate": 6.840276343791873e-05, + "loss": 0.4205, + "step": 33760 + }, + { + "epoch": 662.0, + "eval_loss": 0.42560145258903503, + "eval_runtime": 2.2243, + "eval_samples_per_second": 1024.574, + "eval_steps_per_second": 4.046, + "step": 33762 + }, + { + "epoch": 662.16, + "learning_rate": 6.836239527259926e-05, + "loss": 0.4257, + "step": 33770 + }, + { + "epoch": 662.35, + "learning_rate": 6.832202904531235e-05, + "loss": 0.4229, + "step": 33780 + }, + { + "epoch": 662.55, + "learning_rate": 6.8281664767844e-05, + "loss": 0.4202, + "step": 33790 + }, + { + "epoch": 662.75, + "learning_rate": 6.824130245197971e-05, + "loss": 0.4254, + "step": 33800 + }, + { + "epoch": 662.94, + "learning_rate": 6.820094210950436e-05, + "loss": 0.4185, + "step": 33810 + }, + { + "epoch": 663.0, + "eval_loss": 0.4250730872154236, + "eval_runtime": 2.2197, + "eval_samples_per_second": 1026.736, + "eval_steps_per_second": 4.055, + "step": 33813 + }, + { + "epoch": 663.14, + "learning_rate": 6.81605837522023e-05, + "loss": 0.424, + "step": 33820 + }, + { + "epoch": 663.33, + "learning_rate": 6.812022739185722e-05, + "loss": 0.4151, + "step": 33830 + }, + { + "epoch": 663.53, + "learning_rate": 6.807987304025235e-05, + "loss": 0.4183, + "step": 33840 + }, + { + "epoch": 663.73, + "learning_rate": 6.80395207091702e-05, + "loss": 0.4142, + "step": 33850 + }, + { + "epoch": 663.92, + "learning_rate": 6.799917041039276e-05, + "loss": 0.4212, + "step": 33860 + }, + { + "epoch": 664.0, + "eval_loss": 0.42312219738960266, + "eval_runtime": 2.2415, + "eval_samples_per_second": 1016.727, + "eval_steps_per_second": 4.015, + "step": 33864 + }, + { + "epoch": 664.12, + "learning_rate": 6.795882215570143e-05, + "loss": 0.4202, + "step": 33870 + }, + { + "epoch": 664.31, + "learning_rate": 6.7918475956877e-05, + "loss": 0.4145, + "step": 33880 + }, + { + "epoch": 664.51, + "learning_rate": 6.787813182569968e-05, + "loss": 0.4202, + "step": 33890 + }, + { + "epoch": 664.71, + "learning_rate": 6.783778977394899e-05, + "loss": 0.4203, + "step": 33900 + }, + { + "epoch": 664.9, + "learning_rate": 6.779744981340399e-05, + "loss": 0.4228, + "step": 33910 + }, + { + "epoch": 665.0, + "eval_loss": 0.42498070001602173, + "eval_runtime": 2.2609, + "eval_samples_per_second": 1008.014, + "eval_steps_per_second": 3.981, + "step": 33915 + }, + { + "epoch": 665.1, + "learning_rate": 6.775711195584299e-05, + "loss": 0.421, + "step": 33920 + }, + { + "epoch": 665.29, + "learning_rate": 6.771677621304376e-05, + "loss": 0.4228, + "step": 33930 + }, + { + "epoch": 665.49, + "learning_rate": 6.767644259678348e-05, + "loss": 0.4227, + "step": 33940 + }, + { + "epoch": 665.69, + "learning_rate": 6.76361111188386e-05, + "loss": 0.4181, + "step": 33950 + }, + { + "epoch": 665.88, + "learning_rate": 6.759578179098505e-05, + "loss": 0.421, + "step": 33960 + }, + { + "epoch": 666.0, + "eval_loss": 0.42840486764907837, + "eval_runtime": 2.2641, + "eval_samples_per_second": 1006.588, + "eval_steps_per_second": 3.975, + "step": 33966 + }, + { + "epoch": 666.08, + "learning_rate": 6.755545462499812e-05, + "loss": 0.4178, + "step": 33970 + }, + { + "epoch": 666.27, + "learning_rate": 6.751512963265234e-05, + "loss": 0.419, + "step": 33980 + }, + { + "epoch": 666.47, + "learning_rate": 6.747480682572185e-05, + "loss": 0.4198, + "step": 33990 + }, + { + "epoch": 666.67, + "learning_rate": 6.743448621597988e-05, + "loss": 0.4206, + "step": 34000 + }, + { + "epoch": 666.86, + "learning_rate": 6.739416781519924e-05, + "loss": 0.4226, + "step": 34010 + }, + { + "epoch": 667.0, + "eval_loss": 0.4243127107620239, + "eval_runtime": 2.2983, + "eval_samples_per_second": 991.589, + "eval_steps_per_second": 3.916, + "step": 34017 + }, + { + "epoch": 667.06, + "learning_rate": 6.735385163515194e-05, + "loss": 0.4209, + "step": 34020 + }, + { + "epoch": 667.25, + "learning_rate": 6.731353768760947e-05, + "loss": 0.4203, + "step": 34030 + }, + { + "epoch": 667.45, + "learning_rate": 6.727322598434259e-05, + "loss": 0.4172, + "step": 34040 + }, + { + "epoch": 667.65, + "learning_rate": 6.723291653712135e-05, + "loss": 0.4184, + "step": 34050 + }, + { + "epoch": 667.84, + "learning_rate": 6.719260935771529e-05, + "loss": 0.4201, + "step": 34060 + }, + { + "epoch": 668.0, + "eval_loss": 0.4278631806373596, + "eval_runtime": 2.1713, + "eval_samples_per_second": 1049.583, + "eval_steps_per_second": 4.145, + "step": 34068 + }, + { + "epoch": 668.04, + "learning_rate": 6.715230445789315e-05, + "loss": 0.424, + "step": 34070 + }, + { + "epoch": 668.24, + "learning_rate": 6.711200184942311e-05, + "loss": 0.4177, + "step": 34080 + }, + { + "epoch": 668.43, + "learning_rate": 6.70717015440726e-05, + "loss": 0.4189, + "step": 34090 + }, + { + "epoch": 668.63, + "learning_rate": 6.703140355360843e-05, + "loss": 0.4213, + "step": 34100 + }, + { + "epoch": 668.82, + "learning_rate": 6.69911078897967e-05, + "loss": 0.4213, + "step": 34110 + }, + { + "epoch": 669.0, + "eval_loss": 0.4210264980792999, + "eval_runtime": 2.3532, + "eval_samples_per_second": 968.452, + "eval_steps_per_second": 3.825, + "step": 34119 + }, + { + "epoch": 669.02, + "learning_rate": 6.695081456440284e-05, + "loss": 0.4204, + "step": 34120 + }, + { + "epoch": 669.22, + "learning_rate": 6.691052358919162e-05, + "loss": 0.4156, + "step": 34130 + }, + { + "epoch": 669.41, + "learning_rate": 6.687023497592709e-05, + "loss": 0.4221, + "step": 34140 + }, + { + "epoch": 669.61, + "learning_rate": 6.682994873637267e-05, + "loss": 0.42, + "step": 34150 + }, + { + "epoch": 669.8, + "learning_rate": 6.678966488229099e-05, + "loss": 0.4154, + "step": 34160 + }, + { + "epoch": 670.0, + "learning_rate": 6.674938342544404e-05, + "loss": 0.4237, + "step": 34170 + }, + { + "epoch": 670.0, + "eval_loss": 0.4264044165611267, + "eval_runtime": 2.2635, + "eval_samples_per_second": 1006.853, + "eval_steps_per_second": 3.976, + "step": 34170 + }, + { + "epoch": 670.2, + "learning_rate": 6.670910437759317e-05, + "loss": 0.4231, + "step": 34180 + }, + { + "epoch": 670.39, + "learning_rate": 6.666882775049885e-05, + "loss": 0.4167, + "step": 34190 + }, + { + "epoch": 670.59, + "learning_rate": 6.662855355592109e-05, + "loss": 0.4183, + "step": 34200 + }, + { + "epoch": 670.78, + "learning_rate": 6.658828180561893e-05, + "loss": 0.4166, + "step": 34210 + }, + { + "epoch": 670.98, + "learning_rate": 6.654801251135092e-05, + "loss": 0.4228, + "step": 34220 + }, + { + "epoch": 671.0, + "eval_loss": 0.4236544668674469, + "eval_runtime": 2.3232, + "eval_samples_per_second": 980.983, + "eval_steps_per_second": 3.874, + "step": 34221 + }, + { + "epoch": 671.18, + "learning_rate": 6.650774568487473e-05, + "loss": 0.4168, + "step": 34230 + }, + { + "epoch": 671.37, + "learning_rate": 6.646748133794743e-05, + "loss": 0.4202, + "step": 34240 + }, + { + "epoch": 671.57, + "learning_rate": 6.64272194823253e-05, + "loss": 0.4232, + "step": 34250 + }, + { + "epoch": 671.76, + "learning_rate": 6.638696012976386e-05, + "loss": 0.4189, + "step": 34260 + }, + { + "epoch": 671.96, + "learning_rate": 6.634670329201798e-05, + "loss": 0.4181, + "step": 34270 + }, + { + "epoch": 672.0, + "eval_loss": 0.4245344400405884, + "eval_runtime": 2.3276, + "eval_samples_per_second": 979.099, + "eval_steps_per_second": 3.867, + "step": 34272 + }, + { + "epoch": 672.16, + "learning_rate": 6.630644898084175e-05, + "loss": 0.4182, + "step": 34280 + }, + { + "epoch": 672.35, + "learning_rate": 6.626619720798854e-05, + "loss": 0.4228, + "step": 34290 + }, + { + "epoch": 672.55, + "learning_rate": 6.622594798521094e-05, + "loss": 0.4226, + "step": 34300 + }, + { + "epoch": 672.75, + "learning_rate": 6.618570132426088e-05, + "loss": 0.4187, + "step": 34310 + }, + { + "epoch": 672.94, + "learning_rate": 6.614545723688945e-05, + "loss": 0.4242, + "step": 34320 + }, + { + "epoch": 673.0, + "eval_loss": 0.42444732785224915, + "eval_runtime": 2.1962, + "eval_samples_per_second": 1037.713, + "eval_steps_per_second": 4.098, + "step": 34323 + }, + { + "epoch": 673.14, + "learning_rate": 6.610521573484701e-05, + "loss": 0.4215, + "step": 34330 + }, + { + "epoch": 673.33, + "learning_rate": 6.606497682988323e-05, + "loss": 0.4206, + "step": 34340 + }, + { + "epoch": 673.53, + "learning_rate": 6.60247405337469e-05, + "loss": 0.4224, + "step": 34350 + }, + { + "epoch": 673.73, + "learning_rate": 6.598450685818622e-05, + "loss": 0.4226, + "step": 34360 + }, + { + "epoch": 673.92, + "learning_rate": 6.594427581494844e-05, + "loss": 0.4178, + "step": 34370 + }, + { + "epoch": 674.0, + "eval_loss": 0.424979567527771, + "eval_runtime": 2.2868, + "eval_samples_per_second": 996.581, + "eval_steps_per_second": 3.936, + "step": 34374 + }, + { + "epoch": 674.12, + "learning_rate": 6.590404741578018e-05, + "loss": 0.4136, + "step": 34380 + }, + { + "epoch": 674.31, + "learning_rate": 6.586382167242722e-05, + "loss": 0.414, + "step": 34390 + }, + { + "epoch": 674.51, + "learning_rate": 6.582359859663454e-05, + "loss": 0.4187, + "step": 34400 + }, + { + "epoch": 674.71, + "learning_rate": 6.578337820014644e-05, + "loss": 0.4224, + "step": 34410 + }, + { + "epoch": 674.9, + "learning_rate": 6.574316049470635e-05, + "loss": 0.4184, + "step": 34420 + }, + { + "epoch": 675.0, + "eval_loss": 0.427442342042923, + "eval_runtime": 2.3158, + "eval_samples_per_second": 984.096, + "eval_steps_per_second": 3.886, + "step": 34425 + }, + { + "epoch": 675.1, + "learning_rate": 6.570294549205695e-05, + "loss": 0.4225, + "step": 34430 + }, + { + "epoch": 675.29, + "learning_rate": 6.56627332039401e-05, + "loss": 0.4172, + "step": 34440 + }, + { + "epoch": 675.49, + "learning_rate": 6.562252364209694e-05, + "loss": 0.4148, + "step": 34450 + }, + { + "epoch": 675.69, + "learning_rate": 6.558231681826776e-05, + "loss": 0.418, + "step": 34460 + }, + { + "epoch": 675.88, + "learning_rate": 6.5542112744192e-05, + "loss": 0.4163, + "step": 34470 + }, + { + "epoch": 676.0, + "eval_loss": 0.4221233129501343, + "eval_runtime": 2.3487, + "eval_samples_per_second": 970.314, + "eval_steps_per_second": 3.832, + "step": 34476 + }, + { + "epoch": 676.08, + "learning_rate": 6.550191143160839e-05, + "loss": 0.4191, + "step": 34480 + }, + { + "epoch": 676.27, + "learning_rate": 6.54617128922548e-05, + "loss": 0.4216, + "step": 34490 + }, + { + "epoch": 676.47, + "learning_rate": 6.542151713786834e-05, + "loss": 0.4202, + "step": 34500 + }, + { + "epoch": 676.67, + "learning_rate": 6.538132418018525e-05, + "loss": 0.4188, + "step": 34510 + }, + { + "epoch": 676.86, + "learning_rate": 6.5341134030941e-05, + "loss": 0.4288, + "step": 34520 + }, + { + "epoch": 677.0, + "eval_loss": 0.42452743649482727, + "eval_runtime": 2.2039, + "eval_samples_per_second": 1034.062, + "eval_steps_per_second": 4.084, + "step": 34527 + }, + { + "epoch": 677.06, + "learning_rate": 6.530094670187019e-05, + "loss": 0.4153, + "step": 34530 + }, + { + "epoch": 677.25, + "learning_rate": 6.526076220470661e-05, + "loss": 0.423, + "step": 34540 + }, + { + "epoch": 677.45, + "learning_rate": 6.52205805511833e-05, + "loss": 0.4241, + "step": 34550 + }, + { + "epoch": 677.65, + "learning_rate": 6.518040175303233e-05, + "loss": 0.4211, + "step": 34560 + }, + { + "epoch": 677.84, + "learning_rate": 6.514022582198508e-05, + "loss": 0.4205, + "step": 34570 + }, + { + "epoch": 678.0, + "eval_loss": 0.42583590745925903, + "eval_runtime": 2.3161, + "eval_samples_per_second": 983.968, + "eval_steps_per_second": 3.886, + "step": 34578 + }, + { + "epoch": 678.04, + "learning_rate": 6.510005276977197e-05, + "loss": 0.425, + "step": 34580 + }, + { + "epoch": 678.24, + "learning_rate": 6.505988260812268e-05, + "loss": 0.4236, + "step": 34590 + }, + { + "epoch": 678.43, + "learning_rate": 6.501971534876599e-05, + "loss": 0.4227, + "step": 34600 + }, + { + "epoch": 678.63, + "learning_rate": 6.497955100342979e-05, + "loss": 0.4136, + "step": 34610 + }, + { + "epoch": 678.82, + "learning_rate": 6.493938958384127e-05, + "loss": 0.4167, + "step": 34620 + }, + { + "epoch": 679.0, + "eval_loss": 0.4242996275424957, + "eval_runtime": 2.1858, + "eval_samples_per_second": 1042.625, + "eval_steps_per_second": 4.117, + "step": 34629 + }, + { + "epoch": 679.02, + "learning_rate": 6.489923110172659e-05, + "loss": 0.4198, + "step": 34630 + }, + { + "epoch": 679.22, + "learning_rate": 6.485907556881117e-05, + "loss": 0.4204, + "step": 34640 + }, + { + "epoch": 679.41, + "learning_rate": 6.481892299681953e-05, + "loss": 0.4209, + "step": 34650 + }, + { + "epoch": 679.61, + "learning_rate": 6.477877339747528e-05, + "loss": 0.4188, + "step": 34660 + }, + { + "epoch": 679.8, + "learning_rate": 6.473862678250128e-05, + "loss": 0.4228, + "step": 34670 + }, + { + "epoch": 680.0, + "learning_rate": 6.469848316361938e-05, + "loss": 0.4172, + "step": 34680 + }, + { + "epoch": 680.0, + "eval_loss": 0.4240746796131134, + "eval_runtime": 2.2674, + "eval_samples_per_second": 1005.137, + "eval_steps_per_second": 3.969, + "step": 34680 + }, + { + "epoch": 680.2, + "learning_rate": 6.465834255255067e-05, + "loss": 0.4173, + "step": 34690 + }, + { + "epoch": 680.39, + "learning_rate": 6.461820496101528e-05, + "loss": 0.4136, + "step": 34700 + }, + { + "epoch": 680.59, + "learning_rate": 6.45780704007325e-05, + "loss": 0.4175, + "step": 34710 + }, + { + "epoch": 680.78, + "learning_rate": 6.453793888342077e-05, + "loss": 0.4162, + "step": 34720 + }, + { + "epoch": 680.98, + "learning_rate": 6.449781042079752e-05, + "loss": 0.4212, + "step": 34730 + }, + { + "epoch": 681.0, + "eval_loss": 0.4216473698616028, + "eval_runtime": 2.2937, + "eval_samples_per_second": 993.581, + "eval_steps_per_second": 3.924, + "step": 34731 + }, + { + "epoch": 681.18, + "learning_rate": 6.445768502457942e-05, + "loss": 0.4212, + "step": 34740 + }, + { + "epoch": 681.37, + "learning_rate": 6.441756270648216e-05, + "loss": 0.4171, + "step": 34750 + }, + { + "epoch": 681.57, + "learning_rate": 6.43774434782206e-05, + "loss": 0.4208, + "step": 34760 + }, + { + "epoch": 681.76, + "learning_rate": 6.433732735150862e-05, + "loss": 0.4218, + "step": 34770 + }, + { + "epoch": 681.96, + "learning_rate": 6.429721433805928e-05, + "loss": 0.4164, + "step": 34780 + }, + { + "epoch": 682.0, + "eval_loss": 0.42144080996513367, + "eval_runtime": 2.2344, + "eval_samples_per_second": 1019.974, + "eval_steps_per_second": 4.028, + "step": 34782 + }, + { + "epoch": 682.16, + "learning_rate": 6.425710444958465e-05, + "loss": 0.4188, + "step": 34790 + }, + { + "epoch": 682.35, + "learning_rate": 6.42169976977959e-05, + "loss": 0.4141, + "step": 34800 + }, + { + "epoch": 682.55, + "learning_rate": 6.417689409440339e-05, + "loss": 0.4203, + "step": 34810 + }, + { + "epoch": 682.75, + "learning_rate": 6.413679365111635e-05, + "loss": 0.4194, + "step": 34820 + }, + { + "epoch": 682.94, + "learning_rate": 6.409669637964337e-05, + "loss": 0.4171, + "step": 34830 + }, + { + "epoch": 683.0, + "eval_loss": 0.42304977774620056, + "eval_runtime": 2.2703, + "eval_samples_per_second": 1003.818, + "eval_steps_per_second": 3.964, + "step": 34833 + }, + { + "epoch": 683.14, + "learning_rate": 6.405660229169183e-05, + "loss": 0.4168, + "step": 34840 + }, + { + "epoch": 683.33, + "learning_rate": 6.401651139896838e-05, + "loss": 0.4201, + "step": 34850 + }, + { + "epoch": 683.53, + "learning_rate": 6.397642371317866e-05, + "loss": 0.4153, + "step": 34860 + }, + { + "epoch": 683.73, + "learning_rate": 6.393633924602733e-05, + "loss": 0.4181, + "step": 34870 + }, + { + "epoch": 683.92, + "learning_rate": 6.389625800921824e-05, + "loss": 0.4166, + "step": 34880 + }, + { + "epoch": 684.0, + "eval_loss": 0.42609888315200806, + "eval_runtime": 2.3337, + "eval_samples_per_second": 976.565, + "eval_steps_per_second": 3.857, + "step": 34884 + }, + { + "epoch": 684.12, + "learning_rate": 6.385618001445413e-05, + "loss": 0.4207, + "step": 34890 + }, + { + "epoch": 684.31, + "learning_rate": 6.381610527343694e-05, + "loss": 0.4198, + "step": 34900 + }, + { + "epoch": 684.51, + "learning_rate": 6.377603379786757e-05, + "loss": 0.415, + "step": 34910 + }, + { + "epoch": 684.71, + "learning_rate": 6.373596559944604e-05, + "loss": 0.4151, + "step": 34920 + }, + { + "epoch": 684.9, + "learning_rate": 6.369590068987135e-05, + "loss": 0.4172, + "step": 34930 + }, + { + "epoch": 685.0, + "eval_loss": 0.4224391579627991, + "eval_runtime": 2.3437, + "eval_samples_per_second": 972.39, + "eval_steps_per_second": 3.84, + "step": 34935 + }, + { + "epoch": 685.1, + "learning_rate": 6.365583908084152e-05, + "loss": 0.4098, + "step": 34940 + }, + { + "epoch": 685.29, + "learning_rate": 6.361578078405371e-05, + "loss": 0.4191, + "step": 34950 + }, + { + "epoch": 685.49, + "learning_rate": 6.357572581120399e-05, + "loss": 0.4186, + "step": 34960 + }, + { + "epoch": 685.69, + "learning_rate": 6.353567417398756e-05, + "loss": 0.4173, + "step": 34970 + }, + { + "epoch": 685.88, + "learning_rate": 6.349562588409858e-05, + "loss": 0.4188, + "step": 34980 + }, + { + "epoch": 686.0, + "eval_loss": 0.42092400789260864, + "eval_runtime": 2.2152, + "eval_samples_per_second": 1028.793, + "eval_steps_per_second": 4.063, + "step": 34986 + }, + { + "epoch": 686.08, + "learning_rate": 6.34555809532303e-05, + "loss": 0.422, + "step": 34990 + }, + { + "epoch": 686.27, + "learning_rate": 6.34155393930749e-05, + "loss": 0.416, + "step": 35000 + }, + { + "epoch": 686.47, + "learning_rate": 6.337550121532362e-05, + "loss": 0.4176, + "step": 35010 + }, + { + "epoch": 686.67, + "learning_rate": 6.333546643166678e-05, + "loss": 0.4183, + "step": 35020 + }, + { + "epoch": 686.86, + "learning_rate": 6.329543505379354e-05, + "loss": 0.4187, + "step": 35030 + }, + { + "epoch": 687.0, + "eval_loss": 0.41680261492729187, + "eval_runtime": 2.3457, + "eval_samples_per_second": 971.556, + "eval_steps_per_second": 3.837, + "step": 35037 + }, + { + "epoch": 687.06, + "learning_rate": 6.325540709339227e-05, + "loss": 0.4162, + "step": 35040 + }, + { + "epoch": 687.25, + "learning_rate": 6.321538256215017e-05, + "loss": 0.4152, + "step": 35050 + }, + { + "epoch": 687.45, + "learning_rate": 6.317536147175356e-05, + "loss": 0.4199, + "step": 35060 + }, + { + "epoch": 687.65, + "learning_rate": 6.31353438338877e-05, + "loss": 0.4167, + "step": 35070 + }, + { + "epoch": 687.84, + "learning_rate": 6.309532966023678e-05, + "loss": 0.4174, + "step": 35080 + }, + { + "epoch": 688.0, + "eval_loss": 0.4200821816921234, + "eval_runtime": 2.3042, + "eval_samples_per_second": 989.077, + "eval_steps_per_second": 3.906, + "step": 35088 + }, + { + "epoch": 688.04, + "learning_rate": 6.305531896248415e-05, + "loss": 0.4156, + "step": 35090 + }, + { + "epoch": 688.24, + "learning_rate": 6.301531175231196e-05, + "loss": 0.4127, + "step": 35100 + }, + { + "epoch": 688.43, + "learning_rate": 6.297530804140147e-05, + "loss": 0.4149, + "step": 35110 + }, + { + "epoch": 688.63, + "learning_rate": 6.293530784143284e-05, + "loss": 0.4141, + "step": 35120 + }, + { + "epoch": 688.82, + "learning_rate": 6.289531116408526e-05, + "loss": 0.4184, + "step": 35130 + }, + { + "epoch": 689.0, + "eval_loss": 0.41768765449523926, + "eval_runtime": 2.2735, + "eval_samples_per_second": 1002.432, + "eval_steps_per_second": 3.959, + "step": 35139 + }, + { + "epoch": 689.02, + "learning_rate": 6.285531802103688e-05, + "loss": 0.4164, + "step": 35140 + }, + { + "epoch": 689.22, + "learning_rate": 6.281532842396476e-05, + "loss": 0.4215, + "step": 35150 + }, + { + "epoch": 689.41, + "learning_rate": 6.2775342384545e-05, + "loss": 0.4152, + "step": 35160 + }, + { + "epoch": 689.61, + "learning_rate": 6.273535991445261e-05, + "loss": 0.4231, + "step": 35170 + }, + { + "epoch": 689.8, + "learning_rate": 6.269538102536163e-05, + "loss": 0.4135, + "step": 35180 + }, + { + "epoch": 690.0, + "learning_rate": 6.265540572894494e-05, + "loss": 0.4126, + "step": 35190 + }, + { + "epoch": 690.0, + "eval_loss": 0.41916319727897644, + "eval_runtime": 2.3438, + "eval_samples_per_second": 972.34, + "eval_steps_per_second": 3.84, + "step": 35190 + }, + { + "epoch": 690.2, + "learning_rate": 6.26154340368745e-05, + "loss": 0.4184, + "step": 35200 + }, + { + "epoch": 690.39, + "learning_rate": 6.25754659608211e-05, + "loss": 0.416, + "step": 35210 + }, + { + "epoch": 690.59, + "learning_rate": 6.253550151245454e-05, + "loss": 0.4136, + "step": 35220 + }, + { + "epoch": 690.78, + "learning_rate": 6.249554070344358e-05, + "loss": 0.4141, + "step": 35230 + }, + { + "epoch": 690.98, + "learning_rate": 6.245558354545582e-05, + "loss": 0.4168, + "step": 35240 + }, + { + "epoch": 691.0, + "eval_loss": 0.4171481728553772, + "eval_runtime": 2.3458, + "eval_samples_per_second": 971.51, + "eval_steps_per_second": 3.837, + "step": 35241 + }, + { + "epoch": 691.18, + "learning_rate": 6.241563005015792e-05, + "loss": 0.4149, + "step": 35250 + }, + { + "epoch": 691.37, + "learning_rate": 6.237568022921537e-05, + "loss": 0.4149, + "step": 35260 + }, + { + "epoch": 691.57, + "learning_rate": 6.233573409429267e-05, + "loss": 0.4113, + "step": 35270 + }, + { + "epoch": 691.76, + "learning_rate": 6.229579165705317e-05, + "loss": 0.4205, + "step": 35280 + }, + { + "epoch": 691.96, + "learning_rate": 6.225585292915914e-05, + "loss": 0.4152, + "step": 35290 + }, + { + "epoch": 692.0, + "eval_loss": 0.4202278256416321, + "eval_runtime": 2.3116, + "eval_samples_per_second": 985.918, + "eval_steps_per_second": 3.893, + "step": 35292 + }, + { + "epoch": 692.16, + "learning_rate": 6.221591792227188e-05, + "loss": 0.4188, + "step": 35300 + }, + { + "epoch": 692.35, + "learning_rate": 6.217598664805143e-05, + "loss": 0.4182, + "step": 35310 + }, + { + "epoch": 692.55, + "learning_rate": 6.21360591181569e-05, + "loss": 0.4183, + "step": 35320 + }, + { + "epoch": 692.75, + "learning_rate": 6.209613534424624e-05, + "loss": 0.413, + "step": 35330 + }, + { + "epoch": 692.94, + "learning_rate": 6.205621533797621e-05, + "loss": 0.4137, + "step": 35340 + }, + { + "epoch": 693.0, + "eval_loss": 0.42095068097114563, + "eval_runtime": 2.1897, + "eval_samples_per_second": 1040.785, + "eval_steps_per_second": 4.11, + "step": 35343 + }, + { + "epoch": 693.14, + "learning_rate": 6.201629911100269e-05, + "loss": 0.4146, + "step": 35350 + }, + { + "epoch": 693.33, + "learning_rate": 6.197638667498022e-05, + "loss": 0.4122, + "step": 35360 + }, + { + "epoch": 693.53, + "learning_rate": 6.193647804156241e-05, + "loss": 0.4166, + "step": 35370 + }, + { + "epoch": 693.73, + "learning_rate": 6.189657322240165e-05, + "loss": 0.4164, + "step": 35380 + }, + { + "epoch": 693.92, + "learning_rate": 6.185667222914928e-05, + "loss": 0.4139, + "step": 35390 + }, + { + "epoch": 694.0, + "eval_loss": 0.4143226146697998, + "eval_runtime": 2.205, + "eval_samples_per_second": 1033.539, + "eval_steps_per_second": 4.082, + "step": 35394 + }, + { + "epoch": 694.12, + "learning_rate": 6.181677507345552e-05, + "loss": 0.4172, + "step": 35400 + }, + { + "epoch": 694.31, + "learning_rate": 6.17768817669694e-05, + "loss": 0.4172, + "step": 35410 + }, + { + "epoch": 694.51, + "learning_rate": 6.17369923213389e-05, + "loss": 0.4159, + "step": 35420 + }, + { + "epoch": 694.71, + "learning_rate": 6.169710674821085e-05, + "loss": 0.4136, + "step": 35430 + }, + { + "epoch": 694.9, + "learning_rate": 6.165722505923096e-05, + "loss": 0.418, + "step": 35440 + }, + { + "epoch": 695.0, + "eval_loss": 0.4250470697879791, + "eval_runtime": 2.1916, + "eval_samples_per_second": 1039.863, + "eval_steps_per_second": 4.107, + "step": 35445 + }, + { + "epoch": 695.1, + "learning_rate": 6.161734726604374e-05, + "loss": 0.4154, + "step": 35450 + }, + { + "epoch": 695.29, + "learning_rate": 6.15774733802927e-05, + "loss": 0.4189, + "step": 35460 + }, + { + "epoch": 695.49, + "learning_rate": 6.153760341362007e-05, + "loss": 0.412, + "step": 35470 + }, + { + "epoch": 695.69, + "learning_rate": 6.1497737377667e-05, + "loss": 0.4208, + "step": 35480 + }, + { + "epoch": 695.88, + "learning_rate": 6.145787528407348e-05, + "loss": 0.4116, + "step": 35490 + }, + { + "epoch": 696.0, + "eval_loss": 0.4236893951892853, + "eval_runtime": 2.2843, + "eval_samples_per_second": 997.695, + "eval_steps_per_second": 3.94, + "step": 35496 + }, + { + "epoch": 696.08, + "learning_rate": 6.141801714447834e-05, + "loss": 0.4162, + "step": 35500 + }, + { + "epoch": 696.27, + "learning_rate": 6.137816297051934e-05, + "loss": 0.4133, + "step": 35510 + }, + { + "epoch": 696.47, + "learning_rate": 6.13383127738329e-05, + "loss": 0.4182, + "step": 35520 + }, + { + "epoch": 696.67, + "learning_rate": 6.129846656605448e-05, + "loss": 0.4202, + "step": 35530 + }, + { + "epoch": 696.86, + "learning_rate": 6.125862435881826e-05, + "loss": 0.4113, + "step": 35540 + }, + { + "epoch": 697.0, + "eval_loss": 0.41717728972435, + "eval_runtime": 2.2931, + "eval_samples_per_second": 993.869, + "eval_steps_per_second": 3.925, + "step": 35547 + }, + { + "epoch": 697.06, + "learning_rate": 6.12187861637572e-05, + "loss": 0.4114, + "step": 35550 + }, + { + "epoch": 697.25, + "learning_rate": 6.11789519925033e-05, + "loss": 0.4151, + "step": 35560 + }, + { + "epoch": 697.45, + "learning_rate": 6.113912185668715e-05, + "loss": 0.4122, + "step": 35570 + }, + { + "epoch": 697.65, + "learning_rate": 6.109929576793829e-05, + "loss": 0.4172, + "step": 35580 + }, + { + "epoch": 697.84, + "learning_rate": 6.105947373788505e-05, + "loss": 0.4131, + "step": 35590 + }, + { + "epoch": 698.0, + "eval_loss": 0.4218674898147583, + "eval_runtime": 2.2896, + "eval_samples_per_second": 995.388, + "eval_steps_per_second": 3.931, + "step": 35598 + }, + { + "epoch": 698.04, + "learning_rate": 6.101965577815458e-05, + "loss": 0.4105, + "step": 35600 + }, + { + "epoch": 698.24, + "learning_rate": 6.097984190037284e-05, + "loss": 0.4159, + "step": 35610 + }, + { + "epoch": 698.43, + "learning_rate": 6.0940032116164555e-05, + "loss": 0.4124, + "step": 35620 + }, + { + "epoch": 698.63, + "learning_rate": 6.090022643715335e-05, + "loss": 0.4153, + "step": 35630 + }, + { + "epoch": 698.82, + "learning_rate": 6.086042487496155e-05, + "loss": 0.4148, + "step": 35640 + }, + { + "epoch": 699.0, + "eval_loss": 0.4179209768772125, + "eval_runtime": 2.2257, + "eval_samples_per_second": 1023.944, + "eval_steps_per_second": 4.044, + "step": 35649 + }, + { + "epoch": 699.02, + "learning_rate": 6.082062744121038e-05, + "loss": 0.4175, + "step": 35650 + }, + { + "epoch": 699.22, + "learning_rate": 6.0780834147519704e-05, + "loss": 0.4176, + "step": 35660 + }, + { + "epoch": 699.41, + "learning_rate": 6.07410450055084e-05, + "loss": 0.4121, + "step": 35670 + }, + { + "epoch": 699.61, + "learning_rate": 6.070126002679393e-05, + "loss": 0.4108, + "step": 35680 + }, + { + "epoch": 699.8, + "learning_rate": 6.066147922299262e-05, + "loss": 0.4144, + "step": 35690 + }, + { + "epoch": 700.0, + "learning_rate": 6.062170260571963e-05, + "loss": 0.4117, + "step": 35700 + }, + { + "epoch": 700.0, + "eval_loss": 0.4264349341392517, + "eval_runtime": 2.3638, + "eval_samples_per_second": 964.145, + "eval_steps_per_second": 3.808, + "step": 35700 + }, + { + "epoch": 700.2, + "learning_rate": 6.058193018658876e-05, + "loss": 0.4168, + "step": 35710 + }, + { + "epoch": 700.39, + "learning_rate": 6.0542161977212775e-05, + "loss": 0.4138, + "step": 35720 + }, + { + "epoch": 700.59, + "learning_rate": 6.050239798920303e-05, + "loss": 0.4155, + "step": 35730 + }, + { + "epoch": 700.78, + "learning_rate": 6.046263823416974e-05, + "loss": 0.4106, + "step": 35740 + }, + { + "epoch": 700.98, + "learning_rate": 6.04228827237219e-05, + "loss": 0.4115, + "step": 35750 + }, + { + "epoch": 701.0, + "eval_loss": 0.4244195818901062, + "eval_runtime": 2.2262, + "eval_samples_per_second": 1023.724, + "eval_steps_per_second": 4.043, + "step": 35751 + }, + { + "epoch": 701.18, + "learning_rate": 6.0383131469467157e-05, + "loss": 0.4141, + "step": 35760 + }, + { + "epoch": 701.37, + "learning_rate": 6.034338448301207e-05, + "loss": 0.4116, + "step": 35770 + }, + { + "epoch": 701.57, + "learning_rate": 6.030364177596182e-05, + "loss": 0.4154, + "step": 35780 + }, + { + "epoch": 701.76, + "learning_rate": 6.0263903359920426e-05, + "loss": 0.4175, + "step": 35790 + }, + { + "epoch": 701.96, + "learning_rate": 6.0224169246490586e-05, + "loss": 0.4149, + "step": 35800 + }, + { + "epoch": 702.0, + "eval_loss": 0.42228615283966064, + "eval_runtime": 2.217, + "eval_samples_per_second": 1027.969, + "eval_steps_per_second": 4.06, + "step": 35802 + }, + { + "epoch": 702.16, + "learning_rate": 6.018443944727381e-05, + "loss": 0.416, + "step": 35810 + }, + { + "epoch": 702.35, + "learning_rate": 6.014471397387032e-05, + "loss": 0.4147, + "step": 35820 + }, + { + "epoch": 702.55, + "learning_rate": 6.0104992837879e-05, + "loss": 0.4142, + "step": 35830 + }, + { + "epoch": 702.75, + "learning_rate": 6.0065276050897597e-05, + "loss": 0.4148, + "step": 35840 + }, + { + "epoch": 702.94, + "learning_rate": 6.00255636245225e-05, + "loss": 0.4129, + "step": 35850 + }, + { + "epoch": 703.0, + "eval_loss": 0.4189659357070923, + "eval_runtime": 2.3057, + "eval_samples_per_second": 988.411, + "eval_steps_per_second": 3.903, + "step": 35853 + }, + { + "epoch": 703.14, + "learning_rate": 5.998585557034889e-05, + "loss": 0.4139, + "step": 35860 + }, + { + "epoch": 703.33, + "learning_rate": 5.994615189997056e-05, + "loss": 0.4141, + "step": 35870 + }, + { + "epoch": 703.53, + "learning_rate": 5.990645262498019e-05, + "loss": 0.41, + "step": 35880 + }, + { + "epoch": 703.73, + "learning_rate": 5.986675775696903e-05, + "loss": 0.4108, + "step": 35890 + }, + { + "epoch": 703.92, + "learning_rate": 5.9827067307527067e-05, + "loss": 0.4134, + "step": 35900 + }, + { + "epoch": 704.0, + "eval_loss": 0.41974902153015137, + "eval_runtime": 2.3539, + "eval_samples_per_second": 968.194, + "eval_steps_per_second": 3.823, + "step": 35904 + }, + { + "epoch": 704.12, + "learning_rate": 5.9787381288243094e-05, + "loss": 0.4149, + "step": 35910 + }, + { + "epoch": 704.31, + "learning_rate": 5.9747699710704466e-05, + "loss": 0.4169, + "step": 35920 + }, + { + "epoch": 704.51, + "learning_rate": 5.970802258649742e-05, + "loss": 0.4134, + "step": 35930 + }, + { + "epoch": 704.71, + "learning_rate": 5.9668349927206696e-05, + "loss": 0.4128, + "step": 35940 + }, + { + "epoch": 704.9, + "learning_rate": 5.9628681744415877e-05, + "loss": 0.4155, + "step": 35950 + }, + { + "epoch": 705.0, + "eval_loss": 0.4202663004398346, + "eval_runtime": 2.2216, + "eval_samples_per_second": 1025.818, + "eval_steps_per_second": 4.051, + "step": 35955 + }, + { + "epoch": 705.1, + "learning_rate": 5.95890180497072e-05, + "loss": 0.4137, + "step": 35960 + }, + { + "epoch": 705.29, + "learning_rate": 5.95493588546615e-05, + "loss": 0.4165, + "step": 35970 + }, + { + "epoch": 705.49, + "learning_rate": 5.950970417085848e-05, + "loss": 0.4115, + "step": 35980 + }, + { + "epoch": 705.69, + "learning_rate": 5.9470054009876336e-05, + "loss": 0.4094, + "step": 35990 + }, + { + "epoch": 705.88, + "learning_rate": 5.943040838329209e-05, + "loss": 0.4112, + "step": 36000 + }, + { + "epoch": 706.0, + "eval_loss": 0.42057114839553833, + "eval_runtime": 2.2541, + "eval_samples_per_second": 1011.051, + "eval_steps_per_second": 3.993, + "step": 36006 + }, + { + "epoch": 706.08, + "learning_rate": 5.939076730268135e-05, + "loss": 0.4104, + "step": 36010 + }, + { + "epoch": 706.27, + "learning_rate": 5.935113077961845e-05, + "loss": 0.4129, + "step": 36020 + }, + { + "epoch": 706.47, + "learning_rate": 5.931149882567638e-05, + "loss": 0.4138, + "step": 36030 + }, + { + "epoch": 706.67, + "learning_rate": 5.927187145242672e-05, + "loss": 0.4106, + "step": 36040 + }, + { + "epoch": 706.86, + "learning_rate": 5.923224867143985e-05, + "loss": 0.4113, + "step": 36050 + }, + { + "epoch": 707.0, + "eval_loss": 0.41755741834640503, + "eval_runtime": 2.2519, + "eval_samples_per_second": 1012.054, + "eval_steps_per_second": 3.997, + "step": 36057 + }, + { + "epoch": 707.06, + "learning_rate": 5.9192630494284696e-05, + "loss": 0.4082, + "step": 36060 + }, + { + "epoch": 707.25, + "learning_rate": 5.915301693252894e-05, + "loss": 0.4138, + "step": 36070 + }, + { + "epoch": 707.45, + "learning_rate": 5.911340799773879e-05, + "loss": 0.4161, + "step": 36080 + }, + { + "epoch": 707.65, + "learning_rate": 5.907380370147919e-05, + "loss": 0.412, + "step": 36090 + }, + { + "epoch": 707.84, + "learning_rate": 5.9034204055313734e-05, + "loss": 0.4117, + "step": 36100 + }, + { + "epoch": 708.0, + "eval_loss": 0.42017531394958496, + "eval_runtime": 2.2028, + "eval_samples_per_second": 1034.59, + "eval_steps_per_second": 4.086, + "step": 36108 + }, + { + "epoch": 708.04, + "learning_rate": 5.89946090708046e-05, + "loss": 0.4043, + "step": 36110 + }, + { + "epoch": 708.24, + "learning_rate": 5.895501875951271e-05, + "loss": 0.4111, + "step": 36120 + }, + { + "epoch": 708.43, + "learning_rate": 5.891543313299744e-05, + "loss": 0.4106, + "step": 36130 + }, + { + "epoch": 708.63, + "learning_rate": 5.887585220281703e-05, + "loss": 0.4154, + "step": 36140 + }, + { + "epoch": 708.82, + "learning_rate": 5.8836275980528155e-05, + "loss": 0.4128, + "step": 36150 + }, + { + "epoch": 709.0, + "eval_loss": 0.4185832738876343, + "eval_runtime": 2.1993, + "eval_samples_per_second": 1036.236, + "eval_steps_per_second": 4.092, + "step": 36159 + }, + { + "epoch": 709.02, + "learning_rate": 5.879670447768619e-05, + "loss": 0.4109, + "step": 36160 + }, + { + "epoch": 709.22, + "learning_rate": 5.875713770584518e-05, + "loss": 0.4148, + "step": 36170 + }, + { + "epoch": 709.41, + "learning_rate": 5.8717575676557666e-05, + "loss": 0.4093, + "step": 36180 + }, + { + "epoch": 709.61, + "learning_rate": 5.867801840137497e-05, + "loss": 0.4075, + "step": 36190 + }, + { + "epoch": 709.8, + "learning_rate": 5.8638465891846854e-05, + "loss": 0.4155, + "step": 36200 + }, + { + "epoch": 710.0, + "learning_rate": 5.859891815952181e-05, + "loss": 0.4111, + "step": 36210 + }, + { + "epoch": 710.0, + "eval_loss": 0.41964903473854065, + "eval_runtime": 2.3088, + "eval_samples_per_second": 987.106, + "eval_steps_per_second": 3.898, + "step": 36210 + }, + { + "epoch": 710.2, + "learning_rate": 5.85593752159469e-05, + "loss": 0.4116, + "step": 36220 + }, + { + "epoch": 710.39, + "learning_rate": 5.8519837072667725e-05, + "loss": 0.4109, + "step": 36230 + }, + { + "epoch": 710.59, + "learning_rate": 5.848030374122862e-05, + "loss": 0.4143, + "step": 36240 + }, + { + "epoch": 710.78, + "learning_rate": 5.844077523317238e-05, + "loss": 0.4121, + "step": 36250 + }, + { + "epoch": 710.98, + "learning_rate": 5.8401251560040463e-05, + "loss": 0.4168, + "step": 36260 + }, + { + "epoch": 711.0, + "eval_loss": 0.42247816920280457, + "eval_runtime": 2.3042, + "eval_samples_per_second": 989.074, + "eval_steps_per_second": 3.906, + "step": 36261 + }, + { + "epoch": 711.18, + "learning_rate": 5.83617327333729e-05, + "loss": 0.4147, + "step": 36270 + }, + { + "epoch": 711.37, + "learning_rate": 5.8322218764708336e-05, + "loss": 0.41, + "step": 36280 + }, + { + "epoch": 711.57, + "learning_rate": 5.828270966558392e-05, + "loss": 0.4146, + "step": 36290 + }, + { + "epoch": 711.76, + "learning_rate": 5.824320544753545e-05, + "loss": 0.4069, + "step": 36300 + }, + { + "epoch": 711.96, + "learning_rate": 5.8203706122097275e-05, + "loss": 0.408, + "step": 36310 + }, + { + "epoch": 712.0, + "eval_loss": 0.41460511088371277, + "eval_runtime": 2.3599, + "eval_samples_per_second": 965.725, + "eval_steps_per_second": 3.814, + "step": 36312 + }, + { + "epoch": 712.16, + "learning_rate": 5.8164211700802316e-05, + "loss": 0.409, + "step": 36320 + }, + { + "epoch": 712.35, + "learning_rate": 5.812472219518209e-05, + "loss": 0.4118, + "step": 36330 + }, + { + "epoch": 712.55, + "learning_rate": 5.808523761676658e-05, + "loss": 0.4099, + "step": 36340 + }, + { + "epoch": 712.75, + "learning_rate": 5.8045757977084504e-05, + "loss": 0.4136, + "step": 36350 + }, + { + "epoch": 712.94, + "learning_rate": 5.800628328766296e-05, + "loss": 0.4117, + "step": 36360 + }, + { + "epoch": 713.0, + "eval_loss": 0.4185248911380768, + "eval_runtime": 2.2916, + "eval_samples_per_second": 994.5, + "eval_steps_per_second": 3.927, + "step": 36363 + }, + { + "epoch": 713.14, + "learning_rate": 5.796681356002769e-05, + "loss": 0.4059, + "step": 36370 + }, + { + "epoch": 713.33, + "learning_rate": 5.792734880570301e-05, + "loss": 0.408, + "step": 36380 + }, + { + "epoch": 713.53, + "learning_rate": 5.788788903621168e-05, + "loss": 0.4113, + "step": 36390 + }, + { + "epoch": 713.73, + "learning_rate": 5.784843426307516e-05, + "loss": 0.4136, + "step": 36400 + }, + { + "epoch": 713.92, + "learning_rate": 5.780898449781328e-05, + "loss": 0.4089, + "step": 36410 + }, + { + "epoch": 714.0, + "eval_loss": 0.4214355945587158, + "eval_runtime": 2.3419, + "eval_samples_per_second": 973.124, + "eval_steps_per_second": 3.843, + "step": 36414 + }, + { + "epoch": 714.12, + "learning_rate": 5.7769539751944544e-05, + "loss": 0.4134, + "step": 36420 + }, + { + "epoch": 714.31, + "learning_rate": 5.773010003698595e-05, + "loss": 0.4166, + "step": 36430 + }, + { + "epoch": 714.51, + "learning_rate": 5.769066536445294e-05, + "loss": 0.4101, + "step": 36440 + }, + { + "epoch": 714.71, + "learning_rate": 5.765123574585965e-05, + "loss": 0.4171, + "step": 36450 + }, + { + "epoch": 714.9, + "learning_rate": 5.7611811192718576e-05, + "loss": 0.408, + "step": 36460 + }, + { + "epoch": 715.0, + "eval_loss": 0.4195824861526489, + "eval_runtime": 2.2721, + "eval_samples_per_second": 1003.05, + "eval_steps_per_second": 3.961, + "step": 36465 + }, + { + "epoch": 715.1, + "learning_rate": 5.757239171654086e-05, + "loss": 0.4114, + "step": 36470 + }, + { + "epoch": 715.29, + "learning_rate": 5.753297732883607e-05, + "loss": 0.4121, + "step": 36480 + }, + { + "epoch": 715.49, + "learning_rate": 5.749356804111238e-05, + "loss": 0.411, + "step": 36490 + }, + { + "epoch": 715.69, + "learning_rate": 5.7454163864876376e-05, + "loss": 0.4138, + "step": 36500 + }, + { + "epoch": 715.88, + "learning_rate": 5.741476481163319e-05, + "loss": 0.4126, + "step": 36510 + }, + { + "epoch": 716.0, + "eval_loss": 0.41745418310165405, + "eval_runtime": 2.1957, + "eval_samples_per_second": 1037.945, + "eval_steps_per_second": 4.099, + "step": 36516 + }, + { + "epoch": 716.08, + "learning_rate": 5.737537089288652e-05, + "loss": 0.4114, + "step": 36520 + }, + { + "epoch": 716.27, + "learning_rate": 5.7335982120138456e-05, + "loss": 0.4128, + "step": 36530 + }, + { + "epoch": 716.47, + "learning_rate": 5.72965985048897e-05, + "loss": 0.4117, + "step": 36540 + }, + { + "epoch": 716.67, + "learning_rate": 5.725722005863931e-05, + "loss": 0.4086, + "step": 36550 + }, + { + "epoch": 716.86, + "learning_rate": 5.7217846792885e-05, + "loss": 0.4106, + "step": 36560 + }, + { + "epoch": 717.0, + "eval_loss": 0.4145427942276001, + "eval_runtime": 2.3265, + "eval_samples_per_second": 979.583, + "eval_steps_per_second": 3.868, + "step": 36567 + }, + { + "epoch": 717.06, + "learning_rate": 5.717847871912284e-05, + "loss": 0.4051, + "step": 36570 + }, + { + "epoch": 717.25, + "learning_rate": 5.7139115848847425e-05, + "loss": 0.409, + "step": 36580 + }, + { + "epoch": 717.45, + "learning_rate": 5.709975819355187e-05, + "loss": 0.4174, + "step": 36590 + }, + { + "epoch": 717.65, + "learning_rate": 5.706040576472766e-05, + "loss": 0.4098, + "step": 36600 + }, + { + "epoch": 717.84, + "learning_rate": 5.7021058573864924e-05, + "loss": 0.4112, + "step": 36610 + }, + { + "epoch": 718.0, + "eval_loss": 0.41604405641555786, + "eval_runtime": 2.2861, + "eval_samples_per_second": 996.902, + "eval_steps_per_second": 3.937, + "step": 36618 + }, + { + "epoch": 718.04, + "learning_rate": 5.6981716632452086e-05, + "loss": 0.4096, + "step": 36620 + }, + { + "epoch": 718.24, + "learning_rate": 5.694237995197615e-05, + "loss": 0.4088, + "step": 36630 + }, + { + "epoch": 718.43, + "learning_rate": 5.690304854392257e-05, + "loss": 0.41, + "step": 36640 + }, + { + "epoch": 718.63, + "learning_rate": 5.6863722419775166e-05, + "loss": 0.4145, + "step": 36650 + }, + { + "epoch": 718.82, + "learning_rate": 5.6824401591016385e-05, + "loss": 0.4064, + "step": 36660 + }, + { + "epoch": 719.0, + "eval_loss": 0.41746461391448975, + "eval_runtime": 2.2192, + "eval_samples_per_second": 1026.951, + "eval_steps_per_second": 4.056, + "step": 36669 + }, + { + "epoch": 719.02, + "learning_rate": 5.678508606912694e-05, + "loss": 0.409, + "step": 36670 + }, + { + "epoch": 719.22, + "learning_rate": 5.674577586558616e-05, + "loss": 0.4115, + "step": 36680 + }, + { + "epoch": 719.41, + "learning_rate": 5.6706470991871706e-05, + "loss": 0.4076, + "step": 36690 + }, + { + "epoch": 719.61, + "learning_rate": 5.666717145945976e-05, + "loss": 0.4139, + "step": 36700 + }, + { + "epoch": 719.8, + "learning_rate": 5.662787727982487e-05, + "loss": 0.4089, + "step": 36710 + }, + { + "epoch": 720.0, + "learning_rate": 5.658858846444006e-05, + "loss": 0.41, + "step": 36720 + }, + { + "epoch": 720.0, + "eval_loss": 0.4181264638900757, + "eval_runtime": 2.2542, + "eval_samples_per_second": 1011.006, + "eval_steps_per_second": 3.993, + "step": 36720 + }, + { + "epoch": 720.2, + "learning_rate": 5.654930502477682e-05, + "loss": 0.4102, + "step": 36730 + }, + { + "epoch": 720.39, + "learning_rate": 5.651002697230501e-05, + "loss": 0.4134, + "step": 36740 + }, + { + "epoch": 720.59, + "learning_rate": 5.647075431849299e-05, + "loss": 0.4091, + "step": 36750 + }, + { + "epoch": 720.78, + "learning_rate": 5.643148707480745e-05, + "loss": 0.4132, + "step": 36760 + }, + { + "epoch": 720.98, + "learning_rate": 5.639222525271355e-05, + "loss": 0.4046, + "step": 36770 + }, + { + "epoch": 721.0, + "eval_loss": 0.4158802032470703, + "eval_runtime": 2.327, + "eval_samples_per_second": 979.383, + "eval_steps_per_second": 3.868, + "step": 36771 + }, + { + "epoch": 721.18, + "learning_rate": 5.63529688636749e-05, + "loss": 0.4074, + "step": 36780 + }, + { + "epoch": 721.37, + "learning_rate": 5.631371791915345e-05, + "loss": 0.4129, + "step": 36790 + }, + { + "epoch": 721.57, + "learning_rate": 5.627447243060967e-05, + "loss": 0.4086, + "step": 36800 + }, + { + "epoch": 721.76, + "learning_rate": 5.623523240950225e-05, + "loss": 0.4106, + "step": 36810 + }, + { + "epoch": 721.96, + "learning_rate": 5.6195997867288534e-05, + "loss": 0.4141, + "step": 36820 + }, + { + "epoch": 722.0, + "eval_loss": 0.41193127632141113, + "eval_runtime": 2.2521, + "eval_samples_per_second": 1011.933, + "eval_steps_per_second": 3.996, + "step": 36822 + }, + { + "epoch": 722.16, + "learning_rate": 5.615676881542405e-05, + "loss": 0.4093, + "step": 36830 + }, + { + "epoch": 722.35, + "learning_rate": 5.611754526536282e-05, + "loss": 0.4119, + "step": 36840 + }, + { + "epoch": 722.55, + "learning_rate": 5.6078327228557274e-05, + "loss": 0.4092, + "step": 36850 + }, + { + "epoch": 722.75, + "learning_rate": 5.6039114716458145e-05, + "loss": 0.4097, + "step": 36860 + }, + { + "epoch": 722.94, + "learning_rate": 5.599990774051469e-05, + "loss": 0.414, + "step": 36870 + }, + { + "epoch": 723.0, + "eval_loss": 0.4167172610759735, + "eval_runtime": 2.2632, + "eval_samples_per_second": 1006.978, + "eval_steps_per_second": 3.977, + "step": 36873 + }, + { + "epoch": 723.14, + "learning_rate": 5.596070631217441e-05, + "loss": 0.409, + "step": 36880 + }, + { + "epoch": 723.33, + "learning_rate": 5.592151044288327e-05, + "loss": 0.4143, + "step": 36890 + }, + { + "epoch": 723.53, + "learning_rate": 5.588232014408561e-05, + "loss": 0.4101, + "step": 36900 + }, + { + "epoch": 723.73, + "learning_rate": 5.5843135427224076e-05, + "loss": 0.4125, + "step": 36910 + }, + { + "epoch": 723.92, + "learning_rate": 5.580395630373977e-05, + "loss": 0.4118, + "step": 36920 + }, + { + "epoch": 724.0, + "eval_loss": 0.4165693521499634, + "eval_runtime": 2.1966, + "eval_samples_per_second": 1037.5, + "eval_steps_per_second": 4.097, + "step": 36924 + }, + { + "epoch": 724.12, + "learning_rate": 5.57647827850721e-05, + "loss": 0.4082, + "step": 36930 + }, + { + "epoch": 724.31, + "learning_rate": 5.5725614882658874e-05, + "loss": 0.4093, + "step": 36940 + }, + { + "epoch": 724.51, + "learning_rate": 5.5686452607936226e-05, + "loss": 0.4138, + "step": 36950 + }, + { + "epoch": 724.71, + "learning_rate": 5.564729597233873e-05, + "loss": 0.4079, + "step": 36960 + }, + { + "epoch": 724.9, + "learning_rate": 5.560814498729918e-05, + "loss": 0.4106, + "step": 36970 + }, + { + "epoch": 725.0, + "eval_loss": 0.41572317481040955, + "eval_runtime": 2.3431, + "eval_samples_per_second": 972.633, + "eval_steps_per_second": 3.841, + "step": 36975 + }, + { + "epoch": 725.1, + "learning_rate": 5.556899966424879e-05, + "loss": 0.4097, + "step": 36980 + }, + { + "epoch": 725.29, + "learning_rate": 5.552986001461716e-05, + "loss": 0.4088, + "step": 36990 + }, + { + "epoch": 725.49, + "learning_rate": 5.549072604983218e-05, + "loss": 0.4092, + "step": 37000 + }, + { + "epoch": 725.69, + "learning_rate": 5.545159778132011e-05, + "loss": 0.4109, + "step": 37010 + }, + { + "epoch": 725.88, + "learning_rate": 5.5412475220505475e-05, + "loss": 0.4079, + "step": 37020 + }, + { + "epoch": 726.0, + "eval_loss": 0.4175969958305359, + "eval_runtime": 2.3512, + "eval_samples_per_second": 969.275, + "eval_steps_per_second": 3.828, + "step": 37026 + }, + { + "epoch": 726.08, + "learning_rate": 5.537335837881127e-05, + "loss": 0.4021, + "step": 37030 + }, + { + "epoch": 726.27, + "learning_rate": 5.53342472676587e-05, + "loss": 0.4103, + "step": 37040 + }, + { + "epoch": 726.47, + "learning_rate": 5.529514189846732e-05, + "loss": 0.4125, + "step": 37050 + }, + { + "epoch": 726.67, + "learning_rate": 5.525604228265507e-05, + "loss": 0.4095, + "step": 37060 + }, + { + "epoch": 726.86, + "learning_rate": 5.521694843163809e-05, + "loss": 0.4114, + "step": 37070 + }, + { + "epoch": 727.0, + "eval_loss": 0.4107976257801056, + "eval_runtime": 2.2678, + "eval_samples_per_second": 1004.941, + "eval_steps_per_second": 3.969, + "step": 37077 + }, + { + "epoch": 727.06, + "learning_rate": 5.517786035683102e-05, + "loss": 0.4111, + "step": 37080 + }, + { + "epoch": 727.25, + "learning_rate": 5.5138778069646614e-05, + "loss": 0.4112, + "step": 37090 + }, + { + "epoch": 727.45, + "learning_rate": 5.509970158149608e-05, + "loss": 0.4078, + "step": 37100 + }, + { + "epoch": 727.65, + "learning_rate": 5.5060630903788886e-05, + "loss": 0.4088, + "step": 37110 + }, + { + "epoch": 727.84, + "learning_rate": 5.5021566047932736e-05, + "loss": 0.4117, + "step": 37120 + }, + { + "epoch": 728.0, + "eval_loss": 0.41353392601013184, + "eval_runtime": 2.2102, + "eval_samples_per_second": 1031.137, + "eval_steps_per_second": 4.072, + "step": 37128 + }, + { + "epoch": 728.04, + "learning_rate": 5.4982507025333756e-05, + "loss": 0.4077, + "step": 37130 + }, + { + "epoch": 728.24, + "learning_rate": 5.4943453847396275e-05, + "loss": 0.4131, + "step": 37140 + }, + { + "epoch": 728.43, + "learning_rate": 5.490440652552298e-05, + "loss": 0.4127, + "step": 37150 + }, + { + "epoch": 728.63, + "learning_rate": 5.48653650711148e-05, + "loss": 0.4126, + "step": 37160 + }, + { + "epoch": 728.82, + "learning_rate": 5.4826329495571e-05, + "loss": 0.4155, + "step": 37170 + }, + { + "epoch": 729.0, + "eval_loss": 0.4170650243759155, + "eval_runtime": 2.2637, + "eval_samples_per_second": 1006.775, + "eval_steps_per_second": 3.976, + "step": 37179 + }, + { + "epoch": 729.02, + "learning_rate": 5.478729981028905e-05, + "loss": 0.4111, + "step": 37180 + }, + { + "epoch": 729.22, + "learning_rate": 5.474827602666475e-05, + "loss": 0.4102, + "step": 37190 + }, + { + "epoch": 729.41, + "learning_rate": 5.4709258156092214e-05, + "loss": 0.4115, + "step": 37200 + }, + { + "epoch": 729.61, + "learning_rate": 5.467024620996375e-05, + "loss": 0.4083, + "step": 37210 + }, + { + "epoch": 729.8, + "learning_rate": 5.4631240199670036e-05, + "loss": 0.4099, + "step": 37220 + }, + { + "epoch": 730.0, + "learning_rate": 5.4592240136599856e-05, + "loss": 0.4117, + "step": 37230 + }, + { + "epoch": 730.0, + "eval_loss": 0.4147026836872101, + "eval_runtime": 2.228, + "eval_samples_per_second": 1022.869, + "eval_steps_per_second": 4.039, + "step": 37230 + }, + { + "epoch": 730.2, + "learning_rate": 5.455324603214047e-05, + "loss": 0.4102, + "step": 37240 + }, + { + "epoch": 730.39, + "learning_rate": 5.4514257897677227e-05, + "loss": 0.4123, + "step": 37250 + }, + { + "epoch": 730.59, + "learning_rate": 5.447527574459378e-05, + "loss": 0.4088, + "step": 37260 + }, + { + "epoch": 730.78, + "learning_rate": 5.44362995842721e-05, + "loss": 0.4096, + "step": 37270 + }, + { + "epoch": 730.98, + "learning_rate": 5.439732942809228e-05, + "loss": 0.4092, + "step": 37280 + }, + { + "epoch": 731.0, + "eval_loss": 0.4093756079673767, + "eval_runtime": 2.3646, + "eval_samples_per_second": 963.815, + "eval_steps_per_second": 3.806, + "step": 37281 + }, + { + "epoch": 731.18, + "learning_rate": 5.435836528743283e-05, + "loss": 0.4079, + "step": 37290 + }, + { + "epoch": 731.37, + "learning_rate": 5.431940717367033e-05, + "loss": 0.4105, + "step": 37300 + }, + { + "epoch": 731.57, + "learning_rate": 5.428045509817974e-05, + "loss": 0.4073, + "step": 37310 + }, + { + "epoch": 731.76, + "learning_rate": 5.424150907233418e-05, + "loss": 0.403, + "step": 37320 + }, + { + "epoch": 731.96, + "learning_rate": 5.420256910750497e-05, + "loss": 0.4091, + "step": 37330 + }, + { + "epoch": 732.0, + "eval_loss": 0.41333431005477905, + "eval_runtime": 2.2202, + "eval_samples_per_second": 1026.491, + "eval_steps_per_second": 4.054, + "step": 37332 + }, + { + "epoch": 732.16, + "learning_rate": 5.416363521506178e-05, + "loss": 0.4111, + "step": 37340 + }, + { + "epoch": 732.35, + "learning_rate": 5.4124707406372384e-05, + "loss": 0.4095, + "step": 37350 + }, + { + "epoch": 732.55, + "learning_rate": 5.408578569280289e-05, + "loss": 0.4087, + "step": 37360 + }, + { + "epoch": 732.75, + "learning_rate": 5.404687008571752e-05, + "loss": 0.4072, + "step": 37370 + }, + { + "epoch": 732.94, + "learning_rate": 5.400796059647882e-05, + "loss": 0.4081, + "step": 37380 + }, + { + "epoch": 733.0, + "eval_loss": 0.4142116606235504, + "eval_runtime": 2.3382, + "eval_samples_per_second": 974.674, + "eval_steps_per_second": 3.849, + "step": 37383 + }, + { + "epoch": 733.14, + "learning_rate": 5.396905723644744e-05, + "loss": 0.4078, + "step": 37390 + }, + { + "epoch": 733.33, + "learning_rate": 5.39301600169823e-05, + "loss": 0.4053, + "step": 37400 + }, + { + "epoch": 733.53, + "learning_rate": 5.389126894944054e-05, + "loss": 0.4094, + "step": 37410 + }, + { + "epoch": 733.73, + "learning_rate": 5.385238404517747e-05, + "loss": 0.4079, + "step": 37420 + }, + { + "epoch": 733.92, + "learning_rate": 5.381350531554664e-05, + "loss": 0.4084, + "step": 37430 + }, + { + "epoch": 734.0, + "eval_loss": 0.4169609546661377, + "eval_runtime": 2.3612, + "eval_samples_per_second": 965.176, + "eval_steps_per_second": 3.812, + "step": 37434 + }, + { + "epoch": 734.12, + "learning_rate": 5.377463277189971e-05, + "loss": 0.4031, + "step": 37440 + }, + { + "epoch": 734.31, + "learning_rate": 5.3735766425586685e-05, + "loss": 0.4042, + "step": 37450 + }, + { + "epoch": 734.51, + "learning_rate": 5.3696906287955614e-05, + "loss": 0.408, + "step": 37460 + }, + { + "epoch": 734.71, + "learning_rate": 5.365805237035279e-05, + "loss": 0.4106, + "step": 37470 + }, + { + "epoch": 734.9, + "learning_rate": 5.361920468412273e-05, + "loss": 0.4082, + "step": 37480 + }, + { + "epoch": 735.0, + "eval_loss": 0.41578948497772217, + "eval_runtime": 2.3433, + "eval_samples_per_second": 972.578, + "eval_steps_per_second": 3.841, + "step": 37485 + }, + { + "epoch": 735.1, + "learning_rate": 5.3580363240608015e-05, + "loss": 0.4131, + "step": 37490 + }, + { + "epoch": 735.29, + "learning_rate": 5.3541528051149574e-05, + "loss": 0.4123, + "step": 37500 + }, + { + "epoch": 735.49, + "learning_rate": 5.350269912708636e-05, + "loss": 0.4099, + "step": 37510 + }, + { + "epoch": 735.69, + "learning_rate": 5.346387647975555e-05, + "loss": 0.4101, + "step": 37520 + }, + { + "epoch": 735.88, + "learning_rate": 5.342506012049253e-05, + "loss": 0.4097, + "step": 37530 + }, + { + "epoch": 736.0, + "eval_loss": 0.41176244616508484, + "eval_runtime": 2.2484, + "eval_samples_per_second": 1013.627, + "eval_steps_per_second": 4.003, + "step": 37536 + }, + { + "epoch": 736.08, + "learning_rate": 5.3386250060630765e-05, + "loss": 0.4079, + "step": 37540 + }, + { + "epoch": 736.27, + "learning_rate": 5.334744631150196e-05, + "loss": 0.4045, + "step": 37550 + }, + { + "epoch": 736.47, + "learning_rate": 5.3308648884435914e-05, + "loss": 0.4082, + "step": 37560 + }, + { + "epoch": 736.67, + "learning_rate": 5.326985779076066e-05, + "loss": 0.4075, + "step": 37570 + }, + { + "epoch": 736.86, + "learning_rate": 5.32310730418023e-05, + "loss": 0.4082, + "step": 37580 + }, + { + "epoch": 737.0, + "eval_loss": 0.4105346202850342, + "eval_runtime": 2.3444, + "eval_samples_per_second": 972.103, + "eval_steps_per_second": 3.839, + "step": 37587 + }, + { + "epoch": 737.06, + "learning_rate": 5.3192294648885086e-05, + "loss": 0.4049, + "step": 37590 + }, + { + "epoch": 737.25, + "learning_rate": 5.3153522623331504e-05, + "loss": 0.4119, + "step": 37600 + }, + { + "epoch": 737.45, + "learning_rate": 5.311475697646207e-05, + "loss": 0.4071, + "step": 37610 + }, + { + "epoch": 737.65, + "learning_rate": 5.307599771959553e-05, + "loss": 0.404, + "step": 37620 + }, + { + "epoch": 737.84, + "learning_rate": 5.303724486404868e-05, + "loss": 0.4043, + "step": 37630 + }, + { + "epoch": 738.0, + "eval_loss": 0.4161665141582489, + "eval_runtime": 2.3799, + "eval_samples_per_second": 957.598, + "eval_steps_per_second": 3.782, + "step": 37638 + }, + { + "epoch": 738.04, + "learning_rate": 5.2998498421136554e-05, + "loss": 0.4074, + "step": 37640 + }, + { + "epoch": 738.24, + "learning_rate": 5.2959758402172184e-05, + "loss": 0.4047, + "step": 37650 + }, + { + "epoch": 738.43, + "learning_rate": 5.29210248184668e-05, + "loss": 0.4049, + "step": 37660 + }, + { + "epoch": 738.63, + "learning_rate": 5.288229768132978e-05, + "loss": 0.4089, + "step": 37670 + }, + { + "epoch": 738.82, + "learning_rate": 5.284357700206855e-05, + "loss": 0.4011, + "step": 37680 + }, + { + "epoch": 739.0, + "eval_loss": 0.4121991693973541, + "eval_runtime": 2.3743, + "eval_samples_per_second": 959.861, + "eval_steps_per_second": 3.791, + "step": 37689 + }, + { + "epoch": 739.02, + "learning_rate": 5.2804862791988724e-05, + "loss": 0.4083, + "step": 37690 + }, + { + "epoch": 739.22, + "learning_rate": 5.276615506239393e-05, + "loss": 0.4038, + "step": 37700 + }, + { + "epoch": 739.41, + "learning_rate": 5.272745382458602e-05, + "loss": 0.4015, + "step": 37710 + }, + { + "epoch": 739.61, + "learning_rate": 5.2688759089864874e-05, + "loss": 0.4029, + "step": 37720 + }, + { + "epoch": 739.8, + "learning_rate": 5.265007086952845e-05, + "loss": 0.4126, + "step": 37730 + }, + { + "epoch": 740.0, + "learning_rate": 5.2611389174872926e-05, + "loss": 0.4082, + "step": 37740 + }, + { + "epoch": 740.0, + "eval_loss": 0.4157661199569702, + "eval_runtime": 2.2442, + "eval_samples_per_second": 1015.507, + "eval_steps_per_second": 4.01, + "step": 37740 + }, + { + "epoch": 740.2, + "learning_rate": 5.25727140171924e-05, + "loss": 0.4108, + "step": 37750 + }, + { + "epoch": 740.39, + "learning_rate": 5.253404540777924e-05, + "loss": 0.4039, + "step": 37760 + }, + { + "epoch": 740.59, + "learning_rate": 5.249538335792376e-05, + "loss": 0.4067, + "step": 37770 + }, + { + "epoch": 740.78, + "learning_rate": 5.245672787891444e-05, + "loss": 0.4043, + "step": 37780 + }, + { + "epoch": 740.98, + "learning_rate": 5.241807898203785e-05, + "loss": 0.4098, + "step": 37790 + }, + { + "epoch": 741.0, + "eval_loss": 0.41529256105422974, + "eval_runtime": 2.2331, + "eval_samples_per_second": 1020.576, + "eval_steps_per_second": 4.03, + "step": 37791 + }, + { + "epoch": 741.18, + "learning_rate": 5.237943667857853e-05, + "loss": 0.4056, + "step": 37800 + }, + { + "epoch": 741.37, + "learning_rate": 5.234080097981923e-05, + "loss": 0.407, + "step": 37810 + }, + { + "epoch": 741.57, + "learning_rate": 5.230217189704068e-05, + "loss": 0.4052, + "step": 37820 + }, + { + "epoch": 741.76, + "learning_rate": 5.226354944152174e-05, + "loss": 0.4068, + "step": 37830 + }, + { + "epoch": 741.96, + "learning_rate": 5.222493362453928e-05, + "loss": 0.4082, + "step": 37840 + }, + { + "epoch": 742.0, + "eval_loss": 0.4107462167739868, + "eval_runtime": 2.3772, + "eval_samples_per_second": 958.707, + "eval_steps_per_second": 3.786, + "step": 37842 + }, + { + "epoch": 742.16, + "learning_rate": 5.218632445736829e-05, + "loss": 0.4024, + "step": 37850 + }, + { + "epoch": 742.35, + "learning_rate": 5.214772195128175e-05, + "loss": 0.4072, + "step": 37860 + }, + { + "epoch": 742.55, + "learning_rate": 5.2109126117550734e-05, + "loss": 0.4071, + "step": 37870 + }, + { + "epoch": 742.75, + "learning_rate": 5.207053696744439e-05, + "loss": 0.4052, + "step": 37880 + }, + { + "epoch": 742.94, + "learning_rate": 5.203195451222986e-05, + "loss": 0.4073, + "step": 37890 + }, + { + "epoch": 743.0, + "eval_loss": 0.4117482900619507, + "eval_runtime": 2.3515, + "eval_samples_per_second": 969.164, + "eval_steps_per_second": 3.827, + "step": 37893 + }, + { + "epoch": 743.14, + "learning_rate": 5.1993378763172405e-05, + "loss": 0.4065, + "step": 37900 + }, + { + "epoch": 743.33, + "learning_rate": 5.1954809731535205e-05, + "loss": 0.4052, + "step": 37910 + }, + { + "epoch": 743.53, + "learning_rate": 5.1916247428579655e-05, + "loss": 0.4077, + "step": 37920 + }, + { + "epoch": 743.73, + "learning_rate": 5.187769186556503e-05, + "loss": 0.4065, + "step": 37930 + }, + { + "epoch": 743.92, + "learning_rate": 5.183914305374867e-05, + "loss": 0.403, + "step": 37940 + }, + { + "epoch": 744.0, + "eval_loss": 0.4163016974925995, + "eval_runtime": 2.3484, + "eval_samples_per_second": 970.467, + "eval_steps_per_second": 3.832, + "step": 37944 + }, + { + "epoch": 744.12, + "learning_rate": 5.180060100438604e-05, + "loss": 0.4029, + "step": 37950 + }, + { + "epoch": 744.31, + "learning_rate": 5.176206572873049e-05, + "loss": 0.4093, + "step": 37960 + }, + { + "epoch": 744.51, + "learning_rate": 5.172353723803352e-05, + "loss": 0.4116, + "step": 37970 + }, + { + "epoch": 744.71, + "learning_rate": 5.1685015543544524e-05, + "loss": 0.4061, + "step": 37980 + }, + { + "epoch": 744.9, + "learning_rate": 5.164650065651104e-05, + "loss": 0.4024, + "step": 37990 + }, + { + "epoch": 745.0, + "eval_loss": 0.4079929292201996, + "eval_runtime": 2.2235, + "eval_samples_per_second": 1024.945, + "eval_steps_per_second": 4.048, + "step": 37995 + }, + { + "epoch": 745.1, + "learning_rate": 5.160799258817854e-05, + "loss": 0.4086, + "step": 38000 + }, + { + "epoch": 745.29, + "learning_rate": 5.156949134979049e-05, + "loss": 0.4048, + "step": 38010 + }, + { + "epoch": 745.49, + "learning_rate": 5.153099695258843e-05, + "loss": 0.4021, + "step": 38020 + }, + { + "epoch": 745.69, + "learning_rate": 5.149250940781183e-05, + "loss": 0.4061, + "step": 38030 + }, + { + "epoch": 745.88, + "learning_rate": 5.145402872669824e-05, + "loss": 0.4098, + "step": 38040 + }, + { + "epoch": 746.0, + "eval_loss": 0.4082351624965668, + "eval_runtime": 2.2082, + "eval_samples_per_second": 1032.048, + "eval_steps_per_second": 4.076, + "step": 38046 + }, + { + "epoch": 746.08, + "learning_rate": 5.141555492048311e-05, + "loss": 0.4062, + "step": 38050 + }, + { + "epoch": 746.27, + "learning_rate": 5.137708800039999e-05, + "loss": 0.4041, + "step": 38060 + }, + { + "epoch": 746.47, + "learning_rate": 5.1338627977680316e-05, + "loss": 0.408, + "step": 38070 + }, + { + "epoch": 746.67, + "learning_rate": 5.130017486355356e-05, + "loss": 0.4061, + "step": 38080 + }, + { + "epoch": 746.86, + "learning_rate": 5.1261728669247204e-05, + "loss": 0.4072, + "step": 38090 + }, + { + "epoch": 747.0, + "eval_loss": 0.4111115038394928, + "eval_runtime": 2.3737, + "eval_samples_per_second": 960.092, + "eval_steps_per_second": 3.792, + "step": 38097 + }, + { + "epoch": 747.06, + "learning_rate": 5.1223289405986644e-05, + "loss": 0.4063, + "step": 38100 + }, + { + "epoch": 747.25, + "learning_rate": 5.118485708499533e-05, + "loss": 0.4049, + "step": 38110 + }, + { + "epoch": 747.45, + "learning_rate": 5.114643171749458e-05, + "loss": 0.4083, + "step": 38120 + }, + { + "epoch": 747.65, + "learning_rate": 5.1108013314703824e-05, + "loss": 0.4074, + "step": 38130 + }, + { + "epoch": 747.84, + "learning_rate": 5.106960188784033e-05, + "loss": 0.4065, + "step": 38140 + }, + { + "epoch": 748.0, + "eval_loss": 0.4118553102016449, + "eval_runtime": 2.1762, + "eval_samples_per_second": 1047.261, + "eval_steps_per_second": 4.136, + "step": 38148 + }, + { + "epoch": 748.04, + "learning_rate": 5.103119744811936e-05, + "loss": 0.4044, + "step": 38150 + }, + { + "epoch": 748.24, + "learning_rate": 5.099280000675421e-05, + "loss": 0.4071, + "step": 38160 + }, + { + "epoch": 748.43, + "learning_rate": 5.095440957495602e-05, + "loss": 0.4055, + "step": 38170 + }, + { + "epoch": 748.63, + "learning_rate": 5.0916026163933973e-05, + "loss": 0.4047, + "step": 38180 + }, + { + "epoch": 748.82, + "learning_rate": 5.0877649784895176e-05, + "loss": 0.404, + "step": 38190 + }, + { + "epoch": 749.0, + "eval_loss": 0.40867891907691956, + "eval_runtime": 2.2236, + "eval_samples_per_second": 1024.932, + "eval_steps_per_second": 4.048, + "step": 38199 + }, + { + "epoch": 749.02, + "learning_rate": 5.083928044904464e-05, + "loss": 0.4102, + "step": 38200 + }, + { + "epoch": 749.22, + "learning_rate": 5.08009181675854e-05, + "loss": 0.4052, + "step": 38210 + }, + { + "epoch": 749.41, + "learning_rate": 5.0762562951718336e-05, + "loss": 0.4052, + "step": 38220 + }, + { + "epoch": 749.61, + "learning_rate": 5.0724214812642355e-05, + "loss": 0.4073, + "step": 38230 + }, + { + "epoch": 749.8, + "learning_rate": 5.068587376155423e-05, + "loss": 0.4002, + "step": 38240 + }, + { + "epoch": 750.0, + "learning_rate": 5.064753980964874e-05, + "loss": 0.4024, + "step": 38250 + }, + { + "epoch": 750.0, + "eval_loss": 0.40933549404144287, + "eval_runtime": 2.3322, + "eval_samples_per_second": 977.173, + "eval_steps_per_second": 3.859, + "step": 38250 + }, + { + "epoch": 750.2, + "learning_rate": 5.060921296811852e-05, + "loss": 0.4045, + "step": 38260 + }, + { + "epoch": 750.39, + "learning_rate": 5.0570893248154106e-05, + "loss": 0.4013, + "step": 38270 + }, + { + "epoch": 750.59, + "learning_rate": 5.053258066094407e-05, + "loss": 0.4054, + "step": 38280 + }, + { + "epoch": 750.78, + "learning_rate": 5.0494275217674776e-05, + "loss": 0.4054, + "step": 38290 + }, + { + "epoch": 750.98, + "learning_rate": 5.045597692953061e-05, + "loss": 0.4054, + "step": 38300 + }, + { + "epoch": 751.0, + "eval_loss": 0.41108617186546326, + "eval_runtime": 2.3915, + "eval_samples_per_second": 952.947, + "eval_steps_per_second": 3.763, + "step": 38301 + }, + { + "epoch": 751.18, + "learning_rate": 5.0417685807693785e-05, + "loss": 0.4077, + "step": 38310 + }, + { + "epoch": 751.37, + "learning_rate": 5.037940186334449e-05, + "loss": 0.4014, + "step": 38320 + }, + { + "epoch": 751.57, + "learning_rate": 5.034112510766074e-05, + "loss": 0.4087, + "step": 38330 + }, + { + "epoch": 751.76, + "learning_rate": 5.0302855551818505e-05, + "loss": 0.4092, + "step": 38340 + }, + { + "epoch": 751.96, + "learning_rate": 5.026459320699166e-05, + "loss": 0.403, + "step": 38350 + }, + { + "epoch": 752.0, + "eval_loss": 0.40934062004089355, + "eval_runtime": 2.194, + "eval_samples_per_second": 1038.731, + "eval_steps_per_second": 4.102, + "step": 38352 + }, + { + "epoch": 752.16, + "learning_rate": 5.022633808435193e-05, + "loss": 0.403, + "step": 38360 + }, + { + "epoch": 752.35, + "learning_rate": 5.0188090195069e-05, + "loss": 0.4061, + "step": 38370 + }, + { + "epoch": 752.55, + "learning_rate": 5.0149849550310346e-05, + "loss": 0.4043, + "step": 38380 + }, + { + "epoch": 752.75, + "learning_rate": 5.0111616161241436e-05, + "loss": 0.4077, + "step": 38390 + }, + { + "epoch": 752.94, + "learning_rate": 5.007339003902553e-05, + "loss": 0.4042, + "step": 38400 + }, + { + "epoch": 753.0, + "eval_loss": 0.4117041826248169, + "eval_runtime": 2.3245, + "eval_samples_per_second": 980.409, + "eval_steps_per_second": 3.872, + "step": 38403 + }, + { + "epoch": 753.14, + "learning_rate": 5.0035171194823804e-05, + "loss": 0.4031, + "step": 38410 + }, + { + "epoch": 753.33, + "learning_rate": 4.9996959639795356e-05, + "loss": 0.4069, + "step": 38420 + }, + { + "epoch": 753.53, + "learning_rate": 4.9958755385097035e-05, + "loss": 0.4078, + "step": 38430 + }, + { + "epoch": 753.73, + "learning_rate": 4.992055844188368e-05, + "loss": 0.4035, + "step": 38440 + }, + { + "epoch": 753.92, + "learning_rate": 4.988236882130792e-05, + "loss": 0.4025, + "step": 38450 + }, + { + "epoch": 754.0, + "eval_loss": 0.40878942608833313, + "eval_runtime": 2.2485, + "eval_samples_per_second": 1013.564, + "eval_steps_per_second": 4.003, + "step": 38454 + }, + { + "epoch": 754.12, + "learning_rate": 4.9844186534520305e-05, + "loss": 0.4056, + "step": 38460 + }, + { + "epoch": 754.31, + "learning_rate": 4.9806011592669205e-05, + "loss": 0.4034, + "step": 38470 + }, + { + "epoch": 754.51, + "learning_rate": 4.9767844006900806e-05, + "loss": 0.4106, + "step": 38480 + }, + { + "epoch": 754.71, + "learning_rate": 4.9729683788359245e-05, + "loss": 0.4055, + "step": 38490 + }, + { + "epoch": 754.9, + "learning_rate": 4.969153094818643e-05, + "loss": 0.4025, + "step": 38500 + }, + { + "epoch": 755.0, + "eval_loss": 0.41021928191185, + "eval_runtime": 2.3051, + "eval_samples_per_second": 988.664, + "eval_steps_per_second": 3.904, + "step": 38505 + }, + { + "epoch": 755.1, + "learning_rate": 4.965338549752215e-05, + "loss": 0.4049, + "step": 38510 + }, + { + "epoch": 755.29, + "learning_rate": 4.9615247447504e-05, + "loss": 0.4027, + "step": 38520 + }, + { + "epoch": 755.49, + "learning_rate": 4.9577116809267496e-05, + "loss": 0.4066, + "step": 38530 + }, + { + "epoch": 755.69, + "learning_rate": 4.9538993593945874e-05, + "loss": 0.4049, + "step": 38540 + }, + { + "epoch": 755.88, + "learning_rate": 4.9500877812670294e-05, + "loss": 0.4056, + "step": 38550 + }, + { + "epoch": 756.0, + "eval_loss": 0.41347235441207886, + "eval_runtime": 2.3625, + "eval_samples_per_second": 964.654, + "eval_steps_per_second": 3.81, + "step": 38556 + }, + { + "epoch": 756.08, + "learning_rate": 4.94627694765697e-05, + "loss": 0.4062, + "step": 38560 + }, + { + "epoch": 756.27, + "learning_rate": 4.942466859677087e-05, + "loss": 0.4062, + "step": 38570 + }, + { + "epoch": 756.47, + "learning_rate": 4.938657518439847e-05, + "loss": 0.4028, + "step": 38580 + }, + { + "epoch": 756.67, + "learning_rate": 4.934848925057484e-05, + "loss": 0.4066, + "step": 38590 + }, + { + "epoch": 756.86, + "learning_rate": 4.931041080642028e-05, + "loss": 0.4025, + "step": 38600 + }, + { + "epoch": 757.0, + "eval_loss": 0.4124828279018402, + "eval_runtime": 2.3537, + "eval_samples_per_second": 968.283, + "eval_steps_per_second": 3.824, + "step": 38607 + }, + { + "epoch": 757.06, + "learning_rate": 4.927233986305284e-05, + "loss": 0.4103, + "step": 38610 + }, + { + "epoch": 757.25, + "learning_rate": 4.923427643158835e-05, + "loss": 0.4059, + "step": 38620 + }, + { + "epoch": 757.45, + "learning_rate": 4.9196220523140555e-05, + "loss": 0.4034, + "step": 38630 + }, + { + "epoch": 757.65, + "learning_rate": 4.9158172148820846e-05, + "loss": 0.4035, + "step": 38640 + }, + { + "epoch": 757.84, + "learning_rate": 4.9120131319738555e-05, + "loss": 0.4035, + "step": 38650 + }, + { + "epoch": 758.0, + "eval_loss": 0.410969078540802, + "eval_runtime": 2.213, + "eval_samples_per_second": 1029.805, + "eval_steps_per_second": 4.067, + "step": 38658 + }, + { + "epoch": 758.04, + "learning_rate": 4.908209804700074e-05, + "loss": 0.4063, + "step": 38660 + }, + { + "epoch": 758.24, + "learning_rate": 4.904407234171227e-05, + "loss": 0.4016, + "step": 38670 + }, + { + "epoch": 758.43, + "learning_rate": 4.900605421497583e-05, + "loss": 0.4021, + "step": 38680 + }, + { + "epoch": 758.63, + "learning_rate": 4.896804367789179e-05, + "loss": 0.3997, + "step": 38690 + }, + { + "epoch": 758.82, + "learning_rate": 4.893004074155846e-05, + "loss": 0.4026, + "step": 38700 + }, + { + "epoch": 759.0, + "eval_loss": 0.41265445947647095, + "eval_runtime": 2.3125, + "eval_samples_per_second": 985.522, + "eval_steps_per_second": 3.892, + "step": 38709 + }, + { + "epoch": 759.02, + "learning_rate": 4.889204541707179e-05, + "loss": 0.4016, + "step": 38710 + }, + { + "epoch": 759.22, + "learning_rate": 4.885405771552561e-05, + "loss": 0.3999, + "step": 38720 + }, + { + "epoch": 759.41, + "learning_rate": 4.881607764801146e-05, + "loss": 0.4046, + "step": 38730 + }, + { + "epoch": 759.61, + "learning_rate": 4.8778105225618705e-05, + "loss": 0.4063, + "step": 38740 + }, + { + "epoch": 759.8, + "learning_rate": 4.8740140459434405e-05, + "loss": 0.4072, + "step": 38750 + }, + { + "epoch": 760.0, + "learning_rate": 4.8702183360543426e-05, + "loss": 0.4028, + "step": 38760 + }, + { + "epoch": 760.0, + "eval_loss": 0.4106810688972473, + "eval_runtime": 2.3297, + "eval_samples_per_second": 978.218, + "eval_steps_per_second": 3.863, + "step": 38760 + }, + { + "epoch": 760.2, + "learning_rate": 4.866423394002841e-05, + "loss": 0.402, + "step": 38770 + }, + { + "epoch": 760.39, + "learning_rate": 4.8626292208969734e-05, + "loss": 0.4087, + "step": 38780 + }, + { + "epoch": 760.59, + "learning_rate": 4.858835817844557e-05, + "loss": 0.4029, + "step": 38790 + }, + { + "epoch": 760.78, + "learning_rate": 4.855043185953175e-05, + "loss": 0.4045, + "step": 38800 + }, + { + "epoch": 760.98, + "learning_rate": 4.851251326330196e-05, + "loss": 0.4007, + "step": 38810 + }, + { + "epoch": 761.0, + "eval_loss": 0.4079250693321228, + "eval_runtime": 2.2583, + "eval_samples_per_second": 1009.151, + "eval_steps_per_second": 3.985, + "step": 38811 + }, + { + "epoch": 761.18, + "learning_rate": 4.8474602400827575e-05, + "loss": 0.4066, + "step": 38820 + }, + { + "epoch": 761.37, + "learning_rate": 4.843669928317769e-05, + "loss": 0.4041, + "step": 38830 + }, + { + "epoch": 761.57, + "learning_rate": 4.8398803921419235e-05, + "loss": 0.4006, + "step": 38840 + }, + { + "epoch": 761.76, + "learning_rate": 4.8360916326616735e-05, + "loss": 0.4042, + "step": 38850 + }, + { + "epoch": 761.96, + "learning_rate": 4.832303650983258e-05, + "loss": 0.4043, + "step": 38860 + }, + { + "epoch": 762.0, + "eval_loss": 0.4105600118637085, + "eval_runtime": 2.2483, + "eval_samples_per_second": 1013.643, + "eval_steps_per_second": 4.003, + "step": 38862 + }, + { + "epoch": 762.16, + "learning_rate": 4.82851644821268e-05, + "loss": 0.407, + "step": 38870 + }, + { + "epoch": 762.35, + "learning_rate": 4.824730025455719e-05, + "loss": 0.4043, + "step": 38880 + }, + { + "epoch": 762.55, + "learning_rate": 4.820944383817928e-05, + "loss": 0.3991, + "step": 38890 + }, + { + "epoch": 762.75, + "learning_rate": 4.8171595244046256e-05, + "loss": 0.4011, + "step": 38900 + }, + { + "epoch": 762.94, + "learning_rate": 4.8133754483209105e-05, + "loss": 0.3979, + "step": 38910 + }, + { + "epoch": 763.0, + "eval_loss": 0.40839433670043945, + "eval_runtime": 2.2395, + "eval_samples_per_second": 1017.652, + "eval_steps_per_second": 4.019, + "step": 38913 + }, + { + "epoch": 763.14, + "learning_rate": 4.809592156671645e-05, + "loss": 0.4054, + "step": 38920 + }, + { + "epoch": 763.33, + "learning_rate": 4.8058096505614704e-05, + "loss": 0.4027, + "step": 38930 + }, + { + "epoch": 763.53, + "learning_rate": 4.8020279310947924e-05, + "loss": 0.401, + "step": 38940 + }, + { + "epoch": 763.73, + "learning_rate": 4.798246999375785e-05, + "loss": 0.4027, + "step": 38950 + }, + { + "epoch": 763.92, + "learning_rate": 4.7944668565084e-05, + "loss": 0.4071, + "step": 38960 + }, + { + "epoch": 764.0, + "eval_loss": 0.4093049466609955, + "eval_runtime": 2.2294, + "eval_samples_per_second": 1022.268, + "eval_steps_per_second": 4.037, + "step": 38964 + }, + { + "epoch": 764.12, + "learning_rate": 4.790687503596353e-05, + "loss": 0.4103, + "step": 38970 + }, + { + "epoch": 764.31, + "learning_rate": 4.786908941743132e-05, + "loss": 0.4023, + "step": 38980 + }, + { + "epoch": 764.51, + "learning_rate": 4.783131172051991e-05, + "loss": 0.4053, + "step": 38990 + }, + { + "epoch": 764.71, + "learning_rate": 4.779354195625958e-05, + "loss": 0.403, + "step": 39000 + }, + { + "epoch": 764.9, + "learning_rate": 4.775578013567824e-05, + "loss": 0.4097, + "step": 39010 + }, + { + "epoch": 765.0, + "eval_loss": 0.41303664445877075, + "eval_runtime": 2.3567, + "eval_samples_per_second": 967.04, + "eval_steps_per_second": 3.819, + "step": 39015 + }, + { + "epoch": 765.1, + "learning_rate": 4.7718026269801465e-05, + "loss": 0.4063, + "step": 39020 + }, + { + "epoch": 765.29, + "learning_rate": 4.7680280369652595e-05, + "loss": 0.4052, + "step": 39030 + }, + { + "epoch": 765.49, + "learning_rate": 4.7642542446252544e-05, + "loss": 0.4057, + "step": 39040 + }, + { + "epoch": 765.69, + "learning_rate": 4.760481251062001e-05, + "loss": 0.4035, + "step": 39050 + }, + { + "epoch": 765.88, + "learning_rate": 4.756709057377121e-05, + "loss": 0.4052, + "step": 39060 + }, + { + "epoch": 766.0, + "eval_loss": 0.4117512106895447, + "eval_runtime": 2.2303, + "eval_samples_per_second": 1021.815, + "eval_steps_per_second": 4.035, + "step": 39066 + }, + { + "epoch": 766.08, + "learning_rate": 4.7529376646720166e-05, + "loss": 0.4075, + "step": 39070 + }, + { + "epoch": 766.27, + "learning_rate": 4.7491670740478496e-05, + "loss": 0.4024, + "step": 39080 + }, + { + "epoch": 766.47, + "learning_rate": 4.745397286605545e-05, + "loss": 0.4008, + "step": 39090 + }, + { + "epoch": 766.67, + "learning_rate": 4.741628303445802e-05, + "loss": 0.3995, + "step": 39100 + }, + { + "epoch": 766.86, + "learning_rate": 4.737860125669074e-05, + "loss": 0.4063, + "step": 39110 + }, + { + "epoch": 767.0, + "eval_loss": 0.4054567217826843, + "eval_runtime": 2.1606, + "eval_samples_per_second": 1054.78, + "eval_steps_per_second": 4.165, + "step": 39117 + }, + { + "epoch": 767.06, + "learning_rate": 4.73409275437559e-05, + "loss": 0.4031, + "step": 39120 + }, + { + "epoch": 767.25, + "learning_rate": 4.730326190665333e-05, + "loss": 0.4016, + "step": 39130 + }, + { + "epoch": 767.45, + "learning_rate": 4.726560435638061e-05, + "loss": 0.4006, + "step": 39140 + }, + { + "epoch": 767.65, + "learning_rate": 4.72279549039329e-05, + "loss": 0.4071, + "step": 39150 + }, + { + "epoch": 767.84, + "learning_rate": 4.719031356030294e-05, + "loss": 0.4051, + "step": 39160 + }, + { + "epoch": 768.0, + "eval_loss": 0.4055671691894531, + "eval_runtime": 2.2198, + "eval_samples_per_second": 1026.654, + "eval_steps_per_second": 4.054, + "step": 39168 + }, + { + "epoch": 768.04, + "learning_rate": 4.715268033648123e-05, + "loss": 0.4019, + "step": 39170 + }, + { + "epoch": 768.24, + "learning_rate": 4.711505524345578e-05, + "loss": 0.4041, + "step": 39180 + }, + { + "epoch": 768.43, + "learning_rate": 4.707743829221233e-05, + "loss": 0.4001, + "step": 39190 + }, + { + "epoch": 768.63, + "learning_rate": 4.703982949373414e-05, + "loss": 0.4049, + "step": 39200 + }, + { + "epoch": 768.82, + "learning_rate": 4.700222885900221e-05, + "loss": 0.403, + "step": 39210 + }, + { + "epoch": 769.0, + "eval_loss": 0.4054199457168579, + "eval_runtime": 2.2681, + "eval_samples_per_second": 1004.796, + "eval_steps_per_second": 3.968, + "step": 39219 + }, + { + "epoch": 769.02, + "learning_rate": 4.696463639899501e-05, + "loss": 0.3987, + "step": 39220 + }, + { + "epoch": 769.22, + "learning_rate": 4.692705212468873e-05, + "loss": 0.3996, + "step": 39230 + }, + { + "epoch": 769.41, + "learning_rate": 4.688947604705715e-05, + "loss": 0.4056, + "step": 39240 + }, + { + "epoch": 769.61, + "learning_rate": 4.685190817707163e-05, + "loss": 0.4049, + "step": 39250 + }, + { + "epoch": 769.8, + "learning_rate": 4.681434852570118e-05, + "loss": 0.402, + "step": 39260 + }, + { + "epoch": 770.0, + "learning_rate": 4.6776797103912336e-05, + "loss": 0.4061, + "step": 39270 + }, + { + "epoch": 770.0, + "eval_loss": 0.41016271710395813, + "eval_runtime": 2.241, + "eval_samples_per_second": 1016.966, + "eval_steps_per_second": 4.016, + "step": 39270 + }, + { + "epoch": 770.2, + "learning_rate": 4.673925392266931e-05, + "loss": 0.403, + "step": 39280 + }, + { + "epoch": 770.39, + "learning_rate": 4.670171899293387e-05, + "loss": 0.4001, + "step": 39290 + }, + { + "epoch": 770.59, + "learning_rate": 4.6664192325665355e-05, + "loss": 0.3978, + "step": 39300 + }, + { + "epoch": 770.78, + "learning_rate": 4.6626673931820754e-05, + "loss": 0.4044, + "step": 39310 + }, + { + "epoch": 770.98, + "learning_rate": 4.658916382235455e-05, + "loss": 0.3989, + "step": 39320 + }, + { + "epoch": 771.0, + "eval_loss": 0.4141434133052826, + "eval_runtime": 2.3668, + "eval_samples_per_second": 962.921, + "eval_steps_per_second": 3.803, + "step": 39321 + }, + { + "epoch": 771.18, + "learning_rate": 4.655166200821891e-05, + "loss": 0.4017, + "step": 39330 + }, + { + "epoch": 771.37, + "learning_rate": 4.651416850036347e-05, + "loss": 0.4017, + "step": 39340 + }, + { + "epoch": 771.57, + "learning_rate": 4.6476683309735577e-05, + "loss": 0.4026, + "step": 39350 + }, + { + "epoch": 771.76, + "learning_rate": 4.6439206447280014e-05, + "loss": 0.3998, + "step": 39360 + }, + { + "epoch": 771.96, + "learning_rate": 4.640173792393918e-05, + "loss": 0.4022, + "step": 39370 + }, + { + "epoch": 772.0, + "eval_loss": 0.4049689769744873, + "eval_runtime": 2.3487, + "eval_samples_per_second": 970.308, + "eval_steps_per_second": 3.832, + "step": 39372 + }, + { + "epoch": 772.16, + "learning_rate": 4.636427775065309e-05, + "loss": 0.4021, + "step": 39380 + }, + { + "epoch": 772.35, + "learning_rate": 4.632682593835923e-05, + "loss": 0.3976, + "step": 39390 + }, + { + "epoch": 772.55, + "learning_rate": 4.628938249799275e-05, + "loss": 0.4025, + "step": 39400 + }, + { + "epoch": 772.75, + "learning_rate": 4.6251947440486256e-05, + "loss": 0.4031, + "step": 39410 + }, + { + "epoch": 772.94, + "learning_rate": 4.621452077676999e-05, + "loss": 0.4018, + "step": 39420 + }, + { + "epoch": 773.0, + "eval_loss": 0.4097810685634613, + "eval_runtime": 2.2555, + "eval_samples_per_second": 1010.415, + "eval_steps_per_second": 3.99, + "step": 39423 + }, + { + "epoch": 773.14, + "learning_rate": 4.6177102517771665e-05, + "loss": 0.4007, + "step": 39430 + }, + { + "epoch": 773.33, + "learning_rate": 4.613969267441658e-05, + "loss": 0.4034, + "step": 39440 + }, + { + "epoch": 773.53, + "learning_rate": 4.6102291257627594e-05, + "loss": 0.4008, + "step": 39450 + }, + { + "epoch": 773.73, + "learning_rate": 4.606489827832507e-05, + "loss": 0.3996, + "step": 39460 + }, + { + "epoch": 773.92, + "learning_rate": 4.602751374742697e-05, + "loss": 0.3993, + "step": 39470 + }, + { + "epoch": 774.0, + "eval_loss": 0.40897953510284424, + "eval_runtime": 2.2539, + "eval_samples_per_second": 1011.143, + "eval_steps_per_second": 3.993, + "step": 39474 + }, + { + "epoch": 774.12, + "learning_rate": 4.5990137675848666e-05, + "loss": 0.3995, + "step": 39480 + }, + { + "epoch": 774.31, + "learning_rate": 4.595277007450319e-05, + "loss": 0.4037, + "step": 39490 + }, + { + "epoch": 774.51, + "learning_rate": 4.591541095430105e-05, + "loss": 0.4052, + "step": 39500 + }, + { + "epoch": 774.71, + "learning_rate": 4.5878060326150234e-05, + "loss": 0.3992, + "step": 39510 + }, + { + "epoch": 774.9, + "learning_rate": 4.584071820095636e-05, + "loss": 0.3984, + "step": 39520 + }, + { + "epoch": 775.0, + "eval_loss": 0.40743353962898254, + "eval_runtime": 2.3001, + "eval_samples_per_second": 990.816, + "eval_steps_per_second": 3.913, + "step": 39525 + }, + { + "epoch": 775.1, + "learning_rate": 4.580338458962242e-05, + "loss": 0.4075, + "step": 39530 + }, + { + "epoch": 775.29, + "learning_rate": 4.576605950304905e-05, + "loss": 0.401, + "step": 39540 + }, + { + "epoch": 775.49, + "learning_rate": 4.572874295213431e-05, + "loss": 0.4026, + "step": 39550 + }, + { + "epoch": 775.69, + "learning_rate": 4.569143494777383e-05, + "loss": 0.4046, + "step": 39560 + }, + { + "epoch": 775.88, + "learning_rate": 4.5654135500860715e-05, + "loss": 0.4034, + "step": 39570 + }, + { + "epoch": 776.0, + "eval_loss": 0.40677332878112793, + "eval_runtime": 2.3793, + "eval_samples_per_second": 957.851, + "eval_steps_per_second": 3.783, + "step": 39576 + }, + { + "epoch": 776.08, + "learning_rate": 4.561684462228553e-05, + "loss": 0.4034, + "step": 39580 + }, + { + "epoch": 776.27, + "learning_rate": 4.5579562322936416e-05, + "loss": 0.3984, + "step": 39590 + }, + { + "epoch": 776.47, + "learning_rate": 4.554228861369895e-05, + "loss": 0.3992, + "step": 39600 + }, + { + "epoch": 776.67, + "learning_rate": 4.550502350545626e-05, + "loss": 0.4, + "step": 39610 + }, + { + "epoch": 776.86, + "learning_rate": 4.546776700908892e-05, + "loss": 0.4036, + "step": 39620 + }, + { + "epoch": 777.0, + "eval_loss": 0.4042729437351227, + "eval_runtime": 2.3533, + "eval_samples_per_second": 968.434, + "eval_steps_per_second": 3.824, + "step": 39627 + }, + { + "epoch": 777.06, + "learning_rate": 4.543051913547495e-05, + "loss": 0.4006, + "step": 39630 + }, + { + "epoch": 777.25, + "learning_rate": 4.5393279895489934e-05, + "loss": 0.403, + "step": 39640 + }, + { + "epoch": 777.45, + "learning_rate": 4.535604930000689e-05, + "loss": 0.4016, + "step": 39650 + }, + { + "epoch": 777.65, + "learning_rate": 4.531882735989633e-05, + "loss": 0.398, + "step": 39660 + }, + { + "epoch": 777.84, + "learning_rate": 4.5281614086026227e-05, + "loss": 0.4027, + "step": 39670 + }, + { + "epoch": 778.0, + "eval_loss": 0.40563011169433594, + "eval_runtime": 2.2438, + "eval_samples_per_second": 1015.69, + "eval_steps_per_second": 4.011, + "step": 39678 + }, + { + "epoch": 778.04, + "learning_rate": 4.5244409489262054e-05, + "loss": 0.3998, + "step": 39680 + }, + { + "epoch": 778.24, + "learning_rate": 4.520721358046667e-05, + "loss": 0.3989, + "step": 39690 + }, + { + "epoch": 778.43, + "learning_rate": 4.5170026370500465e-05, + "loss": 0.3968, + "step": 39700 + }, + { + "epoch": 778.63, + "learning_rate": 4.51328478702213e-05, + "loss": 0.4031, + "step": 39710 + }, + { + "epoch": 778.82, + "learning_rate": 4.509567809048445e-05, + "loss": 0.3999, + "step": 39720 + }, + { + "epoch": 779.0, + "eval_loss": 0.410388708114624, + "eval_runtime": 2.3154, + "eval_samples_per_second": 984.289, + "eval_steps_per_second": 3.887, + "step": 39729 + }, + { + "epoch": 779.02, + "learning_rate": 4.505851704214269e-05, + "loss": 0.4029, + "step": 39730 + }, + { + "epoch": 779.22, + "learning_rate": 4.502136473604616e-05, + "loss": 0.3985, + "step": 39740 + }, + { + "epoch": 779.41, + "learning_rate": 4.4984221183042566e-05, + "loss": 0.3996, + "step": 39750 + }, + { + "epoch": 779.61, + "learning_rate": 4.494708639397696e-05, + "loss": 0.4005, + "step": 39760 + }, + { + "epoch": 779.8, + "learning_rate": 4.490996037969187e-05, + "loss": 0.4032, + "step": 39770 + }, + { + "epoch": 780.0, + "learning_rate": 4.48728431510273e-05, + "loss": 0.401, + "step": 39780 + }, + { + "epoch": 780.0, + "eval_loss": 0.4033023416996002, + "eval_runtime": 2.1991, + "eval_samples_per_second": 1036.345, + "eval_steps_per_second": 4.093, + "step": 39780 + }, + { + "epoch": 780.2, + "learning_rate": 4.483573471882061e-05, + "loss": 0.3999, + "step": 39790 + }, + { + "epoch": 780.39, + "learning_rate": 4.479863509390666e-05, + "loss": 0.3978, + "step": 39800 + }, + { + "epoch": 780.59, + "learning_rate": 4.4761544287117696e-05, + "loss": 0.3999, + "step": 39810 + }, + { + "epoch": 780.78, + "learning_rate": 4.472446230928343e-05, + "loss": 0.4011, + "step": 39820 + }, + { + "epoch": 780.98, + "learning_rate": 4.4687389171230975e-05, + "loss": 0.4058, + "step": 39830 + }, + { + "epoch": 781.0, + "eval_loss": 0.40577030181884766, + "eval_runtime": 2.3092, + "eval_samples_per_second": 986.928, + "eval_steps_per_second": 3.897, + "step": 39831 + }, + { + "epoch": 781.18, + "learning_rate": 4.465032488378481e-05, + "loss": 0.3979, + "step": 39840 + }, + { + "epoch": 781.37, + "learning_rate": 4.461326945776694e-05, + "loss": 0.4013, + "step": 39850 + }, + { + "epoch": 781.57, + "learning_rate": 4.457622290399668e-05, + "loss": 0.4018, + "step": 39860 + }, + { + "epoch": 781.76, + "learning_rate": 4.453918523329084e-05, + "loss": 0.4026, + "step": 39870 + }, + { + "epoch": 781.96, + "learning_rate": 4.4502156456463536e-05, + "loss": 0.3977, + "step": 39880 + }, + { + "epoch": 782.0, + "eval_loss": 0.40937620401382446, + "eval_runtime": 2.3002, + "eval_samples_per_second": 990.777, + "eval_steps_per_second": 3.913, + "step": 39882 + }, + { + "epoch": 782.16, + "learning_rate": 4.446513658432642e-05, + "loss": 0.4016, + "step": 39890 + }, + { + "epoch": 782.35, + "learning_rate": 4.44281256276884e-05, + "loss": 0.3995, + "step": 39900 + }, + { + "epoch": 782.55, + "learning_rate": 4.439112359735588e-05, + "loss": 0.4021, + "step": 39910 + }, + { + "epoch": 782.75, + "learning_rate": 4.4354130504132636e-05, + "loss": 0.399, + "step": 39920 + }, + { + "epoch": 782.94, + "learning_rate": 4.4317146358819794e-05, + "loss": 0.402, + "step": 39930 + }, + { + "epoch": 783.0, + "eval_loss": 0.4056869447231293, + "eval_runtime": 2.1949, + "eval_samples_per_second": 1038.316, + "eval_steps_per_second": 4.1, + "step": 39933 + }, + { + "epoch": 783.14, + "learning_rate": 4.428017117221596e-05, + "loss": 0.3983, + "step": 39940 + }, + { + "epoch": 783.33, + "learning_rate": 4.4243204955116995e-05, + "loss": 0.3968, + "step": 39950 + }, + { + "epoch": 783.53, + "learning_rate": 4.420624771831625e-05, + "loss": 0.3991, + "step": 39960 + }, + { + "epoch": 783.73, + "learning_rate": 4.41692994726044e-05, + "loss": 0.3984, + "step": 39970 + }, + { + "epoch": 783.92, + "learning_rate": 4.4132360228769506e-05, + "loss": 0.3972, + "step": 39980 + }, + { + "epoch": 784.0, + "eval_loss": 0.4044448733329773, + "eval_runtime": 2.3046, + "eval_samples_per_second": 988.877, + "eval_steps_per_second": 3.905, + "step": 39984 + }, + { + "epoch": 784.12, + "learning_rate": 4.409542999759703e-05, + "loss": 0.404, + "step": 39990 + }, + { + "epoch": 784.31, + "learning_rate": 4.4058508789869736e-05, + "loss": 0.4004, + "step": 40000 + }, + { + "epoch": 784.51, + "learning_rate": 4.4021596616367825e-05, + "loss": 0.3969, + "step": 40010 + }, + { + "epoch": 784.71, + "learning_rate": 4.3984693487868806e-05, + "loss": 0.4001, + "step": 40020 + }, + { + "epoch": 784.9, + "learning_rate": 4.394779941514759e-05, + "loss": 0.3997, + "step": 40030 + }, + { + "epoch": 785.0, + "eval_loss": 0.40749338269233704, + "eval_runtime": 2.2723, + "eval_samples_per_second": 1002.965, + "eval_steps_per_second": 3.961, + "step": 40035 + }, + { + "epoch": 785.1, + "learning_rate": 4.3910914408976426e-05, + "loss": 0.3994, + "step": 40040 + }, + { + "epoch": 785.29, + "learning_rate": 4.3874038480124876e-05, + "loss": 0.4003, + "step": 40050 + }, + { + "epoch": 785.49, + "learning_rate": 4.383717163935992e-05, + "loss": 0.3993, + "step": 40060 + }, + { + "epoch": 785.69, + "learning_rate": 4.380031389744584e-05, + "loss": 0.4, + "step": 40070 + }, + { + "epoch": 785.88, + "learning_rate": 4.376346526514429e-05, + "loss": 0.4003, + "step": 40080 + }, + { + "epoch": 786.0, + "eval_loss": 0.4073701798915863, + "eval_runtime": 2.1998, + "eval_samples_per_second": 1036.005, + "eval_steps_per_second": 4.091, + "step": 40086 + }, + { + "epoch": 786.08, + "learning_rate": 4.372662575321423e-05, + "loss": 0.3957, + "step": 40090 + }, + { + "epoch": 786.27, + "learning_rate": 4.368979537241202e-05, + "loss": 0.3999, + "step": 40100 + }, + { + "epoch": 786.47, + "learning_rate": 4.365297413349127e-05, + "loss": 0.3975, + "step": 40110 + }, + { + "epoch": 786.67, + "learning_rate": 4.3616162047202904e-05, + "loss": 0.3998, + "step": 40120 + }, + { + "epoch": 786.86, + "learning_rate": 4.3579359124295356e-05, + "loss": 0.3973, + "step": 40130 + }, + { + "epoch": 787.0, + "eval_loss": 0.4044763445854187, + "eval_runtime": 2.3538, + "eval_samples_per_second": 968.204, + "eval_steps_per_second": 3.824, + "step": 40137 + }, + { + "epoch": 787.06, + "learning_rate": 4.3542565375514164e-05, + "loss": 0.3938, + "step": 40140 + }, + { + "epoch": 787.25, + "learning_rate": 4.350578081160235e-05, + "loss": 0.3976, + "step": 40150 + }, + { + "epoch": 787.45, + "learning_rate": 4.346900544330011e-05, + "loss": 0.3953, + "step": 40160 + }, + { + "epoch": 787.65, + "learning_rate": 4.343223928134511e-05, + "loss": 0.4008, + "step": 40170 + }, + { + "epoch": 787.84, + "learning_rate": 4.3395482336472175e-05, + "loss": 0.3989, + "step": 40180 + }, + { + "epoch": 788.0, + "eval_loss": 0.4078381061553955, + "eval_runtime": 2.2214, + "eval_samples_per_second": 1025.951, + "eval_steps_per_second": 4.052, + "step": 40188 + }, + { + "epoch": 788.04, + "learning_rate": 4.335873461941355e-05, + "loss": 0.3958, + "step": 40190 + }, + { + "epoch": 788.24, + "learning_rate": 4.332199614089878e-05, + "loss": 0.4011, + "step": 40200 + }, + { + "epoch": 788.43, + "learning_rate": 4.328526691165462e-05, + "loss": 0.398, + "step": 40210 + }, + { + "epoch": 788.63, + "learning_rate": 4.3248546942405235e-05, + "loss": 0.3984, + "step": 40220 + }, + { + "epoch": 788.82, + "learning_rate": 4.321183624387196e-05, + "loss": 0.4029, + "step": 40230 + }, + { + "epoch": 789.0, + "eval_loss": 0.40923169255256653, + "eval_runtime": 2.2528, + "eval_samples_per_second": 1011.619, + "eval_steps_per_second": 3.995, + "step": 40239 + }, + { + "epoch": 789.02, + "learning_rate": 4.3175134826773626e-05, + "loss": 0.3972, + "step": 40240 + }, + { + "epoch": 789.22, + "learning_rate": 4.313844270182615e-05, + "loss": 0.3998, + "step": 40250 + }, + { + "epoch": 789.41, + "learning_rate": 4.31017598797428e-05, + "loss": 0.4007, + "step": 40260 + }, + { + "epoch": 789.61, + "learning_rate": 4.306508637123419e-05, + "loss": 0.3986, + "step": 40270 + }, + { + "epoch": 789.8, + "learning_rate": 4.302842218700808e-05, + "loss": 0.396, + "step": 40280 + }, + { + "epoch": 790.0, + "learning_rate": 4.299176733776972e-05, + "loss": 0.4011, + "step": 40290 + }, + { + "epoch": 790.0, + "eval_loss": 0.4051341116428375, + "eval_runtime": 2.1983, + "eval_samples_per_second": 1036.73, + "eval_steps_per_second": 4.094, + "step": 40290 + }, + { + "epoch": 790.2, + "learning_rate": 4.295512183422145e-05, + "loss": 0.3981, + "step": 40300 + }, + { + "epoch": 790.39, + "learning_rate": 4.291848568706289e-05, + "loss": 0.401, + "step": 40310 + }, + { + "epoch": 790.59, + "learning_rate": 4.288185890699107e-05, + "loss": 0.4029, + "step": 40320 + }, + { + "epoch": 790.78, + "learning_rate": 4.284524150470007e-05, + "loss": 0.3987, + "step": 40330 + }, + { + "epoch": 790.98, + "learning_rate": 4.28086334908815e-05, + "loss": 0.3975, + "step": 40340 + }, + { + "epoch": 791.0, + "eval_loss": 0.4008138179779053, + "eval_runtime": 2.3676, + "eval_samples_per_second": 962.596, + "eval_steps_per_second": 3.801, + "step": 40341 + }, + { + "epoch": 791.18, + "learning_rate": 4.277203487622397e-05, + "loss": 0.4027, + "step": 40350 + }, + { + "epoch": 791.37, + "learning_rate": 4.273544567141354e-05, + "loss": 0.3978, + "step": 40360 + }, + { + "epoch": 791.57, + "learning_rate": 4.2698865887133414e-05, + "loss": 0.3928, + "step": 40370 + }, + { + "epoch": 791.76, + "learning_rate": 4.266229553406403e-05, + "loss": 0.3963, + "step": 40380 + }, + { + "epoch": 791.96, + "learning_rate": 4.262573462288314e-05, + "loss": 0.3952, + "step": 40390 + }, + { + "epoch": 792.0, + "eval_loss": 0.4049001634120941, + "eval_runtime": 2.1953, + "eval_samples_per_second": 1038.115, + "eval_steps_per_second": 4.1, + "step": 40392 + }, + { + "epoch": 792.16, + "learning_rate": 4.258918316426573e-05, + "loss": 0.3965, + "step": 40400 + }, + { + "epoch": 792.35, + "learning_rate": 4.255264116888404e-05, + "loss": 0.3969, + "step": 40410 + }, + { + "epoch": 792.55, + "learning_rate": 4.251610864740744e-05, + "loss": 0.3987, + "step": 40420 + }, + { + "epoch": 792.75, + "learning_rate": 4.247958561050269e-05, + "loss": 0.3956, + "step": 40430 + }, + { + "epoch": 792.94, + "learning_rate": 4.244307206883364e-05, + "loss": 0.4032, + "step": 40440 + }, + { + "epoch": 793.0, + "eval_loss": 0.4053691029548645, + "eval_runtime": 2.2536, + "eval_samples_per_second": 1011.28, + "eval_steps_per_second": 3.994, + "step": 40443 + }, + { + "epoch": 793.14, + "learning_rate": 4.240656803306145e-05, + "loss": 0.3987, + "step": 40450 + }, + { + "epoch": 793.33, + "learning_rate": 4.2370073513844523e-05, + "loss": 0.3955, + "step": 40460 + }, + { + "epoch": 793.53, + "learning_rate": 4.233358852183838e-05, + "loss": 0.3925, + "step": 40470 + }, + { + "epoch": 793.73, + "learning_rate": 4.2297113067695884e-05, + "loss": 0.3954, + "step": 40480 + }, + { + "epoch": 793.92, + "learning_rate": 4.2260647162066976e-05, + "loss": 0.4027, + "step": 40490 + }, + { + "epoch": 794.0, + "eval_loss": 0.40342316031455994, + "eval_runtime": 2.3542, + "eval_samples_per_second": 968.056, + "eval_steps_per_second": 3.823, + "step": 40494 + }, + { + "epoch": 794.12, + "learning_rate": 4.222419081559899e-05, + "loss": 0.4001, + "step": 40500 + }, + { + "epoch": 794.31, + "learning_rate": 4.218774403893632e-05, + "loss": 0.3975, + "step": 40510 + }, + { + "epoch": 794.51, + "learning_rate": 4.2151306842720574e-05, + "loss": 0.3954, + "step": 40520 + }, + { + "epoch": 794.71, + "learning_rate": 4.211487923759066e-05, + "loss": 0.3969, + "step": 40530 + }, + { + "epoch": 794.9, + "learning_rate": 4.207846123418254e-05, + "loss": 0.397, + "step": 40540 + }, + { + "epoch": 795.0, + "eval_loss": 0.40420642495155334, + "eval_runtime": 2.219, + "eval_samples_per_second": 1027.021, + "eval_steps_per_second": 4.056, + "step": 40545 + }, + { + "epoch": 795.1, + "learning_rate": 4.2042052843129586e-05, + "loss": 0.3991, + "step": 40550 + }, + { + "epoch": 795.29, + "learning_rate": 4.200565407506214e-05, + "loss": 0.3996, + "step": 40560 + }, + { + "epoch": 795.49, + "learning_rate": 4.196926494060788e-05, + "loss": 0.3975, + "step": 40570 + }, + { + "epoch": 795.69, + "learning_rate": 4.1932885450391594e-05, + "loss": 0.3958, + "step": 40580 + }, + { + "epoch": 795.88, + "learning_rate": 4.189651561503527e-05, + "loss": 0.3941, + "step": 40590 + }, + { + "epoch": 796.0, + "eval_loss": 0.40304508805274963, + "eval_runtime": 2.3229, + "eval_samples_per_second": 981.082, + "eval_steps_per_second": 3.874, + "step": 40596 + }, + { + "epoch": 796.08, + "learning_rate": 4.1860155445158104e-05, + "loss": 0.3953, + "step": 40600 + }, + { + "epoch": 796.27, + "learning_rate": 4.182380495137646e-05, + "loss": 0.3923, + "step": 40610 + }, + { + "epoch": 796.47, + "learning_rate": 4.1787464144303895e-05, + "loss": 0.4072, + "step": 40620 + }, + { + "epoch": 796.67, + "learning_rate": 4.175113303455106e-05, + "loss": 0.3953, + "step": 40630 + }, + { + "epoch": 796.86, + "learning_rate": 4.1714811632725886e-05, + "loss": 0.3929, + "step": 40640 + }, + { + "epoch": 797.0, + "eval_loss": 0.4031014144420624, + "eval_runtime": 2.2705, + "eval_samples_per_second": 1003.756, + "eval_steps_per_second": 3.964, + "step": 40647 + }, + { + "epoch": 797.06, + "learning_rate": 4.167849994943336e-05, + "loss": 0.4023, + "step": 40650 + }, + { + "epoch": 797.25, + "learning_rate": 4.16421979952757e-05, + "loss": 0.4006, + "step": 40660 + }, + { + "epoch": 797.45, + "learning_rate": 4.16059057808523e-05, + "loss": 0.399, + "step": 40670 + }, + { + "epoch": 797.65, + "learning_rate": 4.156962331675963e-05, + "loss": 0.3976, + "step": 40680 + }, + { + "epoch": 797.84, + "learning_rate": 4.153335061359141e-05, + "loss": 0.4016, + "step": 40690 + }, + { + "epoch": 798.0, + "eval_loss": 0.4003075361251831, + "eval_runtime": 2.2633, + "eval_samples_per_second": 1006.944, + "eval_steps_per_second": 3.977, + "step": 40698 + }, + { + "epoch": 798.04, + "learning_rate": 4.149708768193837e-05, + "loss": 0.4008, + "step": 40700 + }, + { + "epoch": 798.24, + "learning_rate": 4.14608345323886e-05, + "loss": 0.3992, + "step": 40710 + }, + { + "epoch": 798.43, + "learning_rate": 4.142459117552715e-05, + "loss": 0.4014, + "step": 40720 + }, + { + "epoch": 798.63, + "learning_rate": 4.1388357621936246e-05, + "loss": 0.3954, + "step": 40730 + }, + { + "epoch": 798.82, + "learning_rate": 4.1352133882195335e-05, + "loss": 0.3926, + "step": 40740 + }, + { + "epoch": 799.0, + "eval_loss": 0.4025706648826599, + "eval_runtime": 2.2185, + "eval_samples_per_second": 1027.263, + "eval_steps_per_second": 4.057, + "step": 40749 + }, + { + "epoch": 799.02, + "learning_rate": 4.131591996688084e-05, + "loss": 0.395, + "step": 40750 + }, + { + "epoch": 799.22, + "learning_rate": 4.127971588656656e-05, + "loss": 0.3989, + "step": 40760 + }, + { + "epoch": 799.41, + "learning_rate": 4.124352165182317e-05, + "loss": 0.3976, + "step": 40770 + }, + { + "epoch": 799.61, + "learning_rate": 4.120733727321864e-05, + "loss": 0.3955, + "step": 40780 + }, + { + "epoch": 799.8, + "learning_rate": 4.117116276131798e-05, + "loss": 0.3981, + "step": 40790 + }, + { + "epoch": 800.0, + "learning_rate": 4.113499812668331e-05, + "loss": 0.3985, + "step": 40800 + }, + { + "epoch": 800.0, + "eval_loss": 0.4045719802379608, + "eval_runtime": 2.3024, + "eval_samples_per_second": 989.823, + "eval_steps_per_second": 3.909, + "step": 40800 + }, + { + "epoch": 800.2, + "learning_rate": 4.1098843379873926e-05, + "loss": 0.3982, + "step": 40810 + }, + { + "epoch": 800.39, + "learning_rate": 4.10626985314462e-05, + "loss": 0.3994, + "step": 40820 + }, + { + "epoch": 800.59, + "learning_rate": 4.102656359195366e-05, + "loss": 0.3951, + "step": 40830 + }, + { + "epoch": 800.78, + "learning_rate": 4.099043857194684e-05, + "loss": 0.3987, + "step": 40840 + }, + { + "epoch": 800.98, + "learning_rate": 4.09543234819735e-05, + "loss": 0.3978, + "step": 40850 + }, + { + "epoch": 801.0, + "eval_loss": 0.4002394676208496, + "eval_runtime": 2.2534, + "eval_samples_per_second": 1011.373, + "eval_steps_per_second": 3.994, + "step": 40851 + }, + { + "epoch": 801.18, + "learning_rate": 4.091821833257838e-05, + "loss": 0.4006, + "step": 40860 + }, + { + "epoch": 801.37, + "learning_rate": 4.088212313430342e-05, + "loss": 0.3967, + "step": 40870 + }, + { + "epoch": 801.57, + "learning_rate": 4.084603789768762e-05, + "loss": 0.4034, + "step": 40880 + }, + { + "epoch": 801.76, + "learning_rate": 4.080996263326702e-05, + "loss": 0.3974, + "step": 40890 + }, + { + "epoch": 801.96, + "learning_rate": 4.0773897351574846e-05, + "loss": 0.3972, + "step": 40900 + }, + { + "epoch": 802.0, + "eval_loss": 0.4057813584804535, + "eval_runtime": 2.287, + "eval_samples_per_second": 996.519, + "eval_steps_per_second": 3.935, + "step": 40902 + }, + { + "epoch": 802.16, + "learning_rate": 4.073784206314127e-05, + "loss": 0.3923, + "step": 40910 + }, + { + "epoch": 802.35, + "learning_rate": 4.070179677849375e-05, + "loss": 0.3933, + "step": 40920 + }, + { + "epoch": 802.55, + "learning_rate": 4.0665761508156654e-05, + "loss": 0.3955, + "step": 40930 + }, + { + "epoch": 802.75, + "learning_rate": 4.062973626265144e-05, + "loss": 0.3995, + "step": 40940 + }, + { + "epoch": 802.94, + "learning_rate": 4.0593721052496725e-05, + "loss": 0.3993, + "step": 40950 + }, + { + "epoch": 803.0, + "eval_loss": 0.4025868773460388, + "eval_runtime": 2.3877, + "eval_samples_per_second": 954.49, + "eval_steps_per_second": 3.769, + "step": 40953 + }, + { + "epoch": 803.14, + "learning_rate": 4.055771588820808e-05, + "loss": 0.3984, + "step": 40960 + }, + { + "epoch": 803.33, + "learning_rate": 4.0521720780298315e-05, + "loss": 0.3989, + "step": 40970 + }, + { + "epoch": 803.53, + "learning_rate": 4.04857357392771e-05, + "loss": 0.3969, + "step": 40980 + }, + { + "epoch": 803.73, + "learning_rate": 4.044976077565136e-05, + "loss": 0.3943, + "step": 40990 + }, + { + "epoch": 803.92, + "learning_rate": 4.041379589992491e-05, + "loss": 0.3935, + "step": 41000 + }, + { + "epoch": 804.0, + "eval_loss": 0.4048832058906555, + "eval_runtime": 2.3357, + "eval_samples_per_second": 975.736, + "eval_steps_per_second": 3.853, + "step": 41004 + }, + { + "epoch": 804.12, + "learning_rate": 4.037784112259868e-05, + "loss": 0.3994, + "step": 41010 + }, + { + "epoch": 804.31, + "learning_rate": 4.03418964541707e-05, + "loss": 0.3917, + "step": 41020 + }, + { + "epoch": 804.51, + "learning_rate": 4.0305961905135996e-05, + "loss": 0.3989, + "step": 41030 + }, + { + "epoch": 804.71, + "learning_rate": 4.0270037485986705e-05, + "loss": 0.3977, + "step": 41040 + }, + { + "epoch": 804.9, + "learning_rate": 4.023412320721191e-05, + "loss": 0.3973, + "step": 41050 + }, + { + "epoch": 805.0, + "eval_loss": 0.39889949560165405, + "eval_runtime": 2.2245, + "eval_samples_per_second": 1024.521, + "eval_steps_per_second": 4.046, + "step": 41055 + }, + { + "epoch": 805.1, + "learning_rate": 4.019821907929776e-05, + "loss": 0.3906, + "step": 41060 + }, + { + "epoch": 805.29, + "learning_rate": 4.016232511272747e-05, + "loss": 0.3924, + "step": 41070 + }, + { + "epoch": 805.49, + "learning_rate": 4.0126441317981306e-05, + "loss": 0.3991, + "step": 41080 + }, + { + "epoch": 805.69, + "learning_rate": 4.009056770553654e-05, + "loss": 0.3966, + "step": 41090 + }, + { + "epoch": 805.88, + "learning_rate": 4.0054704285867425e-05, + "loss": 0.4002, + "step": 41100 + }, + { + "epoch": 806.0, + "eval_loss": 0.40028414130210876, + "eval_runtime": 2.2018, + "eval_samples_per_second": 1035.077, + "eval_steps_per_second": 4.088, + "step": 41106 + }, + { + "epoch": 806.08, + "learning_rate": 4.0018851069445334e-05, + "loss": 0.3994, + "step": 41110 + }, + { + "epoch": 806.27, + "learning_rate": 3.9983008066738534e-05, + "loss": 0.3967, + "step": 41120 + }, + { + "epoch": 806.47, + "learning_rate": 3.9947175288212434e-05, + "loss": 0.3946, + "step": 41130 + }, + { + "epoch": 806.67, + "learning_rate": 3.9911352744329424e-05, + "loss": 0.3947, + "step": 41140 + }, + { + "epoch": 806.86, + "learning_rate": 3.9875540445548835e-05, + "loss": 0.3918, + "step": 41150 + }, + { + "epoch": 807.0, + "eval_loss": 0.4006493389606476, + "eval_runtime": 2.2025, + "eval_samples_per_second": 1034.733, + "eval_steps_per_second": 4.086, + "step": 41157 + }, + { + "epoch": 807.06, + "learning_rate": 3.9839738402327106e-05, + "loss": 0.3991, + "step": 41160 + }, + { + "epoch": 807.25, + "learning_rate": 3.980394662511756e-05, + "loss": 0.3929, + "step": 41170 + }, + { + "epoch": 807.45, + "learning_rate": 3.976816512437071e-05, + "loss": 0.3981, + "step": 41180 + }, + { + "epoch": 807.65, + "learning_rate": 3.973239391053389e-05, + "loss": 0.3941, + "step": 41190 + }, + { + "epoch": 807.84, + "learning_rate": 3.9696632994051476e-05, + "loss": 0.4001, + "step": 41200 + }, + { + "epoch": 808.0, + "eval_loss": 0.3997151553630829, + "eval_runtime": 2.2178, + "eval_samples_per_second": 1027.58, + "eval_steps_per_second": 4.058, + "step": 41208 + }, + { + "epoch": 808.04, + "learning_rate": 3.966088238536492e-05, + "loss": 0.3915, + "step": 41210 + }, + { + "epoch": 808.24, + "learning_rate": 3.962514209491254e-05, + "loss": 0.3964, + "step": 41220 + }, + { + "epoch": 808.43, + "learning_rate": 3.958941213312973e-05, + "loss": 0.3962, + "step": 41230 + }, + { + "epoch": 808.63, + "learning_rate": 3.955369251044884e-05, + "loss": 0.3923, + "step": 41240 + }, + { + "epoch": 808.82, + "learning_rate": 3.951798323729925e-05, + "loss": 0.397, + "step": 41250 + }, + { + "epoch": 809.0, + "eval_loss": 0.40183350443840027, + "eval_runtime": 2.2856, + "eval_samples_per_second": 997.107, + "eval_steps_per_second": 3.938, + "step": 41259 + }, + { + "epoch": 809.02, + "learning_rate": 3.948228432410722e-05, + "loss": 0.3932, + "step": 41260 + }, + { + "epoch": 809.22, + "learning_rate": 3.944659578129602e-05, + "loss": 0.3923, + "step": 41270 + }, + { + "epoch": 809.41, + "learning_rate": 3.9410917619285926e-05, + "loss": 0.3966, + "step": 41280 + }, + { + "epoch": 809.61, + "learning_rate": 3.9375249848494184e-05, + "loss": 0.3941, + "step": 41290 + }, + { + "epoch": 809.8, + "learning_rate": 3.9339592479335e-05, + "loss": 0.3929, + "step": 41300 + }, + { + "epoch": 810.0, + "learning_rate": 3.930394552221948e-05, + "loss": 0.3984, + "step": 41310 + }, + { + "epoch": 810.0, + "eval_loss": 0.4029523730278015, + "eval_runtime": 2.225, + "eval_samples_per_second": 1024.248, + "eval_steps_per_second": 4.045, + "step": 41310 + }, + { + "epoch": 810.2, + "learning_rate": 3.9268308987555794e-05, + "loss": 0.3987, + "step": 41320 + }, + { + "epoch": 810.39, + "learning_rate": 3.9232682885748965e-05, + "loss": 0.3974, + "step": 41330 + }, + { + "epoch": 810.59, + "learning_rate": 3.9197067227201044e-05, + "loss": 0.3971, + "step": 41340 + }, + { + "epoch": 810.78, + "learning_rate": 3.916146202231105e-05, + "loss": 0.3933, + "step": 41350 + }, + { + "epoch": 810.98, + "learning_rate": 3.912586728147482e-05, + "loss": 0.3925, + "step": 41360 + }, + { + "epoch": 811.0, + "eval_loss": 0.40738365054130554, + "eval_runtime": 2.2715, + "eval_samples_per_second": 1003.286, + "eval_steps_per_second": 3.962, + "step": 41361 + }, + { + "epoch": 811.18, + "learning_rate": 3.9090283015085305e-05, + "loss": 0.4008, + "step": 41370 + }, + { + "epoch": 811.37, + "learning_rate": 3.905470923353224e-05, + "loss": 0.3922, + "step": 41380 + }, + { + "epoch": 811.57, + "learning_rate": 3.901914594720247e-05, + "loss": 0.3944, + "step": 41390 + }, + { + "epoch": 811.76, + "learning_rate": 3.8983593166479635e-05, + "loss": 0.3957, + "step": 41400 + }, + { + "epoch": 811.96, + "learning_rate": 3.894805090174432e-05, + "loss": 0.398, + "step": 41410 + }, + { + "epoch": 812.0, + "eval_loss": 0.4031858444213867, + "eval_runtime": 2.2163, + "eval_samples_per_second": 1028.271, + "eval_steps_per_second": 4.061, + "step": 41412 + }, + { + "epoch": 812.16, + "learning_rate": 3.891251916337413e-05, + "loss": 0.3923, + "step": 41420 + }, + { + "epoch": 812.35, + "learning_rate": 3.8876997961743495e-05, + "loss": 0.3998, + "step": 41430 + }, + { + "epoch": 812.55, + "learning_rate": 3.8841487307223826e-05, + "loss": 0.393, + "step": 41440 + }, + { + "epoch": 812.75, + "learning_rate": 3.880598721018346e-05, + "loss": 0.3957, + "step": 41450 + }, + { + "epoch": 812.94, + "learning_rate": 3.8770497680987645e-05, + "loss": 0.4, + "step": 41460 + }, + { + "epoch": 813.0, + "eval_loss": 0.3986700773239136, + "eval_runtime": 2.2742, + "eval_samples_per_second": 1002.097, + "eval_steps_per_second": 3.957, + "step": 41463 + }, + { + "epoch": 813.14, + "learning_rate": 3.873501872999851e-05, + "loss": 0.392, + "step": 41470 + }, + { + "epoch": 813.33, + "learning_rate": 3.8699550367575105e-05, + "loss": 0.3931, + "step": 41480 + }, + { + "epoch": 813.53, + "learning_rate": 3.8664092604073404e-05, + "loss": 0.3973, + "step": 41490 + }, + { + "epoch": 813.73, + "learning_rate": 3.862864544984628e-05, + "loss": 0.3959, + "step": 41500 + }, + { + "epoch": 813.92, + "learning_rate": 3.8593208915243566e-05, + "loss": 0.3943, + "step": 41510 + }, + { + "epoch": 814.0, + "eval_loss": 0.40154093503952026, + "eval_runtime": 2.3168, + "eval_samples_per_second": 983.691, + "eval_steps_per_second": 3.885, + "step": 41514 + }, + { + "epoch": 814.12, + "learning_rate": 3.855778301061188e-05, + "loss": 0.4014, + "step": 41520 + }, + { + "epoch": 814.31, + "learning_rate": 3.852236774629483e-05, + "loss": 0.3984, + "step": 41530 + }, + { + "epoch": 814.51, + "learning_rate": 3.848696313263284e-05, + "loss": 0.3954, + "step": 41540 + }, + { + "epoch": 814.71, + "learning_rate": 3.8451569179963295e-05, + "loss": 0.3955, + "step": 41550 + }, + { + "epoch": 814.9, + "learning_rate": 3.8416185898620465e-05, + "loss": 0.3973, + "step": 41560 + }, + { + "epoch": 815.0, + "eval_loss": 0.3962329924106598, + "eval_runtime": 2.2968, + "eval_samples_per_second": 992.272, + "eval_steps_per_second": 3.919, + "step": 41565 + }, + { + "epoch": 815.1, + "learning_rate": 3.838081329893543e-05, + "loss": 0.3943, + "step": 41570 + }, + { + "epoch": 815.29, + "learning_rate": 3.834545139123626e-05, + "loss": 0.3969, + "step": 41580 + }, + { + "epoch": 815.49, + "learning_rate": 3.831010018584774e-05, + "loss": 0.3928, + "step": 41590 + }, + { + "epoch": 815.69, + "learning_rate": 3.827475969309177e-05, + "loss": 0.3924, + "step": 41600 + }, + { + "epoch": 815.88, + "learning_rate": 3.823942992328691e-05, + "loss": 0.3922, + "step": 41610 + }, + { + "epoch": 816.0, + "eval_loss": 0.403202623128891, + "eval_runtime": 2.1921, + "eval_samples_per_second": 1039.664, + "eval_steps_per_second": 4.106, + "step": 41616 + }, + { + "epoch": 816.08, + "learning_rate": 3.8204110886748645e-05, + "loss": 0.3916, + "step": 41620 + }, + { + "epoch": 816.27, + "learning_rate": 3.816880259378941e-05, + "loss": 0.393, + "step": 41630 + }, + { + "epoch": 816.47, + "learning_rate": 3.813350505471836e-05, + "loss": 0.3974, + "step": 41640 + }, + { + "epoch": 816.67, + "learning_rate": 3.809821827984164e-05, + "loss": 0.3995, + "step": 41650 + }, + { + "epoch": 816.86, + "learning_rate": 3.806294227946219e-05, + "loss": 0.3902, + "step": 41660 + }, + { + "epoch": 817.0, + "eval_loss": 0.3992672860622406, + "eval_runtime": 2.2198, + "eval_samples_per_second": 1026.655, + "eval_steps_per_second": 4.054, + "step": 41667 + }, + { + "epoch": 817.06, + "learning_rate": 3.8027677063879836e-05, + "loss": 0.3939, + "step": 41670 + }, + { + "epoch": 817.25, + "learning_rate": 3.799242264339123e-05, + "loss": 0.3945, + "step": 41680 + }, + { + "epoch": 817.45, + "learning_rate": 3.7957179028289835e-05, + "loss": 0.3941, + "step": 41690 + }, + { + "epoch": 817.65, + "learning_rate": 3.792194622886602e-05, + "loss": 0.3934, + "step": 41700 + }, + { + "epoch": 817.84, + "learning_rate": 3.788672425540699e-05, + "loss": 0.3942, + "step": 41710 + }, + { + "epoch": 818.0, + "eval_loss": 0.40182411670684814, + "eval_runtime": 2.2756, + "eval_samples_per_second": 1001.5, + "eval_steps_per_second": 3.955, + "step": 41718 + }, + { + "epoch": 818.04, + "learning_rate": 3.78515131181968e-05, + "loss": 0.4016, + "step": 41720 + }, + { + "epoch": 818.24, + "learning_rate": 3.781631282751629e-05, + "loss": 0.3948, + "step": 41730 + }, + { + "epoch": 818.43, + "learning_rate": 3.7781123393643125e-05, + "loss": 0.3937, + "step": 41740 + }, + { + "epoch": 818.63, + "learning_rate": 3.7745944826851866e-05, + "loss": 0.3977, + "step": 41750 + }, + { + "epoch": 818.82, + "learning_rate": 3.771077713741388e-05, + "loss": 0.3994, + "step": 41760 + }, + { + "epoch": 819.0, + "eval_loss": 0.40313833951950073, + "eval_runtime": 2.2345, + "eval_samples_per_second": 1019.911, + "eval_steps_per_second": 4.028, + "step": 41769 + }, + { + "epoch": 819.02, + "learning_rate": 3.767562033559736e-05, + "loss": 0.3932, + "step": 41770 + }, + { + "epoch": 819.22, + "learning_rate": 3.7640474431667264e-05, + "loss": 0.391, + "step": 41780 + }, + { + "epoch": 819.41, + "learning_rate": 3.760533943588546e-05, + "loss": 0.3949, + "step": 41790 + }, + { + "epoch": 819.61, + "learning_rate": 3.757021535851053e-05, + "loss": 0.3966, + "step": 41800 + }, + { + "epoch": 819.8, + "learning_rate": 3.753510220979795e-05, + "loss": 0.3985, + "step": 41810 + }, + { + "epoch": 820.0, + "learning_rate": 3.750000000000001e-05, + "loss": 0.3959, + "step": 41820 + }, + { + "epoch": 820.0, + "eval_loss": 0.4008371829986572, + "eval_runtime": 2.3152, + "eval_samples_per_second": 984.368, + "eval_steps_per_second": 3.887, + "step": 41820 + }, + { + "epoch": 820.2, + "learning_rate": 3.746490873936571e-05, + "loss": 0.396, + "step": 41830 + }, + { + "epoch": 820.39, + "learning_rate": 3.742982843814097e-05, + "loss": 0.3978, + "step": 41840 + }, + { + "epoch": 820.59, + "learning_rate": 3.73947591065684e-05, + "loss": 0.3908, + "step": 41850 + }, + { + "epoch": 820.78, + "learning_rate": 3.73597007548875e-05, + "loss": 0.4008, + "step": 41860 + }, + { + "epoch": 820.98, + "learning_rate": 3.732465339333454e-05, + "loss": 0.3911, + "step": 41870 + }, + { + "epoch": 821.0, + "eval_loss": 0.4035691022872925, + "eval_runtime": 2.3143, + "eval_samples_per_second": 984.737, + "eval_steps_per_second": 3.889, + "step": 41871 + }, + { + "epoch": 821.18, + "learning_rate": 3.728961703214252e-05, + "loss": 0.3986, + "step": 41880 + }, + { + "epoch": 821.37, + "learning_rate": 3.7254591681541327e-05, + "loss": 0.3976, + "step": 41890 + }, + { + "epoch": 821.57, + "learning_rate": 3.721957735175754e-05, + "loss": 0.3909, + "step": 41900 + }, + { + "epoch": 821.76, + "learning_rate": 3.7184574053014585e-05, + "loss": 0.3977, + "step": 41910 + }, + { + "epoch": 821.96, + "learning_rate": 3.714958179553263e-05, + "loss": 0.3941, + "step": 41920 + }, + { + "epoch": 822.0, + "eval_loss": 0.3997298777103424, + "eval_runtime": 2.3656, + "eval_samples_per_second": 963.412, + "eval_steps_per_second": 3.805, + "step": 41922 + }, + { + "epoch": 822.16, + "learning_rate": 3.7114600589528675e-05, + "loss": 0.3965, + "step": 41930 + }, + { + "epoch": 822.35, + "learning_rate": 3.707963044521642e-05, + "loss": 0.3952, + "step": 41940 + }, + { + "epoch": 822.55, + "learning_rate": 3.704467137280635e-05, + "loss": 0.3936, + "step": 41950 + }, + { + "epoch": 822.75, + "learning_rate": 3.700972338250574e-05, + "loss": 0.3961, + "step": 41960 + }, + { + "epoch": 822.94, + "learning_rate": 3.697478648451864e-05, + "loss": 0.3936, + "step": 41970 + }, + { + "epoch": 823.0, + "eval_loss": 0.3970935344696045, + "eval_runtime": 2.2729, + "eval_samples_per_second": 1002.69, + "eval_steps_per_second": 3.96, + "step": 41973 + }, + { + "epoch": 823.14, + "learning_rate": 3.693986068904588e-05, + "loss": 0.3919, + "step": 41980 + }, + { + "epoch": 823.33, + "learning_rate": 3.6904946006284936e-05, + "loss": 0.3954, + "step": 41990 + }, + { + "epoch": 823.53, + "learning_rate": 3.6870042446430185e-05, + "loss": 0.3947, + "step": 42000 + }, + { + "epoch": 823.73, + "learning_rate": 3.683515001967264e-05, + "loss": 0.391, + "step": 42010 + }, + { + "epoch": 823.92, + "learning_rate": 3.680026873620012e-05, + "loss": 0.397, + "step": 42020 + }, + { + "epoch": 824.0, + "eval_loss": 0.4010617733001709, + "eval_runtime": 2.2259, + "eval_samples_per_second": 1023.865, + "eval_steps_per_second": 4.043, + "step": 42024 + }, + { + "epoch": 824.12, + "learning_rate": 3.676539860619723e-05, + "loss": 0.393, + "step": 42030 + }, + { + "epoch": 824.31, + "learning_rate": 3.67305396398452e-05, + "loss": 0.392, + "step": 42040 + }, + { + "epoch": 824.51, + "learning_rate": 3.669569184732213e-05, + "loss": 0.3962, + "step": 42050 + }, + { + "epoch": 824.71, + "learning_rate": 3.666085523880274e-05, + "loss": 0.3946, + "step": 42060 + }, + { + "epoch": 824.9, + "learning_rate": 3.662602982445859e-05, + "loss": 0.3974, + "step": 42070 + }, + { + "epoch": 825.0, + "eval_loss": 0.3963702917098999, + "eval_runtime": 2.2579, + "eval_samples_per_second": 1009.332, + "eval_steps_per_second": 3.986, + "step": 42075 + }, + { + "epoch": 825.1, + "learning_rate": 3.659121561445792e-05, + "loss": 0.3919, + "step": 42080 + }, + { + "epoch": 825.29, + "learning_rate": 3.655641261896567e-05, + "loss": 0.3924, + "step": 42090 + }, + { + "epoch": 825.49, + "learning_rate": 3.6521620848143584e-05, + "loss": 0.3921, + "step": 42100 + }, + { + "epoch": 825.69, + "learning_rate": 3.648684031215004e-05, + "loss": 0.398, + "step": 42110 + }, + { + "epoch": 825.88, + "learning_rate": 3.6452071021140184e-05, + "loss": 0.3921, + "step": 42120 + }, + { + "epoch": 826.0, + "eval_loss": 0.4010373055934906, + "eval_runtime": 2.3877, + "eval_samples_per_second": 954.483, + "eval_steps_per_second": 3.769, + "step": 42126 + }, + { + "epoch": 826.08, + "learning_rate": 3.64173129852659e-05, + "loss": 0.3915, + "step": 42130 + }, + { + "epoch": 826.27, + "learning_rate": 3.638256621467577e-05, + "loss": 0.3926, + "step": 42140 + }, + { + "epoch": 826.47, + "learning_rate": 3.634783071951506e-05, + "loss": 0.393, + "step": 42150 + }, + { + "epoch": 826.67, + "learning_rate": 3.631310650992572e-05, + "loss": 0.3911, + "step": 42160 + }, + { + "epoch": 826.86, + "learning_rate": 3.6278393596046476e-05, + "loss": 0.3961, + "step": 42170 + }, + { + "epoch": 827.0, + "eval_loss": 0.401947021484375, + "eval_runtime": 2.2833, + "eval_samples_per_second": 998.113, + "eval_steps_per_second": 3.942, + "step": 42177 + }, + { + "epoch": 827.06, + "learning_rate": 3.624369198801272e-05, + "loss": 0.3977, + "step": 42180 + }, + { + "epoch": 827.25, + "learning_rate": 3.620900169595659e-05, + "loss": 0.3955, + "step": 42190 + }, + { + "epoch": 827.45, + "learning_rate": 3.617432273000681e-05, + "loss": 0.3979, + "step": 42200 + }, + { + "epoch": 827.65, + "learning_rate": 3.613965510028893e-05, + "loss": 0.3954, + "step": 42210 + }, + { + "epoch": 827.84, + "learning_rate": 3.610499881692506e-05, + "loss": 0.3912, + "step": 42220 + }, + { + "epoch": 828.0, + "eval_loss": 0.4004402756690979, + "eval_runtime": 2.3254, + "eval_samples_per_second": 980.067, + "eval_steps_per_second": 3.87, + "step": 42228 + }, + { + "epoch": 828.04, + "learning_rate": 3.607035389003409e-05, + "loss": 0.391, + "step": 42230 + }, + { + "epoch": 828.24, + "learning_rate": 3.60357203297316e-05, + "loss": 0.3938, + "step": 42240 + }, + { + "epoch": 828.43, + "learning_rate": 3.6001098146129756e-05, + "loss": 0.3945, + "step": 42250 + }, + { + "epoch": 828.63, + "learning_rate": 3.596648734933752e-05, + "loss": 0.3921, + "step": 42260 + }, + { + "epoch": 828.82, + "learning_rate": 3.5931887949460425e-05, + "loss": 0.3939, + "step": 42270 + }, + { + "epoch": 829.0, + "eval_loss": 0.39803311228752136, + "eval_runtime": 2.2329, + "eval_samples_per_second": 1020.649, + "eval_steps_per_second": 4.031, + "step": 42279 + }, + { + "epoch": 829.02, + "learning_rate": 3.5897299956600735e-05, + "loss": 0.4018, + "step": 42280 + }, + { + "epoch": 829.22, + "learning_rate": 3.586272338085742e-05, + "loss": 0.3925, + "step": 42290 + }, + { + "epoch": 829.41, + "learning_rate": 3.5828158232326e-05, + "loss": 0.3942, + "step": 42300 + }, + { + "epoch": 829.61, + "learning_rate": 3.5793604521098796e-05, + "loss": 0.3919, + "step": 42310 + }, + { + "epoch": 829.8, + "learning_rate": 3.5759062257264645e-05, + "loss": 0.393, + "step": 42320 + }, + { + "epoch": 830.0, + "learning_rate": 3.572453145090916e-05, + "loss": 0.3917, + "step": 42330 + }, + { + "epoch": 830.0, + "eval_loss": 0.40272367000579834, + "eval_runtime": 2.3185, + "eval_samples_per_second": 982.944, + "eval_steps_per_second": 3.882, + "step": 42330 + }, + { + "epoch": 830.2, + "learning_rate": 3.569001211211456e-05, + "loss": 0.3938, + "step": 42340 + }, + { + "epoch": 830.39, + "learning_rate": 3.565550425095976e-05, + "loss": 0.3942, + "step": 42350 + }, + { + "epoch": 830.59, + "learning_rate": 3.562100787752025e-05, + "loss": 0.3922, + "step": 42360 + }, + { + "epoch": 830.78, + "learning_rate": 3.558652300186817e-05, + "loss": 0.3926, + "step": 42370 + }, + { + "epoch": 830.98, + "learning_rate": 3.5552049634072366e-05, + "loss": 0.3977, + "step": 42380 + }, + { + "epoch": 831.0, + "eval_loss": 0.4004882574081421, + "eval_runtime": 2.3806, + "eval_samples_per_second": 957.34, + "eval_steps_per_second": 3.781, + "step": 42381 + }, + { + "epoch": 831.18, + "learning_rate": 3.55175877841983e-05, + "loss": 0.3928, + "step": 42390 + }, + { + "epoch": 831.37, + "learning_rate": 3.548313746230809e-05, + "loss": 0.3944, + "step": 42400 + }, + { + "epoch": 831.57, + "learning_rate": 3.544869867846039e-05, + "loss": 0.3948, + "step": 42410 + }, + { + "epoch": 831.76, + "learning_rate": 3.541427144271064e-05, + "loss": 0.392, + "step": 42420 + }, + { + "epoch": 831.96, + "learning_rate": 3.537985576511074e-05, + "loss": 0.3881, + "step": 42430 + }, + { + "epoch": 832.0, + "eval_loss": 0.39829736948013306, + "eval_runtime": 2.2221, + "eval_samples_per_second": 1025.626, + "eval_steps_per_second": 4.05, + "step": 42432 + }, + { + "epoch": 832.16, + "learning_rate": 3.534545165570934e-05, + "loss": 0.3913, + "step": 42440 + }, + { + "epoch": 832.35, + "learning_rate": 3.531105912455172e-05, + "loss": 0.3917, + "step": 42450 + }, + { + "epoch": 832.55, + "learning_rate": 3.5276678181679636e-05, + "loss": 0.3904, + "step": 42460 + }, + { + "epoch": 832.75, + "learning_rate": 3.524230883713164e-05, + "loss": 0.3893, + "step": 42470 + }, + { + "epoch": 832.94, + "learning_rate": 3.5207951100942765e-05, + "loss": 0.3939, + "step": 42480 + }, + { + "epoch": 833.0, + "eval_loss": 0.4025621712207794, + "eval_runtime": 2.2271, + "eval_samples_per_second": 1023.286, + "eval_steps_per_second": 4.041, + "step": 42483 + }, + { + "epoch": 833.14, + "learning_rate": 3.5173604983144714e-05, + "loss": 0.3946, + "step": 42490 + }, + { + "epoch": 833.33, + "learning_rate": 3.513927049376582e-05, + "loss": 0.3929, + "step": 42500 + }, + { + "epoch": 833.53, + "learning_rate": 3.5104947642830934e-05, + "loss": 0.3909, + "step": 42510 + }, + { + "epoch": 833.73, + "learning_rate": 3.5070636440361615e-05, + "loss": 0.3942, + "step": 42520 + }, + { + "epoch": 833.92, + "learning_rate": 3.5036336896375924e-05, + "loss": 0.393, + "step": 42530 + }, + { + "epoch": 834.0, + "eval_loss": 0.399141788482666, + "eval_runtime": 2.1939, + "eval_samples_per_second": 1038.783, + "eval_steps_per_second": 4.102, + "step": 42534 + }, + { + "epoch": 834.12, + "learning_rate": 3.500204902088857e-05, + "loss": 0.3934, + "step": 42540 + }, + { + "epoch": 834.31, + "learning_rate": 3.49677728239109e-05, + "loss": 0.3953, + "step": 42550 + }, + { + "epoch": 834.51, + "learning_rate": 3.493350831545073e-05, + "loss": 0.3942, + "step": 42560 + }, + { + "epoch": 834.71, + "learning_rate": 3.4899255505512593e-05, + "loss": 0.394, + "step": 42570 + }, + { + "epoch": 834.9, + "learning_rate": 3.4865014404097475e-05, + "loss": 0.3928, + "step": 42580 + }, + { + "epoch": 835.0, + "eval_loss": 0.398049920797348, + "eval_runtime": 2.3222, + "eval_samples_per_second": 981.383, + "eval_steps_per_second": 3.876, + "step": 42585 + }, + { + "epoch": 835.1, + "learning_rate": 3.483078502120307e-05, + "loss": 0.3925, + "step": 42590 + }, + { + "epoch": 835.29, + "learning_rate": 3.4796567366823564e-05, + "loss": 0.3931, + "step": 42600 + }, + { + "epoch": 835.49, + "learning_rate": 3.47623614509498e-05, + "loss": 0.3872, + "step": 42610 + }, + { + "epoch": 835.69, + "learning_rate": 3.47281672835691e-05, + "loss": 0.392, + "step": 42620 + }, + { + "epoch": 835.88, + "learning_rate": 3.4693984874665384e-05, + "loss": 0.394, + "step": 42630 + }, + { + "epoch": 836.0, + "eval_loss": 0.39526310563087463, + "eval_runtime": 2.2715, + "eval_samples_per_second": 1003.309, + "eval_steps_per_second": 3.962, + "step": 42636 + }, + { + "epoch": 836.08, + "learning_rate": 3.465981423421917e-05, + "loss": 0.3924, + "step": 42640 + }, + { + "epoch": 836.27, + "learning_rate": 3.462565537220753e-05, + "loss": 0.3902, + "step": 42650 + }, + { + "epoch": 836.47, + "learning_rate": 3.459150829860411e-05, + "loss": 0.3939, + "step": 42660 + }, + { + "epoch": 836.67, + "learning_rate": 3.455737302337904e-05, + "loss": 0.3937, + "step": 42670 + }, + { + "epoch": 836.86, + "learning_rate": 3.452324955649911e-05, + "loss": 0.3908, + "step": 42680 + }, + { + "epoch": 837.0, + "eval_loss": 0.4002179503440857, + "eval_runtime": 2.3723, + "eval_samples_per_second": 960.663, + "eval_steps_per_second": 3.794, + "step": 42687 + }, + { + "epoch": 837.06, + "learning_rate": 3.448913790792757e-05, + "loss": 0.3886, + "step": 42690 + }, + { + "epoch": 837.25, + "learning_rate": 3.445503808762429e-05, + "loss": 0.3923, + "step": 42700 + }, + { + "epoch": 837.45, + "learning_rate": 3.442095010554567e-05, + "loss": 0.3926, + "step": 42710 + }, + { + "epoch": 837.65, + "learning_rate": 3.4386873971644586e-05, + "loss": 0.3957, + "step": 42720 + }, + { + "epoch": 837.84, + "learning_rate": 3.4352809695870565e-05, + "loss": 0.3926, + "step": 42730 + }, + { + "epoch": 838.0, + "eval_loss": 0.4014919400215149, + "eval_runtime": 2.3297, + "eval_samples_per_second": 978.232, + "eval_steps_per_second": 3.863, + "step": 42738 + }, + { + "epoch": 838.04, + "learning_rate": 3.431875728816958e-05, + "loss": 0.3962, + "step": 42740 + }, + { + "epoch": 838.24, + "learning_rate": 3.4284716758484175e-05, + "loss": 0.3914, + "step": 42750 + }, + { + "epoch": 838.43, + "learning_rate": 3.4250688116753464e-05, + "loss": 0.392, + "step": 42760 + }, + { + "epoch": 838.63, + "learning_rate": 3.4216671372913005e-05, + "loss": 0.3918, + "step": 42770 + }, + { + "epoch": 838.82, + "learning_rate": 3.418266653689497e-05, + "loss": 0.3947, + "step": 42780 + }, + { + "epoch": 839.0, + "eval_loss": 0.3990994989871979, + "eval_runtime": 2.3005, + "eval_samples_per_second": 990.663, + "eval_steps_per_second": 3.912, + "step": 42789 + }, + { + "epoch": 839.02, + "learning_rate": 3.414867361862797e-05, + "loss": 0.3898, + "step": 42790 + }, + { + "epoch": 839.22, + "learning_rate": 3.41146926280372e-05, + "loss": 0.3938, + "step": 42800 + }, + { + "epoch": 839.41, + "learning_rate": 3.408072357504435e-05, + "loss": 0.3916, + "step": 42810 + }, + { + "epoch": 839.61, + "learning_rate": 3.404676646956765e-05, + "loss": 0.3888, + "step": 42820 + }, + { + "epoch": 839.8, + "learning_rate": 3.4012821321521806e-05, + "loss": 0.3912, + "step": 42830 + }, + { + "epoch": 840.0, + "learning_rate": 3.3978888140817996e-05, + "loss": 0.3965, + "step": 42840 + }, + { + "epoch": 840.0, + "eval_loss": 0.3969307243824005, + "eval_runtime": 2.2053, + "eval_samples_per_second": 1033.431, + "eval_steps_per_second": 4.081, + "step": 42840 + }, + { + "epoch": 840.2, + "learning_rate": 3.394496693736399e-05, + "loss": 0.3947, + "step": 42850 + }, + { + "epoch": 840.39, + "learning_rate": 3.391105772106403e-05, + "loss": 0.3936, + "step": 42860 + }, + { + "epoch": 840.59, + "learning_rate": 3.387716050181886e-05, + "loss": 0.3952, + "step": 42870 + }, + { + "epoch": 840.78, + "learning_rate": 3.384327528952568e-05, + "loss": 0.3956, + "step": 42880 + }, + { + "epoch": 840.98, + "learning_rate": 3.380940209407825e-05, + "loss": 0.3934, + "step": 42890 + }, + { + "epoch": 841.0, + "eval_loss": 0.4002283215522766, + "eval_runtime": 2.2711, + "eval_samples_per_second": 1003.473, + "eval_steps_per_second": 3.963, + "step": 42891 + }, + { + "epoch": 841.18, + "learning_rate": 3.377554092536674e-05, + "loss": 0.3937, + "step": 42900 + }, + { + "epoch": 841.37, + "learning_rate": 3.374169179327789e-05, + "loss": 0.3956, + "step": 42910 + }, + { + "epoch": 841.57, + "learning_rate": 3.370785470769491e-05, + "loss": 0.3965, + "step": 42920 + }, + { + "epoch": 841.76, + "learning_rate": 3.367402967849743e-05, + "loss": 0.395, + "step": 42930 + }, + { + "epoch": 841.96, + "learning_rate": 3.364021671556165e-05, + "loss": 0.3916, + "step": 42940 + }, + { + "epoch": 842.0, + "eval_loss": 0.39687296748161316, + "eval_runtime": 2.3553, + "eval_samples_per_second": 967.62, + "eval_steps_per_second": 3.821, + "step": 42942 + }, + { + "epoch": 842.16, + "learning_rate": 3.360641582876015e-05, + "loss": 0.3895, + "step": 42950 + }, + { + "epoch": 842.35, + "learning_rate": 3.357262702796206e-05, + "loss": 0.3921, + "step": 42960 + }, + { + "epoch": 842.55, + "learning_rate": 3.3538850323032984e-05, + "loss": 0.3965, + "step": 42970 + }, + { + "epoch": 842.75, + "learning_rate": 3.3505085723834917e-05, + "loss": 0.3899, + "step": 42980 + }, + { + "epoch": 842.94, + "learning_rate": 3.3471333240226414e-05, + "loss": 0.3887, + "step": 42990 + }, + { + "epoch": 843.0, + "eval_loss": 0.39406681060791016, + "eval_runtime": 2.385, + "eval_samples_per_second": 955.561, + "eval_steps_per_second": 3.774, + "step": 42993 + }, + { + "epoch": 843.14, + "learning_rate": 3.3437592882062406e-05, + "loss": 0.3873, + "step": 43000 + }, + { + "epoch": 843.33, + "learning_rate": 3.340386465919434e-05, + "loss": 0.3939, + "step": 43010 + }, + { + "epoch": 843.53, + "learning_rate": 3.3370148581470106e-05, + "loss": 0.3885, + "step": 43020 + }, + { + "epoch": 843.73, + "learning_rate": 3.333644465873408e-05, + "loss": 0.3911, + "step": 43030 + }, + { + "epoch": 843.92, + "learning_rate": 3.3302752900827025e-05, + "loss": 0.3938, + "step": 43040 + }, + { + "epoch": 844.0, + "eval_loss": 0.3971950113773346, + "eval_runtime": 2.3473, + "eval_samples_per_second": 970.919, + "eval_steps_per_second": 3.834, + "step": 43044 + }, + { + "epoch": 844.12, + "learning_rate": 3.3269073317586156e-05, + "loss": 0.39, + "step": 43050 + }, + { + "epoch": 844.31, + "learning_rate": 3.32354059188452e-05, + "loss": 0.3895, + "step": 43060 + }, + { + "epoch": 844.51, + "learning_rate": 3.3201750714434264e-05, + "loss": 0.3975, + "step": 43070 + }, + { + "epoch": 844.71, + "learning_rate": 3.3168107714179954e-05, + "loss": 0.3905, + "step": 43080 + }, + { + "epoch": 844.9, + "learning_rate": 3.3134476927905234e-05, + "loss": 0.3928, + "step": 43090 + }, + { + "epoch": 845.0, + "eval_loss": 0.40146586298942566, + "eval_runtime": 2.1799, + "eval_samples_per_second": 1045.449, + "eval_steps_per_second": 4.129, + "step": 43095 + }, + { + "epoch": 845.1, + "learning_rate": 3.3100858365429575e-05, + "loss": 0.3932, + "step": 43100 + }, + { + "epoch": 845.29, + "learning_rate": 3.306725203656881e-05, + "loss": 0.3904, + "step": 43110 + }, + { + "epoch": 845.49, + "learning_rate": 3.303365795113525e-05, + "loss": 0.392, + "step": 43120 + }, + { + "epoch": 845.69, + "learning_rate": 3.300007611893766e-05, + "loss": 0.3893, + "step": 43130 + }, + { + "epoch": 845.88, + "learning_rate": 3.2966506549781134e-05, + "loss": 0.3948, + "step": 43140 + }, + { + "epoch": 846.0, + "eval_loss": 0.39760321378707886, + "eval_runtime": 2.3454, + "eval_samples_per_second": 971.692, + "eval_steps_per_second": 3.837, + "step": 43146 + }, + { + "epoch": 846.08, + "learning_rate": 3.2932949253467276e-05, + "loss": 0.395, + "step": 43150 + }, + { + "epoch": 846.27, + "learning_rate": 3.2899404239794034e-05, + "loss": 0.388, + "step": 43160 + }, + { + "epoch": 846.47, + "learning_rate": 3.2865871518555814e-05, + "loss": 0.394, + "step": 43170 + }, + { + "epoch": 846.67, + "learning_rate": 3.283235109954345e-05, + "loss": 0.3921, + "step": 43180 + }, + { + "epoch": 846.86, + "learning_rate": 3.279884299254411e-05, + "loss": 0.3925, + "step": 43190 + }, + { + "epoch": 847.0, + "eval_loss": 0.3952951729297638, + "eval_runtime": 2.212, + "eval_samples_per_second": 1030.312, + "eval_steps_per_second": 4.069, + "step": 43197 + }, + { + "epoch": 847.06, + "learning_rate": 3.276534720734147e-05, + "loss": 0.3906, + "step": 43200 + }, + { + "epoch": 847.25, + "learning_rate": 3.273186375371549e-05, + "loss": 0.3919, + "step": 43210 + }, + { + "epoch": 847.45, + "learning_rate": 3.269839264144263e-05, + "loss": 0.3918, + "step": 43220 + }, + { + "epoch": 847.65, + "learning_rate": 3.266493388029572e-05, + "loss": 0.3954, + "step": 43230 + }, + { + "epoch": 847.84, + "learning_rate": 3.263148748004393e-05, + "loss": 0.3876, + "step": 43240 + }, + { + "epoch": 848.0, + "eval_loss": 0.3958490490913391, + "eval_runtime": 2.2194, + "eval_samples_per_second": 1026.867, + "eval_steps_per_second": 4.055, + "step": 43248 + }, + { + "epoch": 848.04, + "learning_rate": 3.2598053450452914e-05, + "loss": 0.3967, + "step": 43250 + }, + { + "epoch": 848.24, + "learning_rate": 3.2564631801284604e-05, + "loss": 0.3884, + "step": 43260 + }, + { + "epoch": 848.43, + "learning_rate": 3.253122254229742e-05, + "loss": 0.3852, + "step": 43270 + }, + { + "epoch": 848.63, + "learning_rate": 3.24978256832461e-05, + "loss": 0.3918, + "step": 43280 + }, + { + "epoch": 848.82, + "learning_rate": 3.246444123388181e-05, + "loss": 0.3857, + "step": 43290 + }, + { + "epoch": 849.0, + "eval_loss": 0.39667844772338867, + "eval_runtime": 2.3103, + "eval_samples_per_second": 986.453, + "eval_steps_per_second": 3.896, + "step": 43299 + }, + { + "epoch": 849.02, + "learning_rate": 3.2431069203952046e-05, + "loss": 0.393, + "step": 43300 + }, + { + "epoch": 849.22, + "learning_rate": 3.239770960320067e-05, + "loss": 0.391, + "step": 43310 + }, + { + "epoch": 849.41, + "learning_rate": 3.236436244136797e-05, + "loss": 0.3919, + "step": 43320 + }, + { + "epoch": 849.61, + "learning_rate": 3.233102772819057e-05, + "loss": 0.3887, + "step": 43330 + }, + { + "epoch": 849.8, + "learning_rate": 3.22977054734015e-05, + "loss": 0.3876, + "step": 43340 + }, + { + "epoch": 850.0, + "learning_rate": 3.226439568673003e-05, + "loss": 0.389, + "step": 43350 + }, + { + "epoch": 850.0, + "eval_loss": 0.3974584639072418, + "eval_runtime": 2.2721, + "eval_samples_per_second": 1003.056, + "eval_steps_per_second": 3.961, + "step": 43350 + }, + { + "epoch": 850.2, + "learning_rate": 3.2231098377901966e-05, + "loss": 0.3892, + "step": 43360 + }, + { + "epoch": 850.39, + "learning_rate": 3.21978135566393e-05, + "loss": 0.3901, + "step": 43370 + }, + { + "epoch": 850.59, + "learning_rate": 3.21645412326605e-05, + "loss": 0.3902, + "step": 43380 + }, + { + "epoch": 850.78, + "learning_rate": 3.2131281415680365e-05, + "loss": 0.3908, + "step": 43390 + }, + { + "epoch": 850.98, + "learning_rate": 3.2098034115409956e-05, + "loss": 0.3905, + "step": 43400 + }, + { + "epoch": 851.0, + "eval_loss": 0.3915613889694214, + "eval_runtime": 2.2983, + "eval_samples_per_second": 991.585, + "eval_steps_per_second": 3.916, + "step": 43401 + }, + { + "epoch": 851.18, + "learning_rate": 3.206479934155681e-05, + "loss": 0.3891, + "step": 43410 + }, + { + "epoch": 851.37, + "learning_rate": 3.203157710382469e-05, + "loss": 0.3889, + "step": 43420 + }, + { + "epoch": 851.57, + "learning_rate": 3.199836741191375e-05, + "loss": 0.3898, + "step": 43430 + }, + { + "epoch": 851.76, + "learning_rate": 3.1965170275520534e-05, + "loss": 0.3915, + "step": 43440 + }, + { + "epoch": 851.96, + "learning_rate": 3.1931985704337804e-05, + "loss": 0.389, + "step": 43450 + }, + { + "epoch": 852.0, + "eval_loss": 0.3987075686454773, + "eval_runtime": 2.2702, + "eval_samples_per_second": 1003.882, + "eval_steps_per_second": 3.964, + "step": 43452 + }, + { + "epoch": 852.16, + "learning_rate": 3.189881370805475e-05, + "loss": 0.3933, + "step": 43460 + }, + { + "epoch": 852.35, + "learning_rate": 3.1865654296356835e-05, + "loss": 0.3869, + "step": 43470 + }, + { + "epoch": 852.55, + "learning_rate": 3.183250747892587e-05, + "loss": 0.39, + "step": 43480 + }, + { + "epoch": 852.75, + "learning_rate": 3.1799373265439985e-05, + "loss": 0.391, + "step": 43490 + }, + { + "epoch": 852.94, + "learning_rate": 3.1766251665573676e-05, + "loss": 0.3872, + "step": 43500 + }, + { + "epoch": 853.0, + "eval_loss": 0.39647892117500305, + "eval_runtime": 2.3604, + "eval_samples_per_second": 965.511, + "eval_steps_per_second": 3.813, + "step": 43503 + }, + { + "epoch": 853.14, + "learning_rate": 3.173314268899767e-05, + "loss": 0.3842, + "step": 43510 + }, + { + "epoch": 853.33, + "learning_rate": 3.170004634537903e-05, + "loss": 0.3962, + "step": 43520 + }, + { + "epoch": 853.53, + "learning_rate": 3.1666962644381165e-05, + "loss": 0.3919, + "step": 43530 + }, + { + "epoch": 853.73, + "learning_rate": 3.1633891595663795e-05, + "loss": 0.3922, + "step": 43540 + }, + { + "epoch": 853.92, + "learning_rate": 3.1600833208882954e-05, + "loss": 0.3902, + "step": 43550 + }, + { + "epoch": 854.0, + "eval_loss": 0.3962956666946411, + "eval_runtime": 2.2913, + "eval_samples_per_second": 994.612, + "eval_steps_per_second": 3.928, + "step": 43554 + }, + { + "epoch": 854.12, + "learning_rate": 3.156778749369088e-05, + "loss": 0.3889, + "step": 43560 + }, + { + "epoch": 854.31, + "learning_rate": 3.1534754459736256e-05, + "loss": 0.3916, + "step": 43570 + }, + { + "epoch": 854.51, + "learning_rate": 3.150173411666394e-05, + "loss": 0.389, + "step": 43580 + }, + { + "epoch": 854.71, + "learning_rate": 3.1468726474115156e-05, + "loss": 0.3866, + "step": 43590 + }, + { + "epoch": 854.9, + "learning_rate": 3.143573154172743e-05, + "loss": 0.3883, + "step": 43600 + }, + { + "epoch": 855.0, + "eval_loss": 0.3941084146499634, + "eval_runtime": 2.2726, + "eval_samples_per_second": 1002.825, + "eval_steps_per_second": 3.96, + "step": 43605 + }, + { + "epoch": 855.1, + "learning_rate": 3.1402749329134476e-05, + "loss": 0.3919, + "step": 43610 + }, + { + "epoch": 855.29, + "learning_rate": 3.1369779845966446e-05, + "loss": 0.3902, + "step": 43620 + }, + { + "epoch": 855.49, + "learning_rate": 3.133682310184961e-05, + "loss": 0.3866, + "step": 43630 + }, + { + "epoch": 855.69, + "learning_rate": 3.1303879106406664e-05, + "loss": 0.39, + "step": 43640 + }, + { + "epoch": 855.88, + "learning_rate": 3.127094786925651e-05, + "loss": 0.393, + "step": 43650 + }, + { + "epoch": 856.0, + "eval_loss": 0.3944731056690216, + "eval_runtime": 2.2622, + "eval_samples_per_second": 1007.445, + "eval_steps_per_second": 3.979, + "step": 43656 + }, + { + "epoch": 856.08, + "learning_rate": 3.1238029400014305e-05, + "loss": 0.388, + "step": 43660 + }, + { + "epoch": 856.27, + "learning_rate": 3.120512370829156e-05, + "loss": 0.3884, + "step": 43670 + }, + { + "epoch": 856.47, + "learning_rate": 3.1172230803695945e-05, + "loss": 0.3895, + "step": 43680 + }, + { + "epoch": 856.67, + "learning_rate": 3.113935069583147e-05, + "loss": 0.3919, + "step": 43690 + }, + { + "epoch": 856.86, + "learning_rate": 3.1106483394298416e-05, + "loss": 0.3908, + "step": 43700 + }, + { + "epoch": 857.0, + "eval_loss": 0.3987371325492859, + "eval_runtime": 2.2357, + "eval_samples_per_second": 1019.351, + "eval_steps_per_second": 4.026, + "step": 43707 + }, + { + "epoch": 857.06, + "learning_rate": 3.107362890869332e-05, + "loss": 0.3887, + "step": 43710 + }, + { + "epoch": 857.25, + "learning_rate": 3.104078724860892e-05, + "loss": 0.3931, + "step": 43720 + }, + { + "epoch": 857.45, + "learning_rate": 3.1007958423634235e-05, + "loss": 0.3914, + "step": 43730 + }, + { + "epoch": 857.65, + "learning_rate": 3.097514244335457e-05, + "loss": 0.3868, + "step": 43740 + }, + { + "epoch": 857.84, + "learning_rate": 3.094233931735147e-05, + "loss": 0.3891, + "step": 43750 + }, + { + "epoch": 858.0, + "eval_loss": 0.3969500958919525, + "eval_runtime": 2.2738, + "eval_samples_per_second": 1002.285, + "eval_steps_per_second": 3.958, + "step": 43758 + }, + { + "epoch": 858.04, + "learning_rate": 3.090954905520272e-05, + "loss": 0.3888, + "step": 43760 + }, + { + "epoch": 858.24, + "learning_rate": 3.087677166648232e-05, + "loss": 0.3955, + "step": 43770 + }, + { + "epoch": 858.43, + "learning_rate": 3.0844007160760576e-05, + "loss": 0.3885, + "step": 43780 + }, + { + "epoch": 858.63, + "learning_rate": 3.0811255547603925e-05, + "loss": 0.3864, + "step": 43790 + }, + { + "epoch": 858.82, + "learning_rate": 3.077851683657517e-05, + "loss": 0.39, + "step": 43800 + }, + { + "epoch": 859.0, + "eval_loss": 0.3933936655521393, + "eval_runtime": 2.2192, + "eval_samples_per_second": 1026.96, + "eval_steps_per_second": 4.056, + "step": 43809 + }, + { + "epoch": 859.02, + "learning_rate": 3.074579103723327e-05, + "loss": 0.3814, + "step": 43810 + }, + { + "epoch": 859.22, + "learning_rate": 3.071307815913342e-05, + "loss": 0.3884, + "step": 43820 + }, + { + "epoch": 859.41, + "learning_rate": 3.068037821182706e-05, + "loss": 0.3902, + "step": 43830 + }, + { + "epoch": 859.61, + "learning_rate": 3.064769120486182e-05, + "loss": 0.3885, + "step": 43840 + }, + { + "epoch": 859.8, + "learning_rate": 3.061501714778159e-05, + "loss": 0.3898, + "step": 43850 + }, + { + "epoch": 860.0, + "learning_rate": 3.05823560501265e-05, + "loss": 0.3894, + "step": 43860 + }, + { + "epoch": 860.0, + "eval_loss": 0.3981279730796814, + "eval_runtime": 2.295, + "eval_samples_per_second": 993.016, + "eval_steps_per_second": 3.922, + "step": 43860 + }, + { + "epoch": 860.2, + "learning_rate": 3.054970792143282e-05, + "loss": 0.3903, + "step": 43870 + }, + { + "epoch": 860.39, + "learning_rate": 3.0517072771233103e-05, + "loss": 0.388, + "step": 43880 + }, + { + "epoch": 860.59, + "learning_rate": 3.0484450609056048e-05, + "loss": 0.3893, + "step": 43890 + }, + { + "epoch": 860.78, + "learning_rate": 3.0451841444426625e-05, + "loss": 0.3894, + "step": 43900 + }, + { + "epoch": 860.98, + "learning_rate": 3.0419245286865998e-05, + "loss": 0.3859, + "step": 43910 + }, + { + "epoch": 861.0, + "eval_loss": 0.39396482706069946, + "eval_runtime": 2.2654, + "eval_samples_per_second": 1005.99, + "eval_steps_per_second": 3.973, + "step": 43911 + }, + { + "epoch": 861.18, + "learning_rate": 3.038666214589148e-05, + "loss": 0.3893, + "step": 43920 + }, + { + "epoch": 861.37, + "learning_rate": 3.035409203101667e-05, + "loss": 0.3916, + "step": 43930 + }, + { + "epoch": 861.57, + "learning_rate": 3.032153495175126e-05, + "loss": 0.3859, + "step": 43940 + }, + { + "epoch": 861.76, + "learning_rate": 3.028899091760121e-05, + "loss": 0.389, + "step": 43950 + }, + { + "epoch": 861.96, + "learning_rate": 3.025645993806866e-05, + "loss": 0.3896, + "step": 43960 + }, + { + "epoch": 862.0, + "eval_loss": 0.39559245109558105, + "eval_runtime": 2.3499, + "eval_samples_per_second": 969.843, + "eval_steps_per_second": 3.83, + "step": 43962 + }, + { + "epoch": 862.16, + "learning_rate": 3.022394202265196e-05, + "loss": 0.3863, + "step": 43970 + }, + { + "epoch": 862.35, + "learning_rate": 3.019143718084559e-05, + "loss": 0.3895, + "step": 43980 + }, + { + "epoch": 862.55, + "learning_rate": 3.015894542214021e-05, + "loss": 0.3908, + "step": 43990 + }, + { + "epoch": 862.75, + "learning_rate": 3.0126466756022707e-05, + "loss": 0.3892, + "step": 44000 + }, + { + "epoch": 862.94, + "learning_rate": 3.0094001191976134e-05, + "loss": 0.3897, + "step": 44010 + }, + { + "epoch": 863.0, + "eval_loss": 0.3952128291130066, + "eval_runtime": 2.2684, + "eval_samples_per_second": 1004.679, + "eval_steps_per_second": 3.968, + "step": 44013 + }, + { + "epoch": 863.14, + "learning_rate": 3.0061548739479748e-05, + "loss": 0.3845, + "step": 44020 + }, + { + "epoch": 863.33, + "learning_rate": 3.0029109408008867e-05, + "loss": 0.3908, + "step": 44030 + }, + { + "epoch": 863.53, + "learning_rate": 2.9996683207035127e-05, + "loss": 0.3871, + "step": 44040 + }, + { + "epoch": 863.73, + "learning_rate": 2.9964270146026188e-05, + "loss": 0.3866, + "step": 44050 + }, + { + "epoch": 863.92, + "learning_rate": 2.993187023444597e-05, + "loss": 0.385, + "step": 44060 + }, + { + "epoch": 864.0, + "eval_loss": 0.3940654695034027, + "eval_runtime": 2.3365, + "eval_samples_per_second": 975.387, + "eval_steps_per_second": 3.852, + "step": 44064 + }, + { + "epoch": 864.12, + "learning_rate": 2.989948348175456e-05, + "loss": 0.3915, + "step": 44070 + }, + { + "epoch": 864.31, + "learning_rate": 2.98671098974081e-05, + "loss": 0.389, + "step": 44080 + }, + { + "epoch": 864.51, + "learning_rate": 2.983474949085902e-05, + "loss": 0.3863, + "step": 44090 + }, + { + "epoch": 864.71, + "learning_rate": 2.9802402271555775e-05, + "loss": 0.3926, + "step": 44100 + }, + { + "epoch": 864.9, + "learning_rate": 2.9770068248943062e-05, + "loss": 0.3876, + "step": 44110 + }, + { + "epoch": 865.0, + "eval_loss": 0.3937053680419922, + "eval_runtime": 2.3814, + "eval_samples_per_second": 956.993, + "eval_steps_per_second": 3.779, + "step": 44115 + }, + { + "epoch": 865.1, + "learning_rate": 2.973774743246173e-05, + "loss": 0.3901, + "step": 44120 + }, + { + "epoch": 865.29, + "learning_rate": 2.9705439831548672e-05, + "loss": 0.3932, + "step": 44130 + }, + { + "epoch": 865.49, + "learning_rate": 2.967314545563704e-05, + "loss": 0.3903, + "step": 44140 + }, + { + "epoch": 865.69, + "learning_rate": 2.9640864314156017e-05, + "loss": 0.3917, + "step": 44150 + }, + { + "epoch": 865.88, + "learning_rate": 2.9608596416531015e-05, + "loss": 0.3889, + "step": 44160 + }, + { + "epoch": 866.0, + "eval_loss": 0.39745286107063293, + "eval_runtime": 2.2179, + "eval_samples_per_second": 1027.526, + "eval_steps_per_second": 4.058, + "step": 44166 + }, + { + "epoch": 866.08, + "learning_rate": 2.957634177218353e-05, + "loss": 0.3876, + "step": 44170 + }, + { + "epoch": 866.27, + "learning_rate": 2.954410039053123e-05, + "loss": 0.3854, + "step": 44180 + }, + { + "epoch": 866.47, + "learning_rate": 2.951187228098785e-05, + "loss": 0.3888, + "step": 44190 + }, + { + "epoch": 866.67, + "learning_rate": 2.9479657452963253e-05, + "loss": 0.3906, + "step": 44200 + }, + { + "epoch": 866.86, + "learning_rate": 2.9447455915863477e-05, + "loss": 0.3926, + "step": 44210 + }, + { + "epoch": 867.0, + "eval_loss": 0.39533084630966187, + "eval_runtime": 2.2146, + "eval_samples_per_second": 1029.072, + "eval_steps_per_second": 4.064, + "step": 44217 + }, + { + "epoch": 867.06, + "learning_rate": 2.9415267679090657e-05, + "loss": 0.3895, + "step": 44220 + }, + { + "epoch": 867.25, + "learning_rate": 2.938309275204306e-05, + "loss": 0.3908, + "step": 44230 + }, + { + "epoch": 867.45, + "learning_rate": 2.9350931144115e-05, + "loss": 0.3886, + "step": 44240 + }, + { + "epoch": 867.65, + "learning_rate": 2.9318782864696995e-05, + "loss": 0.385, + "step": 44250 + }, + { + "epoch": 867.84, + "learning_rate": 2.928664792317558e-05, + "loss": 0.3895, + "step": 44260 + }, + { + "epoch": 868.0, + "eval_loss": 0.391824334859848, + "eval_runtime": 2.2277, + "eval_samples_per_second": 1023.015, + "eval_steps_per_second": 4.04, + "step": 44268 + }, + { + "epoch": 868.04, + "learning_rate": 2.925452632893346e-05, + "loss": 0.3865, + "step": 44270 + }, + { + "epoch": 868.24, + "learning_rate": 2.9222418091349463e-05, + "loss": 0.3874, + "step": 44280 + }, + { + "epoch": 868.43, + "learning_rate": 2.9190323219798413e-05, + "loss": 0.3864, + "step": 44290 + }, + { + "epoch": 868.63, + "learning_rate": 2.9158241723651357e-05, + "loss": 0.3867, + "step": 44300 + }, + { + "epoch": 868.82, + "learning_rate": 2.9126173612275315e-05, + "loss": 0.3926, + "step": 44310 + }, + { + "epoch": 869.0, + "eval_loss": 0.39258652925491333, + "eval_runtime": 2.2004, + "eval_samples_per_second": 1035.742, + "eval_steps_per_second": 4.09, + "step": 44319 + }, + { + "epoch": 869.02, + "learning_rate": 2.9094118895033494e-05, + "loss": 0.3904, + "step": 44320 + }, + { + "epoch": 869.22, + "learning_rate": 2.9062077581285187e-05, + "loss": 0.3853, + "step": 44330 + }, + { + "epoch": 869.41, + "learning_rate": 2.9030049680385685e-05, + "loss": 0.3888, + "step": 44340 + }, + { + "epoch": 869.61, + "learning_rate": 2.899803520168647e-05, + "loss": 0.3925, + "step": 44350 + }, + { + "epoch": 869.8, + "learning_rate": 2.8966034154535005e-05, + "loss": 0.3892, + "step": 44360 + }, + { + "epoch": 870.0, + "learning_rate": 2.893404654827491e-05, + "loss": 0.3861, + "step": 44370 + }, + { + "epoch": 870.0, + "eval_loss": 0.39331433176994324, + "eval_runtime": 2.2363, + "eval_samples_per_second": 1019.085, + "eval_steps_per_second": 4.024, + "step": 44370 + }, + { + "epoch": 870.2, + "learning_rate": 2.8902072392245856e-05, + "loss": 0.3868, + "step": 44380 + }, + { + "epoch": 870.39, + "learning_rate": 2.8870111695783603e-05, + "loss": 0.3874, + "step": 44390 + }, + { + "epoch": 870.59, + "learning_rate": 2.883816446821994e-05, + "loss": 0.3876, + "step": 44400 + }, + { + "epoch": 870.78, + "learning_rate": 2.880623071888271e-05, + "loss": 0.386, + "step": 44410 + }, + { + "epoch": 870.98, + "learning_rate": 2.87743104570959e-05, + "loss": 0.3881, + "step": 44420 + }, + { + "epoch": 871.0, + "eval_loss": 0.39406803250312805, + "eval_runtime": 2.3068, + "eval_samples_per_second": 987.965, + "eval_steps_per_second": 3.902, + "step": 44421 + }, + { + "epoch": 871.18, + "learning_rate": 2.87424036921795e-05, + "loss": 0.3829, + "step": 44430 + }, + { + "epoch": 871.37, + "learning_rate": 2.8710510433449598e-05, + "loss": 0.3881, + "step": 44440 + }, + { + "epoch": 871.57, + "learning_rate": 2.8678630690218274e-05, + "loss": 0.3879, + "step": 44450 + }, + { + "epoch": 871.76, + "learning_rate": 2.864676447179375e-05, + "loss": 0.3864, + "step": 44460 + }, + { + "epoch": 871.96, + "learning_rate": 2.8614911787480188e-05, + "loss": 0.3863, + "step": 44470 + }, + { + "epoch": 872.0, + "eval_loss": 0.3938988447189331, + "eval_runtime": 2.3231, + "eval_samples_per_second": 981.03, + "eval_steps_per_second": 3.874, + "step": 44472 + }, + { + "epoch": 872.16, + "learning_rate": 2.8583072646577905e-05, + "loss": 0.3849, + "step": 44480 + }, + { + "epoch": 872.35, + "learning_rate": 2.8551247058383234e-05, + "loss": 0.3887, + "step": 44490 + }, + { + "epoch": 872.55, + "learning_rate": 2.8519435032188488e-05, + "loss": 0.3927, + "step": 44500 + }, + { + "epoch": 872.75, + "learning_rate": 2.8487636577282115e-05, + "loss": 0.3897, + "step": 44510 + }, + { + "epoch": 872.94, + "learning_rate": 2.8455851702948522e-05, + "loss": 0.3863, + "step": 44520 + }, + { + "epoch": 873.0, + "eval_loss": 0.3912711441516876, + "eval_runtime": 2.1836, + "eval_samples_per_second": 1043.695, + "eval_steps_per_second": 4.122, + "step": 44523 + }, + { + "epoch": 873.14, + "learning_rate": 2.8424080418468184e-05, + "loss": 0.392, + "step": 44530 + }, + { + "epoch": 873.33, + "learning_rate": 2.8392322733117654e-05, + "loss": 0.3873, + "step": 44540 + }, + { + "epoch": 873.53, + "learning_rate": 2.83605786561694e-05, + "loss": 0.3863, + "step": 44550 + }, + { + "epoch": 873.73, + "learning_rate": 2.832884819689205e-05, + "loss": 0.3924, + "step": 44560 + }, + { + "epoch": 873.92, + "learning_rate": 2.8297131364550138e-05, + "loss": 0.386, + "step": 44570 + }, + { + "epoch": 874.0, + "eval_loss": 0.3918991982936859, + "eval_runtime": 2.2676, + "eval_samples_per_second": 1005.049, + "eval_steps_per_second": 3.969, + "step": 44574 + }, + { + "epoch": 874.12, + "learning_rate": 2.8265428168404287e-05, + "loss": 0.3857, + "step": 44580 + }, + { + "epoch": 874.31, + "learning_rate": 2.8233738617711158e-05, + "loss": 0.3906, + "step": 44590 + }, + { + "epoch": 874.51, + "learning_rate": 2.8202062721723325e-05, + "loss": 0.3855, + "step": 44600 + }, + { + "epoch": 874.71, + "learning_rate": 2.817040048968952e-05, + "loss": 0.3865, + "step": 44610 + }, + { + "epoch": 874.9, + "learning_rate": 2.8138751930854347e-05, + "loss": 0.382, + "step": 44620 + }, + { + "epoch": 875.0, + "eval_loss": 0.38788464665412903, + "eval_runtime": 2.2983, + "eval_samples_per_second": 991.581, + "eval_steps_per_second": 3.916, + "step": 44625 + }, + { + "epoch": 875.1, + "learning_rate": 2.8107117054458496e-05, + "loss": 0.3885, + "step": 44630 + }, + { + "epoch": 875.29, + "learning_rate": 2.8075495869738657e-05, + "loss": 0.3876, + "step": 44640 + }, + { + "epoch": 875.49, + "learning_rate": 2.8043888385927525e-05, + "loss": 0.385, + "step": 44650 + }, + { + "epoch": 875.69, + "learning_rate": 2.8012294612253767e-05, + "loss": 0.3894, + "step": 44660 + }, + { + "epoch": 875.88, + "learning_rate": 2.798071455794203e-05, + "loss": 0.384, + "step": 44670 + }, + { + "epoch": 876.0, + "eval_loss": 0.393778532743454, + "eval_runtime": 2.3202, + "eval_samples_per_second": 982.244, + "eval_steps_per_second": 3.879, + "step": 44676 + }, + { + "epoch": 876.08, + "learning_rate": 2.7949148232213006e-05, + "loss": 0.387, + "step": 44680 + }, + { + "epoch": 876.27, + "learning_rate": 2.7917595644283365e-05, + "loss": 0.3904, + "step": 44690 + }, + { + "epoch": 876.47, + "learning_rate": 2.7886056803365777e-05, + "loss": 0.3851, + "step": 44700 + }, + { + "epoch": 876.67, + "learning_rate": 2.7854531718668842e-05, + "loss": 0.389, + "step": 44710 + }, + { + "epoch": 876.86, + "learning_rate": 2.7823020399397213e-05, + "loss": 0.3898, + "step": 44720 + }, + { + "epoch": 877.0, + "eval_loss": 0.3949425220489502, + "eval_runtime": 2.2739, + "eval_samples_per_second": 1002.245, + "eval_steps_per_second": 3.958, + "step": 44727 + }, + { + "epoch": 877.06, + "learning_rate": 2.779152285475146e-05, + "loss": 0.3867, + "step": 44730 + }, + { + "epoch": 877.25, + "learning_rate": 2.776003909392819e-05, + "loss": 0.384, + "step": 44740 + }, + { + "epoch": 877.45, + "learning_rate": 2.7728569126119966e-05, + "loss": 0.3847, + "step": 44750 + }, + { + "epoch": 877.65, + "learning_rate": 2.7697112960515283e-05, + "loss": 0.3864, + "step": 44760 + }, + { + "epoch": 877.84, + "learning_rate": 2.7665670606298682e-05, + "loss": 0.3913, + "step": 44770 + }, + { + "epoch": 878.0, + "eval_loss": 0.3947102725505829, + "eval_runtime": 2.3101, + "eval_samples_per_second": 986.554, + "eval_steps_per_second": 3.896, + "step": 44778 + }, + { + "epoch": 878.04, + "learning_rate": 2.7634242072650577e-05, + "loss": 0.3859, + "step": 44780 + }, + { + "epoch": 878.24, + "learning_rate": 2.760282736874743e-05, + "loss": 0.3849, + "step": 44790 + }, + { + "epoch": 878.43, + "learning_rate": 2.7571426503761657e-05, + "loss": 0.3884, + "step": 44800 + }, + { + "epoch": 878.63, + "learning_rate": 2.754003948686156e-05, + "loss": 0.3871, + "step": 44810 + }, + { + "epoch": 878.82, + "learning_rate": 2.75086663272115e-05, + "loss": 0.3859, + "step": 44820 + }, + { + "epoch": 879.0, + "eval_loss": 0.3952098488807678, + "eval_runtime": 2.2397, + "eval_samples_per_second": 1017.564, + "eval_steps_per_second": 4.018, + "step": 44829 + }, + { + "epoch": 879.02, + "learning_rate": 2.7477307033971687e-05, + "loss": 0.3867, + "step": 44830 + }, + { + "epoch": 879.22, + "learning_rate": 2.744596161629836e-05, + "loss": 0.3881, + "step": 44840 + }, + { + "epoch": 879.41, + "learning_rate": 2.7414630083343687e-05, + "loss": 0.3871, + "step": 44850 + }, + { + "epoch": 879.61, + "learning_rate": 2.7383312444255793e-05, + "loss": 0.383, + "step": 44860 + }, + { + "epoch": 879.8, + "learning_rate": 2.7352008708178714e-05, + "loss": 0.3874, + "step": 44870 + }, + { + "epoch": 880.0, + "learning_rate": 2.7320718884252412e-05, + "loss": 0.385, + "step": 44880 + }, + { + "epoch": 880.0, + "eval_loss": 0.395025372505188, + "eval_runtime": 2.3571, + "eval_samples_per_second": 966.867, + "eval_steps_per_second": 3.818, + "step": 44880 + }, + { + "epoch": 880.2, + "learning_rate": 2.728944298161284e-05, + "loss": 0.388, + "step": 44890 + }, + { + "epoch": 880.39, + "learning_rate": 2.725818100939187e-05, + "loss": 0.3887, + "step": 44900 + }, + { + "epoch": 880.59, + "learning_rate": 2.7226932976717336e-05, + "loss": 0.3882, + "step": 44910 + }, + { + "epoch": 880.78, + "learning_rate": 2.7195698892712894e-05, + "loss": 0.3846, + "step": 44920 + }, + { + "epoch": 880.98, + "learning_rate": 2.716447876649826e-05, + "loss": 0.3872, + "step": 44930 + }, + { + "epoch": 881.0, + "eval_loss": 0.3877263069152832, + "eval_runtime": 2.2028, + "eval_samples_per_second": 1034.586, + "eval_steps_per_second": 4.086, + "step": 44931 + }, + { + "epoch": 881.18, + "learning_rate": 2.7133272607188975e-05, + "loss": 0.3849, + "step": 44940 + }, + { + "epoch": 881.37, + "learning_rate": 2.710208042389655e-05, + "loss": 0.3876, + "step": 44950 + }, + { + "epoch": 881.57, + "learning_rate": 2.707090222572844e-05, + "loss": 0.3888, + "step": 44960 + }, + { + "epoch": 881.76, + "learning_rate": 2.7039738021787926e-05, + "loss": 0.3888, + "step": 44970 + }, + { + "epoch": 881.96, + "learning_rate": 2.7008587821174328e-05, + "loss": 0.383, + "step": 44980 + }, + { + "epoch": 882.0, + "eval_loss": 0.3904988169670105, + "eval_runtime": 2.2822, + "eval_samples_per_second": 998.591, + "eval_steps_per_second": 3.944, + "step": 44982 + }, + { + "epoch": 882.16, + "learning_rate": 2.697745163298274e-05, + "loss": 0.3868, + "step": 44990 + }, + { + "epoch": 882.35, + "learning_rate": 2.6946329466304274e-05, + "loss": 0.3821, + "step": 45000 + }, + { + "epoch": 882.55, + "learning_rate": 2.691522133022593e-05, + "loss": 0.3869, + "step": 45010 + }, + { + "epoch": 882.75, + "learning_rate": 2.6884127233830533e-05, + "loss": 0.3848, + "step": 45020 + }, + { + "epoch": 882.94, + "learning_rate": 2.6853047186196924e-05, + "loss": 0.387, + "step": 45030 + }, + { + "epoch": 883.0, + "eval_loss": 0.39386025071144104, + "eval_runtime": 2.2917, + "eval_samples_per_second": 994.475, + "eval_steps_per_second": 3.927, + "step": 45033 + }, + { + "epoch": 883.14, + "learning_rate": 2.6821981196399727e-05, + "loss": 0.3901, + "step": 45040 + }, + { + "epoch": 883.33, + "learning_rate": 2.6790929273509545e-05, + "loss": 0.3869, + "step": 45050 + }, + { + "epoch": 883.53, + "learning_rate": 2.675989142659285e-05, + "loss": 0.3875, + "step": 45060 + }, + { + "epoch": 883.73, + "learning_rate": 2.6728867664712033e-05, + "loss": 0.3818, + "step": 45070 + }, + { + "epoch": 883.92, + "learning_rate": 2.66978579969253e-05, + "loss": 0.3834, + "step": 45080 + }, + { + "epoch": 884.0, + "eval_loss": 0.39473608136177063, + "eval_runtime": 2.2587, + "eval_samples_per_second": 1009.006, + "eval_steps_per_second": 3.985, + "step": 45084 + }, + { + "epoch": 884.12, + "learning_rate": 2.6666862432286758e-05, + "loss": 0.3834, + "step": 45090 + }, + { + "epoch": 884.31, + "learning_rate": 2.6635880979846462e-05, + "loss": 0.3862, + "step": 45100 + }, + { + "epoch": 884.51, + "learning_rate": 2.6604913648650295e-05, + "loss": 0.3878, + "step": 45110 + }, + { + "epoch": 884.71, + "learning_rate": 2.6573960447740055e-05, + "loss": 0.3898, + "step": 45120 + }, + { + "epoch": 884.9, + "learning_rate": 2.6543021386153322e-05, + "loss": 0.3866, + "step": 45130 + }, + { + "epoch": 885.0, + "eval_loss": 0.39346495270729065, + "eval_runtime": 2.245, + "eval_samples_per_second": 1015.159, + "eval_steps_per_second": 4.009, + "step": 45135 + }, + { + "epoch": 885.1, + "learning_rate": 2.651209647292368e-05, + "loss": 0.3878, + "step": 45140 + }, + { + "epoch": 885.29, + "learning_rate": 2.6481185717080457e-05, + "loss": 0.3882, + "step": 45150 + }, + { + "epoch": 885.49, + "learning_rate": 2.645028912764893e-05, + "loss": 0.3869, + "step": 45160 + }, + { + "epoch": 885.69, + "learning_rate": 2.6419406713650245e-05, + "loss": 0.3878, + "step": 45170 + }, + { + "epoch": 885.88, + "learning_rate": 2.638853848410132e-05, + "loss": 0.3834, + "step": 45180 + }, + { + "epoch": 886.0, + "eval_loss": 0.3925130069255829, + "eval_runtime": 2.2463, + "eval_samples_per_second": 1014.544, + "eval_steps_per_second": 4.007, + "step": 45186 + }, + { + "epoch": 886.08, + "learning_rate": 2.6357684448015038e-05, + "loss": 0.3826, + "step": 45190 + }, + { + "epoch": 886.27, + "learning_rate": 2.6326844614400038e-05, + "loss": 0.387, + "step": 45200 + }, + { + "epoch": 886.47, + "learning_rate": 2.6296018992260903e-05, + "loss": 0.3832, + "step": 45210 + }, + { + "epoch": 886.67, + "learning_rate": 2.626520759059804e-05, + "loss": 0.3887, + "step": 45220 + }, + { + "epoch": 886.86, + "learning_rate": 2.623441041840765e-05, + "loss": 0.3848, + "step": 45230 + }, + { + "epoch": 887.0, + "eval_loss": 0.3903357684612274, + "eval_runtime": 2.3016, + "eval_samples_per_second": 990.196, + "eval_steps_per_second": 3.91, + "step": 45237 + }, + { + "epoch": 887.06, + "learning_rate": 2.620362748468186e-05, + "loss": 0.3878, + "step": 45240 + }, + { + "epoch": 887.25, + "learning_rate": 2.6172858798408557e-05, + "loss": 0.3854, + "step": 45250 + }, + { + "epoch": 887.45, + "learning_rate": 2.6142104368571522e-05, + "loss": 0.386, + "step": 45260 + }, + { + "epoch": 887.65, + "learning_rate": 2.6111364204150414e-05, + "loss": 0.3814, + "step": 45270 + }, + { + "epoch": 887.84, + "learning_rate": 2.60806383141206e-05, + "loss": 0.3896, + "step": 45280 + }, + { + "epoch": 888.0, + "eval_loss": 0.39181816577911377, + "eval_runtime": 2.3791, + "eval_samples_per_second": 957.935, + "eval_steps_per_second": 3.783, + "step": 45288 + }, + { + "epoch": 888.04, + "learning_rate": 2.6049926707453428e-05, + "loss": 0.3842, + "step": 45290 + }, + { + "epoch": 888.24, + "learning_rate": 2.6019229393115935e-05, + "loss": 0.389, + "step": 45300 + }, + { + "epoch": 888.43, + "learning_rate": 2.5988546380071072e-05, + "loss": 0.3812, + "step": 45310 + }, + { + "epoch": 888.63, + "learning_rate": 2.5957877677277615e-05, + "loss": 0.3885, + "step": 45320 + }, + { + "epoch": 888.82, + "learning_rate": 2.592722329369016e-05, + "loss": 0.3863, + "step": 45330 + }, + { + "epoch": 889.0, + "eval_loss": 0.3879792094230652, + "eval_runtime": 2.2596, + "eval_samples_per_second": 1008.603, + "eval_steps_per_second": 3.983, + "step": 45339 + }, + { + "epoch": 889.02, + "learning_rate": 2.5896583238259064e-05, + "loss": 0.3845, + "step": 45340 + }, + { + "epoch": 889.22, + "learning_rate": 2.5865957519930526e-05, + "loss": 0.3888, + "step": 45350 + }, + { + "epoch": 889.41, + "learning_rate": 2.5835346147646597e-05, + "loss": 0.3855, + "step": 45360 + }, + { + "epoch": 889.61, + "learning_rate": 2.580474913034512e-05, + "loss": 0.3849, + "step": 45370 + }, + { + "epoch": 889.8, + "learning_rate": 2.5774166476959758e-05, + "loss": 0.3828, + "step": 45380 + }, + { + "epoch": 890.0, + "learning_rate": 2.574359819641992e-05, + "loss": 0.384, + "step": 45390 + }, + { + "epoch": 890.0, + "eval_loss": 0.388411283493042, + "eval_runtime": 2.2402, + "eval_samples_per_second": 1017.319, + "eval_steps_per_second": 4.017, + "step": 45390 + }, + { + "epoch": 890.2, + "learning_rate": 2.5713044297650904e-05, + "loss": 0.3816, + "step": 45400 + }, + { + "epoch": 890.39, + "learning_rate": 2.568250478957372e-05, + "loss": 0.3868, + "step": 45410 + }, + { + "epoch": 890.59, + "learning_rate": 2.5651979681105258e-05, + "loss": 0.3859, + "step": 45420 + }, + { + "epoch": 890.78, + "learning_rate": 2.562146898115819e-05, + "loss": 0.3863, + "step": 45430 + }, + { + "epoch": 890.98, + "learning_rate": 2.5590972698640892e-05, + "loss": 0.3844, + "step": 45440 + }, + { + "epoch": 891.0, + "eval_loss": 0.3906935751438141, + "eval_runtime": 2.3035, + "eval_samples_per_second": 989.365, + "eval_steps_per_second": 3.907, + "step": 45441 + }, + { + "epoch": 891.18, + "learning_rate": 2.5560490842457675e-05, + "loss": 0.3852, + "step": 45450 + }, + { + "epoch": 891.37, + "learning_rate": 2.553002342150849e-05, + "loss": 0.3851, + "step": 45460 + }, + { + "epoch": 891.57, + "learning_rate": 2.549957044468919e-05, + "loss": 0.3863, + "step": 45470 + }, + { + "epoch": 891.76, + "learning_rate": 2.546913192089137e-05, + "loss": 0.3835, + "step": 45480 + }, + { + "epoch": 891.96, + "learning_rate": 2.543870785900236e-05, + "loss": 0.3863, + "step": 45490 + }, + { + "epoch": 892.0, + "eval_loss": 0.39539283514022827, + "eval_runtime": 2.2062, + "eval_samples_per_second": 1032.976, + "eval_steps_per_second": 4.079, + "step": 45492 + }, + { + "epoch": 892.16, + "learning_rate": 2.5408298267905357e-05, + "loss": 0.3857, + "step": 45500 + }, + { + "epoch": 892.35, + "learning_rate": 2.5377903156479235e-05, + "loss": 0.3898, + "step": 45510 + }, + { + "epoch": 892.55, + "learning_rate": 2.5347522533598706e-05, + "loss": 0.3868, + "step": 45520 + }, + { + "epoch": 892.75, + "learning_rate": 2.531715640813424e-05, + "loss": 0.3801, + "step": 45530 + }, + { + "epoch": 892.94, + "learning_rate": 2.52868047889521e-05, + "loss": 0.3872, + "step": 45540 + }, + { + "epoch": 893.0, + "eval_loss": 0.3918585479259491, + "eval_runtime": 2.1779, + "eval_samples_per_second": 1046.398, + "eval_steps_per_second": 4.132, + "step": 45543 + }, + { + "epoch": 893.14, + "learning_rate": 2.525646768491424e-05, + "loss": 0.385, + "step": 45550 + }, + { + "epoch": 893.33, + "learning_rate": 2.5226145104878405e-05, + "loss": 0.3884, + "step": 45560 + }, + { + "epoch": 893.53, + "learning_rate": 2.5195837057698134e-05, + "loss": 0.3824, + "step": 45570 + }, + { + "epoch": 893.73, + "learning_rate": 2.5165543552222706e-05, + "loss": 0.3824, + "step": 45580 + }, + { + "epoch": 893.92, + "learning_rate": 2.5135264597297166e-05, + "loss": 0.3869, + "step": 45590 + }, + { + "epoch": 894.0, + "eval_loss": 0.39283615350723267, + "eval_runtime": 2.2359, + "eval_samples_per_second": 1019.291, + "eval_steps_per_second": 4.025, + "step": 45594 + }, + { + "epoch": 894.12, + "learning_rate": 2.5105000201762253e-05, + "loss": 0.3907, + "step": 45600 + }, + { + "epoch": 894.31, + "learning_rate": 2.5074750374454532e-05, + "loss": 0.3855, + "step": 45610 + }, + { + "epoch": 894.51, + "learning_rate": 2.504451512420624e-05, + "loss": 0.383, + "step": 45620 + }, + { + "epoch": 894.71, + "learning_rate": 2.5014294459845418e-05, + "loss": 0.384, + "step": 45630 + }, + { + "epoch": 894.9, + "learning_rate": 2.498408839019584e-05, + "loss": 0.3801, + "step": 45640 + }, + { + "epoch": 895.0, + "eval_loss": 0.39413610100746155, + "eval_runtime": 2.344, + "eval_samples_per_second": 972.252, + "eval_steps_per_second": 3.84, + "step": 45645 + }, + { + "epoch": 895.1, + "learning_rate": 2.4953896924076978e-05, + "loss": 0.3843, + "step": 45650 + }, + { + "epoch": 895.29, + "learning_rate": 2.4923720070304088e-05, + "loss": 0.386, + "step": 45660 + }, + { + "epoch": 895.49, + "learning_rate": 2.4893557837688108e-05, + "loss": 0.3901, + "step": 45670 + }, + { + "epoch": 895.69, + "learning_rate": 2.486341023503576e-05, + "loss": 0.3842, + "step": 45680 + }, + { + "epoch": 895.88, + "learning_rate": 2.4833277271149496e-05, + "loss": 0.3832, + "step": 45690 + }, + { + "epoch": 896.0, + "eval_loss": 0.39303308725357056, + "eval_runtime": 2.2279, + "eval_samples_per_second": 1022.934, + "eval_steps_per_second": 4.04, + "step": 45696 + }, + { + "epoch": 896.08, + "learning_rate": 2.480315895482742e-05, + "loss": 0.3866, + "step": 45700 + }, + { + "epoch": 896.27, + "learning_rate": 2.4773055294863443e-05, + "loss": 0.3891, + "step": 45710 + }, + { + "epoch": 896.47, + "learning_rate": 2.4742966300047138e-05, + "loss": 0.3846, + "step": 45720 + }, + { + "epoch": 896.67, + "learning_rate": 2.4712891979163826e-05, + "loss": 0.3854, + "step": 45730 + }, + { + "epoch": 896.86, + "learning_rate": 2.4682832340994544e-05, + "loss": 0.3886, + "step": 45740 + }, + { + "epoch": 897.0, + "eval_loss": 0.3933192193508148, + "eval_runtime": 2.243, + "eval_samples_per_second": 1016.036, + "eval_steps_per_second": 4.012, + "step": 45747 + }, + { + "epoch": 897.06, + "learning_rate": 2.4652787394316066e-05, + "loss": 0.3919, + "step": 45750 + }, + { + "epoch": 897.25, + "learning_rate": 2.4622757147900816e-05, + "loss": 0.3843, + "step": 45760 + }, + { + "epoch": 897.45, + "learning_rate": 2.459274161051693e-05, + "loss": 0.3851, + "step": 45770 + }, + { + "epoch": 897.65, + "learning_rate": 2.4562740790928304e-05, + "loss": 0.3832, + "step": 45780 + }, + { + "epoch": 897.84, + "learning_rate": 2.4532754697894512e-05, + "loss": 0.3871, + "step": 45790 + }, + { + "epoch": 898.0, + "eval_loss": 0.3916724920272827, + "eval_runtime": 2.233, + "eval_samples_per_second": 1020.585, + "eval_steps_per_second": 4.03, + "step": 45798 + }, + { + "epoch": 898.04, + "learning_rate": 2.4502783340170833e-05, + "loss": 0.3837, + "step": 45800 + }, + { + "epoch": 898.24, + "learning_rate": 2.4472826726508207e-05, + "loss": 0.3866, + "step": 45810 + }, + { + "epoch": 898.43, + "learning_rate": 2.4442884865653332e-05, + "loss": 0.3846, + "step": 45820 + }, + { + "epoch": 898.63, + "learning_rate": 2.4412957766348516e-05, + "loss": 0.3798, + "step": 45830 + }, + { + "epoch": 898.82, + "learning_rate": 2.4383045437331835e-05, + "loss": 0.3892, + "step": 45840 + }, + { + "epoch": 899.0, + "eval_loss": 0.39273956418037415, + "eval_runtime": 2.3197, + "eval_samples_per_second": 982.453, + "eval_steps_per_second": 3.88, + "step": 45849 + }, + { + "epoch": 899.02, + "learning_rate": 2.4353147887337042e-05, + "loss": 0.3851, + "step": 45850 + }, + { + "epoch": 899.22, + "learning_rate": 2.4323265125093507e-05, + "loss": 0.386, + "step": 45860 + }, + { + "epoch": 899.41, + "learning_rate": 2.4293397159326384e-05, + "loss": 0.3838, + "step": 45870 + }, + { + "epoch": 899.61, + "learning_rate": 2.4263543998756392e-05, + "loss": 0.3802, + "step": 45880 + }, + { + "epoch": 899.8, + "learning_rate": 2.4233705652100026e-05, + "loss": 0.3867, + "step": 45890 + }, + { + "epoch": 900.0, + "learning_rate": 2.420388212806943e-05, + "loss": 0.3864, + "step": 45900 + }, + { + "epoch": 900.0, + "eval_loss": 0.3934266269207001, + "eval_runtime": 2.2188, + "eval_samples_per_second": 1027.135, + "eval_steps_per_second": 4.056, + "step": 45900 + }, + { + "epoch": 900.2, + "learning_rate": 2.417407343537237e-05, + "loss": 0.3864, + "step": 45910 + }, + { + "epoch": 900.39, + "learning_rate": 2.4144279582712353e-05, + "loss": 0.384, + "step": 45920 + }, + { + "epoch": 900.59, + "learning_rate": 2.4114500578788486e-05, + "loss": 0.3841, + "step": 45930 + }, + { + "epoch": 900.78, + "learning_rate": 2.40847364322956e-05, + "loss": 0.3825, + "step": 45940 + }, + { + "epoch": 900.98, + "learning_rate": 2.405498715192415e-05, + "loss": 0.3827, + "step": 45950 + }, + { + "epoch": 901.0, + "eval_loss": 0.39162707328796387, + "eval_runtime": 2.2278, + "eval_samples_per_second": 1022.964, + "eval_steps_per_second": 4.04, + "step": 45951 + }, + { + "epoch": 901.18, + "learning_rate": 2.402525274636029e-05, + "loss": 0.3816, + "step": 45960 + }, + { + "epoch": 901.37, + "learning_rate": 2.3995533224285788e-05, + "loss": 0.3832, + "step": 45970 + }, + { + "epoch": 901.57, + "learning_rate": 2.3965828594378042e-05, + "loss": 0.3892, + "step": 45980 + }, + { + "epoch": 901.76, + "learning_rate": 2.3936138865310177e-05, + "loss": 0.3815, + "step": 45990 + }, + { + "epoch": 901.96, + "learning_rate": 2.3906464045750927e-05, + "loss": 0.3838, + "step": 46000 + }, + { + "epoch": 902.0, + "eval_loss": 0.3931758403778076, + "eval_runtime": 2.2246, + "eval_samples_per_second": 1024.454, + "eval_steps_per_second": 4.046, + "step": 46002 + }, + { + "epoch": 902.16, + "learning_rate": 2.387680414436471e-05, + "loss": 0.3852, + "step": 46010 + }, + { + "epoch": 902.35, + "learning_rate": 2.384715916981152e-05, + "loss": 0.3842, + "step": 46020 + }, + { + "epoch": 902.55, + "learning_rate": 2.3817529130747002e-05, + "loss": 0.3843, + "step": 46030 + }, + { + "epoch": 902.75, + "learning_rate": 2.3787914035822512e-05, + "loss": 0.3856, + "step": 46040 + }, + { + "epoch": 902.94, + "learning_rate": 2.3758313893684976e-05, + "loss": 0.3859, + "step": 46050 + }, + { + "epoch": 903.0, + "eval_loss": 0.3901480436325073, + "eval_runtime": 2.2773, + "eval_samples_per_second": 1000.738, + "eval_steps_per_second": 3.952, + "step": 46053 + }, + { + "epoch": 903.14, + "learning_rate": 2.3728728712977005e-05, + "loss": 0.377, + "step": 46060 + }, + { + "epoch": 903.33, + "learning_rate": 2.369915850233677e-05, + "loss": 0.3878, + "step": 46070 + }, + { + "epoch": 903.53, + "learning_rate": 2.366960327039815e-05, + "loss": 0.3839, + "step": 46080 + }, + { + "epoch": 903.73, + "learning_rate": 2.3640063025790577e-05, + "loss": 0.3786, + "step": 46090 + }, + { + "epoch": 903.92, + "learning_rate": 2.3610537777139165e-05, + "loss": 0.382, + "step": 46100 + }, + { + "epoch": 904.0, + "eval_loss": 0.3918239176273346, + "eval_runtime": 2.2928, + "eval_samples_per_second": 993.994, + "eval_steps_per_second": 3.925, + "step": 46104 + }, + { + "epoch": 904.12, + "learning_rate": 2.358102753306465e-05, + "loss": 0.3819, + "step": 46110 + }, + { + "epoch": 904.31, + "learning_rate": 2.3551532302183307e-05, + "loss": 0.3855, + "step": 46120 + }, + { + "epoch": 904.51, + "learning_rate": 2.3522052093107154e-05, + "loss": 0.3824, + "step": 46130 + }, + { + "epoch": 904.71, + "learning_rate": 2.3492586914443693e-05, + "loss": 0.3829, + "step": 46140 + }, + { + "epoch": 904.9, + "learning_rate": 2.346313677479613e-05, + "loss": 0.3824, + "step": 46150 + }, + { + "epoch": 905.0, + "eval_loss": 0.39389172196388245, + "eval_runtime": 2.3228, + "eval_samples_per_second": 981.125, + "eval_steps_per_second": 3.875, + "step": 46155 + }, + { + "epoch": 905.1, + "learning_rate": 2.3433701682763262e-05, + "loss": 0.3865, + "step": 46160 + }, + { + "epoch": 905.29, + "learning_rate": 2.3404281646939442e-05, + "loss": 0.385, + "step": 46170 + }, + { + "epoch": 905.49, + "learning_rate": 2.3374876675914704e-05, + "loss": 0.386, + "step": 46180 + }, + { + "epoch": 905.69, + "learning_rate": 2.3345486778274604e-05, + "loss": 0.386, + "step": 46190 + }, + { + "epoch": 905.88, + "learning_rate": 2.331611196260036e-05, + "loss": 0.3799, + "step": 46200 + }, + { + "epoch": 906.0, + "eval_loss": 0.390666663646698, + "eval_runtime": 2.2415, + "eval_samples_per_second": 1016.712, + "eval_steps_per_second": 4.015, + "step": 46206 + }, + { + "epoch": 906.08, + "learning_rate": 2.328675223746876e-05, + "loss": 0.3824, + "step": 46210 + }, + { + "epoch": 906.27, + "learning_rate": 2.3257407611452215e-05, + "loss": 0.3818, + "step": 46220 + }, + { + "epoch": 906.47, + "learning_rate": 2.322807809311867e-05, + "loss": 0.3869, + "step": 46230 + }, + { + "epoch": 906.67, + "learning_rate": 2.3198763691031675e-05, + "loss": 0.3806, + "step": 46240 + }, + { + "epoch": 906.86, + "learning_rate": 2.31694644137504e-05, + "loss": 0.3851, + "step": 46250 + }, + { + "epoch": 907.0, + "eval_loss": 0.38907137513160706, + "eval_runtime": 2.3737, + "eval_samples_per_second": 960.124, + "eval_steps_per_second": 3.792, + "step": 46257 + }, + { + "epoch": 907.06, + "learning_rate": 2.3140180269829587e-05, + "loss": 0.3824, + "step": 46260 + }, + { + "epoch": 907.25, + "learning_rate": 2.311091126781957e-05, + "loss": 0.3862, + "step": 46270 + }, + { + "epoch": 907.45, + "learning_rate": 2.3081657416266202e-05, + "loss": 0.3848, + "step": 46280 + }, + { + "epoch": 907.65, + "learning_rate": 2.3052418723711e-05, + "loss": 0.3833, + "step": 46290 + }, + { + "epoch": 907.84, + "learning_rate": 2.3023195198690968e-05, + "loss": 0.3854, + "step": 46300 + }, + { + "epoch": 908.0, + "eval_loss": 0.38849303126335144, + "eval_runtime": 2.3757, + "eval_samples_per_second": 959.284, + "eval_steps_per_second": 3.788, + "step": 46308 + }, + { + "epoch": 908.04, + "learning_rate": 2.2993986849738735e-05, + "loss": 0.3819, + "step": 46310 + }, + { + "epoch": 908.24, + "learning_rate": 2.2964793685382518e-05, + "loss": 0.384, + "step": 46320 + }, + { + "epoch": 908.43, + "learning_rate": 2.293561571414603e-05, + "loss": 0.3822, + "step": 46330 + }, + { + "epoch": 908.63, + "learning_rate": 2.2906452944548622e-05, + "loss": 0.381, + "step": 46340 + }, + { + "epoch": 908.82, + "learning_rate": 2.2877305385105134e-05, + "loss": 0.3855, + "step": 46350 + }, + { + "epoch": 909.0, + "eval_loss": 0.3911910653114319, + "eval_runtime": 2.2359, + "eval_samples_per_second": 1019.275, + "eval_steps_per_second": 4.025, + "step": 46359 + }, + { + "epoch": 909.02, + "learning_rate": 2.2848173044326036e-05, + "loss": 0.3796, + "step": 46360 + }, + { + "epoch": 909.22, + "learning_rate": 2.2819055930717316e-05, + "loss": 0.3814, + "step": 46370 + }, + { + "epoch": 909.41, + "learning_rate": 2.2789954052780508e-05, + "loss": 0.3793, + "step": 46380 + }, + { + "epoch": 909.61, + "learning_rate": 2.2760867419012732e-05, + "loss": 0.3837, + "step": 46390 + }, + { + "epoch": 909.8, + "learning_rate": 2.273179603790661e-05, + "loss": 0.3862, + "step": 46400 + }, + { + "epoch": 910.0, + "learning_rate": 2.2702739917950342e-05, + "loss": 0.3855, + "step": 46410 + }, + { + "epoch": 910.0, + "eval_loss": 0.3912041187286377, + "eval_runtime": 2.2617, + "eval_samples_per_second": 1007.639, + "eval_steps_per_second": 3.979, + "step": 46410 + }, + { + "epoch": 910.2, + "learning_rate": 2.267369906762768e-05, + "loss": 0.3839, + "step": 46420 + }, + { + "epoch": 910.39, + "learning_rate": 2.2644673495417922e-05, + "loss": 0.3809, + "step": 46430 + }, + { + "epoch": 910.59, + "learning_rate": 2.261566320979587e-05, + "loss": 0.3842, + "step": 46440 + }, + { + "epoch": 910.78, + "learning_rate": 2.2586668219231847e-05, + "loss": 0.3804, + "step": 46450 + }, + { + "epoch": 910.98, + "learning_rate": 2.255768853219178e-05, + "loss": 0.3799, + "step": 46460 + }, + { + "epoch": 911.0, + "eval_loss": 0.38822638988494873, + "eval_runtime": 2.2355, + "eval_samples_per_second": 1019.442, + "eval_steps_per_second": 4.026, + "step": 46461 + }, + { + "epoch": 911.18, + "learning_rate": 2.2528724157137082e-05, + "loss": 0.3793, + "step": 46470 + }, + { + "epoch": 911.37, + "learning_rate": 2.2499775102524725e-05, + "loss": 0.3807, + "step": 46480 + }, + { + "epoch": 911.57, + "learning_rate": 2.2470841376807154e-05, + "loss": 0.3854, + "step": 46490 + }, + { + "epoch": 911.76, + "learning_rate": 2.2441922988432405e-05, + "loss": 0.3827, + "step": 46500 + }, + { + "epoch": 911.96, + "learning_rate": 2.2413019945843964e-05, + "loss": 0.387, + "step": 46510 + }, + { + "epoch": 912.0, + "eval_loss": 0.3894227147102356, + "eval_runtime": 2.3651, + "eval_samples_per_second": 963.582, + "eval_steps_per_second": 3.805, + "step": 46512 + }, + { + "epoch": 912.16, + "learning_rate": 2.2384132257480898e-05, + "loss": 0.384, + "step": 46520 + }, + { + "epoch": 912.35, + "learning_rate": 2.2355259931777784e-05, + "loss": 0.387, + "step": 46530 + }, + { + "epoch": 912.55, + "learning_rate": 2.2326402977164658e-05, + "loss": 0.3837, + "step": 46540 + }, + { + "epoch": 912.75, + "learning_rate": 2.2297561402067148e-05, + "loss": 0.3859, + "step": 46550 + }, + { + "epoch": 912.94, + "learning_rate": 2.226873521490631e-05, + "loss": 0.3792, + "step": 46560 + }, + { + "epoch": 913.0, + "eval_loss": 0.3886968791484833, + "eval_runtime": 2.3469, + "eval_samples_per_second": 971.078, + "eval_steps_per_second": 3.835, + "step": 46563 + }, + { + "epoch": 913.14, + "learning_rate": 2.223992442409876e-05, + "loss": 0.3861, + "step": 46570 + }, + { + "epoch": 913.33, + "learning_rate": 2.2211129038056646e-05, + "loss": 0.3809, + "step": 46580 + }, + { + "epoch": 913.53, + "learning_rate": 2.218234906518752e-05, + "loss": 0.3852, + "step": 46590 + }, + { + "epoch": 913.73, + "learning_rate": 2.2153584513894547e-05, + "loss": 0.3826, + "step": 46600 + }, + { + "epoch": 913.92, + "learning_rate": 2.2124835392576275e-05, + "loss": 0.3831, + "step": 46610 + }, + { + "epoch": 914.0, + "eval_loss": 0.3874710500240326, + "eval_runtime": 2.2984, + "eval_samples_per_second": 991.568, + "eval_steps_per_second": 3.916, + "step": 46614 + }, + { + "epoch": 914.12, + "learning_rate": 2.209610170962685e-05, + "loss": 0.3827, + "step": 46620 + }, + { + "epoch": 914.31, + "learning_rate": 2.2067383473435844e-05, + "loss": 0.3832, + "step": 46630 + }, + { + "epoch": 914.51, + "learning_rate": 2.203868069238838e-05, + "loss": 0.385, + "step": 46640 + }, + { + "epoch": 914.71, + "learning_rate": 2.2009993374864997e-05, + "loss": 0.3857, + "step": 46650 + }, + { + "epoch": 914.9, + "learning_rate": 2.198132152924173e-05, + "loss": 0.3821, + "step": 46660 + }, + { + "epoch": 915.0, + "eval_loss": 0.38625603914260864, + "eval_runtime": 2.2102, + "eval_samples_per_second": 1031.139, + "eval_steps_per_second": 4.072, + "step": 46665 + }, + { + "epoch": 915.1, + "learning_rate": 2.195266516389015e-05, + "loss": 0.381, + "step": 46670 + }, + { + "epoch": 915.29, + "learning_rate": 2.192402428717728e-05, + "loss": 0.3842, + "step": 46680 + }, + { + "epoch": 915.49, + "learning_rate": 2.189539890746562e-05, + "loss": 0.381, + "step": 46690 + }, + { + "epoch": 915.69, + "learning_rate": 2.1866789033113142e-05, + "loss": 0.3763, + "step": 46700 + }, + { + "epoch": 915.88, + "learning_rate": 2.1838194672473254e-05, + "loss": 0.3853, + "step": 46710 + }, + { + "epoch": 916.0, + "eval_loss": 0.3884444534778595, + "eval_runtime": 2.358, + "eval_samples_per_second": 966.491, + "eval_steps_per_second": 3.817, + "step": 46716 + }, + { + "epoch": 916.08, + "learning_rate": 2.1809615833894893e-05, + "loss": 0.38, + "step": 46720 + }, + { + "epoch": 916.27, + "learning_rate": 2.178105252572245e-05, + "loss": 0.3845, + "step": 46730 + }, + { + "epoch": 916.47, + "learning_rate": 2.17525047562958e-05, + "loss": 0.3847, + "step": 46740 + }, + { + "epoch": 916.67, + "learning_rate": 2.1723972533950197e-05, + "loss": 0.3907, + "step": 46750 + }, + { + "epoch": 916.86, + "learning_rate": 2.1695455867016466e-05, + "loss": 0.381, + "step": 46760 + }, + { + "epoch": 917.0, + "eval_loss": 0.38729578256607056, + "eval_runtime": 2.3567, + "eval_samples_per_second": 967.025, + "eval_steps_per_second": 3.819, + "step": 46767 + }, + { + "epoch": 917.06, + "learning_rate": 2.1666954763820795e-05, + "loss": 0.3843, + "step": 46770 + }, + { + "epoch": 917.25, + "learning_rate": 2.1638469232684892e-05, + "loss": 0.3812, + "step": 46780 + }, + { + "epoch": 917.45, + "learning_rate": 2.1609999281925916e-05, + "loss": 0.3827, + "step": 46790 + }, + { + "epoch": 917.65, + "learning_rate": 2.1581544919856415e-05, + "loss": 0.3857, + "step": 46800 + }, + { + "epoch": 917.84, + "learning_rate": 2.1553106154784482e-05, + "loss": 0.3847, + "step": 46810 + }, + { + "epoch": 918.0, + "eval_loss": 0.3849899470806122, + "eval_runtime": 2.2047, + "eval_samples_per_second": 1033.685, + "eval_steps_per_second": 4.082, + "step": 46818 + }, + { + "epoch": 918.04, + "learning_rate": 2.152468299501353e-05, + "loss": 0.3817, + "step": 46820 + }, + { + "epoch": 918.24, + "learning_rate": 2.1496275448842536e-05, + "loss": 0.3891, + "step": 46830 + }, + { + "epoch": 918.43, + "learning_rate": 2.1467883524565886e-05, + "loss": 0.3834, + "step": 46840 + }, + { + "epoch": 918.63, + "learning_rate": 2.1439507230473345e-05, + "loss": 0.3803, + "step": 46850 + }, + { + "epoch": 918.82, + "learning_rate": 2.14111465748502e-05, + "loss": 0.3813, + "step": 46860 + }, + { + "epoch": 919.0, + "eval_loss": 0.387481153011322, + "eval_runtime": 2.1882, + "eval_samples_per_second": 1041.495, + "eval_steps_per_second": 4.113, + "step": 46869 + }, + { + "epoch": 919.02, + "learning_rate": 2.1382801565977082e-05, + "loss": 0.3762, + "step": 46870 + }, + { + "epoch": 919.22, + "learning_rate": 2.135447221213013e-05, + "loss": 0.3835, + "step": 46880 + }, + { + "epoch": 919.41, + "learning_rate": 2.1326158521580874e-05, + "loss": 0.3842, + "step": 46890 + }, + { + "epoch": 919.61, + "learning_rate": 2.129786050259632e-05, + "loss": 0.3819, + "step": 46900 + }, + { + "epoch": 919.8, + "learning_rate": 2.1269578163438812e-05, + "loss": 0.3833, + "step": 46910 + }, + { + "epoch": 920.0, + "learning_rate": 2.1241311512366167e-05, + "loss": 0.3853, + "step": 46920 + }, + { + "epoch": 920.0, + "eval_loss": 0.3859827518463135, + "eval_runtime": 2.2216, + "eval_samples_per_second": 1025.821, + "eval_steps_per_second": 4.051, + "step": 46920 + }, + { + "epoch": 920.2, + "learning_rate": 2.1213060557631614e-05, + "loss": 0.3855, + "step": 46930 + }, + { + "epoch": 920.39, + "learning_rate": 2.1184825307483818e-05, + "loss": 0.3855, + "step": 46940 + }, + { + "epoch": 920.59, + "learning_rate": 2.115660577016686e-05, + "loss": 0.3826, + "step": 46950 + }, + { + "epoch": 920.78, + "learning_rate": 2.1128401953920172e-05, + "loss": 0.3822, + "step": 46960 + }, + { + "epoch": 920.98, + "learning_rate": 2.1100213866978683e-05, + "loss": 0.3849, + "step": 46970 + }, + { + "epoch": 921.0, + "eval_loss": 0.38799986243247986, + "eval_runtime": 2.175, + "eval_samples_per_second": 1047.835, + "eval_steps_per_second": 4.138, + "step": 46971 + }, + { + "epoch": 921.18, + "learning_rate": 2.1072041517572635e-05, + "loss": 0.3817, + "step": 46980 + }, + { + "epoch": 921.37, + "learning_rate": 2.1043884913927757e-05, + "loss": 0.3842, + "step": 46990 + }, + { + "epoch": 921.57, + "learning_rate": 2.1015744064265165e-05, + "loss": 0.3846, + "step": 47000 + }, + { + "epoch": 921.76, + "learning_rate": 2.098761897680132e-05, + "loss": 0.3848, + "step": 47010 + }, + { + "epoch": 921.96, + "learning_rate": 2.095950965974817e-05, + "loss": 0.3771, + "step": 47020 + }, + { + "epoch": 922.0, + "eval_loss": 0.3890962600708008, + "eval_runtime": 2.3192, + "eval_samples_per_second": 982.67, + "eval_steps_per_second": 3.881, + "step": 47022 + }, + { + "epoch": 922.16, + "learning_rate": 2.0931416121312948e-05, + "loss": 0.3789, + "step": 47030 + }, + { + "epoch": 922.35, + "learning_rate": 2.0903338369698376e-05, + "loss": 0.3837, + "step": 47040 + }, + { + "epoch": 922.55, + "learning_rate": 2.0875276413102553e-05, + "loss": 0.3806, + "step": 47050 + }, + { + "epoch": 922.75, + "learning_rate": 2.084723025971889e-05, + "loss": 0.3833, + "step": 47060 + }, + { + "epoch": 922.94, + "learning_rate": 2.0819199917736294e-05, + "loss": 0.3815, + "step": 47070 + }, + { + "epoch": 923.0, + "eval_loss": 0.3886687457561493, + "eval_runtime": 2.278, + "eval_samples_per_second": 1000.437, + "eval_steps_per_second": 3.951, + "step": 47073 + }, + { + "epoch": 923.14, + "learning_rate": 2.0791185395338944e-05, + "loss": 0.3829, + "step": 47080 + }, + { + "epoch": 923.33, + "learning_rate": 2.076318670070649e-05, + "loss": 0.3815, + "step": 47090 + }, + { + "epoch": 923.53, + "learning_rate": 2.0735203842013924e-05, + "loss": 0.384, + "step": 47100 + }, + { + "epoch": 923.73, + "learning_rate": 2.0707236827431635e-05, + "loss": 0.3862, + "step": 47110 + }, + { + "epoch": 923.92, + "learning_rate": 2.0679285665125343e-05, + "loss": 0.3827, + "step": 47120 + }, + { + "epoch": 924.0, + "eval_loss": 0.39016667008399963, + "eval_runtime": 2.192, + "eval_samples_per_second": 1039.686, + "eval_steps_per_second": 4.106, + "step": 47124 + }, + { + "epoch": 924.12, + "learning_rate": 2.0651350363256144e-05, + "loss": 0.3817, + "step": 47130 + }, + { + "epoch": 924.31, + "learning_rate": 2.062343092998055e-05, + "loss": 0.379, + "step": 47140 + }, + { + "epoch": 924.51, + "learning_rate": 2.0595527373450406e-05, + "loss": 0.3816, + "step": 47150 + }, + { + "epoch": 924.71, + "learning_rate": 2.0567639701812956e-05, + "loss": 0.3834, + "step": 47160 + }, + { + "epoch": 924.9, + "learning_rate": 2.0539767923210733e-05, + "loss": 0.3828, + "step": 47170 + }, + { + "epoch": 925.0, + "eval_loss": 0.39003047347068787, + "eval_runtime": 2.2931, + "eval_samples_per_second": 993.858, + "eval_steps_per_second": 3.925, + "step": 47175 + }, + { + "epoch": 925.1, + "learning_rate": 2.0511912045781716e-05, + "loss": 0.3807, + "step": 47180 + }, + { + "epoch": 925.29, + "learning_rate": 2.0484072077659158e-05, + "loss": 0.3796, + "step": 47190 + }, + { + "epoch": 925.49, + "learning_rate": 2.045624802697173e-05, + "loss": 0.3844, + "step": 47200 + }, + { + "epoch": 925.69, + "learning_rate": 2.0428439901843452e-05, + "loss": 0.3805, + "step": 47210 + }, + { + "epoch": 925.88, + "learning_rate": 2.0400647710393635e-05, + "loss": 0.3861, + "step": 47220 + }, + { + "epoch": 926.0, + "eval_loss": 0.39150747656822205, + "eval_runtime": 2.2942, + "eval_samples_per_second": 993.379, + "eval_steps_per_second": 3.923, + "step": 47226 + }, + { + "epoch": 926.08, + "learning_rate": 2.037287146073703e-05, + "loss": 0.3812, + "step": 47230 + }, + { + "epoch": 926.27, + "learning_rate": 2.0345111160983632e-05, + "loss": 0.3801, + "step": 47240 + }, + { + "epoch": 926.47, + "learning_rate": 2.0317366819238855e-05, + "loss": 0.3837, + "step": 47250 + }, + { + "epoch": 926.67, + "learning_rate": 2.0289638443603447e-05, + "loss": 0.3836, + "step": 47260 + }, + { + "epoch": 926.86, + "learning_rate": 2.0261926042173433e-05, + "loss": 0.383, + "step": 47270 + }, + { + "epoch": 927.0, + "eval_loss": 0.39107587933540344, + "eval_runtime": 2.2238, + "eval_samples_per_second": 1024.819, + "eval_steps_per_second": 4.047, + "step": 47277 + }, + { + "epoch": 927.06, + "learning_rate": 2.023422962304026e-05, + "loss": 0.3848, + "step": 47280 + }, + { + "epoch": 927.25, + "learning_rate": 2.0206549194290613e-05, + "loss": 0.3828, + "step": 47290 + }, + { + "epoch": 927.45, + "learning_rate": 2.0178884764006595e-05, + "loss": 0.3804, + "step": 47300 + }, + { + "epoch": 927.65, + "learning_rate": 2.0151236340265593e-05, + "loss": 0.3857, + "step": 47310 + }, + { + "epoch": 927.84, + "learning_rate": 2.0123603931140354e-05, + "loss": 0.3785, + "step": 47320 + }, + { + "epoch": 928.0, + "eval_loss": 0.3836647570133209, + "eval_runtime": 2.3837, + "eval_samples_per_second": 956.084, + "eval_steps_per_second": 3.776, + "step": 47328 + }, + { + "epoch": 928.04, + "learning_rate": 2.0095987544698916e-05, + "loss": 0.3847, + "step": 47330 + }, + { + "epoch": 928.24, + "learning_rate": 2.0068387189004604e-05, + "loss": 0.3833, + "step": 47340 + }, + { + "epoch": 928.43, + "learning_rate": 2.004080287211614e-05, + "loss": 0.3813, + "step": 47350 + }, + { + "epoch": 928.63, + "learning_rate": 2.0013234602087526e-05, + "loss": 0.3792, + "step": 47360 + }, + { + "epoch": 928.82, + "learning_rate": 1.998568238696811e-05, + "loss": 0.3825, + "step": 47370 + }, + { + "epoch": 929.0, + "eval_loss": 0.3878856301307678, + "eval_runtime": 2.2175, + "eval_samples_per_second": 1027.725, + "eval_steps_per_second": 4.059, + "step": 47379 + }, + { + "epoch": 929.02, + "learning_rate": 1.9958146234802504e-05, + "loss": 0.3804, + "step": 47380 + }, + { + "epoch": 929.22, + "learning_rate": 1.9930626153630614e-05, + "loss": 0.3808, + "step": 47390 + }, + { + "epoch": 929.41, + "learning_rate": 1.9903122151487725e-05, + "loss": 0.3789, + "step": 47400 + }, + { + "epoch": 929.61, + "learning_rate": 1.9875634236404388e-05, + "loss": 0.382, + "step": 47410 + }, + { + "epoch": 929.8, + "learning_rate": 1.984816241640648e-05, + "loss": 0.3786, + "step": 47420 + }, + { + "epoch": 930.0, + "learning_rate": 1.982070669951513e-05, + "loss": 0.3793, + "step": 47430 + }, + { + "epoch": 930.0, + "eval_loss": 0.39214441180229187, + "eval_runtime": 2.3109, + "eval_samples_per_second": 986.177, + "eval_steps_per_second": 3.895, + "step": 47430 + }, + { + "epoch": 930.2, + "learning_rate": 1.9793267093746814e-05, + "loss": 0.3809, + "step": 47440 + }, + { + "epoch": 930.39, + "learning_rate": 1.976584360711326e-05, + "loss": 0.3851, + "step": 47450 + }, + { + "epoch": 930.59, + "learning_rate": 1.9738436247621536e-05, + "loss": 0.3829, + "step": 47460 + }, + { + "epoch": 930.78, + "learning_rate": 1.971104502327399e-05, + "loss": 0.3799, + "step": 47470 + }, + { + "epoch": 930.98, + "learning_rate": 1.968366994206822e-05, + "loss": 0.3836, + "step": 47480 + }, + { + "epoch": 931.0, + "eval_loss": 0.38933873176574707, + "eval_runtime": 2.3912, + "eval_samples_per_second": 953.091, + "eval_steps_per_second": 3.764, + "step": 47481 + }, + { + "epoch": 931.18, + "learning_rate": 1.9656311011997168e-05, + "loss": 0.3799, + "step": 47490 + }, + { + "epoch": 931.37, + "learning_rate": 1.9628968241049e-05, + "loss": 0.3838, + "step": 47500 + }, + { + "epoch": 931.57, + "learning_rate": 1.9601641637207204e-05, + "loss": 0.3878, + "step": 47510 + }, + { + "epoch": 931.76, + "learning_rate": 1.9574331208450575e-05, + "loss": 0.3797, + "step": 47520 + }, + { + "epoch": 931.96, + "learning_rate": 1.9547036962753097e-05, + "loss": 0.3858, + "step": 47530 + }, + { + "epoch": 932.0, + "eval_loss": 0.38738271594047546, + "eval_runtime": 2.2401, + "eval_samples_per_second": 1017.363, + "eval_steps_per_second": 4.018, + "step": 47532 + }, + { + "epoch": 932.16, + "learning_rate": 1.9519758908084132e-05, + "loss": 0.3813, + "step": 47540 + }, + { + "epoch": 932.35, + "learning_rate": 1.9492497052408204e-05, + "loss": 0.3814, + "step": 47550 + }, + { + "epoch": 932.55, + "learning_rate": 1.9465251403685207e-05, + "loss": 0.3827, + "step": 47560 + }, + { + "epoch": 932.75, + "learning_rate": 1.9438021969870248e-05, + "loss": 0.3792, + "step": 47570 + }, + { + "epoch": 932.94, + "learning_rate": 1.9410808758913747e-05, + "loss": 0.387, + "step": 47580 + }, + { + "epoch": 933.0, + "eval_loss": 0.3881475627422333, + "eval_runtime": 2.3304, + "eval_samples_per_second": 977.924, + "eval_steps_per_second": 3.862, + "step": 47583 + }, + { + "epoch": 933.14, + "learning_rate": 1.938361177876133e-05, + "loss": 0.3814, + "step": 47590 + }, + { + "epoch": 933.33, + "learning_rate": 1.935643103735389e-05, + "loss": 0.3805, + "step": 47600 + }, + { + "epoch": 933.53, + "learning_rate": 1.9329266542627614e-05, + "loss": 0.3807, + "step": 47610 + }, + { + "epoch": 933.73, + "learning_rate": 1.9302118302513926e-05, + "loss": 0.3796, + "step": 47620 + }, + { + "epoch": 933.92, + "learning_rate": 1.927498632493953e-05, + "loss": 0.3855, + "step": 47630 + }, + { + "epoch": 934.0, + "eval_loss": 0.3862844705581665, + "eval_runtime": 2.2897, + "eval_samples_per_second": 995.32, + "eval_steps_per_second": 3.931, + "step": 47634 + }, + { + "epoch": 934.12, + "learning_rate": 1.9247870617826323e-05, + "loss": 0.3807, + "step": 47640 + }, + { + "epoch": 934.31, + "learning_rate": 1.9220771189091515e-05, + "loss": 0.3788, + "step": 47650 + }, + { + "epoch": 934.51, + "learning_rate": 1.919368804664751e-05, + "loss": 0.3823, + "step": 47660 + }, + { + "epoch": 934.71, + "learning_rate": 1.9166621198401992e-05, + "loss": 0.3811, + "step": 47670 + }, + { + "epoch": 934.9, + "learning_rate": 1.9139570652257897e-05, + "loss": 0.3813, + "step": 47680 + }, + { + "epoch": 935.0, + "eval_loss": 0.38326093554496765, + "eval_runtime": 2.2607, + "eval_samples_per_second": 1008.081, + "eval_steps_per_second": 3.981, + "step": 47685 + }, + { + "epoch": 935.1, + "learning_rate": 1.911253641611334e-05, + "loss": 0.379, + "step": 47690 + }, + { + "epoch": 935.29, + "learning_rate": 1.9085518497861766e-05, + "loss": 0.3842, + "step": 47700 + }, + { + "epoch": 935.49, + "learning_rate": 1.9058516905391757e-05, + "loss": 0.378, + "step": 47710 + }, + { + "epoch": 935.69, + "learning_rate": 1.9031531646587185e-05, + "loss": 0.3811, + "step": 47720 + }, + { + "epoch": 935.88, + "learning_rate": 1.9004562729327182e-05, + "loss": 0.3787, + "step": 47730 + }, + { + "epoch": 936.0, + "eval_loss": 0.387604683637619, + "eval_runtime": 2.2353, + "eval_samples_per_second": 1019.552, + "eval_steps_per_second": 4.026, + "step": 47736 + }, + { + "epoch": 936.08, + "learning_rate": 1.897761016148602e-05, + "loss": 0.3787, + "step": 47740 + }, + { + "epoch": 936.27, + "learning_rate": 1.8950673950933296e-05, + "loss": 0.3826, + "step": 47750 + }, + { + "epoch": 936.47, + "learning_rate": 1.8923754105533733e-05, + "loss": 0.3797, + "step": 47760 + }, + { + "epoch": 936.67, + "learning_rate": 1.889685063314734e-05, + "loss": 0.3848, + "step": 47770 + }, + { + "epoch": 936.86, + "learning_rate": 1.8869963541629353e-05, + "loss": 0.3834, + "step": 47780 + }, + { + "epoch": 937.0, + "eval_loss": 0.38703593611717224, + "eval_runtime": 2.28, + "eval_samples_per_second": 999.577, + "eval_steps_per_second": 3.947, + "step": 47787 + }, + { + "epoch": 937.06, + "learning_rate": 1.8843092838830206e-05, + "loss": 0.381, + "step": 47790 + }, + { + "epoch": 937.25, + "learning_rate": 1.8816238532595532e-05, + "loss": 0.3802, + "step": 47800 + }, + { + "epoch": 937.45, + "learning_rate": 1.8789400630766168e-05, + "loss": 0.3742, + "step": 47810 + }, + { + "epoch": 937.65, + "learning_rate": 1.8762579141178198e-05, + "loss": 0.3801, + "step": 47820 + }, + { + "epoch": 937.84, + "learning_rate": 1.87357740716629e-05, + "loss": 0.3807, + "step": 47830 + }, + { + "epoch": 938.0, + "eval_loss": 0.3838607668876648, + "eval_runtime": 2.2246, + "eval_samples_per_second": 1024.454, + "eval_steps_per_second": 4.046, + "step": 47838 + }, + { + "epoch": 938.04, + "learning_rate": 1.8708985430046785e-05, + "loss": 0.3805, + "step": 47840 + }, + { + "epoch": 938.24, + "learning_rate": 1.868221322415149e-05, + "loss": 0.3789, + "step": 47850 + }, + { + "epoch": 938.43, + "learning_rate": 1.8655457461793947e-05, + "loss": 0.3759, + "step": 47860 + }, + { + "epoch": 938.63, + "learning_rate": 1.8628718150786196e-05, + "loss": 0.3823, + "step": 47870 + }, + { + "epoch": 938.82, + "learning_rate": 1.8601995298935548e-05, + "loss": 0.3788, + "step": 47880 + }, + { + "epoch": 939.0, + "eval_loss": 0.3863469660282135, + "eval_runtime": 2.3287, + "eval_samples_per_second": 978.651, + "eval_steps_per_second": 3.865, + "step": 47889 + }, + { + "epoch": 939.02, + "learning_rate": 1.8575288914044497e-05, + "loss": 0.3787, + "step": 47890 + }, + { + "epoch": 939.22, + "learning_rate": 1.8548599003910664e-05, + "loss": 0.3816, + "step": 47900 + }, + { + "epoch": 939.41, + "learning_rate": 1.8521925576326955e-05, + "loss": 0.3785, + "step": 47910 + }, + { + "epoch": 939.61, + "learning_rate": 1.8495268639081373e-05, + "loss": 0.384, + "step": 47920 + }, + { + "epoch": 939.8, + "learning_rate": 1.846862819995718e-05, + "loss": 0.3861, + "step": 47930 + }, + { + "epoch": 940.0, + "learning_rate": 1.8442004266732787e-05, + "loss": 0.3788, + "step": 47940 + }, + { + "epoch": 940.0, + "eval_loss": 0.38470685482025146, + "eval_runtime": 2.281, + "eval_samples_per_second": 999.111, + "eval_steps_per_second": 3.946, + "step": 47940 + }, + { + "epoch": 940.2, + "learning_rate": 1.8415396847181766e-05, + "loss": 0.3786, + "step": 47950 + }, + { + "epoch": 940.39, + "learning_rate": 1.838880594907294e-05, + "loss": 0.3824, + "step": 47960 + }, + { + "epoch": 940.59, + "learning_rate": 1.8362231580170186e-05, + "loss": 0.3843, + "step": 47970 + }, + { + "epoch": 940.78, + "learning_rate": 1.8335673748232674e-05, + "loss": 0.3859, + "step": 47980 + }, + { + "epoch": 940.98, + "learning_rate": 1.8309132461014688e-05, + "loss": 0.3819, + "step": 47990 + }, + { + "epoch": 941.0, + "eval_loss": 0.3876339793205261, + "eval_runtime": 2.2305, + "eval_samples_per_second": 1021.726, + "eval_steps_per_second": 4.035, + "step": 47991 + }, + { + "epoch": 941.18, + "learning_rate": 1.8282607726265716e-05, + "loss": 0.3788, + "step": 48000 + }, + { + "epoch": 941.37, + "learning_rate": 1.825609955173037e-05, + "loss": 0.3834, + "step": 48010 + }, + { + "epoch": 941.57, + "learning_rate": 1.822960794514842e-05, + "loss": 0.3777, + "step": 48020 + }, + { + "epoch": 941.76, + "learning_rate": 1.8203132914254847e-05, + "loss": 0.3827, + "step": 48030 + }, + { + "epoch": 941.96, + "learning_rate": 1.817667446677977e-05, + "loss": 0.3814, + "step": 48040 + }, + { + "epoch": 942.0, + "eval_loss": 0.38454288244247437, + "eval_runtime": 2.3427, + "eval_samples_per_second": 972.806, + "eval_steps_per_second": 3.842, + "step": 48042 + }, + { + "epoch": 942.16, + "learning_rate": 1.8150232610448492e-05, + "loss": 0.3836, + "step": 48050 + }, + { + "epoch": 942.35, + "learning_rate": 1.812380735298139e-05, + "loss": 0.3805, + "step": 48060 + }, + { + "epoch": 942.55, + "learning_rate": 1.8097398702094106e-05, + "loss": 0.3773, + "step": 48070 + }, + { + "epoch": 942.75, + "learning_rate": 1.8071006665497327e-05, + "loss": 0.3781, + "step": 48080 + }, + { + "epoch": 942.94, + "learning_rate": 1.8044631250896958e-05, + "loss": 0.3817, + "step": 48090 + }, + { + "epoch": 943.0, + "eval_loss": 0.3829639256000519, + "eval_runtime": 2.3281, + "eval_samples_per_second": 978.891, + "eval_steps_per_second": 3.866, + "step": 48093 + }, + { + "epoch": 943.14, + "learning_rate": 1.8018272465994058e-05, + "loss": 0.3792, + "step": 48100 + }, + { + "epoch": 943.33, + "learning_rate": 1.7991930318484763e-05, + "loss": 0.3781, + "step": 48110 + }, + { + "epoch": 943.53, + "learning_rate": 1.7965604816060436e-05, + "loss": 0.3822, + "step": 48120 + }, + { + "epoch": 943.73, + "learning_rate": 1.7939295966407478e-05, + "loss": 0.3778, + "step": 48130 + }, + { + "epoch": 943.92, + "learning_rate": 1.7913003777207533e-05, + "loss": 0.3838, + "step": 48140 + }, + { + "epoch": 944.0, + "eval_loss": 0.388039231300354, + "eval_runtime": 2.3308, + "eval_samples_per_second": 977.761, + "eval_steps_per_second": 3.861, + "step": 48144 + }, + { + "epoch": 944.12, + "learning_rate": 1.7886728256137345e-05, + "loss": 0.3834, + "step": 48150 + }, + { + "epoch": 944.31, + "learning_rate": 1.786046941086873e-05, + "loss": 0.3813, + "step": 48160 + }, + { + "epoch": 944.51, + "learning_rate": 1.783422724906873e-05, + "loss": 0.3789, + "step": 48170 + }, + { + "epoch": 944.71, + "learning_rate": 1.7808001778399432e-05, + "loss": 0.3765, + "step": 48180 + }, + { + "epoch": 944.9, + "learning_rate": 1.7781793006518112e-05, + "loss": 0.3787, + "step": 48190 + }, + { + "epoch": 945.0, + "eval_loss": 0.3880222737789154, + "eval_runtime": 2.2771, + "eval_samples_per_second": 1000.828, + "eval_steps_per_second": 3.952, + "step": 48195 + }, + { + "epoch": 945.1, + "learning_rate": 1.7755600941077165e-05, + "loss": 0.3771, + "step": 48200 + }, + { + "epoch": 945.29, + "learning_rate": 1.772942558972405e-05, + "loss": 0.3782, + "step": 48210 + }, + { + "epoch": 945.49, + "learning_rate": 1.7703266960101425e-05, + "loss": 0.3793, + "step": 48220 + }, + { + "epoch": 945.69, + "learning_rate": 1.7677125059846983e-05, + "loss": 0.3813, + "step": 48230 + }, + { + "epoch": 945.88, + "learning_rate": 1.7650999896593602e-05, + "loss": 0.3812, + "step": 48240 + }, + { + "epoch": 946.0, + "eval_loss": 0.38842880725860596, + "eval_runtime": 2.3222, + "eval_samples_per_second": 981.39, + "eval_steps_per_second": 3.876, + "step": 48246 + }, + { + "epoch": 946.08, + "learning_rate": 1.7624891477969244e-05, + "loss": 0.3814, + "step": 48250 + }, + { + "epoch": 946.27, + "learning_rate": 1.7598799811597004e-05, + "loss": 0.3784, + "step": 48260 + }, + { + "epoch": 946.47, + "learning_rate": 1.7572724905095058e-05, + "loss": 0.3796, + "step": 48270 + }, + { + "epoch": 946.67, + "learning_rate": 1.7546666766076655e-05, + "loss": 0.3824, + "step": 48280 + }, + { + "epoch": 946.86, + "learning_rate": 1.7520625402150225e-05, + "loss": 0.3806, + "step": 48290 + }, + { + "epoch": 947.0, + "eval_loss": 0.38914692401885986, + "eval_runtime": 2.3124, + "eval_samples_per_second": 985.566, + "eval_steps_per_second": 3.892, + "step": 48297 + }, + { + "epoch": 947.06, + "learning_rate": 1.7494600820919264e-05, + "loss": 0.3746, + "step": 48300 + }, + { + "epoch": 947.25, + "learning_rate": 1.746859302998239e-05, + "loss": 0.3777, + "step": 48310 + }, + { + "epoch": 947.45, + "learning_rate": 1.7442602036933252e-05, + "loss": 0.3768, + "step": 48320 + }, + { + "epoch": 947.65, + "learning_rate": 1.7416627849360695e-05, + "loss": 0.3803, + "step": 48330 + }, + { + "epoch": 947.84, + "learning_rate": 1.7390670474848538e-05, + "loss": 0.3816, + "step": 48340 + }, + { + "epoch": 948.0, + "eval_loss": 0.3855222165584564, + "eval_runtime": 2.2246, + "eval_samples_per_second": 1024.441, + "eval_steps_per_second": 4.046, + "step": 48348 + }, + { + "epoch": 948.04, + "learning_rate": 1.73647299209758e-05, + "loss": 0.3813, + "step": 48350 + }, + { + "epoch": 948.24, + "learning_rate": 1.7338806195316555e-05, + "loss": 0.3799, + "step": 48360 + }, + { + "epoch": 948.43, + "learning_rate": 1.7312899305439903e-05, + "loss": 0.3819, + "step": 48370 + }, + { + "epoch": 948.63, + "learning_rate": 1.728700925891013e-05, + "loss": 0.3781, + "step": 48380 + }, + { + "epoch": 948.82, + "learning_rate": 1.7261136063286505e-05, + "loss": 0.3813, + "step": 48390 + }, + { + "epoch": 949.0, + "eval_loss": 0.38467514514923096, + "eval_runtime": 2.2999, + "eval_samples_per_second": 990.903, + "eval_steps_per_second": 3.913, + "step": 48399 + }, + { + "epoch": 949.02, + "learning_rate": 1.7235279726123456e-05, + "loss": 0.3769, + "step": 48400 + }, + { + "epoch": 949.22, + "learning_rate": 1.7209440254970467e-05, + "loss": 0.3814, + "step": 48410 + }, + { + "epoch": 949.41, + "learning_rate": 1.7183617657372047e-05, + "loss": 0.3837, + "step": 48420 + }, + { + "epoch": 949.61, + "learning_rate": 1.715781194086786e-05, + "loss": 0.3795, + "step": 48430 + }, + { + "epoch": 949.8, + "learning_rate": 1.713202311299256e-05, + "loss": 0.378, + "step": 48440 + }, + { + "epoch": 950.0, + "learning_rate": 1.7106251181275932e-05, + "loss": 0.3811, + "step": 48450 + }, + { + "epoch": 950.0, + "eval_loss": 0.38474026322364807, + "eval_runtime": 2.3034, + "eval_samples_per_second": 989.396, + "eval_steps_per_second": 3.907, + "step": 48450 + }, + { + "epoch": 950.2, + "learning_rate": 1.7080496153242798e-05, + "loss": 0.3811, + "step": 48460 + }, + { + "epoch": 950.39, + "learning_rate": 1.7054758036413086e-05, + "loss": 0.3793, + "step": 48470 + }, + { + "epoch": 950.59, + "learning_rate": 1.7029036838301716e-05, + "loss": 0.378, + "step": 48480 + }, + { + "epoch": 950.78, + "learning_rate": 1.700333256641869e-05, + "loss": 0.3775, + "step": 48490 + }, + { + "epoch": 950.98, + "learning_rate": 1.6977645228269106e-05, + "loss": 0.3776, + "step": 48500 + }, + { + "epoch": 951.0, + "eval_loss": 0.38311800360679626, + "eval_runtime": 2.3155, + "eval_samples_per_second": 984.221, + "eval_steps_per_second": 3.887, + "step": 48501 + }, + { + "epoch": 951.18, + "learning_rate": 1.6951974831353092e-05, + "loss": 0.3787, + "step": 48510 + }, + { + "epoch": 951.37, + "learning_rate": 1.6926321383165852e-05, + "loss": 0.3782, + "step": 48520 + }, + { + "epoch": 951.57, + "learning_rate": 1.6900684891197576e-05, + "loss": 0.378, + "step": 48530 + }, + { + "epoch": 951.76, + "learning_rate": 1.6875065362933595e-05, + "loss": 0.378, + "step": 48540 + }, + { + "epoch": 951.96, + "learning_rate": 1.684946280585419e-05, + "loss": 0.3794, + "step": 48550 + }, + { + "epoch": 952.0, + "eval_loss": 0.38669443130493164, + "eval_runtime": 2.36, + "eval_samples_per_second": 965.687, + "eval_steps_per_second": 3.814, + "step": 48552 + }, + { + "epoch": 952.16, + "learning_rate": 1.6823877227434774e-05, + "loss": 0.3798, + "step": 48560 + }, + { + "epoch": 952.35, + "learning_rate": 1.6798308635145765e-05, + "loss": 0.3816, + "step": 48570 + }, + { + "epoch": 952.55, + "learning_rate": 1.677275703645259e-05, + "loss": 0.3793, + "step": 48580 + }, + { + "epoch": 952.75, + "learning_rate": 1.674722243881579e-05, + "loss": 0.3783, + "step": 48590 + }, + { + "epoch": 952.94, + "learning_rate": 1.672170484969086e-05, + "loss": 0.3782, + "step": 48600 + }, + { + "epoch": 953.0, + "eval_loss": 0.38120561838150024, + "eval_runtime": 2.3996, + "eval_samples_per_second": 949.743, + "eval_steps_per_second": 3.751, + "step": 48603 + }, + { + "epoch": 953.14, + "learning_rate": 1.6696204276528375e-05, + "loss": 0.3754, + "step": 48610 + }, + { + "epoch": 953.33, + "learning_rate": 1.6670720726773965e-05, + "loss": 0.3837, + "step": 48620 + }, + { + "epoch": 953.53, + "learning_rate": 1.6645254207868203e-05, + "loss": 0.38, + "step": 48630 + }, + { + "epoch": 953.73, + "learning_rate": 1.661980472724681e-05, + "loss": 0.3779, + "step": 48640 + }, + { + "epoch": 953.92, + "learning_rate": 1.6594372292340403e-05, + "loss": 0.3834, + "step": 48650 + }, + { + "epoch": 954.0, + "eval_loss": 0.38515594601631165, + "eval_runtime": 2.3227, + "eval_samples_per_second": 981.17, + "eval_steps_per_second": 3.875, + "step": 48654 + }, + { + "epoch": 954.12, + "learning_rate": 1.6568956910574713e-05, + "loss": 0.3766, + "step": 48660 + }, + { + "epoch": 954.31, + "learning_rate": 1.6543558589370472e-05, + "loss": 0.3815, + "step": 48670 + }, + { + "epoch": 954.51, + "learning_rate": 1.6518177336143434e-05, + "loss": 0.383, + "step": 48680 + }, + { + "epoch": 954.71, + "learning_rate": 1.6492813158304344e-05, + "loss": 0.3795, + "step": 48690 + }, + { + "epoch": 954.9, + "learning_rate": 1.6467466063258956e-05, + "loss": 0.3785, + "step": 48700 + }, + { + "epoch": 955.0, + "eval_loss": 0.3830149173736572, + "eval_runtime": 2.3291, + "eval_samples_per_second": 978.479, + "eval_steps_per_second": 3.864, + "step": 48705 + }, + { + "epoch": 955.1, + "learning_rate": 1.6442136058408073e-05, + "loss": 0.3811, + "step": 48710 + }, + { + "epoch": 955.29, + "learning_rate": 1.6416823151147498e-05, + "loss": 0.3839, + "step": 48720 + }, + { + "epoch": 955.49, + "learning_rate": 1.6391527348868047e-05, + "loss": 0.3788, + "step": 48730 + }, + { + "epoch": 955.69, + "learning_rate": 1.6366248658955496e-05, + "loss": 0.3777, + "step": 48740 + }, + { + "epoch": 955.88, + "learning_rate": 1.6340987088790696e-05, + "loss": 0.3789, + "step": 48750 + }, + { + "epoch": 956.0, + "eval_loss": 0.3851874768733978, + "eval_runtime": 2.2442, + "eval_samples_per_second": 1015.509, + "eval_steps_per_second": 4.01, + "step": 48756 + }, + { + "epoch": 956.08, + "learning_rate": 1.6315742645749423e-05, + "loss": 0.3794, + "step": 48760 + }, + { + "epoch": 956.27, + "learning_rate": 1.6290515337202516e-05, + "loss": 0.3801, + "step": 48770 + }, + { + "epoch": 956.47, + "learning_rate": 1.6265305170515798e-05, + "loss": 0.3787, + "step": 48780 + }, + { + "epoch": 956.67, + "learning_rate": 1.6240112153050038e-05, + "loss": 0.3808, + "step": 48790 + }, + { + "epoch": 956.86, + "learning_rate": 1.6214936292161072e-05, + "loss": 0.3801, + "step": 48800 + }, + { + "epoch": 957.0, + "eval_loss": 0.38819777965545654, + "eval_runtime": 2.2645, + "eval_samples_per_second": 1006.408, + "eval_steps_per_second": 3.974, + "step": 48807 + }, + { + "epoch": 957.06, + "learning_rate": 1.6189777595199663e-05, + "loss": 0.3766, + "step": 48810 + }, + { + "epoch": 957.25, + "learning_rate": 1.6164636069511606e-05, + "loss": 0.3797, + "step": 48820 + }, + { + "epoch": 957.45, + "learning_rate": 1.613951172243767e-05, + "loss": 0.3823, + "step": 48830 + }, + { + "epoch": 957.65, + "learning_rate": 1.6114404561313583e-05, + "loss": 0.3768, + "step": 48840 + }, + { + "epoch": 957.84, + "learning_rate": 1.60893145934701e-05, + "loss": 0.3771, + "step": 48850 + }, + { + "epoch": 958.0, + "eval_loss": 0.38420116901397705, + "eval_runtime": 2.2468, + "eval_samples_per_second": 1014.35, + "eval_steps_per_second": 4.006, + "step": 48858 + }, + { + "epoch": 958.04, + "learning_rate": 1.60642418262329e-05, + "loss": 0.3806, + "step": 48860 + }, + { + "epoch": 958.24, + "learning_rate": 1.6039186266922693e-05, + "loss": 0.3787, + "step": 48870 + }, + { + "epoch": 958.43, + "learning_rate": 1.6014147922855168e-05, + "loss": 0.3784, + "step": 48880 + }, + { + "epoch": 958.63, + "learning_rate": 1.5989126801340914e-05, + "loss": 0.3739, + "step": 48890 + }, + { + "epoch": 958.82, + "learning_rate": 1.5964122909685588e-05, + "loss": 0.3808, + "step": 48900 + }, + { + "epoch": 959.0, + "eval_loss": 0.3839624524116516, + "eval_runtime": 2.2667, + "eval_samples_per_second": 1005.427, + "eval_steps_per_second": 3.971, + "step": 48909 + }, + { + "epoch": 959.02, + "learning_rate": 1.5939136255189715e-05, + "loss": 0.3793, + "step": 48910 + }, + { + "epoch": 959.22, + "learning_rate": 1.5914166845148876e-05, + "loss": 0.3776, + "step": 48920 + }, + { + "epoch": 959.41, + "learning_rate": 1.5889214686853578e-05, + "loss": 0.3781, + "step": 48930 + }, + { + "epoch": 959.61, + "learning_rate": 1.5864279787589306e-05, + "loss": 0.3814, + "step": 48940 + }, + { + "epoch": 959.8, + "learning_rate": 1.5839362154636485e-05, + "loss": 0.3793, + "step": 48950 + }, + { + "epoch": 960.0, + "learning_rate": 1.581446179527049e-05, + "loss": 0.3762, + "step": 48960 + }, + { + "epoch": 960.0, + "eval_loss": 0.3849249482154846, + "eval_runtime": 2.1949, + "eval_samples_per_second": 1038.309, + "eval_steps_per_second": 4.1, + "step": 48960 + }, + { + "epoch": 960.2, + "learning_rate": 1.578957871676168e-05, + "loss": 0.3752, + "step": 48970 + }, + { + "epoch": 960.39, + "learning_rate": 1.5764712926375365e-05, + "loss": 0.3755, + "step": 48980 + }, + { + "epoch": 960.59, + "learning_rate": 1.5739864431371816e-05, + "loss": 0.3833, + "step": 48990 + }, + { + "epoch": 960.78, + "learning_rate": 1.5715033239006214e-05, + "loss": 0.3765, + "step": 49000 + }, + { + "epoch": 960.98, + "learning_rate": 1.5690219356528737e-05, + "loss": 0.3777, + "step": 49010 + }, + { + "epoch": 961.0, + "eval_loss": 0.38420653343200684, + "eval_runtime": 2.4626, + "eval_samples_per_second": 925.439, + "eval_steps_per_second": 3.655, + "step": 49011 + }, + { + "epoch": 961.18, + "learning_rate": 1.566542279118445e-05, + "loss": 0.3807, + "step": 49020 + }, + { + "epoch": 961.37, + "learning_rate": 1.564064355021342e-05, + "loss": 0.3801, + "step": 49030 + }, + { + "epoch": 961.57, + "learning_rate": 1.5615881640850652e-05, + "loss": 0.3761, + "step": 49040 + }, + { + "epoch": 961.76, + "learning_rate": 1.5591137070326027e-05, + "loss": 0.379, + "step": 49050 + }, + { + "epoch": 961.96, + "learning_rate": 1.5566409845864454e-05, + "loss": 0.3781, + "step": 49060 + }, + { + "epoch": 962.0, + "eval_loss": 0.3874445855617523, + "eval_runtime": 2.1852, + "eval_samples_per_second": 1042.935, + "eval_steps_per_second": 4.119, + "step": 49062 + }, + { + "epoch": 962.16, + "learning_rate": 1.554169997468569e-05, + "loss": 0.3793, + "step": 49070 + }, + { + "epoch": 962.35, + "learning_rate": 1.5517007464004482e-05, + "loss": 0.3771, + "step": 49080 + }, + { + "epoch": 962.55, + "learning_rate": 1.5492332321030517e-05, + "loss": 0.3779, + "step": 49090 + }, + { + "epoch": 962.75, + "learning_rate": 1.546767455296834e-05, + "loss": 0.3808, + "step": 49100 + }, + { + "epoch": 962.94, + "learning_rate": 1.5443034167017524e-05, + "loss": 0.3781, + "step": 49110 + }, + { + "epoch": 963.0, + "eval_loss": 0.3837532699108124, + "eval_runtime": 2.345, + "eval_samples_per_second": 971.842, + "eval_steps_per_second": 3.838, + "step": 49113 + }, + { + "epoch": 963.14, + "learning_rate": 1.5418411170372452e-05, + "loss": 0.3771, + "step": 49120 + }, + { + "epoch": 963.33, + "learning_rate": 1.5393805570222524e-05, + "loss": 0.3781, + "step": 49130 + }, + { + "epoch": 963.53, + "learning_rate": 1.5369217373752023e-05, + "loss": 0.3766, + "step": 49140 + }, + { + "epoch": 963.73, + "learning_rate": 1.534464658814017e-05, + "loss": 0.3766, + "step": 49150 + }, + { + "epoch": 963.92, + "learning_rate": 1.532009322056107e-05, + "loss": 0.376, + "step": 49160 + }, + { + "epoch": 964.0, + "eval_loss": 0.3862951695919037, + "eval_runtime": 2.2938, + "eval_samples_per_second": 993.556, + "eval_steps_per_second": 3.924, + "step": 49164 + }, + { + "epoch": 964.12, + "learning_rate": 1.529555727818374e-05, + "loss": 0.3788, + "step": 49170 + }, + { + "epoch": 964.31, + "learning_rate": 1.5271038768172144e-05, + "loss": 0.3788, + "step": 49180 + }, + { + "epoch": 964.51, + "learning_rate": 1.5246537697685143e-05, + "loss": 0.3767, + "step": 49190 + }, + { + "epoch": 964.71, + "learning_rate": 1.5222054073876527e-05, + "loss": 0.3779, + "step": 49200 + }, + { + "epoch": 964.9, + "learning_rate": 1.5197587903894929e-05, + "loss": 0.3777, + "step": 49210 + }, + { + "epoch": 965.0, + "eval_loss": 0.3826569616794586, + "eval_runtime": 2.3906, + "eval_samples_per_second": 953.331, + "eval_steps_per_second": 3.765, + "step": 49215 + }, + { + "epoch": 965.1, + "learning_rate": 1.5173139194883948e-05, + "loss": 0.3756, + "step": 49220 + }, + { + "epoch": 965.29, + "learning_rate": 1.5148707953982036e-05, + "loss": 0.3765, + "step": 49230 + }, + { + "epoch": 965.49, + "learning_rate": 1.5124294188322594e-05, + "loss": 0.3776, + "step": 49240 + }, + { + "epoch": 965.69, + "learning_rate": 1.5099897905033904e-05, + "loss": 0.3812, + "step": 49250 + }, + { + "epoch": 965.88, + "learning_rate": 1.5075519111239106e-05, + "loss": 0.3808, + "step": 49260 + }, + { + "epoch": 966.0, + "eval_loss": 0.38532519340515137, + "eval_runtime": 2.2587, + "eval_samples_per_second": 1008.974, + "eval_steps_per_second": 3.985, + "step": 49266 + }, + { + "epoch": 966.08, + "learning_rate": 1.5051157814056303e-05, + "loss": 0.3771, + "step": 49270 + }, + { + "epoch": 966.27, + "learning_rate": 1.502681402059841e-05, + "loss": 0.3805, + "step": 49280 + }, + { + "epoch": 966.47, + "learning_rate": 1.5002487737973293e-05, + "loss": 0.3763, + "step": 49290 + }, + { + "epoch": 966.67, + "learning_rate": 1.4978178973283703e-05, + "loss": 0.3812, + "step": 49300 + }, + { + "epoch": 966.86, + "learning_rate": 1.4953887733627213e-05, + "loss": 0.3835, + "step": 49310 + }, + { + "epoch": 967.0, + "eval_loss": 0.3868524432182312, + "eval_runtime": 2.3298, + "eval_samples_per_second": 978.194, + "eval_steps_per_second": 3.863, + "step": 49317 + }, + { + "epoch": 967.06, + "learning_rate": 1.4929614026096365e-05, + "loss": 0.3793, + "step": 49320 + }, + { + "epoch": 967.25, + "learning_rate": 1.4905357857778499e-05, + "loss": 0.3769, + "step": 49330 + }, + { + "epoch": 967.45, + "learning_rate": 1.488111923575591e-05, + "loss": 0.3771, + "step": 49340 + }, + { + "epoch": 967.65, + "learning_rate": 1.4856898167105707e-05, + "loss": 0.381, + "step": 49350 + }, + { + "epoch": 967.84, + "learning_rate": 1.4832694658899946e-05, + "loss": 0.3801, + "step": 49360 + }, + { + "epoch": 968.0, + "eval_loss": 0.3859129548072815, + "eval_runtime": 2.3191, + "eval_samples_per_second": 982.701, + "eval_steps_per_second": 3.881, + "step": 49368 + }, + { + "epoch": 968.04, + "learning_rate": 1.480850871820549e-05, + "loss": 0.3765, + "step": 49370 + }, + { + "epoch": 968.24, + "learning_rate": 1.4784340352084062e-05, + "loss": 0.3785, + "step": 49380 + }, + { + "epoch": 968.43, + "learning_rate": 1.4760189567592304e-05, + "loss": 0.3763, + "step": 49390 + }, + { + "epoch": 968.63, + "learning_rate": 1.4736056371781723e-05, + "loss": 0.3771, + "step": 49400 + }, + { + "epoch": 968.82, + "learning_rate": 1.4711940771698686e-05, + "loss": 0.3839, + "step": 49410 + }, + { + "epoch": 969.0, + "eval_loss": 0.38414880633354187, + "eval_runtime": 2.2645, + "eval_samples_per_second": 1006.397, + "eval_steps_per_second": 3.974, + "step": 49419 + }, + { + "epoch": 969.02, + "learning_rate": 1.4687842774384365e-05, + "loss": 0.3796, + "step": 49420 + }, + { + "epoch": 969.22, + "learning_rate": 1.4663762386874883e-05, + "loss": 0.3757, + "step": 49430 + }, + { + "epoch": 969.41, + "learning_rate": 1.4639699616201133e-05, + "loss": 0.3808, + "step": 49440 + }, + { + "epoch": 969.61, + "learning_rate": 1.4615654469388938e-05, + "loss": 0.3708, + "step": 49450 + }, + { + "epoch": 969.8, + "learning_rate": 1.4591626953458955e-05, + "loss": 0.38, + "step": 49460 + }, + { + "epoch": 970.0, + "learning_rate": 1.4567617075426638e-05, + "loss": 0.3768, + "step": 49470 + }, + { + "epoch": 970.0, + "eval_loss": 0.38485315442085266, + "eval_runtime": 2.244, + "eval_samples_per_second": 1015.601, + "eval_steps_per_second": 4.011, + "step": 49470 + }, + { + "epoch": 970.2, + "learning_rate": 1.4543624842302382e-05, + "loss": 0.3777, + "step": 49480 + }, + { + "epoch": 970.39, + "learning_rate": 1.4519650261091347e-05, + "loss": 0.376, + "step": 49490 + }, + { + "epoch": 970.59, + "learning_rate": 1.4495693338793595e-05, + "loss": 0.3779, + "step": 49500 + }, + { + "epoch": 970.78, + "learning_rate": 1.447175408240403e-05, + "loss": 0.3753, + "step": 49510 + }, + { + "epoch": 970.98, + "learning_rate": 1.4447832498912335e-05, + "loss": 0.3797, + "step": 49520 + }, + { + "epoch": 971.0, + "eval_loss": 0.38439249992370605, + "eval_runtime": 2.2074, + "eval_samples_per_second": 1032.431, + "eval_steps_per_second": 4.077, + "step": 49521 + }, + { + "epoch": 971.18, + "learning_rate": 1.442392859530313e-05, + "loss": 0.3765, + "step": 49530 + }, + { + "epoch": 971.37, + "learning_rate": 1.4400042378555784e-05, + "loss": 0.3805, + "step": 49540 + }, + { + "epoch": 971.57, + "learning_rate": 1.4376173855644544e-05, + "loss": 0.3766, + "step": 49550 + }, + { + "epoch": 971.76, + "learning_rate": 1.4352323033538523e-05, + "loss": 0.3776, + "step": 49560 + }, + { + "epoch": 971.96, + "learning_rate": 1.432848991920158e-05, + "loss": 0.3763, + "step": 49570 + }, + { + "epoch": 972.0, + "eval_loss": 0.38550448417663574, + "eval_runtime": 2.3189, + "eval_samples_per_second": 982.804, + "eval_steps_per_second": 3.881, + "step": 49572 + }, + { + "epoch": 972.16, + "learning_rate": 1.4304674519592496e-05, + "loss": 0.3719, + "step": 49580 + }, + { + "epoch": 972.35, + "learning_rate": 1.4280876841664793e-05, + "loss": 0.3774, + "step": 49590 + }, + { + "epoch": 972.55, + "learning_rate": 1.425709689236688e-05, + "loss": 0.3792, + "step": 49600 + }, + { + "epoch": 972.75, + "learning_rate": 1.4233334678641984e-05, + "loss": 0.3802, + "step": 49610 + }, + { + "epoch": 972.94, + "learning_rate": 1.4209590207428148e-05, + "loss": 0.3788, + "step": 49620 + }, + { + "epoch": 973.0, + "eval_loss": 0.3832288980484009, + "eval_runtime": 2.3426, + "eval_samples_per_second": 972.871, + "eval_steps_per_second": 3.842, + "step": 49623 + }, + { + "epoch": 973.14, + "learning_rate": 1.418586348565821e-05, + "loss": 0.3797, + "step": 49630 + }, + { + "epoch": 973.33, + "learning_rate": 1.4162154520259839e-05, + "loss": 0.376, + "step": 49640 + }, + { + "epoch": 973.53, + "learning_rate": 1.4138463318155527e-05, + "loss": 0.3818, + "step": 49650 + }, + { + "epoch": 973.73, + "learning_rate": 1.4114789886262576e-05, + "loss": 0.3774, + "step": 49660 + }, + { + "epoch": 973.92, + "learning_rate": 1.4091134231493131e-05, + "loss": 0.374, + "step": 49670 + }, + { + "epoch": 974.0, + "eval_loss": 0.3858170211315155, + "eval_runtime": 2.3194, + "eval_samples_per_second": 982.599, + "eval_steps_per_second": 3.88, + "step": 49674 + }, + { + "epoch": 974.12, + "learning_rate": 1.4067496360754078e-05, + "loss": 0.3829, + "step": 49680 + }, + { + "epoch": 974.31, + "learning_rate": 1.4043876280947175e-05, + "loss": 0.3772, + "step": 49690 + }, + { + "epoch": 974.51, + "learning_rate": 1.4020273998968918e-05, + "loss": 0.376, + "step": 49700 + }, + { + "epoch": 974.71, + "learning_rate": 1.3996689521710683e-05, + "loss": 0.373, + "step": 49710 + }, + { + "epoch": 974.9, + "learning_rate": 1.3973122856058614e-05, + "loss": 0.3785, + "step": 49720 + }, + { + "epoch": 975.0, + "eval_loss": 0.38047897815704346, + "eval_runtime": 2.3373, + "eval_samples_per_second": 975.043, + "eval_steps_per_second": 3.851, + "step": 49725 + }, + { + "epoch": 975.1, + "learning_rate": 1.3949574008893629e-05, + "loss": 0.3775, + "step": 49730 + }, + { + "epoch": 975.29, + "learning_rate": 1.392604298709149e-05, + "loss": 0.3805, + "step": 49740 + }, + { + "epoch": 975.49, + "learning_rate": 1.39025297975227e-05, + "loss": 0.3765, + "step": 49750 + }, + { + "epoch": 975.69, + "learning_rate": 1.3879034447052597e-05, + "loss": 0.3808, + "step": 49760 + }, + { + "epoch": 975.88, + "learning_rate": 1.3855556942541333e-05, + "loss": 0.3752, + "step": 49770 + }, + { + "epoch": 976.0, + "eval_loss": 0.38550615310668945, + "eval_runtime": 2.3057, + "eval_samples_per_second": 988.423, + "eval_steps_per_second": 3.903, + "step": 49776 + }, + { + "epoch": 976.08, + "learning_rate": 1.383209729084377e-05, + "loss": 0.3783, + "step": 49780 + }, + { + "epoch": 976.27, + "learning_rate": 1.3808655498809638e-05, + "loss": 0.379, + "step": 49790 + }, + { + "epoch": 976.47, + "learning_rate": 1.3785231573283379e-05, + "loss": 0.3753, + "step": 49800 + }, + { + "epoch": 976.67, + "learning_rate": 1.376182552110428e-05, + "loss": 0.3774, + "step": 49810 + }, + { + "epoch": 976.86, + "learning_rate": 1.3738437349106384e-05, + "loss": 0.3752, + "step": 49820 + }, + { + "epoch": 977.0, + "eval_loss": 0.38268429040908813, + "eval_runtime": 2.3909, + "eval_samples_per_second": 953.211, + "eval_steps_per_second": 3.764, + "step": 49827 + }, + { + "epoch": 977.06, + "learning_rate": 1.3715067064118537e-05, + "loss": 0.381, + "step": 49830 + }, + { + "epoch": 977.25, + "learning_rate": 1.3691714672964322e-05, + "loss": 0.3796, + "step": 49840 + }, + { + "epoch": 977.45, + "learning_rate": 1.36683801824621e-05, + "loss": 0.3758, + "step": 49850 + }, + { + "epoch": 977.65, + "learning_rate": 1.364506359942503e-05, + "loss": 0.3765, + "step": 49860 + }, + { + "epoch": 977.84, + "learning_rate": 1.362176493066104e-05, + "loss": 0.3779, + "step": 49870 + }, + { + "epoch": 978.0, + "eval_loss": 0.3826252222061157, + "eval_runtime": 2.2244, + "eval_samples_per_second": 1024.528, + "eval_steps_per_second": 4.046, + "step": 49878 + }, + { + "epoch": 978.04, + "learning_rate": 1.3598484182972844e-05, + "loss": 0.3764, + "step": 49880 + }, + { + "epoch": 978.24, + "learning_rate": 1.3575221363157866e-05, + "loss": 0.377, + "step": 49890 + }, + { + "epoch": 978.43, + "learning_rate": 1.3551976478008356e-05, + "loss": 0.3756, + "step": 49900 + }, + { + "epoch": 978.63, + "learning_rate": 1.3528749534311279e-05, + "loss": 0.3736, + "step": 49910 + }, + { + "epoch": 978.82, + "learning_rate": 1.3505540538848384e-05, + "loss": 0.3769, + "step": 49920 + }, + { + "epoch": 979.0, + "eval_loss": 0.3824384808540344, + "eval_runtime": 2.2551, + "eval_samples_per_second": 1010.599, + "eval_steps_per_second": 3.991, + "step": 49929 + }, + { + "epoch": 979.02, + "learning_rate": 1.3482349498396224e-05, + "loss": 0.3763, + "step": 49930 + }, + { + "epoch": 979.22, + "learning_rate": 1.3459176419726004e-05, + "loss": 0.3792, + "step": 49940 + }, + { + "epoch": 979.41, + "learning_rate": 1.3436021309603806e-05, + "loss": 0.3759, + "step": 49950 + }, + { + "epoch": 979.61, + "learning_rate": 1.341288417479035e-05, + "loss": 0.3765, + "step": 49960 + }, + { + "epoch": 979.8, + "learning_rate": 1.3389765022041191e-05, + "loss": 0.3754, + "step": 49970 + }, + { + "epoch": 980.0, + "learning_rate": 1.3366663858106618e-05, + "loss": 0.3778, + "step": 49980 + }, + { + "epoch": 980.0, + "eval_loss": 0.38484612107276917, + "eval_runtime": 2.2844, + "eval_samples_per_second": 997.63, + "eval_steps_per_second": 3.94, + "step": 49980 + }, + { + "epoch": 980.2, + "learning_rate": 1.3343580689731632e-05, + "loss": 0.3776, + "step": 49990 + }, + { + "epoch": 980.39, + "learning_rate": 1.3320515523656019e-05, + "loss": 0.382, + "step": 50000 + }, + { + "epoch": 980.59, + "learning_rate": 1.3297468366614281e-05, + "loss": 0.3809, + "step": 50010 + }, + { + "epoch": 980.78, + "learning_rate": 1.3274439225335673e-05, + "loss": 0.3788, + "step": 50020 + }, + { + "epoch": 980.98, + "learning_rate": 1.3251428106544202e-05, + "loss": 0.3749, + "step": 50030 + }, + { + "epoch": 981.0, + "eval_loss": 0.38307544589042664, + "eval_runtime": 2.3077, + "eval_samples_per_second": 987.572, + "eval_steps_per_second": 3.9, + "step": 50031 + }, + { + "epoch": 981.18, + "learning_rate": 1.3228435016958609e-05, + "loss": 0.3727, + "step": 50040 + }, + { + "epoch": 981.37, + "learning_rate": 1.3205459963292357e-05, + "loss": 0.377, + "step": 50050 + }, + { + "epoch": 981.57, + "learning_rate": 1.3182502952253621e-05, + "loss": 0.3802, + "step": 50060 + }, + { + "epoch": 981.76, + "learning_rate": 1.3159563990545366e-05, + "loss": 0.3794, + "step": 50070 + }, + { + "epoch": 981.96, + "learning_rate": 1.3136643084865242e-05, + "loss": 0.3756, + "step": 50080 + }, + { + "epoch": 982.0, + "eval_loss": 0.38791292905807495, + "eval_runtime": 2.2435, + "eval_samples_per_second": 1015.827, + "eval_steps_per_second": 4.012, + "step": 50082 + }, + { + "epoch": 982.16, + "learning_rate": 1.3113740241905671e-05, + "loss": 0.38, + "step": 50090 + }, + { + "epoch": 982.35, + "learning_rate": 1.3090855468353736e-05, + "loss": 0.377, + "step": 50100 + }, + { + "epoch": 982.55, + "learning_rate": 1.3067988770891319e-05, + "loss": 0.3827, + "step": 50110 + }, + { + "epoch": 982.75, + "learning_rate": 1.3045140156194936e-05, + "loss": 0.3768, + "step": 50120 + }, + { + "epoch": 982.94, + "learning_rate": 1.3022309630935901e-05, + "loss": 0.3739, + "step": 50130 + }, + { + "epoch": 983.0, + "eval_loss": 0.38304463028907776, + "eval_runtime": 2.2767, + "eval_samples_per_second": 1001.022, + "eval_steps_per_second": 3.953, + "step": 50133 + }, + { + "epoch": 983.14, + "learning_rate": 1.299949720178024e-05, + "loss": 0.3792, + "step": 50140 + }, + { + "epoch": 983.33, + "learning_rate": 1.2976702875388633e-05, + "loss": 0.3716, + "step": 50150 + }, + { + "epoch": 983.53, + "learning_rate": 1.295392665841655e-05, + "loss": 0.3773, + "step": 50160 + }, + { + "epoch": 983.73, + "learning_rate": 1.2931168557514094e-05, + "loss": 0.3751, + "step": 50170 + }, + { + "epoch": 983.92, + "learning_rate": 1.2908428579326158e-05, + "loss": 0.3769, + "step": 50180 + }, + { + "epoch": 984.0, + "eval_loss": 0.3844551146030426, + "eval_runtime": 2.1862, + "eval_samples_per_second": 1042.456, + "eval_steps_per_second": 4.117, + "step": 50184 + }, + { + "epoch": 984.12, + "learning_rate": 1.2885706730492316e-05, + "loss": 0.3767, + "step": 50190 + }, + { + "epoch": 984.31, + "learning_rate": 1.2863003017646809e-05, + "loss": 0.3797, + "step": 50200 + }, + { + "epoch": 984.51, + "learning_rate": 1.2840317447418652e-05, + "loss": 0.379, + "step": 50210 + }, + { + "epoch": 984.71, + "learning_rate": 1.2817650026431481e-05, + "loss": 0.3756, + "step": 50220 + }, + { + "epoch": 984.9, + "learning_rate": 1.2795000761303708e-05, + "loss": 0.3737, + "step": 50230 + }, + { + "epoch": 985.0, + "eval_loss": 0.3893979787826538, + "eval_runtime": 2.2314, + "eval_samples_per_second": 1021.322, + "eval_steps_per_second": 4.033, + "step": 50235 + }, + { + "epoch": 985.1, + "learning_rate": 1.277236965864842e-05, + "loss": 0.38, + "step": 50240 + }, + { + "epoch": 985.29, + "learning_rate": 1.2749756725073365e-05, + "loss": 0.3791, + "step": 50250 + }, + { + "epoch": 985.49, + "learning_rate": 1.2727161967181043e-05, + "loss": 0.3772, + "step": 50260 + }, + { + "epoch": 985.69, + "learning_rate": 1.2704585391568594e-05, + "loss": 0.3743, + "step": 50270 + }, + { + "epoch": 985.88, + "learning_rate": 1.2682027004827888e-05, + "loss": 0.3769, + "step": 50280 + }, + { + "epoch": 986.0, + "eval_loss": 0.38151878118515015, + "eval_runtime": 2.232, + "eval_samples_per_second": 1021.062, + "eval_steps_per_second": 4.032, + "step": 50286 + }, + { + "epoch": 986.08, + "learning_rate": 1.2659486813545472e-05, + "loss": 0.3764, + "step": 50290 + }, + { + "epoch": 986.27, + "learning_rate": 1.2636964824302597e-05, + "loss": 0.3798, + "step": 50300 + }, + { + "epoch": 986.47, + "learning_rate": 1.2614461043675164e-05, + "loss": 0.3708, + "step": 50310 + }, + { + "epoch": 986.67, + "learning_rate": 1.2591975478233749e-05, + "loss": 0.3742, + "step": 50320 + }, + { + "epoch": 986.86, + "learning_rate": 1.2569508134543666e-05, + "loss": 0.373, + "step": 50330 + }, + { + "epoch": 987.0, + "eval_loss": 0.37965089082717896, + "eval_runtime": 2.2126, + "eval_samples_per_second": 1030.024, + "eval_steps_per_second": 4.068, + "step": 50337 + }, + { + "epoch": 987.06, + "learning_rate": 1.2547059019164868e-05, + "loss": 0.3739, + "step": 50340 + }, + { + "epoch": 987.25, + "learning_rate": 1.2524628138652021e-05, + "loss": 0.3752, + "step": 50350 + }, + { + "epoch": 987.45, + "learning_rate": 1.2502215499554411e-05, + "loss": 0.3725, + "step": 50360 + }, + { + "epoch": 987.65, + "learning_rate": 1.2479821108416044e-05, + "loss": 0.3808, + "step": 50370 + }, + { + "epoch": 987.84, + "learning_rate": 1.2457444971775565e-05, + "loss": 0.374, + "step": 50380 + }, + { + "epoch": 988.0, + "eval_loss": 0.38273051381111145, + "eval_runtime": 2.3252, + "eval_samples_per_second": 980.135, + "eval_steps_per_second": 3.871, + "step": 50388 + }, + { + "epoch": 988.04, + "learning_rate": 1.2435087096166324e-05, + "loss": 0.3735, + "step": 50390 + }, + { + "epoch": 988.24, + "learning_rate": 1.2412747488116332e-05, + "loss": 0.3778, + "step": 50400 + }, + { + "epoch": 988.43, + "learning_rate": 1.2390426154148228e-05, + "loss": 0.372, + "step": 50410 + }, + { + "epoch": 988.63, + "learning_rate": 1.2368123100779376e-05, + "loss": 0.379, + "step": 50420 + }, + { + "epoch": 988.82, + "learning_rate": 1.2345838334521724e-05, + "loss": 0.3778, + "step": 50430 + }, + { + "epoch": 989.0, + "eval_loss": 0.38441118597984314, + "eval_runtime": 2.235, + "eval_samples_per_second": 1019.7, + "eval_steps_per_second": 4.027, + "step": 50439 + }, + { + "epoch": 989.02, + "learning_rate": 1.2323571861881967e-05, + "loss": 0.3758, + "step": 50440 + }, + { + "epoch": 989.22, + "learning_rate": 1.2301323689361423e-05, + "loss": 0.3726, + "step": 50450 + }, + { + "epoch": 989.41, + "learning_rate": 1.2279093823456019e-05, + "loss": 0.3765, + "step": 50460 + }, + { + "epoch": 989.61, + "learning_rate": 1.2256882270656429e-05, + "loss": 0.3738, + "step": 50470 + }, + { + "epoch": 989.8, + "learning_rate": 1.2234689037447892e-05, + "loss": 0.3775, + "step": 50480 + }, + { + "epoch": 990.0, + "learning_rate": 1.2212514130310358e-05, + "loss": 0.3773, + "step": 50490 + }, + { + "epoch": 990.0, + "eval_loss": 0.38456442952156067, + "eval_runtime": 2.2877, + "eval_samples_per_second": 996.205, + "eval_steps_per_second": 3.934, + "step": 50490 + }, + { + "epoch": 990.2, + "learning_rate": 1.2190357555718388e-05, + "loss": 0.3756, + "step": 50500 + }, + { + "epoch": 990.39, + "learning_rate": 1.216821932014125e-05, + "loss": 0.3724, + "step": 50510 + }, + { + "epoch": 990.59, + "learning_rate": 1.2146099430042782e-05, + "loss": 0.3797, + "step": 50520 + }, + { + "epoch": 990.78, + "learning_rate": 1.2123997891881485e-05, + "loss": 0.3732, + "step": 50530 + }, + { + "epoch": 990.98, + "learning_rate": 1.2101914712110536e-05, + "loss": 0.3759, + "step": 50540 + }, + { + "epoch": 991.0, + "eval_loss": 0.3825616240501404, + "eval_runtime": 2.2692, + "eval_samples_per_second": 1004.299, + "eval_steps_per_second": 3.966, + "step": 50541 + }, + { + "epoch": 991.18, + "learning_rate": 1.2079849897177721e-05, + "loss": 0.3776, + "step": 50550 + }, + { + "epoch": 991.37, + "learning_rate": 1.2057803453525502e-05, + "loss": 0.3784, + "step": 50560 + }, + { + "epoch": 991.57, + "learning_rate": 1.2035775387590915e-05, + "loss": 0.3768, + "step": 50570 + }, + { + "epoch": 991.76, + "learning_rate": 1.201376570580569e-05, + "loss": 0.3741, + "step": 50580 + }, + { + "epoch": 991.96, + "learning_rate": 1.1991774414596126e-05, + "loss": 0.3752, + "step": 50590 + }, + { + "epoch": 992.0, + "eval_loss": 0.38430219888687134, + "eval_runtime": 2.3526, + "eval_samples_per_second": 968.721, + "eval_steps_per_second": 3.826, + "step": 50592 + }, + { + "epoch": 992.16, + "learning_rate": 1.196980152038322e-05, + "loss": 0.3784, + "step": 50600 + }, + { + "epoch": 992.35, + "learning_rate": 1.1947847029582578e-05, + "loss": 0.3768, + "step": 50610 + }, + { + "epoch": 992.55, + "learning_rate": 1.1925910948604376e-05, + "loss": 0.3751, + "step": 50620 + }, + { + "epoch": 992.75, + "learning_rate": 1.1903993283853516e-05, + "loss": 0.3772, + "step": 50630 + }, + { + "epoch": 992.94, + "learning_rate": 1.1882094041729423e-05, + "loss": 0.3747, + "step": 50640 + }, + { + "epoch": 993.0, + "eval_loss": 0.381651371717453, + "eval_runtime": 2.2123, + "eval_samples_per_second": 1030.155, + "eval_steps_per_second": 4.068, + "step": 50643 + }, + { + "epoch": 993.14, + "learning_rate": 1.1860213228626198e-05, + "loss": 0.3762, + "step": 50650 + }, + { + "epoch": 993.33, + "learning_rate": 1.1838350850932578e-05, + "loss": 0.3774, + "step": 50660 + }, + { + "epoch": 993.53, + "learning_rate": 1.1816506915031845e-05, + "loss": 0.3776, + "step": 50670 + }, + { + "epoch": 993.73, + "learning_rate": 1.1794681427301986e-05, + "loss": 0.3762, + "step": 50680 + }, + { + "epoch": 993.92, + "learning_rate": 1.1772874394115519e-05, + "loss": 0.3781, + "step": 50690 + }, + { + "epoch": 994.0, + "eval_loss": 0.3783932626247406, + "eval_runtime": 2.3211, + "eval_samples_per_second": 981.883, + "eval_steps_per_second": 3.878, + "step": 50694 + }, + { + "epoch": 994.12, + "learning_rate": 1.175108582183962e-05, + "loss": 0.3763, + "step": 50700 + }, + { + "epoch": 994.31, + "learning_rate": 1.1729315716836083e-05, + "loss": 0.3761, + "step": 50710 + }, + { + "epoch": 994.51, + "learning_rate": 1.1707564085461295e-05, + "loss": 0.3792, + "step": 50720 + }, + { + "epoch": 994.71, + "learning_rate": 1.168583093406624e-05, + "loss": 0.3752, + "step": 50730 + }, + { + "epoch": 994.9, + "learning_rate": 1.1664116268996488e-05, + "loss": 0.3751, + "step": 50740 + }, + { + "epoch": 995.0, + "eval_loss": 0.3832464814186096, + "eval_runtime": 2.2172, + "eval_samples_per_second": 1027.873, + "eval_steps_per_second": 4.059, + "step": 50745 + }, + { + "epoch": 995.1, + "learning_rate": 1.1642420096592258e-05, + "loss": 0.3765, + "step": 50750 + }, + { + "epoch": 995.29, + "learning_rate": 1.1620742423188354e-05, + "loss": 0.3787, + "step": 50760 + }, + { + "epoch": 995.49, + "learning_rate": 1.1599083255114175e-05, + "loss": 0.3764, + "step": 50770 + }, + { + "epoch": 995.69, + "learning_rate": 1.1577442598693699e-05, + "loss": 0.3781, + "step": 50780 + }, + { + "epoch": 995.88, + "learning_rate": 1.1555820460245535e-05, + "loss": 0.3758, + "step": 50790 + }, + { + "epoch": 996.0, + "eval_loss": 0.37998104095458984, + "eval_runtime": 2.2526, + "eval_samples_per_second": 1011.735, + "eval_steps_per_second": 3.995, + "step": 50796 + }, + { + "epoch": 996.08, + "learning_rate": 1.1534216846082845e-05, + "loss": 0.3749, + "step": 50800 + }, + { + "epoch": 996.27, + "learning_rate": 1.1512631762513405e-05, + "loss": 0.3793, + "step": 50810 + }, + { + "epoch": 996.47, + "learning_rate": 1.14910652158396e-05, + "loss": 0.3785, + "step": 50820 + }, + { + "epoch": 996.67, + "learning_rate": 1.1469517212358354e-05, + "loss": 0.3736, + "step": 50830 + }, + { + "epoch": 996.86, + "learning_rate": 1.144798775836123e-05, + "loss": 0.3718, + "step": 50840 + }, + { + "epoch": 997.0, + "eval_loss": 0.38368964195251465, + "eval_runtime": 2.2857, + "eval_samples_per_second": 997.047, + "eval_steps_per_second": 3.937, + "step": 50847 + }, + { + "epoch": 997.06, + "learning_rate": 1.1426476860134318e-05, + "loss": 0.3746, + "step": 50850 + }, + { + "epoch": 997.25, + "learning_rate": 1.1404984523958335e-05, + "loss": 0.3727, + "step": 50860 + }, + { + "epoch": 997.45, + "learning_rate": 1.138351075610858e-05, + "loss": 0.3771, + "step": 50870 + }, + { + "epoch": 997.65, + "learning_rate": 1.1362055562854877e-05, + "loss": 0.3787, + "step": 50880 + }, + { + "epoch": 997.84, + "learning_rate": 1.1340618950461708e-05, + "loss": 0.3745, + "step": 50890 + }, + { + "epoch": 998.0, + "eval_loss": 0.382259726524353, + "eval_runtime": 2.3112, + "eval_samples_per_second": 986.086, + "eval_steps_per_second": 3.894, + "step": 50898 + }, + { + "epoch": 998.04, + "learning_rate": 1.1319200925188049e-05, + "loss": 0.3769, + "step": 50900 + }, + { + "epoch": 998.24, + "learning_rate": 1.1297801493287497e-05, + "loss": 0.3799, + "step": 50910 + }, + { + "epoch": 998.43, + "learning_rate": 1.1276420661008231e-05, + "loss": 0.3787, + "step": 50920 + }, + { + "epoch": 998.63, + "learning_rate": 1.1255058434592939e-05, + "loss": 0.3742, + "step": 50930 + }, + { + "epoch": 998.82, + "learning_rate": 1.123371482027895e-05, + "loss": 0.3757, + "step": 50940 + }, + { + "epoch": 999.0, + "eval_loss": 0.3797883093357086, + "eval_runtime": 2.2661, + "eval_samples_per_second": 1005.687, + "eval_steps_per_second": 3.972, + "step": 50949 + }, + { + "epoch": 999.02, + "learning_rate": 1.1212389824298093e-05, + "loss": 0.3729, + "step": 50950 + }, + { + "epoch": 999.22, + "learning_rate": 1.1191083452876806e-05, + "loss": 0.3754, + "step": 50960 + }, + { + "epoch": 999.41, + "learning_rate": 1.116979571223607e-05, + "loss": 0.3697, + "step": 50970 + }, + { + "epoch": 999.61, + "learning_rate": 1.114852660859145e-05, + "loss": 0.3744, + "step": 50980 + }, + { + "epoch": 999.8, + "learning_rate": 1.1127276148153039e-05, + "loss": 0.373, + "step": 50990 + }, + { + "epoch": 1000.0, + "learning_rate": 1.1106044337125478e-05, + "loss": 0.3786, + "step": 51000 + }, + { + "epoch": 1000.0, + "eval_loss": 0.37940987944602966, + "eval_runtime": 2.3813, + "eval_samples_per_second": 957.056, + "eval_steps_per_second": 3.78, + "step": 51000 + }, + { + "epoch": 1000.2, + "learning_rate": 1.108483118170799e-05, + "loss": 0.3717, + "step": 51010 + }, + { + "epoch": 1000.39, + "learning_rate": 1.1063636688094354e-05, + "loss": 0.3741, + "step": 51020 + }, + { + "epoch": 1000.59, + "learning_rate": 1.1042460862472905e-05, + "loss": 0.3779, + "step": 51030 + }, + { + "epoch": 1000.78, + "learning_rate": 1.1021303711026468e-05, + "loss": 0.3747, + "step": 51040 + }, + { + "epoch": 1000.98, + "learning_rate": 1.1000165239932507e-05, + "loss": 0.3738, + "step": 51050 + }, + { + "epoch": 1001.0, + "eval_loss": 0.37811383605003357, + "eval_runtime": 2.1934, + "eval_samples_per_second": 1039.022, + "eval_steps_per_second": 4.103, + "step": 51051 + }, + { + "epoch": 1001.18, + "learning_rate": 1.0979045455362948e-05, + "loss": 0.3778, + "step": 51060 + }, + { + "epoch": 1001.37, + "learning_rate": 1.09579443634843e-05, + "loss": 0.3713, + "step": 51070 + }, + { + "epoch": 1001.57, + "learning_rate": 1.0936861970457644e-05, + "loss": 0.3763, + "step": 51080 + }, + { + "epoch": 1001.76, + "learning_rate": 1.0915798282438531e-05, + "loss": 0.3775, + "step": 51090 + }, + { + "epoch": 1001.96, + "learning_rate": 1.0894753305577116e-05, + "loss": 0.3779, + "step": 51100 + }, + { + "epoch": 1002.0, + "eval_loss": 0.38506320118904114, + "eval_runtime": 2.3112, + "eval_samples_per_second": 986.049, + "eval_steps_per_second": 3.894, + "step": 51102 + }, + { + "epoch": 1002.16, + "learning_rate": 1.0873727046018036e-05, + "loss": 0.3746, + "step": 51110 + }, + { + "epoch": 1002.35, + "learning_rate": 1.085271950990051e-05, + "loss": 0.3796, + "step": 51120 + }, + { + "epoch": 1002.55, + "learning_rate": 1.0831730703358265e-05, + "loss": 0.3764, + "step": 51130 + }, + { + "epoch": 1002.75, + "learning_rate": 1.081076063251956e-05, + "loss": 0.3795, + "step": 51140 + }, + { + "epoch": 1002.94, + "learning_rate": 1.0789809303507205e-05, + "loss": 0.3735, + "step": 51150 + }, + { + "epoch": 1003.0, + "eval_loss": 0.3844279646873474, + "eval_runtime": 2.3307, + "eval_samples_per_second": 977.811, + "eval_steps_per_second": 3.861, + "step": 51153 + }, + { + "epoch": 1003.14, + "learning_rate": 1.0768876722438487e-05, + "loss": 0.376, + "step": 51160 + }, + { + "epoch": 1003.33, + "learning_rate": 1.0747962895425272e-05, + "loss": 0.3744, + "step": 51170 + }, + { + "epoch": 1003.53, + "learning_rate": 1.0727067828573937e-05, + "loss": 0.376, + "step": 51180 + }, + { + "epoch": 1003.73, + "learning_rate": 1.0706191527985389e-05, + "loss": 0.3746, + "step": 51190 + }, + { + "epoch": 1003.92, + "learning_rate": 1.0685333999755017e-05, + "loss": 0.3753, + "step": 51200 + }, + { + "epoch": 1004.0, + "eval_loss": 0.3840962052345276, + "eval_runtime": 2.2584, + "eval_samples_per_second": 1009.122, + "eval_steps_per_second": 3.985, + "step": 51204 + }, + { + "epoch": 1004.12, + "learning_rate": 1.0664495249972749e-05, + "loss": 0.3796, + "step": 51210 + }, + { + "epoch": 1004.31, + "learning_rate": 1.0643675284723043e-05, + "loss": 0.3747, + "step": 51220 + }, + { + "epoch": 1004.51, + "learning_rate": 1.0622874110084873e-05, + "loss": 0.3754, + "step": 51230 + }, + { + "epoch": 1004.71, + "learning_rate": 1.0602091732131727e-05, + "loss": 0.3759, + "step": 51240 + }, + { + "epoch": 1004.9, + "learning_rate": 1.0581328156931559e-05, + "loss": 0.3701, + "step": 51250 + }, + { + "epoch": 1005.0, + "eval_loss": 0.3804880380630493, + "eval_runtime": 2.256, + "eval_samples_per_second": 1010.217, + "eval_steps_per_second": 3.989, + "step": 51255 + }, + { + "epoch": 1005.1, + "learning_rate": 1.0560583390546923e-05, + "loss": 0.3763, + "step": 51260 + }, + { + "epoch": 1005.29, + "learning_rate": 1.053985743903477e-05, + "loss": 0.3766, + "step": 51270 + }, + { + "epoch": 1005.49, + "learning_rate": 1.0519150308446655e-05, + "loss": 0.3795, + "step": 51280 + }, + { + "epoch": 1005.69, + "learning_rate": 1.0498462004828598e-05, + "loss": 0.3769, + "step": 51290 + }, + { + "epoch": 1005.88, + "learning_rate": 1.04777925342211e-05, + "loss": 0.3738, + "step": 51300 + }, + { + "epoch": 1006.0, + "eval_loss": 0.3825666904449463, + "eval_runtime": 2.2233, + "eval_samples_per_second": 1025.034, + "eval_steps_per_second": 4.048, + "step": 51306 + }, + { + "epoch": 1006.08, + "learning_rate": 1.0457141902659208e-05, + "loss": 0.373, + "step": 51310 + }, + { + "epoch": 1006.27, + "learning_rate": 1.0436510116172425e-05, + "loss": 0.374, + "step": 51320 + }, + { + "epoch": 1006.47, + "learning_rate": 1.0415897180784774e-05, + "loss": 0.375, + "step": 51330 + }, + { + "epoch": 1006.67, + "learning_rate": 1.0395303102514807e-05, + "loss": 0.3782, + "step": 51340 + }, + { + "epoch": 1006.86, + "learning_rate": 1.0374727887375481e-05, + "loss": 0.3729, + "step": 51350 + }, + { + "epoch": 1007.0, + "eval_loss": 0.3792899549007416, + "eval_runtime": 2.2748, + "eval_samples_per_second": 1001.838, + "eval_steps_per_second": 3.956, + "step": 51357 + }, + { + "epoch": 1007.06, + "learning_rate": 1.0354171541374356e-05, + "loss": 0.3748, + "step": 51360 + }, + { + "epoch": 1007.25, + "learning_rate": 1.0333634070513375e-05, + "loss": 0.3737, + "step": 51370 + }, + { + "epoch": 1007.45, + "learning_rate": 1.0313115480789047e-05, + "loss": 0.3785, + "step": 51380 + }, + { + "epoch": 1007.65, + "learning_rate": 1.0292615778192348e-05, + "loss": 0.3722, + "step": 51390 + }, + { + "epoch": 1007.84, + "learning_rate": 1.027213496870874e-05, + "loss": 0.3765, + "step": 51400 + }, + { + "epoch": 1008.0, + "eval_loss": 0.38250917196273804, + "eval_runtime": 2.4031, + "eval_samples_per_second": 948.347, + "eval_steps_per_second": 3.745, + "step": 51408 + }, + { + "epoch": 1008.04, + "learning_rate": 1.0251673058318147e-05, + "loss": 0.3738, + "step": 51410 + }, + { + "epoch": 1008.24, + "learning_rate": 1.0231230052994974e-05, + "loss": 0.3751, + "step": 51420 + }, + { + "epoch": 1008.43, + "learning_rate": 1.0210805958708145e-05, + "loss": 0.3736, + "step": 51430 + }, + { + "epoch": 1008.63, + "learning_rate": 1.0190400781421035e-05, + "loss": 0.3752, + "step": 51440 + }, + { + "epoch": 1008.82, + "learning_rate": 1.0170014527091524e-05, + "loss": 0.3725, + "step": 51450 + }, + { + "epoch": 1009.0, + "eval_loss": 0.38174739480018616, + "eval_runtime": 2.3742, + "eval_samples_per_second": 959.913, + "eval_steps_per_second": 3.791, + "step": 51459 + }, + { + "epoch": 1009.02, + "learning_rate": 1.0149647201671904e-05, + "loss": 0.3754, + "step": 51460 + }, + { + "epoch": 1009.22, + "learning_rate": 1.0129298811109015e-05, + "loss": 0.3773, + "step": 51470 + }, + { + "epoch": 1009.41, + "learning_rate": 1.0108969361344099e-05, + "loss": 0.3749, + "step": 51480 + }, + { + "epoch": 1009.61, + "learning_rate": 1.0088658858312914e-05, + "loss": 0.3774, + "step": 51490 + }, + { + "epoch": 1009.8, + "learning_rate": 1.0068367307945702e-05, + "loss": 0.3746, + "step": 51500 + }, + { + "epoch": 1010.0, + "learning_rate": 1.0048094716167095e-05, + "loss": 0.3766, + "step": 51510 + }, + { + "epoch": 1010.0, + "eval_loss": 0.38128504157066345, + "eval_runtime": 2.2597, + "eval_samples_per_second": 1008.534, + "eval_steps_per_second": 3.983, + "step": 51510 + }, + { + "epoch": 1010.2, + "learning_rate": 1.0027841088896289e-05, + "loss": 0.3742, + "step": 51520 + }, + { + "epoch": 1010.39, + "learning_rate": 1.0007606432046846e-05, + "loss": 0.3758, + "step": 51530 + }, + { + "epoch": 1010.59, + "learning_rate": 9.987390751526855e-06, + "loss": 0.3777, + "step": 51540 + }, + { + "epoch": 1010.78, + "learning_rate": 9.96719405323885e-06, + "loss": 0.3797, + "step": 51550 + }, + { + "epoch": 1010.98, + "learning_rate": 9.947016343079806e-06, + "loss": 0.3736, + "step": 51560 + }, + { + "epoch": 1011.0, + "eval_loss": 0.38342854380607605, + "eval_runtime": 2.256, + "eval_samples_per_second": 1010.174, + "eval_steps_per_second": 3.989, + "step": 51561 + }, + { + "epoch": 1011.18, + "learning_rate": 9.926857626941176e-06, + "loss": 0.3724, + "step": 51570 + }, + { + "epoch": 1011.37, + "learning_rate": 9.906717910708828e-06, + "loss": 0.3753, + "step": 51580 + }, + { + "epoch": 1011.57, + "learning_rate": 9.886597200263132e-06, + "loss": 0.3776, + "step": 51590 + }, + { + "epoch": 1011.76, + "learning_rate": 9.866495501478891e-06, + "loss": 0.3789, + "step": 51600 + }, + { + "epoch": 1011.96, + "learning_rate": 9.846412820225358e-06, + "loss": 0.3747, + "step": 51610 + }, + { + "epoch": 1012.0, + "eval_loss": 0.38004985451698303, + "eval_runtime": 2.2958, + "eval_samples_per_second": 992.701, + "eval_steps_per_second": 3.92, + "step": 51612 + }, + { + "epoch": 1012.16, + "learning_rate": 9.82634916236621e-06, + "loss": 0.375, + "step": 51620 + }, + { + "epoch": 1012.35, + "learning_rate": 9.806304533759576e-06, + "loss": 0.3746, + "step": 51630 + }, + { + "epoch": 1012.55, + "learning_rate": 9.78627894025806e-06, + "loss": 0.3714, + "step": 51640 + }, + { + "epoch": 1012.75, + "learning_rate": 9.766272387708693e-06, + "loss": 0.3699, + "step": 51650 + }, + { + "epoch": 1012.94, + "learning_rate": 9.746284881952942e-06, + "loss": 0.3726, + "step": 51660 + }, + { + "epoch": 1013.0, + "eval_loss": 0.381724089384079, + "eval_runtime": 2.2926, + "eval_samples_per_second": 994.086, + "eval_steps_per_second": 3.926, + "step": 51663 + }, + { + "epoch": 1013.14, + "learning_rate": 9.726316428826717e-06, + "loss": 0.3761, + "step": 51670 + }, + { + "epoch": 1013.33, + "learning_rate": 9.706367034160326e-06, + "loss": 0.3737, + "step": 51680 + }, + { + "epoch": 1013.53, + "learning_rate": 9.686436703778577e-06, + "loss": 0.3765, + "step": 51690 + }, + { + "epoch": 1013.73, + "learning_rate": 9.666525443500667e-06, + "loss": 0.3723, + "step": 51700 + }, + { + "epoch": 1013.92, + "learning_rate": 9.646633259140276e-06, + "loss": 0.3819, + "step": 51710 + }, + { + "epoch": 1014.0, + "eval_loss": 0.3839859068393707, + "eval_runtime": 2.2503, + "eval_samples_per_second": 1012.774, + "eval_steps_per_second": 4.0, + "step": 51714 + }, + { + "epoch": 1014.12, + "learning_rate": 9.626760156505429e-06, + "loss": 0.3707, + "step": 51720 + }, + { + "epoch": 1014.31, + "learning_rate": 9.60690614139867e-06, + "loss": 0.3738, + "step": 51730 + }, + { + "epoch": 1014.51, + "learning_rate": 9.587071219616918e-06, + "loss": 0.3731, + "step": 51740 + }, + { + "epoch": 1014.71, + "learning_rate": 9.567255396951478e-06, + "loss": 0.3739, + "step": 51750 + }, + { + "epoch": 1014.9, + "learning_rate": 9.5474586791882e-06, + "loss": 0.3799, + "step": 51760 + }, + { + "epoch": 1015.0, + "eval_loss": 0.38338810205459595, + "eval_runtime": 2.2465, + "eval_samples_per_second": 1014.485, + "eval_steps_per_second": 4.006, + "step": 51765 + }, + { + "epoch": 1015.1, + "learning_rate": 9.527681072107249e-06, + "loss": 0.379, + "step": 51770 + }, + { + "epoch": 1015.29, + "learning_rate": 9.507922581483257e-06, + "loss": 0.3778, + "step": 51780 + }, + { + "epoch": 1015.49, + "learning_rate": 9.488183213085243e-06, + "loss": 0.3747, + "step": 51790 + }, + { + "epoch": 1015.69, + "learning_rate": 9.46846297267668e-06, + "loss": 0.3726, + "step": 51800 + }, + { + "epoch": 1015.88, + "learning_rate": 9.448761866015445e-06, + "loss": 0.3754, + "step": 51810 + }, + { + "epoch": 1016.0, + "eval_loss": 0.3817760944366455, + "eval_runtime": 2.3863, + "eval_samples_per_second": 955.027, + "eval_steps_per_second": 3.771, + "step": 51816 + }, + { + "epoch": 1016.08, + "learning_rate": 9.429079898853795e-06, + "loss": 0.3722, + "step": 51820 + }, + { + "epoch": 1016.27, + "learning_rate": 9.409417076938457e-06, + "loss": 0.3708, + "step": 51830 + }, + { + "epoch": 1016.47, + "learning_rate": 9.389773406010509e-06, + "loss": 0.37, + "step": 51840 + }, + { + "epoch": 1016.67, + "learning_rate": 9.370148891805467e-06, + "loss": 0.373, + "step": 51850 + }, + { + "epoch": 1016.86, + "learning_rate": 9.350543540053268e-06, + "loss": 0.3762, + "step": 51860 + }, + { + "epoch": 1017.0, + "eval_loss": 0.37691184878349304, + "eval_runtime": 2.2427, + "eval_samples_per_second": 1016.193, + "eval_steps_per_second": 4.013, + "step": 51867 + }, + { + "epoch": 1017.06, + "learning_rate": 9.330957356478248e-06, + "loss": 0.3805, + "step": 51870 + }, + { + "epoch": 1017.25, + "learning_rate": 9.311390346799114e-06, + "loss": 0.3737, + "step": 51880 + }, + { + "epoch": 1017.45, + "learning_rate": 9.29184251672899e-06, + "loss": 0.3775, + "step": 51890 + }, + { + "epoch": 1017.65, + "learning_rate": 9.27231387197541e-06, + "loss": 0.3714, + "step": 51900 + }, + { + "epoch": 1017.84, + "learning_rate": 9.252804418240312e-06, + "loss": 0.3718, + "step": 51910 + }, + { + "epoch": 1018.0, + "eval_loss": 0.3794402480125427, + "eval_runtime": 2.2234, + "eval_samples_per_second": 1025.004, + "eval_steps_per_second": 4.048, + "step": 51918 + }, + { + "epoch": 1018.04, + "learning_rate": 9.23331416122004e-06, + "loss": 0.3731, + "step": 51920 + }, + { + "epoch": 1018.24, + "learning_rate": 9.213843106605267e-06, + "loss": 0.3782, + "step": 51930 + }, + { + "epoch": 1018.43, + "learning_rate": 9.194391260081163e-06, + "loss": 0.3725, + "step": 51940 + }, + { + "epoch": 1018.63, + "learning_rate": 9.174958627327191e-06, + "loss": 0.3746, + "step": 51950 + }, + { + "epoch": 1018.82, + "learning_rate": 9.155545214017232e-06, + "loss": 0.3785, + "step": 51960 + }, + { + "epoch": 1019.0, + "eval_loss": 0.3825004994869232, + "eval_runtime": 2.2245, + "eval_samples_per_second": 1024.505, + "eval_steps_per_second": 4.046, + "step": 51969 + }, + { + "epoch": 1019.02, + "learning_rate": 9.136151025819633e-06, + "loss": 0.3729, + "step": 51970 + }, + { + "epoch": 1019.22, + "learning_rate": 9.116776068397006e-06, + "loss": 0.3754, + "step": 51980 + }, + { + "epoch": 1019.41, + "learning_rate": 9.097420347406442e-06, + "loss": 0.3786, + "step": 51990 + }, + { + "epoch": 1019.61, + "learning_rate": 9.078083868499356e-06, + "loss": 0.3759, + "step": 52000 + }, + { + "epoch": 1019.8, + "learning_rate": 9.05876663732158e-06, + "loss": 0.3697, + "step": 52010 + }, + { + "epoch": 1020.0, + "learning_rate": 9.039468659513327e-06, + "loss": 0.3754, + "step": 52020 + }, + { + "epoch": 1020.0, + "eval_loss": 0.38265079259872437, + "eval_runtime": 2.199, + "eval_samples_per_second": 1036.381, + "eval_steps_per_second": 4.093, + "step": 52020 + }, + { + "epoch": 1020.2, + "learning_rate": 9.02018994070914e-06, + "loss": 0.376, + "step": 52030 + }, + { + "epoch": 1020.39, + "learning_rate": 9.000930486538026e-06, + "loss": 0.3739, + "step": 52040 + }, + { + "epoch": 1020.59, + "learning_rate": 8.981690302623263e-06, + "loss": 0.3717, + "step": 52050 + }, + { + "epoch": 1020.78, + "learning_rate": 8.962469394582587e-06, + "loss": 0.3768, + "step": 52060 + }, + { + "epoch": 1020.98, + "learning_rate": 8.943267768028068e-06, + "loss": 0.374, + "step": 52070 + }, + { + "epoch": 1021.0, + "eval_loss": 0.3817632496356964, + "eval_runtime": 2.293, + "eval_samples_per_second": 993.877, + "eval_steps_per_second": 3.925, + "step": 52071 + }, + { + "epoch": 1021.18, + "learning_rate": 8.924085428566163e-06, + "loss": 0.3702, + "step": 52080 + }, + { + "epoch": 1021.37, + "learning_rate": 8.904922381797677e-06, + "loss": 0.3725, + "step": 52090 + }, + { + "epoch": 1021.57, + "learning_rate": 8.885778633317783e-06, + "loss": 0.3782, + "step": 52100 + }, + { + "epoch": 1021.76, + "learning_rate": 8.866654188716035e-06, + "loss": 0.3683, + "step": 52110 + }, + { + "epoch": 1021.96, + "learning_rate": 8.847549053576342e-06, + "loss": 0.3785, + "step": 52120 + }, + { + "epoch": 1022.0, + "eval_loss": 0.3780389428138733, + "eval_runtime": 2.2927, + "eval_samples_per_second": 994.038, + "eval_steps_per_second": 3.926, + "step": 52122 + }, + { + "epoch": 1022.16, + "learning_rate": 8.828463233477e-06, + "loss": 0.3754, + "step": 52130 + }, + { + "epoch": 1022.35, + "learning_rate": 8.809396733990615e-06, + "loss": 0.3757, + "step": 52140 + }, + { + "epoch": 1022.55, + "learning_rate": 8.790349560684203e-06, + "loss": 0.3749, + "step": 52150 + }, + { + "epoch": 1022.75, + "learning_rate": 8.771321719119101e-06, + "loss": 0.3733, + "step": 52160 + }, + { + "epoch": 1022.94, + "learning_rate": 8.75231321485098e-06, + "loss": 0.3735, + "step": 52170 + }, + { + "epoch": 1023.0, + "eval_loss": 0.3814985752105713, + "eval_runtime": 2.374, + "eval_samples_per_second": 959.969, + "eval_steps_per_second": 3.791, + "step": 52173 + }, + { + "epoch": 1023.14, + "learning_rate": 8.733324053429963e-06, + "loss": 0.3719, + "step": 52180 + }, + { + "epoch": 1023.33, + "learning_rate": 8.71435424040042e-06, + "loss": 0.3746, + "step": 52190 + }, + { + "epoch": 1023.53, + "learning_rate": 8.695403781301144e-06, + "loss": 0.3718, + "step": 52200 + }, + { + "epoch": 1023.73, + "learning_rate": 8.676472681665208e-06, + "loss": 0.3755, + "step": 52210 + }, + { + "epoch": 1023.92, + "learning_rate": 8.657560947020093e-06, + "loss": 0.3726, + "step": 52220 + }, + { + "epoch": 1024.0, + "eval_loss": 0.3794108033180237, + "eval_runtime": 2.2464, + "eval_samples_per_second": 1014.494, + "eval_steps_per_second": 4.006, + "step": 52224 + }, + { + "epoch": 1024.12, + "learning_rate": 8.63866858288762e-06, + "loss": 0.3789, + "step": 52230 + }, + { + "epoch": 1024.31, + "learning_rate": 8.619795594783896e-06, + "loss": 0.3744, + "step": 52240 + }, + { + "epoch": 1024.51, + "learning_rate": 8.600941988219453e-06, + "loss": 0.375, + "step": 52250 + }, + { + "epoch": 1024.71, + "learning_rate": 8.582107768699098e-06, + "loss": 0.3722, + "step": 52260 + }, + { + "epoch": 1024.9, + "learning_rate": 8.563292941722004e-06, + "loss": 0.3798, + "step": 52270 + }, + { + "epoch": 1025.0, + "eval_loss": 0.378730446100235, + "eval_runtime": 2.3701, + "eval_samples_per_second": 961.574, + "eval_steps_per_second": 3.797, + "step": 52275 + }, + { + "epoch": 1025.1, + "learning_rate": 8.544497512781697e-06, + "loss": 0.3745, + "step": 52280 + }, + { + "epoch": 1025.29, + "learning_rate": 8.525721487366027e-06, + "loss": 0.3725, + "step": 52290 + }, + { + "epoch": 1025.49, + "learning_rate": 8.506964870957159e-06, + "loss": 0.3762, + "step": 52300 + }, + { + "epoch": 1025.69, + "learning_rate": 8.488227669031594e-06, + "loss": 0.3732, + "step": 52310 + }, + { + "epoch": 1025.88, + "learning_rate": 8.4695098870602e-06, + "loss": 0.3714, + "step": 52320 + }, + { + "epoch": 1026.0, + "eval_loss": 0.3809713125228882, + "eval_runtime": 2.3518, + "eval_samples_per_second": 969.036, + "eval_steps_per_second": 3.827, + "step": 52326 + }, + { + "epoch": 1026.08, + "learning_rate": 8.450811530508136e-06, + "loss": 0.3731, + "step": 52330 + }, + { + "epoch": 1026.27, + "learning_rate": 8.432132604834938e-06, + "loss": 0.3736, + "step": 52340 + }, + { + "epoch": 1026.47, + "learning_rate": 8.413473115494407e-06, + "loss": 0.3721, + "step": 52350 + }, + { + "epoch": 1026.67, + "learning_rate": 8.394833067934687e-06, + "loss": 0.378, + "step": 52360 + }, + { + "epoch": 1026.86, + "learning_rate": 8.37621246759829e-06, + "loss": 0.3776, + "step": 52370 + }, + { + "epoch": 1027.0, + "eval_loss": 0.3787022829055786, + "eval_runtime": 2.2896, + "eval_samples_per_second": 995.38, + "eval_steps_per_second": 3.931, + "step": 52377 + }, + { + "epoch": 1027.06, + "learning_rate": 8.357611319921967e-06, + "loss": 0.3712, + "step": 52380 + }, + { + "epoch": 1027.25, + "learning_rate": 8.3390296303369e-06, + "loss": 0.3721, + "step": 52390 + }, + { + "epoch": 1027.45, + "learning_rate": 8.320467404268479e-06, + "loss": 0.3758, + "step": 52400 + }, + { + "epoch": 1027.65, + "learning_rate": 8.301924647136499e-06, + "loss": 0.3751, + "step": 52410 + }, + { + "epoch": 1027.84, + "learning_rate": 8.283401364354999e-06, + "loss": 0.3688, + "step": 52420 + }, + { + "epoch": 1028.0, + "eval_loss": 0.37706291675567627, + "eval_runtime": 2.2642, + "eval_samples_per_second": 1006.53, + "eval_steps_per_second": 3.975, + "step": 52428 + }, + { + "epoch": 1028.04, + "learning_rate": 8.264897561332357e-06, + "loss": 0.3715, + "step": 52430 + }, + { + "epoch": 1028.24, + "learning_rate": 8.246413243471315e-06, + "loss": 0.3757, + "step": 52440 + }, + { + "epoch": 1028.43, + "learning_rate": 8.22794841616884e-06, + "loss": 0.3712, + "step": 52450 + }, + { + "epoch": 1028.63, + "learning_rate": 8.209503084816285e-06, + "loss": 0.3777, + "step": 52460 + }, + { + "epoch": 1028.82, + "learning_rate": 8.191077254799244e-06, + "loss": 0.375, + "step": 52470 + }, + { + "epoch": 1029.0, + "eval_loss": 0.3775680661201477, + "eval_runtime": 2.3853, + "eval_samples_per_second": 955.43, + "eval_steps_per_second": 3.773, + "step": 52479 + }, + { + "epoch": 1029.02, + "learning_rate": 8.172670931497655e-06, + "loss": 0.3781, + "step": 52480 + }, + { + "epoch": 1029.22, + "learning_rate": 8.154284120285775e-06, + "loss": 0.3723, + "step": 52490 + }, + { + "epoch": 1029.41, + "learning_rate": 8.135916826532112e-06, + "loss": 0.3737, + "step": 52500 + }, + { + "epoch": 1029.61, + "learning_rate": 8.117569055599543e-06, + "loss": 0.3723, + "step": 52510 + }, + { + "epoch": 1029.8, + "learning_rate": 8.099240812845173e-06, + "loss": 0.3709, + "step": 52520 + }, + { + "epoch": 1030.0, + "learning_rate": 8.080932103620446e-06, + "loss": 0.372, + "step": 52530 + }, + { + "epoch": 1030.0, + "eval_loss": 0.3795132339000702, + "eval_runtime": 2.3155, + "eval_samples_per_second": 984.231, + "eval_steps_per_second": 3.887, + "step": 52530 + }, + { + "epoch": 1030.2, + "learning_rate": 8.062642933271104e-06, + "loss": 0.3769, + "step": 52540 + }, + { + "epoch": 1030.39, + "learning_rate": 8.044373307137201e-06, + "loss": 0.3697, + "step": 52550 + }, + { + "epoch": 1030.59, + "learning_rate": 8.026123230553033e-06, + "loss": 0.3753, + "step": 52560 + }, + { + "epoch": 1030.78, + "learning_rate": 8.0078927088472e-06, + "loss": 0.3695, + "step": 52570 + }, + { + "epoch": 1030.98, + "learning_rate": 7.98968174734265e-06, + "loss": 0.3736, + "step": 52580 + }, + { + "epoch": 1031.0, + "eval_loss": 0.3780902624130249, + "eval_runtime": 2.2445, + "eval_samples_per_second": 1015.369, + "eval_steps_per_second": 4.01, + "step": 52581 + }, + { + "epoch": 1031.18, + "learning_rate": 7.971490351356521e-06, + "loss": 0.3718, + "step": 52590 + }, + { + "epoch": 1031.37, + "learning_rate": 7.953318526200358e-06, + "loss": 0.3723, + "step": 52600 + }, + { + "epoch": 1031.57, + "learning_rate": 7.935166277179884e-06, + "loss": 0.3737, + "step": 52610 + }, + { + "epoch": 1031.76, + "learning_rate": 7.91703360959518e-06, + "loss": 0.3744, + "step": 52620 + }, + { + "epoch": 1031.96, + "learning_rate": 7.898920528740566e-06, + "loss": 0.3713, + "step": 52630 + }, + { + "epoch": 1032.0, + "eval_loss": 0.3815433084964752, + "eval_runtime": 2.225, + "eval_samples_per_second": 1024.291, + "eval_steps_per_second": 4.045, + "step": 52632 + }, + { + "epoch": 1032.16, + "learning_rate": 7.880827039904633e-06, + "loss": 0.3725, + "step": 52640 + }, + { + "epoch": 1032.35, + "learning_rate": 7.862753148370331e-06, + "loss": 0.3678, + "step": 52650 + }, + { + "epoch": 1032.55, + "learning_rate": 7.844698859414783e-06, + "loss": 0.3763, + "step": 52660 + }, + { + "epoch": 1032.75, + "learning_rate": 7.826664178309477e-06, + "loss": 0.3692, + "step": 52670 + }, + { + "epoch": 1032.94, + "learning_rate": 7.808649110320111e-06, + "loss": 0.3772, + "step": 52680 + }, + { + "epoch": 1033.0, + "eval_loss": 0.38015732169151306, + "eval_runtime": 2.2304, + "eval_samples_per_second": 1021.789, + "eval_steps_per_second": 4.035, + "step": 52683 + }, + { + "epoch": 1033.14, + "learning_rate": 7.790653660706686e-06, + "loss": 0.3714, + "step": 52690 + }, + { + "epoch": 1033.33, + "learning_rate": 7.772677834723498e-06, + "loss": 0.3774, + "step": 52700 + }, + { + "epoch": 1033.53, + "learning_rate": 7.75472163761905e-06, + "loss": 0.3751, + "step": 52710 + }, + { + "epoch": 1033.73, + "learning_rate": 7.736785074636179e-06, + "loss": 0.3657, + "step": 52720 + }, + { + "epoch": 1033.92, + "learning_rate": 7.71886815101194e-06, + "loss": 0.375, + "step": 52730 + }, + { + "epoch": 1034.0, + "eval_loss": 0.37879452109336853, + "eval_runtime": 2.2594, + "eval_samples_per_second": 1008.66, + "eval_steps_per_second": 3.983, + "step": 52734 + }, + { + "epoch": 1034.12, + "learning_rate": 7.700970871977687e-06, + "loss": 0.3721, + "step": 52740 + }, + { + "epoch": 1034.31, + "learning_rate": 7.68309324275902e-06, + "loss": 0.375, + "step": 52750 + }, + { + "epoch": 1034.51, + "learning_rate": 7.665235268575835e-06, + "loss": 0.3738, + "step": 52760 + }, + { + "epoch": 1034.71, + "learning_rate": 7.647396954642235e-06, + "loss": 0.3723, + "step": 52770 + }, + { + "epoch": 1034.9, + "learning_rate": 7.629578306166607e-06, + "loss": 0.3725, + "step": 52780 + }, + { + "epoch": 1035.0, + "eval_loss": 0.3818568289279938, + "eval_runtime": 2.272, + "eval_samples_per_second": 1003.103, + "eval_steps_per_second": 3.961, + "step": 52785 + }, + { + "epoch": 1035.1, + "learning_rate": 7.6117793283516196e-06, + "loss": 0.3697, + "step": 52790 + }, + { + "epoch": 1035.29, + "learning_rate": 7.594000026394134e-06, + "loss": 0.3768, + "step": 52800 + }, + { + "epoch": 1035.49, + "learning_rate": 7.576240405485373e-06, + "loss": 0.3731, + "step": 52810 + }, + { + "epoch": 1035.69, + "learning_rate": 7.558500470810697e-06, + "loss": 0.3749, + "step": 52820 + }, + { + "epoch": 1035.88, + "learning_rate": 7.540780227549811e-06, + "loss": 0.3696, + "step": 52830 + }, + { + "epoch": 1036.0, + "eval_loss": 0.38364723324775696, + "eval_runtime": 2.3835, + "eval_samples_per_second": 956.169, + "eval_steps_per_second": 3.776, + "step": 52836 + }, + { + "epoch": 1036.08, + "learning_rate": 7.523079680876612e-06, + "loss": 0.372, + "step": 52840 + }, + { + "epoch": 1036.27, + "learning_rate": 7.50539883595924e-06, + "loss": 0.3726, + "step": 52850 + }, + { + "epoch": 1036.47, + "learning_rate": 7.487737697960155e-06, + "loss": 0.3733, + "step": 52860 + }, + { + "epoch": 1036.67, + "learning_rate": 7.470096272035978e-06, + "loss": 0.3738, + "step": 52870 + }, + { + "epoch": 1036.86, + "learning_rate": 7.452474563337643e-06, + "loss": 0.3741, + "step": 52880 + }, + { + "epoch": 1037.0, + "eval_loss": 0.3813818693161011, + "eval_runtime": 2.2919, + "eval_samples_per_second": 994.355, + "eval_steps_per_second": 3.927, + "step": 52887 + }, + { + "epoch": 1037.06, + "learning_rate": 7.43487257701027e-06, + "loss": 0.3728, + "step": 52890 + }, + { + "epoch": 1037.25, + "learning_rate": 7.417290318193247e-06, + "loss": 0.3696, + "step": 52900 + }, + { + "epoch": 1037.45, + "learning_rate": 7.399727792020235e-06, + "loss": 0.3743, + "step": 52910 + }, + { + "epoch": 1037.65, + "learning_rate": 7.382185003619048e-06, + "loss": 0.3764, + "step": 52920 + }, + { + "epoch": 1037.84, + "learning_rate": 7.364661958111839e-06, + "loss": 0.3734, + "step": 52930 + }, + { + "epoch": 1038.0, + "eval_loss": 0.37986841797828674, + "eval_runtime": 2.3307, + "eval_samples_per_second": 977.807, + "eval_steps_per_second": 3.861, + "step": 52938 + }, + { + "epoch": 1038.04, + "learning_rate": 7.347158660614907e-06, + "loss": 0.3711, + "step": 52940 + }, + { + "epoch": 1038.24, + "learning_rate": 7.3296751162388475e-06, + "loss": 0.3734, + "step": 52950 + }, + { + "epoch": 1038.43, + "learning_rate": 7.3122113300884525e-06, + "loss": 0.3741, + "step": 52960 + }, + { + "epoch": 1038.63, + "learning_rate": 7.294767307262784e-06, + "loss": 0.373, + "step": 52970 + }, + { + "epoch": 1038.82, + "learning_rate": 7.277343052855084e-06, + "loss": 0.3759, + "step": 52980 + }, + { + "epoch": 1039.0, + "eval_loss": 0.3788532614707947, + "eval_runtime": 2.3042, + "eval_samples_per_second": 989.052, + "eval_steps_per_second": 3.906, + "step": 52989 + }, + { + "epoch": 1039.02, + "learning_rate": 7.259938571952833e-06, + "loss": 0.3736, + "step": 52990 + }, + { + "epoch": 1039.22, + "learning_rate": 7.242553869637793e-06, + "loss": 0.3782, + "step": 53000 + }, + { + "epoch": 1039.41, + "learning_rate": 7.225188950985852e-06, + "loss": 0.3711, + "step": 53010 + }, + { + "epoch": 1039.61, + "learning_rate": 7.207843821067239e-06, + "loss": 0.3719, + "step": 53020 + }, + { + "epoch": 1039.8, + "learning_rate": 7.190518484946309e-06, + "loss": 0.376, + "step": 53030 + }, + { + "epoch": 1040.0, + "learning_rate": 7.173212947681692e-06, + "loss": 0.3726, + "step": 53040 + }, + { + "epoch": 1040.0, + "eval_loss": 0.38017430901527405, + "eval_runtime": 2.2059, + "eval_samples_per_second": 1033.148, + "eval_steps_per_second": 4.08, + "step": 53040 + }, + { + "epoch": 1040.2, + "learning_rate": 7.155927214326213e-06, + "loss": 0.3687, + "step": 53050 + }, + { + "epoch": 1040.39, + "learning_rate": 7.138661289926892e-06, + "loss": 0.3779, + "step": 53060 + }, + { + "epoch": 1040.59, + "learning_rate": 7.121415179525039e-06, + "loss": 0.3764, + "step": 53070 + }, + { + "epoch": 1040.78, + "learning_rate": 7.104188888156109e-06, + "loss": 0.3772, + "step": 53080 + }, + { + "epoch": 1040.98, + "learning_rate": 7.086982420849812e-06, + "loss": 0.3693, + "step": 53090 + }, + { + "epoch": 1041.0, + "eval_loss": 0.37691381573677063, + "eval_runtime": 2.2297, + "eval_samples_per_second": 1022.133, + "eval_steps_per_second": 4.037, + "step": 53091 + }, + { + "epoch": 1041.18, + "learning_rate": 7.069795782630039e-06, + "loss": 0.3744, + "step": 53100 + }, + { + "epoch": 1041.37, + "learning_rate": 7.0526289785148824e-06, + "loss": 0.3716, + "step": 53110 + }, + { + "epoch": 1041.57, + "learning_rate": 7.035482013516716e-06, + "loss": 0.3705, + "step": 53120 + }, + { + "epoch": 1041.76, + "learning_rate": 7.018354892642028e-06, + "loss": 0.3755, + "step": 53130 + }, + { + "epoch": 1041.96, + "learning_rate": 7.001247620891592e-06, + "loss": 0.3705, + "step": 53140 + }, + { + "epoch": 1042.0, + "eval_loss": 0.3811741769313812, + "eval_runtime": 2.3416, + "eval_samples_per_second": 973.249, + "eval_steps_per_second": 3.843, + "step": 53142 + }, + { + "epoch": 1042.16, + "learning_rate": 6.984160203260323e-06, + "loss": 0.3728, + "step": 53150 + }, + { + "epoch": 1042.35, + "learning_rate": 6.967092644737368e-06, + "loss": 0.3718, + "step": 53160 + }, + { + "epoch": 1042.55, + "learning_rate": 6.950044950306094e-06, + "loss": 0.3709, + "step": 53170 + }, + { + "epoch": 1042.75, + "learning_rate": 6.9330171249440184e-06, + "loss": 0.374, + "step": 53180 + }, + { + "epoch": 1042.94, + "learning_rate": 6.916009173622914e-06, + "loss": 0.3691, + "step": 53190 + }, + { + "epoch": 1043.0, + "eval_loss": 0.3806150257587433, + "eval_runtime": 2.3941, + "eval_samples_per_second": 951.93, + "eval_steps_per_second": 3.759, + "step": 53193 + }, + { + "epoch": 1043.14, + "learning_rate": 6.899021101308699e-06, + "loss": 0.3748, + "step": 53200 + }, + { + "epoch": 1043.33, + "learning_rate": 6.882052912961533e-06, + "loss": 0.3745, + "step": 53210 + }, + { + "epoch": 1043.53, + "learning_rate": 6.865104613535718e-06, + "loss": 0.3736, + "step": 53220 + }, + { + "epoch": 1043.73, + "learning_rate": 6.848176207979822e-06, + "loss": 0.3703, + "step": 53230 + }, + { + "epoch": 1043.92, + "learning_rate": 6.83126770123654e-06, + "loss": 0.3736, + "step": 53240 + }, + { + "epoch": 1044.0, + "eval_loss": 0.3796224892139435, + "eval_runtime": 2.3263, + "eval_samples_per_second": 979.664, + "eval_steps_per_second": 3.869, + "step": 53244 + }, + { + "epoch": 1044.12, + "learning_rate": 6.814379098242773e-06, + "loss": 0.3684, + "step": 53250 + }, + { + "epoch": 1044.31, + "learning_rate": 6.7975104039296266e-06, + "loss": 0.372, + "step": 53260 + }, + { + "epoch": 1044.51, + "learning_rate": 6.780661623222361e-06, + "loss": 0.3715, + "step": 53270 + }, + { + "epoch": 1044.71, + "learning_rate": 6.763832761040483e-06, + "loss": 0.3704, + "step": 53280 + }, + { + "epoch": 1044.9, + "learning_rate": 6.747023822297612e-06, + "loss": 0.3707, + "step": 53290 + }, + { + "epoch": 1045.0, + "eval_loss": 0.3784136474132538, + "eval_runtime": 2.3444, + "eval_samples_per_second": 972.094, + "eval_steps_per_second": 3.839, + "step": 53295 + }, + { + "epoch": 1045.1, + "learning_rate": 6.730234811901614e-06, + "loss": 0.3733, + "step": 53300 + }, + { + "epoch": 1045.29, + "learning_rate": 6.713465734754475e-06, + "loss": 0.3748, + "step": 53310 + }, + { + "epoch": 1045.49, + "learning_rate": 6.696716595752388e-06, + "loss": 0.3723, + "step": 53320 + }, + { + "epoch": 1045.69, + "learning_rate": 6.679987399785766e-06, + "loss": 0.3714, + "step": 53330 + }, + { + "epoch": 1045.88, + "learning_rate": 6.663278151739135e-06, + "loss": 0.3735, + "step": 53340 + }, + { + "epoch": 1046.0, + "eval_loss": 0.3752482831478119, + "eval_runtime": 2.3265, + "eval_samples_per_second": 979.576, + "eval_steps_per_second": 3.868, + "step": 53346 + }, + { + "epoch": 1046.08, + "learning_rate": 6.646588856491234e-06, + "loss": 0.3691, + "step": 53350 + }, + { + "epoch": 1046.27, + "learning_rate": 6.629919518914939e-06, + "loss": 0.3676, + "step": 53360 + }, + { + "epoch": 1046.47, + "learning_rate": 6.61327014387735e-06, + "loss": 0.3704, + "step": 53370 + }, + { + "epoch": 1046.67, + "learning_rate": 6.59664073623972e-06, + "loss": 0.3789, + "step": 53380 + }, + { + "epoch": 1046.86, + "learning_rate": 6.580031300857438e-06, + "loss": 0.3773, + "step": 53390 + }, + { + "epoch": 1047.0, + "eval_loss": 0.38012462854385376, + "eval_runtime": 2.3216, + "eval_samples_per_second": 981.637, + "eval_steps_per_second": 3.877, + "step": 53397 + }, + { + "epoch": 1047.06, + "learning_rate": 6.563441842580111e-06, + "loss": 0.3798, + "step": 53400 + }, + { + "epoch": 1047.25, + "learning_rate": 6.54687236625148e-06, + "loss": 0.3741, + "step": 53410 + }, + { + "epoch": 1047.45, + "learning_rate": 6.530322876709465e-06, + "loss": 0.3705, + "step": 53420 + }, + { + "epoch": 1047.65, + "learning_rate": 6.513793378786136e-06, + "loss": 0.3742, + "step": 53430 + }, + { + "epoch": 1047.84, + "learning_rate": 6.4972838773077655e-06, + "loss": 0.3714, + "step": 53440 + }, + { + "epoch": 1048.0, + "eval_loss": 0.38000649213790894, + "eval_runtime": 2.2751, + "eval_samples_per_second": 1001.725, + "eval_steps_per_second": 3.956, + "step": 53448 + }, + { + "epoch": 1048.04, + "learning_rate": 6.4807943770947475e-06, + "loss": 0.3758, + "step": 53450 + }, + { + "epoch": 1048.24, + "learning_rate": 6.46432488296163e-06, + "loss": 0.3754, + "step": 53460 + }, + { + "epoch": 1048.43, + "learning_rate": 6.4478753997171675e-06, + "loss": 0.3754, + "step": 53470 + }, + { + "epoch": 1048.63, + "learning_rate": 6.4314459321642e-06, + "loss": 0.3703, + "step": 53480 + }, + { + "epoch": 1048.82, + "learning_rate": 6.415036485099825e-06, + "loss": 0.3747, + "step": 53490 + }, + { + "epoch": 1049.0, + "eval_loss": 0.3787485957145691, + "eval_runtime": 2.204, + "eval_samples_per_second": 1034.047, + "eval_steps_per_second": 4.084, + "step": 53499 + }, + { + "epoch": 1049.02, + "learning_rate": 6.3986470633151845e-06, + "loss": 0.3682, + "step": 53500 + }, + { + "epoch": 1049.22, + "learning_rate": 6.382277671595659e-06, + "loss": 0.368, + "step": 53510 + }, + { + "epoch": 1049.41, + "learning_rate": 6.365928314720725e-06, + "loss": 0.3749, + "step": 53520 + }, + { + "epoch": 1049.61, + "learning_rate": 6.349598997464015e-06, + "loss": 0.3713, + "step": 53530 + }, + { + "epoch": 1049.8, + "learning_rate": 6.333289724593363e-06, + "loss": 0.3732, + "step": 53540 + }, + { + "epoch": 1050.0, + "learning_rate": 6.317000500870687e-06, + "loss": 0.3735, + "step": 53550 + }, + { + "epoch": 1050.0, + "eval_loss": 0.3775447905063629, + "eval_runtime": 2.2154, + "eval_samples_per_second": 1028.709, + "eval_steps_per_second": 4.062, + "step": 53550 + }, + { + "epoch": 1050.2, + "learning_rate": 6.3007313310520975e-06, + "loss": 0.367, + "step": 53560 + }, + { + "epoch": 1050.39, + "learning_rate": 6.2844822198878046e-06, + "loss": 0.3703, + "step": 53570 + }, + { + "epoch": 1050.59, + "learning_rate": 6.268253172122204e-06, + "loss": 0.3666, + "step": 53580 + }, + { + "epoch": 1050.78, + "learning_rate": 6.252044192493813e-06, + "loss": 0.3735, + "step": 53590 + }, + { + "epoch": 1050.98, + "learning_rate": 6.235855285735289e-06, + "loss": 0.3727, + "step": 53600 + }, + { + "epoch": 1051.0, + "eval_loss": 0.37708601355552673, + "eval_runtime": 2.3593, + "eval_samples_per_second": 965.956, + "eval_steps_per_second": 3.815, + "step": 53601 + }, + { + "epoch": 1051.18, + "learning_rate": 6.219686456573434e-06, + "loss": 0.374, + "step": 53610 + }, + { + "epoch": 1051.37, + "learning_rate": 6.203537709729178e-06, + "loss": 0.3726, + "step": 53620 + }, + { + "epoch": 1051.57, + "learning_rate": 6.187409049917611e-06, + "loss": 0.3717, + "step": 53630 + }, + { + "epoch": 1051.76, + "learning_rate": 6.171300481847905e-06, + "loss": 0.3703, + "step": 53640 + }, + { + "epoch": 1051.96, + "learning_rate": 6.155212010223457e-06, + "loss": 0.3736, + "step": 53650 + }, + { + "epoch": 1052.0, + "eval_loss": 0.38328659534454346, + "eval_runtime": 2.4265, + "eval_samples_per_second": 939.22, + "eval_steps_per_second": 3.709, + "step": 53652 + }, + { + "epoch": 1052.16, + "learning_rate": 6.1391436397417084e-06, + "loss": 0.3724, + "step": 53660 + }, + { + "epoch": 1052.35, + "learning_rate": 6.123095375094267e-06, + "loss": 0.3723, + "step": 53670 + }, + { + "epoch": 1052.55, + "learning_rate": 6.107067220966874e-06, + "loss": 0.3691, + "step": 53680 + }, + { + "epoch": 1052.75, + "learning_rate": 6.0910591820393705e-06, + "loss": 0.3719, + "step": 53690 + }, + { + "epoch": 1052.94, + "learning_rate": 6.0750712629858005e-06, + "loss": 0.3676, + "step": 53700 + }, + { + "epoch": 1053.0, + "eval_loss": 0.37962618470191956, + "eval_runtime": 2.3919, + "eval_samples_per_second": 952.79, + "eval_steps_per_second": 3.763, + "step": 53703 + }, + { + "epoch": 1053.14, + "learning_rate": 6.059103468474222e-06, + "loss": 0.372, + "step": 53710 + }, + { + "epoch": 1053.33, + "learning_rate": 6.043155803166921e-06, + "loss": 0.3712, + "step": 53720 + }, + { + "epoch": 1053.53, + "learning_rate": 6.027228271720233e-06, + "loss": 0.3705, + "step": 53730 + }, + { + "epoch": 1053.73, + "learning_rate": 6.011320878784629e-06, + "loss": 0.3735, + "step": 53740 + }, + { + "epoch": 1053.92, + "learning_rate": 5.99543362900475e-06, + "loss": 0.3688, + "step": 53750 + }, + { + "epoch": 1054.0, + "eval_loss": 0.3757660686969757, + "eval_runtime": 2.255, + "eval_samples_per_second": 1010.632, + "eval_steps_per_second": 3.991, + "step": 53754 + }, + { + "epoch": 1054.12, + "learning_rate": 5.979566527019289e-06, + "loss": 0.3707, + "step": 53760 + }, + { + "epoch": 1054.31, + "learning_rate": 5.963719577461112e-06, + "loss": 0.3738, + "step": 53770 + }, + { + "epoch": 1054.51, + "learning_rate": 5.947892784957162e-06, + "loss": 0.3766, + "step": 53780 + }, + { + "epoch": 1054.71, + "learning_rate": 5.932086154128474e-06, + "loss": 0.3759, + "step": 53790 + }, + { + "epoch": 1054.9, + "learning_rate": 5.916299689590298e-06, + "loss": 0.369, + "step": 53800 + }, + { + "epoch": 1055.0, + "eval_loss": 0.3774784207344055, + "eval_runtime": 2.2073, + "eval_samples_per_second": 1032.485, + "eval_steps_per_second": 4.077, + "step": 53805 + }, + { + "epoch": 1055.1, + "learning_rate": 5.900533395951881e-06, + "loss": 0.3718, + "step": 53810 + }, + { + "epoch": 1055.29, + "learning_rate": 5.884787277816649e-06, + "loss": 0.3693, + "step": 53820 + }, + { + "epoch": 1055.49, + "learning_rate": 5.869061339782116e-06, + "loss": 0.372, + "step": 53830 + }, + { + "epoch": 1055.69, + "learning_rate": 5.853355586439901e-06, + "loss": 0.374, + "step": 53840 + }, + { + "epoch": 1055.88, + "learning_rate": 5.837670022375734e-06, + "loss": 0.3696, + "step": 53850 + }, + { + "epoch": 1056.0, + "eval_loss": 0.38110822439193726, + "eval_runtime": 2.2544, + "eval_samples_per_second": 1010.892, + "eval_steps_per_second": 3.992, + "step": 53856 + }, + { + "epoch": 1056.08, + "learning_rate": 5.822004652169445e-06, + "loss": 0.3728, + "step": 53860 + }, + { + "epoch": 1056.27, + "learning_rate": 5.806359480394992e-06, + "loss": 0.3676, + "step": 53870 + }, + { + "epoch": 1056.47, + "learning_rate": 5.790734511620387e-06, + "loss": 0.3764, + "step": 53880 + }, + { + "epoch": 1056.67, + "learning_rate": 5.775129750407806e-06, + "loss": 0.3732, + "step": 53890 + }, + { + "epoch": 1056.86, + "learning_rate": 5.759545201313445e-06, + "loss": 0.3707, + "step": 53900 + }, + { + "epoch": 1057.0, + "eval_loss": 0.3776305615901947, + "eval_runtime": 2.2002, + "eval_samples_per_second": 1035.8, + "eval_steps_per_second": 4.09, + "step": 53907 + }, + { + "epoch": 1057.06, + "learning_rate": 5.743980868887699e-06, + "loss": 0.3742, + "step": 53910 + }, + { + "epoch": 1057.25, + "learning_rate": 5.728436757674981e-06, + "loss": 0.3709, + "step": 53920 + }, + { + "epoch": 1057.45, + "learning_rate": 5.712912872213812e-06, + "loss": 0.3692, + "step": 53930 + }, + { + "epoch": 1057.65, + "learning_rate": 5.6974092170368414e-06, + "loss": 0.3766, + "step": 53940 + }, + { + "epoch": 1057.84, + "learning_rate": 5.681925796670756e-06, + "loss": 0.3765, + "step": 53950 + }, + { + "epoch": 1058.0, + "eval_loss": 0.3803638219833374, + "eval_runtime": 2.3012, + "eval_samples_per_second": 990.343, + "eval_steps_per_second": 3.911, + "step": 53958 + }, + { + "epoch": 1058.04, + "learning_rate": 5.666462615636422e-06, + "loss": 0.3715, + "step": 53960 + }, + { + "epoch": 1058.24, + "learning_rate": 5.6510196784487125e-06, + "loss": 0.3725, + "step": 53970 + }, + { + "epoch": 1058.43, + "learning_rate": 5.635596989616628e-06, + "loss": 0.3727, + "step": 53980 + }, + { + "epoch": 1058.63, + "learning_rate": 5.620194553643243e-06, + "loss": 0.372, + "step": 53990 + }, + { + "epoch": 1058.82, + "learning_rate": 5.604812375025708e-06, + "loss": 0.3697, + "step": 54000 + }, + { + "epoch": 1059.0, + "eval_loss": 0.3813176155090332, + "eval_runtime": 2.2044, + "eval_samples_per_second": 1033.843, + "eval_steps_per_second": 4.083, + "step": 54009 + }, + { + "epoch": 1059.02, + "learning_rate": 5.589450458255324e-06, + "loss": 0.3749, + "step": 54010 + }, + { + "epoch": 1059.22, + "learning_rate": 5.574108807817384e-06, + "loss": 0.373, + "step": 54020 + }, + { + "epoch": 1059.41, + "learning_rate": 5.558787428191341e-06, + "loss": 0.3729, + "step": 54030 + }, + { + "epoch": 1059.61, + "learning_rate": 5.543486323850666e-06, + "loss": 0.3722, + "step": 54040 + }, + { + "epoch": 1059.8, + "learning_rate": 5.528205499262958e-06, + "loss": 0.3719, + "step": 54050 + }, + { + "epoch": 1060.0, + "learning_rate": 5.512944958889867e-06, + "loss": 0.3718, + "step": 54060 + }, + { + "epoch": 1060.0, + "eval_loss": 0.3722068667411804, + "eval_runtime": 2.3645, + "eval_samples_per_second": 963.858, + "eval_steps_per_second": 3.806, + "step": 54060 + }, + { + "epoch": 1060.2, + "learning_rate": 5.497704707187137e-06, + "loss": 0.3722, + "step": 54070 + }, + { + "epoch": 1060.39, + "learning_rate": 5.482484748604598e-06, + "loss": 0.3718, + "step": 54080 + }, + { + "epoch": 1060.59, + "learning_rate": 5.467285087586107e-06, + "loss": 0.3773, + "step": 54090 + }, + { + "epoch": 1060.78, + "learning_rate": 5.452105728569644e-06, + "loss": 0.3672, + "step": 54100 + }, + { + "epoch": 1060.98, + "learning_rate": 5.436946675987225e-06, + "loss": 0.3699, + "step": 54110 + }, + { + "epoch": 1061.0, + "eval_loss": 0.37705689668655396, + "eval_runtime": 2.2209, + "eval_samples_per_second": 1026.164, + "eval_steps_per_second": 4.052, + "step": 54111 + }, + { + "epoch": 1061.18, + "learning_rate": 5.4218079342649906e-06, + "loss": 0.3643, + "step": 54120 + }, + { + "epoch": 1061.37, + "learning_rate": 5.4066895078230894e-06, + "loss": 0.3655, + "step": 54130 + }, + { + "epoch": 1061.57, + "learning_rate": 5.391591401075765e-06, + "loss": 0.3724, + "step": 54140 + }, + { + "epoch": 1061.76, + "learning_rate": 5.376513618431349e-06, + "loss": 0.3729, + "step": 54150 + }, + { + "epoch": 1061.96, + "learning_rate": 5.361456164292171e-06, + "loss": 0.3725, + "step": 54160 + }, + { + "epoch": 1062.0, + "eval_loss": 0.3779694437980652, + "eval_runtime": 2.3417, + "eval_samples_per_second": 973.217, + "eval_steps_per_second": 3.843, + "step": 54162 + }, + { + "epoch": 1062.16, + "learning_rate": 5.346419043054731e-06, + "loss": 0.3711, + "step": 54170 + }, + { + "epoch": 1062.35, + "learning_rate": 5.331402259109491e-06, + "loss": 0.3718, + "step": 54180 + }, + { + "epoch": 1062.55, + "learning_rate": 5.316405816841035e-06, + "loss": 0.375, + "step": 54190 + }, + { + "epoch": 1062.75, + "learning_rate": 5.3014297206279945e-06, + "loss": 0.375, + "step": 54200 + }, + { + "epoch": 1062.94, + "learning_rate": 5.286473974843022e-06, + "loss": 0.3705, + "step": 54210 + }, + { + "epoch": 1063.0, + "eval_loss": 0.37669458985328674, + "eval_runtime": 2.2851, + "eval_samples_per_second": 997.336, + "eval_steps_per_second": 3.939, + "step": 54213 + }, + { + "epoch": 1063.14, + "learning_rate": 5.271538583852908e-06, + "loss": 0.3732, + "step": 54220 + }, + { + "epoch": 1063.33, + "learning_rate": 5.256623552018421e-06, + "loss": 0.3708, + "step": 54230 + }, + { + "epoch": 1063.53, + "learning_rate": 5.241728883694446e-06, + "loss": 0.3735, + "step": 54240 + }, + { + "epoch": 1063.73, + "learning_rate": 5.226854583229853e-06, + "loss": 0.3736, + "step": 54250 + }, + { + "epoch": 1063.92, + "learning_rate": 5.2120006549676516e-06, + "loss": 0.3698, + "step": 54260 + }, + { + "epoch": 1064.0, + "eval_loss": 0.3782898783683777, + "eval_runtime": 2.2588, + "eval_samples_per_second": 1008.939, + "eval_steps_per_second": 3.984, + "step": 54264 + }, + { + "epoch": 1064.12, + "learning_rate": 5.197167103244823e-06, + "loss": 0.375, + "step": 54270 + }, + { + "epoch": 1064.31, + "learning_rate": 5.182353932392435e-06, + "loss": 0.3668, + "step": 54280 + }, + { + "epoch": 1064.51, + "learning_rate": 5.1675611467356385e-06, + "loss": 0.3702, + "step": 54290 + }, + { + "epoch": 1064.71, + "learning_rate": 5.152788750593559e-06, + "loss": 0.3717, + "step": 54300 + }, + { + "epoch": 1064.9, + "learning_rate": 5.138036748279431e-06, + "loss": 0.374, + "step": 54310 + }, + { + "epoch": 1065.0, + "eval_loss": 0.3775031268596649, + "eval_runtime": 2.3721, + "eval_samples_per_second": 960.751, + "eval_steps_per_second": 3.794, + "step": 54315 + }, + { + "epoch": 1065.1, + "learning_rate": 5.123305144100467e-06, + "loss": 0.3726, + "step": 54320 + }, + { + "epoch": 1065.29, + "learning_rate": 5.108593942358036e-06, + "loss": 0.3703, + "step": 54330 + }, + { + "epoch": 1065.49, + "learning_rate": 5.0939031473474336e-06, + "loss": 0.3685, + "step": 54340 + }, + { + "epoch": 1065.69, + "learning_rate": 5.079232763358046e-06, + "loss": 0.3701, + "step": 54350 + }, + { + "epoch": 1065.88, + "learning_rate": 5.0645827946733215e-06, + "loss": 0.3665, + "step": 54360 + }, + { + "epoch": 1066.0, + "eval_loss": 0.3812878131866455, + "eval_runtime": 2.3196, + "eval_samples_per_second": 982.514, + "eval_steps_per_second": 3.88, + "step": 54366 + }, + { + "epoch": 1066.08, + "learning_rate": 5.04995324557069e-06, + "loss": 0.3756, + "step": 54370 + }, + { + "epoch": 1066.27, + "learning_rate": 5.035344120321691e-06, + "loss": 0.3716, + "step": 54380 + }, + { + "epoch": 1066.47, + "learning_rate": 5.020755423191839e-06, + "loss": 0.3706, + "step": 54390 + }, + { + "epoch": 1066.67, + "learning_rate": 5.006187158440716e-06, + "loss": 0.371, + "step": 54400 + }, + { + "epoch": 1066.86, + "learning_rate": 4.991639330321939e-06, + "loss": 0.3695, + "step": 54410 + }, + { + "epoch": 1067.0, + "eval_loss": 0.38005512952804565, + "eval_runtime": 2.33, + "eval_samples_per_second": 978.108, + "eval_steps_per_second": 3.863, + "step": 54417 + }, + { + "epoch": 1067.06, + "learning_rate": 4.977111943083118e-06, + "loss": 0.3717, + "step": 54420 + }, + { + "epoch": 1067.25, + "learning_rate": 4.962605000965958e-06, + "loss": 0.3732, + "step": 54430 + }, + { + "epoch": 1067.45, + "learning_rate": 4.948118508206156e-06, + "loss": 0.3696, + "step": 54440 + }, + { + "epoch": 1067.65, + "learning_rate": 4.933652469033444e-06, + "loss": 0.3717, + "step": 54450 + }, + { + "epoch": 1067.84, + "learning_rate": 4.9192068876715704e-06, + "loss": 0.3705, + "step": 54460 + }, + { + "epoch": 1068.0, + "eval_loss": 0.38045910000801086, + "eval_runtime": 2.2201, + "eval_samples_per_second": 1026.519, + "eval_steps_per_second": 4.054, + "step": 54468 + }, + { + "epoch": 1068.04, + "learning_rate": 4.904781768338342e-06, + "loss": 0.3715, + "step": 54470 + }, + { + "epoch": 1068.24, + "learning_rate": 4.8903771152455505e-06, + "loss": 0.3755, + "step": 54480 + }, + { + "epoch": 1068.43, + "learning_rate": 4.875992932599046e-06, + "loss": 0.3715, + "step": 54490 + }, + { + "epoch": 1068.63, + "learning_rate": 4.861629224598695e-06, + "loss": 0.3718, + "step": 54500 + }, + { + "epoch": 1068.82, + "learning_rate": 4.847285995438369e-06, + "loss": 0.3709, + "step": 54510 + }, + { + "epoch": 1069.0, + "eval_loss": 0.3779762387275696, + "eval_runtime": 2.3441, + "eval_samples_per_second": 972.235, + "eval_steps_per_second": 3.839, + "step": 54519 + }, + { + "epoch": 1069.02, + "learning_rate": 4.832963249305982e-06, + "loss": 0.3705, + "step": 54520 + }, + { + "epoch": 1069.22, + "learning_rate": 4.818660990383441e-06, + "loss": 0.3656, + "step": 54530 + }, + { + "epoch": 1069.41, + "learning_rate": 4.804379222846696e-06, + "loss": 0.3681, + "step": 54540 + }, + { + "epoch": 1069.61, + "learning_rate": 4.790117950865713e-06, + "loss": 0.3762, + "step": 54550 + }, + { + "epoch": 1069.8, + "learning_rate": 4.775877178604442e-06, + "loss": 0.3734, + "step": 54560 + }, + { + "epoch": 1070.0, + "learning_rate": 4.761656910220901e-06, + "loss": 0.3762, + "step": 54570 + }, + { + "epoch": 1070.0, + "eval_loss": 0.37581372261047363, + "eval_runtime": 2.3052, + "eval_samples_per_second": 988.632, + "eval_steps_per_second": 3.904, + "step": 54570 + }, + { + "epoch": 1070.2, + "learning_rate": 4.747457149867051e-06, + "loss": 0.3741, + "step": 54580 + }, + { + "epoch": 1070.39, + "learning_rate": 4.733277901688951e-06, + "loss": 0.3705, + "step": 54590 + }, + { + "epoch": 1070.59, + "learning_rate": 4.719119169826605e-06, + "loss": 0.3679, + "step": 54600 + }, + { + "epoch": 1070.78, + "learning_rate": 4.704980958414031e-06, + "loss": 0.3715, + "step": 54610 + }, + { + "epoch": 1070.98, + "learning_rate": 4.690863271579304e-06, + "loss": 0.3718, + "step": 54620 + }, + { + "epoch": 1071.0, + "eval_loss": 0.38009241223335266, + "eval_runtime": 2.2507, + "eval_samples_per_second": 1012.56, + "eval_steps_per_second": 3.999, + "step": 54621 + }, + { + "epoch": 1071.18, + "learning_rate": 4.676766113444425e-06, + "loss": 0.3686, + "step": 54630 + }, + { + "epoch": 1071.37, + "learning_rate": 4.662689488125509e-06, + "loss": 0.3716, + "step": 54640 + }, + { + "epoch": 1071.57, + "learning_rate": 4.648633399732571e-06, + "loss": 0.37, + "step": 54650 + }, + { + "epoch": 1071.76, + "learning_rate": 4.6345978523697094e-06, + "loss": 0.3754, + "step": 54660 + }, + { + "epoch": 1071.96, + "learning_rate": 4.620582850134971e-06, + "loss": 0.3736, + "step": 54670 + }, + { + "epoch": 1072.0, + "eval_loss": 0.3768666684627533, + "eval_runtime": 2.3236, + "eval_samples_per_second": 980.799, + "eval_steps_per_second": 3.873, + "step": 54672 + }, + { + "epoch": 1072.16, + "learning_rate": 4.606588397120417e-06, + "loss": 0.3676, + "step": 54680 + }, + { + "epoch": 1072.35, + "learning_rate": 4.592614497412128e-06, + "loss": 0.3706, + "step": 54690 + }, + { + "epoch": 1072.55, + "learning_rate": 4.5786611550901655e-06, + "loss": 0.373, + "step": 54700 + }, + { + "epoch": 1072.75, + "learning_rate": 4.564728374228613e-06, + "loss": 0.3732, + "step": 54710 + }, + { + "epoch": 1072.94, + "learning_rate": 4.5508161588954986e-06, + "loss": 0.3702, + "step": 54720 + }, + { + "epoch": 1073.0, + "eval_loss": 0.37629246711730957, + "eval_runtime": 2.2337, + "eval_samples_per_second": 1020.259, + "eval_steps_per_second": 4.029, + "step": 54723 + }, + { + "epoch": 1073.14, + "learning_rate": 4.536924513152915e-06, + "loss": 0.371, + "step": 54730 + }, + { + "epoch": 1073.33, + "learning_rate": 4.523053441056876e-06, + "loss": 0.3752, + "step": 54740 + }, + { + "epoch": 1073.53, + "learning_rate": 4.509202946657442e-06, + "loss": 0.3692, + "step": 54750 + }, + { + "epoch": 1073.73, + "learning_rate": 4.49537303399867e-06, + "loss": 0.3694, + "step": 54760 + }, + { + "epoch": 1073.92, + "learning_rate": 4.481563707118554e-06, + "loss": 0.3716, + "step": 54770 + }, + { + "epoch": 1074.0, + "eval_loss": 0.3790897727012634, + "eval_runtime": 2.3133, + "eval_samples_per_second": 985.167, + "eval_steps_per_second": 3.891, + "step": 54774 + }, + { + "epoch": 1074.12, + "learning_rate": 4.467774970049129e-06, + "loss": 0.3717, + "step": 54780 + }, + { + "epoch": 1074.31, + "learning_rate": 4.454006826816373e-06, + "loss": 0.3748, + "step": 54790 + }, + { + "epoch": 1074.51, + "learning_rate": 4.440259281440311e-06, + "loss": 0.3725, + "step": 54800 + }, + { + "epoch": 1074.71, + "learning_rate": 4.42653233793491e-06, + "loss": 0.3704, + "step": 54810 + }, + { + "epoch": 1074.9, + "learning_rate": 4.412826000308111e-06, + "loss": 0.3684, + "step": 54820 + }, + { + "epoch": 1075.0, + "eval_loss": 0.37449750304222107, + "eval_runtime": 2.2223, + "eval_samples_per_second": 1025.497, + "eval_steps_per_second": 4.05, + "step": 54825 + }, + { + "epoch": 1075.1, + "learning_rate": 4.399140272561882e-06, + "loss": 0.3727, + "step": 54830 + }, + { + "epoch": 1075.29, + "learning_rate": 4.3854751586921255e-06, + "loss": 0.37, + "step": 54840 + }, + { + "epoch": 1075.49, + "learning_rate": 4.3718306626887825e-06, + "loss": 0.367, + "step": 54850 + }, + { + "epoch": 1075.69, + "learning_rate": 4.3582067885357175e-06, + "loss": 0.3697, + "step": 54860 + }, + { + "epoch": 1075.88, + "learning_rate": 4.344603540210814e-06, + "loss": 0.3682, + "step": 54870 + }, + { + "epoch": 1076.0, + "eval_loss": 0.3796183168888092, + "eval_runtime": 2.2676, + "eval_samples_per_second": 1005.038, + "eval_steps_per_second": 3.969, + "step": 54876 + }, + { + "epoch": 1076.08, + "learning_rate": 4.3310209216859126e-06, + "loss": 0.3734, + "step": 54880 + }, + { + "epoch": 1076.27, + "learning_rate": 4.317458936926816e-06, + "loss": 0.3704, + "step": 54890 + }, + { + "epoch": 1076.47, + "learning_rate": 4.303917589893338e-06, + "loss": 0.3739, + "step": 54900 + }, + { + "epoch": 1076.67, + "learning_rate": 4.290396884539243e-06, + "loss": 0.3692, + "step": 54910 + }, + { + "epoch": 1076.86, + "learning_rate": 4.276896824812298e-06, + "loss": 0.3699, + "step": 54920 + }, + { + "epoch": 1077.0, + "eval_loss": 0.37840622663497925, + "eval_runtime": 2.2821, + "eval_samples_per_second": 998.656, + "eval_steps_per_second": 3.944, + "step": 54927 + }, + { + "epoch": 1077.06, + "learning_rate": 4.263417414654191e-06, + "loss": 0.369, + "step": 54930 + }, + { + "epoch": 1077.25, + "learning_rate": 4.2499586580006324e-06, + "loss": 0.3675, + "step": 54940 + }, + { + "epoch": 1077.45, + "learning_rate": 4.236520558781245e-06, + "loss": 0.3737, + "step": 54950 + }, + { + "epoch": 1077.65, + "learning_rate": 4.223103120919683e-06, + "loss": 0.3716, + "step": 54960 + }, + { + "epoch": 1077.84, + "learning_rate": 4.209706348333544e-06, + "loss": 0.3745, + "step": 54970 + }, + { + "epoch": 1078.0, + "eval_loss": 0.3793691396713257, + "eval_runtime": 2.2701, + "eval_samples_per_second": 1003.93, + "eval_steps_per_second": 3.965, + "step": 54978 + }, + { + "epoch": 1078.04, + "learning_rate": 4.1963302449343595e-06, + "loss": 0.3693, + "step": 54980 + }, + { + "epoch": 1078.24, + "learning_rate": 4.182974814627688e-06, + "loss": 0.3722, + "step": 54990 + }, + { + "epoch": 1078.43, + "learning_rate": 4.169640061312968e-06, + "loss": 0.3692, + "step": 55000 + }, + { + "epoch": 1078.63, + "learning_rate": 4.156325988883702e-06, + "loss": 0.371, + "step": 55010 + }, + { + "epoch": 1078.82, + "learning_rate": 4.143032601227281e-06, + "loss": 0.3721, + "step": 55020 + }, + { + "epoch": 1079.0, + "eval_loss": 0.37800872325897217, + "eval_runtime": 2.2342, + "eval_samples_per_second": 1020.035, + "eval_steps_per_second": 4.028, + "step": 55029 + }, + { + "epoch": 1079.02, + "learning_rate": 4.129759902225066e-06, + "loss": 0.3687, + "step": 55030 + }, + { + "epoch": 1079.22, + "learning_rate": 4.116507895752408e-06, + "loss": 0.3647, + "step": 55040 + }, + { + "epoch": 1079.41, + "learning_rate": 4.103276585678578e-06, + "loss": 0.3695, + "step": 55050 + }, + { + "epoch": 1079.61, + "learning_rate": 4.090065975866843e-06, + "loss": 0.371, + "step": 55060 + }, + { + "epoch": 1079.8, + "learning_rate": 4.076876070174395e-06, + "loss": 0.3698, + "step": 55070 + }, + { + "epoch": 1080.0, + "learning_rate": 4.063706872452402e-06, + "loss": 0.3758, + "step": 55080 + }, + { + "epoch": 1080.0, + "eval_loss": 0.3792489469051361, + "eval_runtime": 2.2691, + "eval_samples_per_second": 1004.359, + "eval_steps_per_second": 3.966, + "step": 55080 + }, + { + "epoch": 1080.2, + "learning_rate": 4.0505583865459714e-06, + "loss": 0.3715, + "step": 55090 + }, + { + "epoch": 1080.39, + "learning_rate": 4.037430616294157e-06, + "loss": 0.3664, + "step": 55100 + }, + { + "epoch": 1080.59, + "learning_rate": 4.024323565529977e-06, + "loss": 0.3658, + "step": 55110 + }, + { + "epoch": 1080.78, + "learning_rate": 4.011237238080412e-06, + "loss": 0.371, + "step": 55120 + }, + { + "epoch": 1080.98, + "learning_rate": 3.998171637766379e-06, + "loss": 0.3742, + "step": 55130 + }, + { + "epoch": 1081.0, + "eval_loss": 0.37813621759414673, + "eval_runtime": 2.3613, + "eval_samples_per_second": 965.161, + "eval_steps_per_second": 3.812, + "step": 55131 + }, + { + "epoch": 1081.18, + "learning_rate": 3.985126768402719e-06, + "loss": 0.3805, + "step": 55140 + }, + { + "epoch": 1081.37, + "learning_rate": 3.972102633798277e-06, + "loss": 0.3703, + "step": 55150 + }, + { + "epoch": 1081.57, + "learning_rate": 3.95909923775577e-06, + "loss": 0.3729, + "step": 55160 + }, + { + "epoch": 1081.76, + "learning_rate": 3.946116584071926e-06, + "loss": 0.3742, + "step": 55170 + }, + { + "epoch": 1081.96, + "learning_rate": 3.933154676537389e-06, + "loss": 0.3693, + "step": 55180 + }, + { + "epoch": 1082.0, + "eval_loss": 0.38186749815940857, + "eval_runtime": 2.2165, + "eval_samples_per_second": 1028.2, + "eval_steps_per_second": 4.06, + "step": 55182 + }, + { + "epoch": 1082.16, + "learning_rate": 3.920213518936732e-06, + "loss": 0.3734, + "step": 55190 + }, + { + "epoch": 1082.35, + "learning_rate": 3.907293115048507e-06, + "loss": 0.3682, + "step": 55200 + }, + { + "epoch": 1082.55, + "learning_rate": 3.894393468645163e-06, + "loss": 0.3734, + "step": 55210 + }, + { + "epoch": 1082.75, + "learning_rate": 3.881514583493111e-06, + "loss": 0.3684, + "step": 55220 + }, + { + "epoch": 1082.94, + "learning_rate": 3.868656463352721e-06, + "loss": 0.3676, + "step": 55230 + }, + { + "epoch": 1083.0, + "eval_loss": 0.37459880113601685, + "eval_runtime": 2.3079, + "eval_samples_per_second": 987.465, + "eval_steps_per_second": 3.9, + "step": 55233 + }, + { + "epoch": 1083.14, + "learning_rate": 3.8558191119782536e-06, + "loss": 0.3701, + "step": 55240 + }, + { + "epoch": 1083.33, + "learning_rate": 3.843002533117937e-06, + "loss": 0.3731, + "step": 55250 + }, + { + "epoch": 1083.53, + "learning_rate": 3.83020673051391e-06, + "loss": 0.3737, + "step": 55260 + }, + { + "epoch": 1083.73, + "learning_rate": 3.817431707902293e-06, + "loss": 0.3684, + "step": 55270 + }, + { + "epoch": 1083.92, + "learning_rate": 3.8046774690131037e-06, + "loss": 0.3684, + "step": 55280 + }, + { + "epoch": 1084.0, + "eval_loss": 0.3811744451522827, + "eval_runtime": 2.351, + "eval_samples_per_second": 969.387, + "eval_steps_per_second": 3.828, + "step": 55284 + }, + { + "epoch": 1084.12, + "learning_rate": 3.7919440175702615e-06, + "loss": 0.3702, + "step": 55290 + }, + { + "epoch": 1084.31, + "learning_rate": 3.779231357291684e-06, + "loss": 0.371, + "step": 55300 + }, + { + "epoch": 1084.51, + "learning_rate": 3.76653949188917e-06, + "loss": 0.3724, + "step": 55310 + }, + { + "epoch": 1084.71, + "learning_rate": 3.7538684250684626e-06, + "loss": 0.3679, + "step": 55320 + }, + { + "epoch": 1084.9, + "learning_rate": 3.7412181605292275e-06, + "loss": 0.3727, + "step": 55330 + }, + { + "epoch": 1085.0, + "eval_loss": 0.3744555711746216, + "eval_runtime": 2.2572, + "eval_samples_per_second": 1009.673, + "eval_steps_per_second": 3.987, + "step": 55335 + }, + { + "epoch": 1085.1, + "learning_rate": 3.728588701965077e-06, + "loss": 0.3642, + "step": 55340 + }, + { + "epoch": 1085.29, + "learning_rate": 3.715980053063519e-06, + "loss": 0.3697, + "step": 55350 + }, + { + "epoch": 1085.49, + "learning_rate": 3.703392217505985e-06, + "loss": 0.3695, + "step": 55360 + }, + { + "epoch": 1085.69, + "learning_rate": 3.6908251989678504e-06, + "loss": 0.3724, + "step": 55370 + }, + { + "epoch": 1085.88, + "learning_rate": 3.6782790011184228e-06, + "loss": 0.3689, + "step": 55380 + }, + { + "epoch": 1086.0, + "eval_loss": 0.3743017315864563, + "eval_runtime": 2.3114, + "eval_samples_per_second": 985.971, + "eval_steps_per_second": 3.894, + "step": 55386 + }, + { + "epoch": 1086.08, + "learning_rate": 3.665753627620896e-06, + "loss": 0.3701, + "step": 55390 + }, + { + "epoch": 1086.27, + "learning_rate": 3.653249082132395e-06, + "loss": 0.3729, + "step": 55400 + }, + { + "epoch": 1086.47, + "learning_rate": 3.6407653683039913e-06, + "loss": 0.3704, + "step": 55410 + }, + { + "epoch": 1086.67, + "learning_rate": 3.6283024897806185e-06, + "loss": 0.375, + "step": 55420 + }, + { + "epoch": 1086.86, + "learning_rate": 3.6158604502011744e-06, + "loss": 0.3704, + "step": 55430 + }, + { + "epoch": 1087.0, + "eval_loss": 0.37848153710365295, + "eval_runtime": 2.2484, + "eval_samples_per_second": 1013.59, + "eval_steps_per_second": 4.003, + "step": 55437 + }, + { + "epoch": 1087.06, + "learning_rate": 3.60343925319847e-06, + "loss": 0.3661, + "step": 55440 + }, + { + "epoch": 1087.25, + "learning_rate": 3.591038902399196e-06, + "loss": 0.3673, + "step": 55450 + }, + { + "epoch": 1087.45, + "learning_rate": 3.5786594014239973e-06, + "loss": 0.3714, + "step": 55460 + }, + { + "epoch": 1087.65, + "learning_rate": 3.5663007538873828e-06, + "loss": 0.3684, + "step": 55470 + }, + { + "epoch": 1087.84, + "learning_rate": 3.553962963397841e-06, + "loss": 0.3664, + "step": 55480 + }, + { + "epoch": 1088.0, + "eval_loss": 0.3773548901081085, + "eval_runtime": 2.3896, + "eval_samples_per_second": 953.703, + "eval_steps_per_second": 3.766, + "step": 55488 + }, + { + "epoch": 1088.04, + "learning_rate": 3.541646033557716e-06, + "loss": 0.3713, + "step": 55490 + }, + { + "epoch": 1088.24, + "learning_rate": 3.529349967963263e-06, + "loss": 0.3721, + "step": 55500 + }, + { + "epoch": 1088.43, + "learning_rate": 3.5170747702046782e-06, + "loss": 0.3678, + "step": 55510 + }, + { + "epoch": 1088.63, + "learning_rate": 3.5048204438660273e-06, + "loss": 0.3711, + "step": 55520 + }, + { + "epoch": 1088.82, + "learning_rate": 3.492586992525306e-06, + "loss": 0.3704, + "step": 55530 + }, + { + "epoch": 1089.0, + "eval_loss": 0.3757428526878357, + "eval_runtime": 2.3436, + "eval_samples_per_second": 972.42, + "eval_steps_per_second": 3.84, + "step": 55539 + }, + { + "epoch": 1089.02, + "learning_rate": 3.480374419754417e-06, + "loss": 0.3732, + "step": 55540 + }, + { + "epoch": 1089.22, + "learning_rate": 3.468182729119157e-06, + "loss": 0.3694, + "step": 55550 + }, + { + "epoch": 1089.41, + "learning_rate": 3.456011924179236e-06, + "loss": 0.3751, + "step": 55560 + }, + { + "epoch": 1089.61, + "learning_rate": 3.4438620084882294e-06, + "loss": 0.3696, + "step": 55570 + }, + { + "epoch": 1089.8, + "learning_rate": 3.431732985593666e-06, + "loss": 0.3698, + "step": 55580 + }, + { + "epoch": 1090.0, + "learning_rate": 3.4196248590369373e-06, + "loss": 0.3702, + "step": 55590 + }, + { + "epoch": 1090.0, + "eval_loss": 0.3789558708667755, + "eval_runtime": 2.3648, + "eval_samples_per_second": 963.724, + "eval_steps_per_second": 3.806, + "step": 55590 + }, + { + "epoch": 1090.2, + "learning_rate": 3.407537632353366e-06, + "loss": 0.3698, + "step": 55600 + }, + { + "epoch": 1090.39, + "learning_rate": 3.395471309072137e-06, + "loss": 0.3665, + "step": 55610 + }, + { + "epoch": 1090.59, + "learning_rate": 3.383425892716349e-06, + "loss": 0.3658, + "step": 55620 + }, + { + "epoch": 1090.78, + "learning_rate": 3.3714013868029883e-06, + "loss": 0.3726, + "step": 55630 + }, + { + "epoch": 1090.98, + "learning_rate": 3.3593977948429467e-06, + "loss": 0.3747, + "step": 55640 + }, + { + "epoch": 1091.0, + "eval_loss": 0.37976905703544617, + "eval_runtime": 2.348, + "eval_samples_per_second": 970.598, + "eval_steps_per_second": 3.833, + "step": 55641 + }, + { + "epoch": 1091.18, + "learning_rate": 3.347415120341029e-06, + "loss": 0.3697, + "step": 55650 + }, + { + "epoch": 1091.37, + "learning_rate": 3.3354533667958706e-06, + "loss": 0.3685, + "step": 55660 + }, + { + "epoch": 1091.57, + "learning_rate": 3.3235125377000597e-06, + "loss": 0.3715, + "step": 55670 + }, + { + "epoch": 1091.76, + "learning_rate": 3.3115926365400336e-06, + "loss": 0.3706, + "step": 55680 + }, + { + "epoch": 1091.96, + "learning_rate": 3.299693666796174e-06, + "loss": 0.3704, + "step": 55690 + }, + { + "epoch": 1092.0, + "eval_loss": 0.37564924359321594, + "eval_runtime": 2.3319, + "eval_samples_per_second": 977.321, + "eval_steps_per_second": 3.86, + "step": 55692 + }, + { + "epoch": 1092.16, + "learning_rate": 3.2878156319426864e-06, + "loss": 0.3706, + "step": 55700 + }, + { + "epoch": 1092.35, + "learning_rate": 3.275958535447687e-06, + "loss": 0.3666, + "step": 55710 + }, + { + "epoch": 1092.55, + "learning_rate": 3.264122380773207e-06, + "loss": 0.3663, + "step": 55720 + }, + { + "epoch": 1092.75, + "learning_rate": 3.2523071713751154e-06, + "loss": 0.3703, + "step": 55730 + }, + { + "epoch": 1092.94, + "learning_rate": 3.2405129107032023e-06, + "loss": 0.3749, + "step": 55740 + }, + { + "epoch": 1093.0, + "eval_loss": 0.3782815933227539, + "eval_runtime": 2.2261, + "eval_samples_per_second": 1023.743, + "eval_steps_per_second": 4.043, + "step": 55743 + }, + { + "epoch": 1093.14, + "learning_rate": 3.228739602201122e-06, + "loss": 0.3711, + "step": 55750 + }, + { + "epoch": 1093.33, + "learning_rate": 3.216987249306441e-06, + "loss": 0.3737, + "step": 55760 + }, + { + "epoch": 1093.53, + "learning_rate": 3.205255855450564e-06, + "loss": 0.3723, + "step": 55770 + }, + { + "epoch": 1093.73, + "learning_rate": 3.1935454240587854e-06, + "loss": 0.3728, + "step": 55780 + }, + { + "epoch": 1093.92, + "learning_rate": 3.181855958550311e-06, + "loss": 0.3686, + "step": 55790 + }, + { + "epoch": 1094.0, + "eval_loss": 0.37587156891822815, + "eval_runtime": 2.2492, + "eval_samples_per_second": 1013.239, + "eval_steps_per_second": 4.001, + "step": 55794 + }, + { + "epoch": 1094.12, + "learning_rate": 3.170187462338186e-06, + "loss": 0.3728, + "step": 55800 + }, + { + "epoch": 1094.31, + "learning_rate": 3.158539938829377e-06, + "loss": 0.3706, + "step": 55810 + }, + { + "epoch": 1094.51, + "learning_rate": 3.1469133914246797e-06, + "loss": 0.3695, + "step": 55820 + }, + { + "epoch": 1094.71, + "learning_rate": 3.135307823518796e-06, + "loss": 0.3723, + "step": 55830 + }, + { + "epoch": 1094.9, + "learning_rate": 3.123723238500289e-06, + "loss": 0.369, + "step": 55840 + }, + { + "epoch": 1095.0, + "eval_loss": 0.3761863708496094, + "eval_runtime": 2.3314, + "eval_samples_per_second": 977.508, + "eval_steps_per_second": 3.86, + "step": 55845 + }, + { + "epoch": 1095.1, + "learning_rate": 3.112159639751588e-06, + "loss": 0.3698, + "step": 55850 + }, + { + "epoch": 1095.29, + "learning_rate": 3.100617030649033e-06, + "loss": 0.3705, + "step": 55860 + }, + { + "epoch": 1095.49, + "learning_rate": 3.0890954145627868e-06, + "loss": 0.3729, + "step": 55870 + }, + { + "epoch": 1095.69, + "learning_rate": 3.0775947948569165e-06, + "loss": 0.366, + "step": 55880 + }, + { + "epoch": 1095.88, + "learning_rate": 3.066115174889336e-06, + "loss": 0.3671, + "step": 55890 + }, + { + "epoch": 1096.0, + "eval_loss": 0.3782743811607361, + "eval_runtime": 2.3706, + "eval_samples_per_second": 961.358, + "eval_steps_per_second": 3.796, + "step": 55896 + }, + { + "epoch": 1096.08, + "learning_rate": 3.0546565580118393e-06, + "loss": 0.3672, + "step": 55900 + }, + { + "epoch": 1096.27, + "learning_rate": 3.0432189475701003e-06, + "loss": 0.3673, + "step": 55910 + }, + { + "epoch": 1096.47, + "learning_rate": 3.0318023469036225e-06, + "loss": 0.3749, + "step": 55920 + }, + { + "epoch": 1096.67, + "learning_rate": 3.020406759345831e-06, + "loss": 0.3692, + "step": 55930 + }, + { + "epoch": 1096.86, + "learning_rate": 3.0090321882239477e-06, + "loss": 0.3686, + "step": 55940 + }, + { + "epoch": 1097.0, + "eval_loss": 0.3780481815338135, + "eval_runtime": 2.4515, + "eval_samples_per_second": 929.638, + "eval_steps_per_second": 3.671, + "step": 55947 + }, + { + "epoch": 1097.06, + "learning_rate": 2.997678636859116e-06, + "loss": 0.3673, + "step": 55950 + }, + { + "epoch": 1097.25, + "learning_rate": 2.986346108566326e-06, + "loss": 0.3729, + "step": 55960 + }, + { + "epoch": 1097.45, + "learning_rate": 2.975034606654397e-06, + "loss": 0.3734, + "step": 55970 + }, + { + "epoch": 1097.65, + "learning_rate": 2.963744134426063e-06, + "loss": 0.3739, + "step": 55980 + }, + { + "epoch": 1097.84, + "learning_rate": 2.95247469517787e-06, + "loss": 0.3693, + "step": 55990 + }, + { + "epoch": 1098.0, + "eval_loss": 0.3777942657470703, + "eval_runtime": 2.3668, + "eval_samples_per_second": 962.905, + "eval_steps_per_second": 3.803, + "step": 55998 + }, + { + "epoch": 1098.04, + "learning_rate": 2.941226292200244e-06, + "loss": 0.3665, + "step": 56000 + }, + { + "epoch": 1098.24, + "learning_rate": 2.929998928777483e-06, + "loss": 0.374, + "step": 56010 + }, + { + "epoch": 1098.43, + "learning_rate": 2.9187926081877146e-06, + "loss": 0.3745, + "step": 56020 + }, + { + "epoch": 1098.63, + "learning_rate": 2.9076073337029464e-06, + "loss": 0.3685, + "step": 56030 + }, + { + "epoch": 1098.82, + "learning_rate": 2.896443108589008e-06, + "loss": 0.3728, + "step": 56040 + }, + { + "epoch": 1099.0, + "eval_loss": 0.375933438539505, + "eval_runtime": 2.4335, + "eval_samples_per_second": 936.505, + "eval_steps_per_second": 3.698, + "step": 56049 + }, + { + "epoch": 1099.02, + "learning_rate": 2.8852999361056173e-06, + "loss": 0.3733, + "step": 56050 + }, + { + "epoch": 1099.22, + "learning_rate": 2.8741778195063377e-06, + "loss": 0.3732, + "step": 56060 + }, + { + "epoch": 1099.41, + "learning_rate": 2.8630767620385713e-06, + "loss": 0.3675, + "step": 56070 + }, + { + "epoch": 1099.61, + "learning_rate": 2.851996766943576e-06, + "loss": 0.3739, + "step": 56080 + }, + { + "epoch": 1099.8, + "learning_rate": 2.8409378374564806e-06, + "loss": 0.3705, + "step": 56090 + }, + { + "epoch": 1100.0, + "learning_rate": 2.829899976806219e-06, + "loss": 0.3715, + "step": 56100 + }, + { + "epoch": 1100.0, + "eval_loss": 0.3777158260345459, + "eval_runtime": 2.2906, + "eval_samples_per_second": 994.948, + "eval_steps_per_second": 3.929, + "step": 56100 + }, + { + "epoch": 1100.2, + "learning_rate": 2.8188831882156205e-06, + "loss": 0.3721, + "step": 56110 + }, + { + "epoch": 1100.39, + "learning_rate": 2.8078874749013463e-06, + "loss": 0.367, + "step": 56120 + }, + { + "epoch": 1100.59, + "learning_rate": 2.79691284007387e-06, + "loss": 0.3694, + "step": 56130 + }, + { + "epoch": 1100.78, + "learning_rate": 2.785959286937578e-06, + "loss": 0.3682, + "step": 56140 + }, + { + "epoch": 1100.98, + "learning_rate": 2.775026818690629e-06, + "loss": 0.3712, + "step": 56150 + }, + { + "epoch": 1101.0, + "eval_loss": 0.37754717469215393, + "eval_runtime": 2.2472, + "eval_samples_per_second": 1014.149, + "eval_steps_per_second": 4.005, + "step": 56151 + }, + { + "epoch": 1101.18, + "learning_rate": 2.7641154385250772e-06, + "loss": 0.3703, + "step": 56160 + }, + { + "epoch": 1101.37, + "learning_rate": 2.753225149626809e-06, + "loss": 0.3712, + "step": 56170 + }, + { + "epoch": 1101.57, + "learning_rate": 2.7423559551755376e-06, + "loss": 0.3735, + "step": 56180 + }, + { + "epoch": 1101.76, + "learning_rate": 2.7315078583448254e-06, + "loss": 0.368, + "step": 56190 + }, + { + "epoch": 1101.96, + "learning_rate": 2.7206808623020633e-06, + "loss": 0.3695, + "step": 56200 + }, + { + "epoch": 1102.0, + "eval_loss": 0.3767015337944031, + "eval_runtime": 2.277, + "eval_samples_per_second": 1000.886, + "eval_steps_per_second": 3.953, + "step": 56202 + }, + { + "epoch": 1102.16, + "learning_rate": 2.7098749702085142e-06, + "loss": 0.3718, + "step": 56210 + }, + { + "epoch": 1102.35, + "learning_rate": 2.699090185219238e-06, + "loss": 0.3709, + "step": 56220 + }, + { + "epoch": 1102.55, + "learning_rate": 2.6883265104831743e-06, + "loss": 0.3696, + "step": 56230 + }, + { + "epoch": 1102.75, + "learning_rate": 2.677583949143067e-06, + "loss": 0.3741, + "step": 56240 + }, + { + "epoch": 1102.94, + "learning_rate": 2.666862504335482e-06, + "loss": 0.3715, + "step": 56250 + }, + { + "epoch": 1103.0, + "eval_loss": 0.37621647119522095, + "eval_runtime": 2.3576, + "eval_samples_per_second": 966.651, + "eval_steps_per_second": 3.817, + "step": 56253 + }, + { + "epoch": 1103.14, + "learning_rate": 2.6561621791908655e-06, + "loss": 0.3712, + "step": 56260 + }, + { + "epoch": 1103.33, + "learning_rate": 2.6454829768334686e-06, + "loss": 0.3662, + "step": 56270 + }, + { + "epoch": 1103.53, + "learning_rate": 2.6348249003813883e-06, + "loss": 0.3704, + "step": 56280 + }, + { + "epoch": 1103.73, + "learning_rate": 2.6241879529465273e-06, + "loss": 0.3718, + "step": 56290 + }, + { + "epoch": 1103.92, + "learning_rate": 2.6135721376346592e-06, + "loss": 0.3728, + "step": 56300 + }, + { + "epoch": 1104.0, + "eval_loss": 0.3774849474430084, + "eval_runtime": 2.2419, + "eval_samples_per_second": 1016.552, + "eval_steps_per_second": 4.014, + "step": 56304 + }, + { + "epoch": 1104.12, + "learning_rate": 2.602977457545338e-06, + "loss": 0.3664, + "step": 56310 + }, + { + "epoch": 1104.31, + "learning_rate": 2.59240391577199e-06, + "loss": 0.3665, + "step": 56320 + }, + { + "epoch": 1104.51, + "learning_rate": 2.58185151540187e-06, + "loss": 0.3712, + "step": 56330 + }, + { + "epoch": 1104.71, + "learning_rate": 2.571320259516005e-06, + "loss": 0.3702, + "step": 56340 + }, + { + "epoch": 1104.9, + "learning_rate": 2.560810151189327e-06, + "loss": 0.368, + "step": 56350 + }, + { + "epoch": 1105.0, + "eval_loss": 0.37828439474105835, + "eval_runtime": 2.2755, + "eval_samples_per_second": 1001.518, + "eval_steps_per_second": 3.955, + "step": 56355 + }, + { + "epoch": 1105.1, + "learning_rate": 2.550321193490523e-06, + "loss": 0.3763, + "step": 56360 + }, + { + "epoch": 1105.29, + "learning_rate": 2.5398533894821437e-06, + "loss": 0.3696, + "step": 56370 + }, + { + "epoch": 1105.49, + "learning_rate": 2.5294067422205606e-06, + "loss": 0.366, + "step": 56380 + }, + { + "epoch": 1105.69, + "learning_rate": 2.5189812547559586e-06, + "loss": 0.3672, + "step": 56390 + }, + { + "epoch": 1105.88, + "learning_rate": 2.508576930132344e-06, + "loss": 0.3705, + "step": 56400 + }, + { + "epoch": 1106.0, + "eval_loss": 0.37971001863479614, + "eval_runtime": 2.2636, + "eval_samples_per_second": 1006.793, + "eval_steps_per_second": 3.976, + "step": 56406 + }, + { + "epoch": 1106.08, + "learning_rate": 2.498193771387547e-06, + "loss": 0.3741, + "step": 56410 + }, + { + "epoch": 1106.27, + "learning_rate": 2.487831781553223e-06, + "loss": 0.3693, + "step": 56420 + }, + { + "epoch": 1106.47, + "learning_rate": 2.477490963654846e-06, + "loss": 0.3726, + "step": 56430 + }, + { + "epoch": 1106.67, + "learning_rate": 2.4671713207117e-06, + "loss": 0.3714, + "step": 56440 + }, + { + "epoch": 1106.86, + "learning_rate": 2.456872855736891e-06, + "loss": 0.3705, + "step": 56450 + }, + { + "epoch": 1107.0, + "eval_loss": 0.37706848978996277, + "eval_runtime": 2.2675, + "eval_samples_per_second": 1005.061, + "eval_steps_per_second": 3.969, + "step": 56457 + }, + { + "epoch": 1107.06, + "learning_rate": 2.446595571737331e-06, + "loss": 0.3651, + "step": 56460 + }, + { + "epoch": 1107.25, + "learning_rate": 2.4363394717137608e-06, + "loss": 0.3696, + "step": 56470 + }, + { + "epoch": 1107.45, + "learning_rate": 2.4261045586607435e-06, + "loss": 0.3731, + "step": 56480 + }, + { + "epoch": 1107.65, + "learning_rate": 2.415890835566647e-06, + "loss": 0.3712, + "step": 56490 + }, + { + "epoch": 1107.84, + "learning_rate": 2.4056983054136365e-06, + "loss": 0.3734, + "step": 56500 + }, + { + "epoch": 1108.0, + "eval_loss": 0.375370591878891, + "eval_runtime": 2.2889, + "eval_samples_per_second": 995.683, + "eval_steps_per_second": 3.932, + "step": 56508 + }, + { + "epoch": 1108.04, + "learning_rate": 2.3955269711777218e-06, + "loss": 0.3741, + "step": 56510 + }, + { + "epoch": 1108.24, + "learning_rate": 2.3853768358286786e-06, + "loss": 0.3698, + "step": 56520 + }, + { + "epoch": 1108.43, + "learning_rate": 2.3752479023301434e-06, + "loss": 0.3717, + "step": 56530 + }, + { + "epoch": 1108.63, + "learning_rate": 2.365140173639535e-06, + "loss": 0.3698, + "step": 56540 + }, + { + "epoch": 1108.82, + "learning_rate": 2.3550536527080748e-06, + "loss": 0.3701, + "step": 56550 + }, + { + "epoch": 1109.0, + "eval_loss": 0.37934526801109314, + "eval_runtime": 2.3521, + "eval_samples_per_second": 968.928, + "eval_steps_per_second": 3.826, + "step": 56559 + }, + { + "epoch": 1109.02, + "learning_rate": 2.344988342480825e-06, + "loss": 0.3704, + "step": 56560 + }, + { + "epoch": 1109.22, + "learning_rate": 2.3349442458965917e-06, + "loss": 0.3732, + "step": 56570 + }, + { + "epoch": 1109.41, + "learning_rate": 2.3249213658880635e-06, + "loss": 0.3692, + "step": 56580 + }, + { + "epoch": 1109.61, + "learning_rate": 2.3149197053816822e-06, + "loss": 0.3661, + "step": 56590 + }, + { + "epoch": 1109.8, + "learning_rate": 2.3049392672977117e-06, + "loss": 0.3711, + "step": 56600 + }, + { + "epoch": 1110.0, + "learning_rate": 2.294980054550222e-06, + "loss": 0.3707, + "step": 56610 + }, + { + "epoch": 1110.0, + "eval_loss": 0.3728983700275421, + "eval_runtime": 2.3629, + "eval_samples_per_second": 964.494, + "eval_steps_per_second": 3.809, + "step": 56610 + }, + { + "epoch": 1110.2, + "learning_rate": 2.2850420700470773e-06, + "loss": 0.3716, + "step": 56620 + }, + { + "epoch": 1110.39, + "learning_rate": 2.275125316689941e-06, + "loss": 0.3699, + "step": 56630 + }, + { + "epoch": 1110.59, + "learning_rate": 2.265229797374296e-06, + "loss": 0.367, + "step": 56640 + }, + { + "epoch": 1110.78, + "learning_rate": 2.2553555149893987e-06, + "loss": 0.3701, + "step": 56650 + }, + { + "epoch": 1110.98, + "learning_rate": 2.2455024724183424e-06, + "loss": 0.3677, + "step": 56660 + }, + { + "epoch": 1111.0, + "eval_loss": 0.376329243183136, + "eval_runtime": 2.3664, + "eval_samples_per_second": 963.065, + "eval_steps_per_second": 3.803, + "step": 56661 + }, + { + "epoch": 1111.18, + "learning_rate": 2.2356706725379675e-06, + "loss": 0.366, + "step": 56670 + }, + { + "epoch": 1111.37, + "learning_rate": 2.2258601182189526e-06, + "loss": 0.3732, + "step": 56680 + }, + { + "epoch": 1111.57, + "learning_rate": 2.216070812325774e-06, + "loss": 0.3684, + "step": 56690 + }, + { + "epoch": 1111.76, + "learning_rate": 2.206302757716677e-06, + "loss": 0.3669, + "step": 56700 + }, + { + "epoch": 1111.96, + "learning_rate": 2.1965559572437147e-06, + "loss": 0.3734, + "step": 56710 + }, + { + "epoch": 1112.0, + "eval_loss": 0.3813353478908539, + "eval_runtime": 2.431, + "eval_samples_per_second": 937.488, + "eval_steps_per_second": 3.702, + "step": 56712 + }, + { + "epoch": 1112.16, + "learning_rate": 2.1868304137527354e-06, + "loss": 0.3718, + "step": 56720 + }, + { + "epoch": 1112.35, + "learning_rate": 2.177126130083384e-06, + "loss": 0.3694, + "step": 56730 + }, + { + "epoch": 1112.55, + "learning_rate": 2.167443109069103e-06, + "loss": 0.3718, + "step": 56740 + }, + { + "epoch": 1112.75, + "learning_rate": 2.157781353537105e-06, + "loss": 0.3679, + "step": 56750 + }, + { + "epoch": 1112.94, + "learning_rate": 2.1481408663084094e-06, + "loss": 0.3714, + "step": 56760 + }, + { + "epoch": 1113.0, + "eval_loss": 0.3772488236427307, + "eval_runtime": 2.3362, + "eval_samples_per_second": 975.496, + "eval_steps_per_second": 3.852, + "step": 56763 + }, + { + "epoch": 1113.14, + "learning_rate": 2.1385216501978384e-06, + "loss": 0.3687, + "step": 56770 + }, + { + "epoch": 1113.33, + "learning_rate": 2.128923708013963e-06, + "loss": 0.3669, + "step": 56780 + }, + { + "epoch": 1113.53, + "learning_rate": 2.119347042559191e-06, + "loss": 0.3702, + "step": 56790 + }, + { + "epoch": 1113.73, + "learning_rate": 2.1097916566296863e-06, + "loss": 0.369, + "step": 56800 + }, + { + "epoch": 1113.92, + "learning_rate": 2.1002575530153996e-06, + "loss": 0.3654, + "step": 56810 + }, + { + "epoch": 1114.0, + "eval_loss": 0.3765362501144409, + "eval_runtime": 2.2523, + "eval_samples_per_second": 1011.853, + "eval_steps_per_second": 3.996, + "step": 56814 + }, + { + "epoch": 1114.12, + "learning_rate": 2.0907447345000967e-06, + "loss": 0.3689, + "step": 56820 + }, + { + "epoch": 1114.31, + "learning_rate": 2.081253203861288e-06, + "loss": 0.3746, + "step": 56830 + }, + { + "epoch": 1114.51, + "learning_rate": 2.0717829638703e-06, + "loss": 0.3751, + "step": 56840 + }, + { + "epoch": 1114.71, + "learning_rate": 2.062334017292236e-06, + "loss": 0.3689, + "step": 56850 + }, + { + "epoch": 1114.9, + "learning_rate": 2.0529063668859646e-06, + "loss": 0.3692, + "step": 56860 + }, + { + "epoch": 1115.0, + "eval_loss": 0.3756592273712158, + "eval_runtime": 2.3455, + "eval_samples_per_second": 971.645, + "eval_steps_per_second": 3.837, + "step": 56865 + }, + { + "epoch": 1115.1, + "learning_rate": 2.0435000154041674e-06, + "loss": 0.3747, + "step": 56870 + }, + { + "epoch": 1115.29, + "learning_rate": 2.034114965593264e-06, + "loss": 0.3657, + "step": 56880 + }, + { + "epoch": 1115.49, + "learning_rate": 2.0247512201934964e-06, + "loss": 0.3661, + "step": 56890 + }, + { + "epoch": 1115.69, + "learning_rate": 2.01540878193886e-06, + "loss": 0.3702, + "step": 56900 + }, + { + "epoch": 1115.88, + "learning_rate": 2.0060876535571564e-06, + "loss": 0.3721, + "step": 56910 + }, + { + "epoch": 1116.0, + "eval_loss": 0.37494155764579773, + "eval_runtime": 2.2377, + "eval_samples_per_second": 1018.439, + "eval_steps_per_second": 4.022, + "step": 56916 + }, + { + "epoch": 1116.08, + "learning_rate": 1.996787837769942e-06, + "loss": 0.3612, + "step": 56920 + }, + { + "epoch": 1116.27, + "learning_rate": 1.9875093372925367e-06, + "loss": 0.3668, + "step": 56930 + }, + { + "epoch": 1116.47, + "learning_rate": 1.9782521548340645e-06, + "loss": 0.3695, + "step": 56940 + }, + { + "epoch": 1116.67, + "learning_rate": 1.969016293097422e-06, + "loss": 0.3713, + "step": 56950 + }, + { + "epoch": 1116.86, + "learning_rate": 1.959801754779286e-06, + "loss": 0.3741, + "step": 56960 + }, + { + "epoch": 1117.0, + "eval_loss": 0.3769468665122986, + "eval_runtime": 2.3568, + "eval_samples_per_second": 966.997, + "eval_steps_per_second": 3.819, + "step": 56967 + }, + { + "epoch": 1117.06, + "learning_rate": 1.95060854257007e-06, + "loss": 0.3681, + "step": 56970 + }, + { + "epoch": 1117.25, + "learning_rate": 1.9414366591540108e-06, + "loss": 0.3726, + "step": 56980 + }, + { + "epoch": 1117.45, + "learning_rate": 1.9322861072090746e-06, + "loss": 0.3707, + "step": 56990 + }, + { + "epoch": 1117.65, + "learning_rate": 1.9231568894070238e-06, + "loss": 0.3666, + "step": 57000 + }, + { + "epoch": 1117.84, + "learning_rate": 1.9140490084134013e-06, + "loss": 0.3649, + "step": 57010 + }, + { + "epoch": 1118.0, + "eval_loss": 0.3805931508541107, + "eval_runtime": 2.2501, + "eval_samples_per_second": 1012.856, + "eval_steps_per_second": 4.0, + "step": 57018 + }, + { + "epoch": 1118.04, + "learning_rate": 1.9049624668874885e-06, + "loss": 0.3691, + "step": 57020 + }, + { + "epoch": 1118.24, + "learning_rate": 1.8958972674823546e-06, + "loss": 0.3667, + "step": 57030 + }, + { + "epoch": 1118.43, + "learning_rate": 1.886853412844841e-06, + "loss": 0.3674, + "step": 57040 + }, + { + "epoch": 1118.63, + "learning_rate": 1.8778309056155433e-06, + "loss": 0.368, + "step": 57050 + }, + { + "epoch": 1118.82, + "learning_rate": 1.8688297484288544e-06, + "loss": 0.3709, + "step": 57060 + }, + { + "epoch": 1119.0, + "eval_loss": 0.37201598286628723, + "eval_runtime": 2.3445, + "eval_samples_per_second": 972.048, + "eval_steps_per_second": 3.839, + "step": 57069 + }, + { + "epoch": 1119.02, + "learning_rate": 1.8598499439128806e-06, + "loss": 0.3719, + "step": 57070 + }, + { + "epoch": 1119.22, + "learning_rate": 1.8508914946895492e-06, + "loss": 0.3698, + "step": 57080 + }, + { + "epoch": 1119.41, + "learning_rate": 1.8419544033745099e-06, + "loss": 0.3725, + "step": 57090 + }, + { + "epoch": 1119.61, + "learning_rate": 1.8330386725772e-06, + "loss": 0.3702, + "step": 57100 + }, + { + "epoch": 1119.8, + "learning_rate": 1.8241443049008208e-06, + "loss": 0.3702, + "step": 57110 + }, + { + "epoch": 1120.0, + "learning_rate": 1.8152713029423283e-06, + "loss": 0.3721, + "step": 57120 + }, + { + "epoch": 1120.0, + "eval_loss": 0.37935277819633484, + "eval_runtime": 2.3207, + "eval_samples_per_second": 982.051, + "eval_steps_per_second": 3.878, + "step": 57120 + }, + { + "epoch": 1120.2, + "learning_rate": 1.8064196692924416e-06, + "loss": 0.3701, + "step": 57130 + }, + { + "epoch": 1120.39, + "learning_rate": 1.7975894065356266e-06, + "loss": 0.3746, + "step": 57140 + }, + { + "epoch": 1120.59, + "learning_rate": 1.7887805172501295e-06, + "loss": 0.3694, + "step": 57150 + }, + { + "epoch": 1120.78, + "learning_rate": 1.7799930040079597e-06, + "loss": 0.3692, + "step": 57160 + }, + { + "epoch": 1120.98, + "learning_rate": 1.7712268693748727e-06, + "loss": 0.3701, + "step": 57170 + }, + { + "epoch": 1121.0, + "eval_loss": 0.3747633099555969, + "eval_runtime": 2.2596, + "eval_samples_per_second": 1008.585, + "eval_steps_per_second": 3.983, + "step": 57171 + }, + { + "epoch": 1121.18, + "learning_rate": 1.7624821159103714e-06, + "loss": 0.3697, + "step": 57180 + }, + { + "epoch": 1121.37, + "learning_rate": 1.7537587461677383e-06, + "loss": 0.3703, + "step": 57190 + }, + { + "epoch": 1121.57, + "learning_rate": 1.7450567626940026e-06, + "loss": 0.3674, + "step": 57200 + }, + { + "epoch": 1121.76, + "learning_rate": 1.7363761680299487e-06, + "loss": 0.3717, + "step": 57210 + }, + { + "epoch": 1121.96, + "learning_rate": 1.7277169647101157e-06, + "loss": 0.3674, + "step": 57220 + }, + { + "epoch": 1122.0, + "eval_loss": 0.3787176012992859, + "eval_runtime": 2.2673, + "eval_samples_per_second": 1005.16, + "eval_steps_per_second": 3.969, + "step": 57222 + }, + { + "epoch": 1122.16, + "learning_rate": 1.719079155262798e-06, + "loss": 0.3674, + "step": 57230 + }, + { + "epoch": 1122.35, + "learning_rate": 1.710462742210053e-06, + "loss": 0.3702, + "step": 57240 + }, + { + "epoch": 1122.55, + "learning_rate": 1.7018677280676601e-06, + "loss": 0.3698, + "step": 57250 + }, + { + "epoch": 1122.75, + "learning_rate": 1.6932941153451785e-06, + "loss": 0.3665, + "step": 57260 + }, + { + "epoch": 1122.94, + "learning_rate": 1.6847419065459306e-06, + "loss": 0.3669, + "step": 57270 + }, + { + "epoch": 1123.0, + "eval_loss": 0.37363022565841675, + "eval_runtime": 2.3303, + "eval_samples_per_second": 977.968, + "eval_steps_per_second": 3.862, + "step": 57273 + }, + { + "epoch": 1123.14, + "learning_rate": 1.6762111041669523e-06, + "loss": 0.3675, + "step": 57280 + }, + { + "epoch": 1123.33, + "learning_rate": 1.6677017106990597e-06, + "loss": 0.3688, + "step": 57290 + }, + { + "epoch": 1123.53, + "learning_rate": 1.6592137286267904e-06, + "loss": 0.3718, + "step": 57300 + }, + { + "epoch": 1123.73, + "learning_rate": 1.650747160428445e-06, + "loss": 0.3675, + "step": 57310 + }, + { + "epoch": 1123.92, + "learning_rate": 1.6423020085760963e-06, + "loss": 0.3726, + "step": 57320 + }, + { + "epoch": 1124.0, + "eval_loss": 0.37890663743019104, + "eval_runtime": 2.2695, + "eval_samples_per_second": 1004.198, + "eval_steps_per_second": 3.966, + "step": 57324 + }, + { + "epoch": 1124.12, + "learning_rate": 1.6338782755355218e-06, + "loss": 0.3729, + "step": 57330 + }, + { + "epoch": 1124.31, + "learning_rate": 1.6254759637662706e-06, + "loss": 0.3732, + "step": 57340 + }, + { + "epoch": 1124.51, + "learning_rate": 1.6170950757216223e-06, + "loss": 0.3684, + "step": 57350 + }, + { + "epoch": 1124.71, + "learning_rate": 1.6087356138486106e-06, + "loss": 0.3683, + "step": 57360 + }, + { + "epoch": 1124.9, + "learning_rate": 1.6003975805880171e-06, + "loss": 0.3672, + "step": 57370 + }, + { + "epoch": 1125.0, + "eval_loss": 0.3774057626724243, + "eval_runtime": 2.3395, + "eval_samples_per_second": 974.139, + "eval_steps_per_second": 3.847, + "step": 57375 + }, + { + "epoch": 1125.1, + "learning_rate": 1.5920809783743689e-06, + "loss": 0.3688, + "step": 57380 + }, + { + "epoch": 1125.29, + "learning_rate": 1.5837858096359151e-06, + "loss": 0.3712, + "step": 57390 + }, + { + "epoch": 1125.49, + "learning_rate": 1.5755120767946604e-06, + "loss": 0.3693, + "step": 57400 + }, + { + "epoch": 1125.69, + "learning_rate": 1.5672597822663557e-06, + "loss": 0.37, + "step": 57410 + }, + { + "epoch": 1125.88, + "learning_rate": 1.55902892846049e-06, + "loss": 0.3674, + "step": 57420 + }, + { + "epoch": 1126.0, + "eval_loss": 0.3777696490287781, + "eval_runtime": 2.2811, + "eval_samples_per_second": 999.061, + "eval_steps_per_second": 3.945, + "step": 57426 + }, + { + "epoch": 1126.08, + "learning_rate": 1.550819517780283e-06, + "loss": 0.3695, + "step": 57430 + }, + { + "epoch": 1126.27, + "learning_rate": 1.5426315526227e-06, + "loss": 0.3663, + "step": 57440 + }, + { + "epoch": 1126.47, + "learning_rate": 1.534465035378446e-06, + "loss": 0.3745, + "step": 57450 + }, + { + "epoch": 1126.67, + "learning_rate": 1.526319968431955e-06, + "loss": 0.3704, + "step": 57460 + }, + { + "epoch": 1126.86, + "learning_rate": 1.5181963541614161e-06, + "loss": 0.3702, + "step": 57470 + }, + { + "epoch": 1127.0, + "eval_loss": 0.37724199891090393, + "eval_runtime": 2.2933, + "eval_samples_per_second": 993.78, + "eval_steps_per_second": 3.925, + "step": 57477 + }, + { + "epoch": 1127.06, + "learning_rate": 1.5100941949387406e-06, + "loss": 0.3692, + "step": 57480 + }, + { + "epoch": 1127.25, + "learning_rate": 1.502013493129578e-06, + "loss": 0.3721, + "step": 57490 + }, + { + "epoch": 1127.45, + "learning_rate": 1.4939542510933072e-06, + "loss": 0.3684, + "step": 57500 + }, + { + "epoch": 1127.65, + "learning_rate": 1.4859164711830546e-06, + "loss": 0.3699, + "step": 57510 + }, + { + "epoch": 1127.84, + "learning_rate": 1.4779001557456593e-06, + "loss": 0.3717, + "step": 57520 + }, + { + "epoch": 1128.0, + "eval_loss": 0.376617968082428, + "eval_runtime": 2.3749, + "eval_samples_per_second": 959.606, + "eval_steps_per_second": 3.79, + "step": 57528 + }, + { + "epoch": 1128.04, + "learning_rate": 1.4699053071217326e-06, + "loss": 0.3666, + "step": 57530 + }, + { + "epoch": 1128.24, + "learning_rate": 1.461931927645557e-06, + "loss": 0.3652, + "step": 57540 + }, + { + "epoch": 1128.43, + "learning_rate": 1.4539800196452206e-06, + "loss": 0.3693, + "step": 57550 + }, + { + "epoch": 1128.63, + "learning_rate": 1.4460495854424659e-06, + "loss": 0.366, + "step": 57560 + }, + { + "epoch": 1128.82, + "learning_rate": 1.4381406273528239e-06, + "loss": 0.3703, + "step": 57570 + }, + { + "epoch": 1129.0, + "eval_loss": 0.3757016360759735, + "eval_runtime": 2.2478, + "eval_samples_per_second": 1013.876, + "eval_steps_per_second": 4.004, + "step": 57579 + }, + { + "epoch": 1129.02, + "learning_rate": 1.4302531476855312e-06, + "loss": 0.3674, + "step": 57580 + }, + { + "epoch": 1129.22, + "learning_rate": 1.4223871487435618e-06, + "loss": 0.3746, + "step": 57590 + }, + { + "epoch": 1129.41, + "learning_rate": 1.4145426328236036e-06, + "loss": 0.3699, + "step": 57600 + }, + { + "epoch": 1129.61, + "learning_rate": 1.406719602216075e-06, + "loss": 0.3729, + "step": 57610 + }, + { + "epoch": 1129.8, + "learning_rate": 1.3989180592051313e-06, + "loss": 0.3675, + "step": 57620 + }, + { + "epoch": 1130.0, + "learning_rate": 1.3911380060686593e-06, + "loss": 0.3695, + "step": 57630 + }, + { + "epoch": 1130.0, + "eval_loss": 0.3807949423789978, + "eval_runtime": 2.3206, + "eval_samples_per_second": 982.078, + "eval_steps_per_second": 3.878, + "step": 57630 + }, + { + "epoch": 1130.2, + "learning_rate": 1.3833794450782504e-06, + "loss": 0.3742, + "step": 57640 + }, + { + "epoch": 1130.39, + "learning_rate": 1.3756423784992253e-06, + "loss": 0.369, + "step": 57650 + }, + { + "epoch": 1130.59, + "learning_rate": 1.3679268085906608e-06, + "loss": 0.3698, + "step": 57660 + }, + { + "epoch": 1130.78, + "learning_rate": 1.3602327376052963e-06, + "loss": 0.3692, + "step": 57670 + }, + { + "epoch": 1130.98, + "learning_rate": 1.3525601677896513e-06, + "loss": 0.3729, + "step": 57680 + }, + { + "epoch": 1131.0, + "eval_loss": 0.37210267782211304, + "eval_runtime": 2.3446, + "eval_samples_per_second": 972.041, + "eval_steps_per_second": 3.839, + "step": 57681 + }, + { + "epoch": 1131.18, + "learning_rate": 1.3449091013839426e-06, + "loss": 0.3687, + "step": 57690 + }, + { + "epoch": 1131.37, + "learning_rate": 1.3372795406221076e-06, + "loss": 0.3689, + "step": 57700 + }, + { + "epoch": 1131.57, + "learning_rate": 1.3296714877318148e-06, + "loss": 0.3752, + "step": 57710 + }, + { + "epoch": 1131.76, + "learning_rate": 1.322084944934429e-06, + "loss": 0.3695, + "step": 57720 + }, + { + "epoch": 1131.96, + "learning_rate": 1.3145199144450613e-06, + "loss": 0.3657, + "step": 57730 + }, + { + "epoch": 1132.0, + "eval_loss": 0.37843912839889526, + "eval_runtime": 2.2806, + "eval_samples_per_second": 999.309, + "eval_steps_per_second": 3.946, + "step": 57732 + }, + { + "epoch": 1132.16, + "learning_rate": 1.3069763984725452e-06, + "loss": 0.3704, + "step": 57740 + }, + { + "epoch": 1132.35, + "learning_rate": 1.2994543992193935e-06, + "loss": 0.3709, + "step": 57750 + }, + { + "epoch": 1132.55, + "learning_rate": 1.2919539188818828e-06, + "loss": 0.3705, + "step": 57760 + }, + { + "epoch": 1132.75, + "learning_rate": 1.2844749596499782e-06, + "loss": 0.3691, + "step": 57770 + }, + { + "epoch": 1132.94, + "learning_rate": 1.277017523707366e-06, + "loss": 0.3676, + "step": 57780 + }, + { + "epoch": 1133.0, + "eval_loss": 0.3793400228023529, + "eval_runtime": 2.3746, + "eval_samples_per_second": 959.724, + "eval_steps_per_second": 3.79, + "step": 57783 + }, + { + "epoch": 1133.14, + "learning_rate": 1.2695816132314545e-06, + "loss": 0.3724, + "step": 57790 + }, + { + "epoch": 1133.33, + "learning_rate": 1.2621672303933738e-06, + "loss": 0.3746, + "step": 57800 + }, + { + "epoch": 1133.53, + "learning_rate": 1.254774377357942e-06, + "loss": 0.3699, + "step": 57810 + }, + { + "epoch": 1133.73, + "learning_rate": 1.2474030562837162e-06, + "loss": 0.3683, + "step": 57820 + }, + { + "epoch": 1133.92, + "learning_rate": 1.2400532693229493e-06, + "loss": 0.3684, + "step": 57830 + }, + { + "epoch": 1134.0, + "eval_loss": 0.37969231605529785, + "eval_runtime": 2.3069, + "eval_samples_per_second": 987.914, + "eval_steps_per_second": 3.901, + "step": 57834 + }, + { + "epoch": 1134.12, + "learning_rate": 1.2327250186216248e-06, + "loss": 0.3688, + "step": 57840 + }, + { + "epoch": 1134.31, + "learning_rate": 1.2254183063194312e-06, + "loss": 0.3696, + "step": 57850 + }, + { + "epoch": 1134.51, + "learning_rate": 1.2181331345497453e-06, + "loss": 0.3682, + "step": 57860 + }, + { + "epoch": 1134.71, + "learning_rate": 1.2108695054396988e-06, + "loss": 0.3666, + "step": 57870 + }, + { + "epoch": 1134.9, + "learning_rate": 1.2036274211100955e-06, + "loss": 0.3703, + "step": 57880 + }, + { + "epoch": 1135.0, + "eval_loss": 0.3771282136440277, + "eval_runtime": 2.257, + "eval_samples_per_second": 1009.765, + "eval_steps_per_second": 3.988, + "step": 57885 + }, + { + "epoch": 1135.1, + "learning_rate": 1.1964068836754687e-06, + "loss": 0.3722, + "step": 57890 + }, + { + "epoch": 1135.29, + "learning_rate": 1.189207895244057e-06, + "loss": 0.3746, + "step": 57900 + }, + { + "epoch": 1135.49, + "learning_rate": 1.182030457917796e-06, + "loss": 0.3681, + "step": 57910 + }, + { + "epoch": 1135.69, + "learning_rate": 1.174874573792342e-06, + "loss": 0.3683, + "step": 57920 + }, + { + "epoch": 1135.88, + "learning_rate": 1.167740244957041e-06, + "loss": 0.3705, + "step": 57930 + }, + { + "epoch": 1136.0, + "eval_loss": 0.37516215443611145, + "eval_runtime": 2.2628, + "eval_samples_per_second": 1007.181, + "eval_steps_per_second": 3.977, + "step": 57936 + }, + { + "epoch": 1136.08, + "learning_rate": 1.1606274734949766e-06, + "loss": 0.3713, + "step": 57940 + }, + { + "epoch": 1136.27, + "learning_rate": 1.1535362614829208e-06, + "loss": 0.3704, + "step": 57950 + }, + { + "epoch": 1136.47, + "learning_rate": 1.1464666109913256e-06, + "loss": 0.3653, + "step": 57960 + }, + { + "epoch": 1136.67, + "learning_rate": 1.1394185240843983e-06, + "loss": 0.3673, + "step": 57970 + }, + { + "epoch": 1136.86, + "learning_rate": 1.1323920028200096e-06, + "loss": 0.3691, + "step": 57980 + }, + { + "epoch": 1137.0, + "eval_loss": 0.3772904872894287, + "eval_runtime": 2.2617, + "eval_samples_per_second": 1007.636, + "eval_steps_per_second": 3.979, + "step": 57987 + }, + { + "epoch": 1137.06, + "learning_rate": 1.125387049249743e-06, + "loss": 0.3706, + "step": 57990 + }, + { + "epoch": 1137.25, + "learning_rate": 1.1184036654188877e-06, + "loss": 0.3736, + "step": 58000 + }, + { + "epoch": 1137.45, + "learning_rate": 1.1114418533664626e-06, + "loss": 0.3705, + "step": 58010 + }, + { + "epoch": 1137.65, + "learning_rate": 1.1045016151251335e-06, + "loss": 0.3681, + "step": 58020 + }, + { + "epoch": 1137.84, + "learning_rate": 1.0975829527212959e-06, + "loss": 0.3673, + "step": 58030 + }, + { + "epoch": 1138.0, + "eval_loss": 0.3766086995601654, + "eval_runtime": 2.3006, + "eval_samples_per_second": 990.6, + "eval_steps_per_second": 3.912, + "step": 58038 + }, + { + "epoch": 1138.04, + "learning_rate": 1.0906858681750508e-06, + "loss": 0.3686, + "step": 58040 + }, + { + "epoch": 1138.24, + "learning_rate": 1.0838103635002038e-06, + "loss": 0.3697, + "step": 58050 + }, + { + "epoch": 1138.43, + "learning_rate": 1.0769564407042407e-06, + "loss": 0.3697, + "step": 58060 + }, + { + "epoch": 1138.63, + "learning_rate": 1.0701241017883526e-06, + "loss": 0.3656, + "step": 58070 + }, + { + "epoch": 1138.82, + "learning_rate": 1.0633133487474189e-06, + "loss": 0.3715, + "step": 58080 + }, + { + "epoch": 1139.0, + "eval_loss": 0.3779025971889496, + "eval_runtime": 2.3816, + "eval_samples_per_second": 956.928, + "eval_steps_per_second": 3.779, + "step": 58089 + }, + { + "epoch": 1139.02, + "learning_rate": 1.0565241835700482e-06, + "loss": 0.3668, + "step": 58090 + }, + { + "epoch": 1139.22, + "learning_rate": 1.049756608238514e-06, + "loss": 0.3665, + "step": 58100 + }, + { + "epoch": 1139.41, + "learning_rate": 1.0430106247288018e-06, + "loss": 0.3684, + "step": 58110 + }, + { + "epoch": 1139.61, + "learning_rate": 1.036286235010586e-06, + "loss": 0.3738, + "step": 58120 + }, + { + "epoch": 1139.8, + "learning_rate": 1.0295834410472382e-06, + "loss": 0.3737, + "step": 58130 + }, + { + "epoch": 1140.0, + "learning_rate": 1.0229022447958256e-06, + "loss": 0.37, + "step": 58140 + }, + { + "epoch": 1140.0, + "eval_loss": 0.37501007318496704, + "eval_runtime": 2.2685, + "eval_samples_per_second": 1004.642, + "eval_steps_per_second": 3.967, + "step": 58140 + }, + { + "epoch": 1140.2, + "learning_rate": 1.016242648207105e-06, + "loss": 0.3705, + "step": 58150 + }, + { + "epoch": 1140.39, + "learning_rate": 1.0096046532255374e-06, + "loss": 0.3679, + "step": 58160 + }, + { + "epoch": 1140.59, + "learning_rate": 1.0029882617892643e-06, + "loss": 0.3662, + "step": 58170 + }, + { + "epoch": 1140.78, + "learning_rate": 9.963934758301235e-07, + "loss": 0.3689, + "step": 58180 + }, + { + "epoch": 1140.98, + "learning_rate": 9.898202972736497e-07, + "loss": 0.3709, + "step": 58190 + }, + { + "epoch": 1141.0, + "eval_loss": 0.37856853008270264, + "eval_runtime": 2.2632, + "eval_samples_per_second": 1006.96, + "eval_steps_per_second": 3.977, + "step": 58191 + }, + { + "epoch": 1141.18, + "learning_rate": 9.832687280390578e-07, + "loss": 0.3671, + "step": 58200 + }, + { + "epoch": 1141.37, + "learning_rate": 9.767387700392675e-07, + "loss": 0.3687, + "step": 58210 + }, + { + "epoch": 1141.57, + "learning_rate": 9.702304251808707e-07, + "loss": 0.3715, + "step": 58220 + }, + { + "epoch": 1141.76, + "learning_rate": 9.637436953641803e-07, + "loss": 0.3679, + "step": 58230 + }, + { + "epoch": 1141.96, + "learning_rate": 9.57278582483148e-07, + "loss": 0.3696, + "step": 58240 + }, + { + "epoch": 1142.0, + "eval_loss": 0.37759509682655334, + "eval_runtime": 2.2953, + "eval_samples_per_second": 992.889, + "eval_steps_per_second": 3.921, + "step": 58242 + }, + { + "epoch": 1142.16, + "learning_rate": 9.50835088425464e-07, + "loss": 0.3672, + "step": 58250 + }, + { + "epoch": 1142.35, + "learning_rate": 9.444132150724732e-07, + "loss": 0.3663, + "step": 58260 + }, + { + "epoch": 1142.55, + "learning_rate": 9.380129642992257e-07, + "loss": 0.367, + "step": 58270 + }, + { + "epoch": 1142.75, + "learning_rate": 9.316343379744517e-07, + "loss": 0.3706, + "step": 58280 + }, + { + "epoch": 1142.94, + "learning_rate": 9.252773379605616e-07, + "loss": 0.3752, + "step": 58290 + }, + { + "epoch": 1143.0, + "eval_loss": 0.3757573366165161, + "eval_runtime": 2.3475, + "eval_samples_per_second": 970.804, + "eval_steps_per_second": 3.834, + "step": 58293 + }, + { + "epoch": 1143.14, + "learning_rate": 9.189419661136621e-07, + "loss": 0.3684, + "step": 58300 + }, + { + "epoch": 1143.33, + "learning_rate": 9.126282242835487e-07, + "loss": 0.3688, + "step": 58310 + }, + { + "epoch": 1143.53, + "learning_rate": 9.0633611431368e-07, + "loss": 0.3681, + "step": 58320 + }, + { + "epoch": 1143.73, + "learning_rate": 9.000656380412114e-07, + "loss": 0.3728, + "step": 58330 + }, + { + "epoch": 1143.92, + "learning_rate": 8.93816797296995e-07, + "loss": 0.3675, + "step": 58340 + }, + { + "epoch": 1144.0, + "eval_loss": 0.37619441747665405, + "eval_runtime": 2.2518, + "eval_samples_per_second": 1012.06, + "eval_steps_per_second": 3.997, + "step": 58344 + }, + { + "epoch": 1144.12, + "learning_rate": 8.875895939055466e-07, + "loss": 0.3683, + "step": 58350 + }, + { + "epoch": 1144.31, + "learning_rate": 8.813840296850616e-07, + "loss": 0.3677, + "step": 58360 + }, + { + "epoch": 1144.51, + "learning_rate": 8.752001064474407e-07, + "loss": 0.3693, + "step": 58370 + }, + { + "epoch": 1144.71, + "learning_rate": 8.690378259982478e-07, + "loss": 0.3662, + "step": 58380 + }, + { + "epoch": 1144.9, + "learning_rate": 8.628971901367271e-07, + "loss": 0.3681, + "step": 58390 + }, + { + "epoch": 1145.0, + "eval_loss": 0.3740864396095276, + "eval_runtime": 2.2362, + "eval_samples_per_second": 1019.118, + "eval_steps_per_second": 4.025, + "step": 58395 + }, + { + "epoch": 1145.1, + "learning_rate": 8.567782006558027e-07, + "loss": 0.3709, + "step": 58400 + }, + { + "epoch": 1145.29, + "learning_rate": 8.506808593420955e-07, + "loss": 0.3683, + "step": 58410 + }, + { + "epoch": 1145.49, + "learning_rate": 8.446051679758814e-07, + "loss": 0.3729, + "step": 58420 + }, + { + "epoch": 1145.69, + "learning_rate": 8.38551128331133e-07, + "loss": 0.3716, + "step": 58430 + }, + { + "epoch": 1145.88, + "learning_rate": 8.325187421755031e-07, + "loss": 0.3684, + "step": 58440 + }, + { + "epoch": 1146.0, + "eval_loss": 0.3794369399547577, + "eval_runtime": 2.2787, + "eval_samples_per_second": 1000.144, + "eval_steps_per_second": 3.95, + "step": 58446 + }, + { + "epoch": 1146.08, + "learning_rate": 8.265080112702993e-07, + "loss": 0.371, + "step": 58450 + }, + { + "epoch": 1146.27, + "learning_rate": 8.205189373705262e-07, + "loss": 0.3694, + "step": 58460 + }, + { + "epoch": 1146.47, + "learning_rate": 8.145515222248599e-07, + "loss": 0.3674, + "step": 58470 + }, + { + "epoch": 1146.67, + "learning_rate": 8.086057675756569e-07, + "loss": 0.3671, + "step": 58480 + }, + { + "epoch": 1146.86, + "learning_rate": 8.026816751589366e-07, + "loss": 0.3663, + "step": 58490 + }, + { + "epoch": 1147.0, + "eval_loss": 0.3720145523548126, + "eval_runtime": 2.2643, + "eval_samples_per_second": 1006.479, + "eval_steps_per_second": 3.975, + "step": 58497 + }, + { + "epoch": 1147.06, + "learning_rate": 7.96779246704407e-07, + "loss": 0.3702, + "step": 58500 + }, + { + "epoch": 1147.25, + "learning_rate": 7.908984839354482e-07, + "loss": 0.3687, + "step": 58510 + }, + { + "epoch": 1147.45, + "learning_rate": 7.850393885691031e-07, + "loss": 0.3683, + "step": 58520 + }, + { + "epoch": 1147.65, + "learning_rate": 7.792019623161116e-07, + "loss": 0.3691, + "step": 58530 + }, + { + "epoch": 1147.84, + "learning_rate": 7.733862068808521e-07, + "loss": 0.3712, + "step": 58540 + }, + { + "epoch": 1148.0, + "eval_loss": 0.3741941452026367, + "eval_runtime": 2.3347, + "eval_samples_per_second": 976.132, + "eval_steps_per_second": 3.855, + "step": 58548 + }, + { + "epoch": 1148.04, + "learning_rate": 7.675921239614164e-07, + "loss": 0.3694, + "step": 58550 + }, + { + "epoch": 1148.24, + "learning_rate": 7.618197152495258e-07, + "loss": 0.3693, + "step": 58560 + }, + { + "epoch": 1148.43, + "learning_rate": 7.560689824306076e-07, + "loss": 0.3691, + "step": 58570 + }, + { + "epoch": 1148.63, + "learning_rate": 7.503399271837518e-07, + "loss": 0.3709, + "step": 58580 + }, + { + "epoch": 1148.82, + "learning_rate": 7.446325511817119e-07, + "loss": 0.3672, + "step": 58590 + }, + { + "epoch": 1149.0, + "eval_loss": 0.3786185681819916, + "eval_runtime": 2.3832, + "eval_samples_per_second": 956.28, + "eval_steps_per_second": 3.776, + "step": 58599 + }, + { + "epoch": 1149.02, + "learning_rate": 7.389468560909051e-07, + "loss": 0.3692, + "step": 58600 + }, + { + "epoch": 1149.22, + "learning_rate": 7.332828435714366e-07, + "loss": 0.3673, + "step": 58610 + }, + { + "epoch": 1149.41, + "learning_rate": 7.276405152770671e-07, + "loss": 0.3684, + "step": 58620 + }, + { + "epoch": 1149.61, + "learning_rate": 7.220198728552368e-07, + "loss": 0.3733, + "step": 58630 + }, + { + "epoch": 1149.8, + "learning_rate": 7.164209179470415e-07, + "loss": 0.3687, + "step": 58640 + }, + { + "epoch": 1150.0, + "learning_rate": 7.108436521872568e-07, + "loss": 0.369, + "step": 58650 + }, + { + "epoch": 1150.0, + "eval_loss": 0.37368664145469666, + "eval_runtime": 2.4044, + "eval_samples_per_second": 947.86, + "eval_steps_per_second": 3.743, + "step": 58650 + }, + { + "epoch": 1150.2, + "learning_rate": 7.052880772043134e-07, + "loss": 0.371, + "step": 58660 + }, + { + "epoch": 1150.39, + "learning_rate": 6.997541946203139e-07, + "loss": 0.3704, + "step": 58670 + }, + { + "epoch": 1150.59, + "learning_rate": 6.942420060510406e-07, + "loss": 0.3694, + "step": 58680 + }, + { + "epoch": 1150.78, + "learning_rate": 6.887515131059229e-07, + "loss": 0.3674, + "step": 58690 + }, + { + "epoch": 1150.98, + "learning_rate": 6.832827173880618e-07, + "loss": 0.3648, + "step": 58700 + }, + { + "epoch": 1151.0, + "eval_loss": 0.37666937708854675, + "eval_runtime": 2.4198, + "eval_samples_per_second": 941.831, + "eval_steps_per_second": 3.719, + "step": 58701 + }, + { + "epoch": 1151.18, + "learning_rate": 6.778356204942214e-07, + "loss": 0.3701, + "step": 58710 + }, + { + "epoch": 1151.37, + "learning_rate": 6.724102240148299e-07, + "loss": 0.3712, + "step": 58720 + }, + { + "epoch": 1151.57, + "learning_rate": 6.670065295339866e-07, + "loss": 0.3712, + "step": 58730 + }, + { + "epoch": 1151.76, + "learning_rate": 6.616245386294627e-07, + "loss": 0.3705, + "step": 58740 + }, + { + "epoch": 1151.96, + "learning_rate": 6.562642528726597e-07, + "loss": 0.3704, + "step": 58750 + }, + { + "epoch": 1152.0, + "eval_loss": 0.37399017810821533, + "eval_runtime": 2.2859, + "eval_samples_per_second": 996.982, + "eval_steps_per_second": 3.937, + "step": 58752 + }, + { + "epoch": 1152.16, + "learning_rate": 6.509256738286672e-07, + "loss": 0.3687, + "step": 58760 + }, + { + "epoch": 1152.35, + "learning_rate": 6.4560880305623e-07, + "loss": 0.3616, + "step": 58770 + }, + { + "epoch": 1152.55, + "learning_rate": 6.403136421077565e-07, + "loss": 0.3695, + "step": 58780 + }, + { + "epoch": 1152.75, + "learning_rate": 6.350401925293264e-07, + "loss": 0.3713, + "step": 58790 + }, + { + "epoch": 1152.94, + "learning_rate": 6.29788455860658e-07, + "loss": 0.3695, + "step": 58800 + }, + { + "epoch": 1153.0, + "eval_loss": 0.3780522346496582, + "eval_runtime": 2.3433, + "eval_samples_per_second": 972.56, + "eval_steps_per_second": 3.841, + "step": 58803 + }, + { + "epoch": 1153.14, + "learning_rate": 6.245584336351417e-07, + "loss": 0.3701, + "step": 58810 + }, + { + "epoch": 1153.33, + "learning_rate": 6.193501273798307e-07, + "loss": 0.3689, + "step": 58820 + }, + { + "epoch": 1153.53, + "learning_rate": 6.141635386154253e-07, + "loss": 0.3688, + "step": 58830 + }, + { + "epoch": 1153.73, + "learning_rate": 6.089986688563143e-07, + "loss": 0.3695, + "step": 58840 + }, + { + "epoch": 1153.92, + "learning_rate": 6.038555196105077e-07, + "loss": 0.3707, + "step": 58850 + }, + { + "epoch": 1154.0, + "eval_loss": 0.37525415420532227, + "eval_runtime": 2.319, + "eval_samples_per_second": 982.731, + "eval_steps_per_second": 3.881, + "step": 58854 + }, + { + "epoch": 1154.12, + "learning_rate": 5.987340923796879e-07, + "loss": 0.3646, + "step": 58860 + }, + { + "epoch": 1154.31, + "learning_rate": 5.936343886592087e-07, + "loss": 0.3709, + "step": 58870 + }, + { + "epoch": 1154.51, + "learning_rate": 5.885564099380624e-07, + "loss": 0.3701, + "step": 58880 + }, + { + "epoch": 1154.71, + "learning_rate": 5.835001576989129e-07, + "loss": 0.3689, + "step": 58890 + }, + { + "epoch": 1154.9, + "learning_rate": 5.784656334180709e-07, + "loss": 0.3661, + "step": 58900 + }, + { + "epoch": 1155.0, + "eval_loss": 0.37740692496299744, + "eval_runtime": 2.2596, + "eval_samples_per_second": 1008.586, + "eval_steps_per_second": 3.983, + "step": 58905 + }, + { + "epoch": 1155.1, + "learning_rate": 5.73452838565494e-07, + "loss": 0.3704, + "step": 58910 + }, + { + "epoch": 1155.29, + "learning_rate": 5.684617746048198e-07, + "loss": 0.3677, + "step": 58920 + }, + { + "epoch": 1155.49, + "learning_rate": 5.634924429933241e-07, + "loss": 0.3684, + "step": 58930 + }, + { + "epoch": 1155.69, + "learning_rate": 5.585448451819296e-07, + "loss": 0.3633, + "step": 58940 + }, + { + "epoch": 1155.88, + "learning_rate": 5.536189826152476e-07, + "loss": 0.367, + "step": 58950 + }, + { + "epoch": 1156.0, + "eval_loss": 0.3763006031513214, + "eval_runtime": 2.2788, + "eval_samples_per_second": 1000.081, + "eval_steps_per_second": 3.949, + "step": 58956 + }, + { + "epoch": 1156.08, + "learning_rate": 5.487148567315026e-07, + "loss": 0.3703, + "step": 58960 + }, + { + "epoch": 1156.27, + "learning_rate": 5.438324689625989e-07, + "loss": 0.3678, + "step": 58970 + }, + { + "epoch": 1156.47, + "learning_rate": 5.389718207340716e-07, + "loss": 0.368, + "step": 58980 + }, + { + "epoch": 1156.67, + "learning_rate": 5.341329134651351e-07, + "loss": 0.3682, + "step": 58990 + }, + { + "epoch": 1156.86, + "learning_rate": 5.293157485686428e-07, + "loss": 0.3657, + "step": 59000 + }, + { + "epoch": 1157.0, + "eval_loss": 0.3766930103302002, + "eval_runtime": 2.379, + "eval_samples_per_second": 957.978, + "eval_steps_per_second": 3.783, + "step": 59007 + }, + { + "epoch": 1157.06, + "learning_rate": 5.245203274510862e-07, + "loss": 0.3731, + "step": 59010 + }, + { + "epoch": 1157.25, + "learning_rate": 5.197466515126369e-07, + "loss": 0.364, + "step": 59020 + }, + { + "epoch": 1157.45, + "learning_rate": 5.149947221470885e-07, + "loss": 0.3691, + "step": 59030 + }, + { + "epoch": 1157.65, + "learning_rate": 5.102645407419059e-07, + "loss": 0.3675, + "step": 59040 + }, + { + "epoch": 1157.84, + "learning_rate": 5.055561086781928e-07, + "loss": 0.3638, + "step": 59050 + }, + { + "epoch": 1158.0, + "eval_loss": 0.37378421425819397, + "eval_runtime": 2.2244, + "eval_samples_per_second": 1024.566, + "eval_steps_per_second": 4.046, + "step": 59058 + }, + { + "epoch": 1158.04, + "learning_rate": 5.008694273307162e-07, + "loss": 0.3721, + "step": 59060 + }, + { + "epoch": 1158.24, + "learning_rate": 4.962044980678731e-07, + "loss": 0.3723, + "step": 59070 + }, + { + "epoch": 1158.43, + "learning_rate": 4.915613222517156e-07, + "loss": 0.3693, + "step": 59080 + }, + { + "epoch": 1158.63, + "learning_rate": 4.86939901237951e-07, + "loss": 0.3684, + "step": 59090 + }, + { + "epoch": 1158.82, + "learning_rate": 4.823402363759416e-07, + "loss": 0.3728, + "step": 59100 + }, + { + "epoch": 1159.0, + "eval_loss": 0.373190313577652, + "eval_runtime": 2.2286, + "eval_samples_per_second": 1022.623, + "eval_steps_per_second": 4.038, + "step": 59109 + }, + { + "epoch": 1159.02, + "learning_rate": 4.777623290086713e-07, + "loss": 0.3714, + "step": 59110 + }, + { + "epoch": 1159.22, + "learning_rate": 4.73206180472796e-07, + "loss": 0.3674, + "step": 59120 + }, + { + "epoch": 1159.41, + "learning_rate": 4.6867179209860995e-07, + "loss": 0.3685, + "step": 59130 + }, + { + "epoch": 1159.61, + "learning_rate": 4.641591652100457e-07, + "loss": 0.3673, + "step": 59140 + }, + { + "epoch": 1159.8, + "learning_rate": 4.59668301124691e-07, + "loss": 0.3684, + "step": 59150 + }, + { + "epoch": 1160.0, + "learning_rate": 4.551992011537886e-07, + "loss": 0.3748, + "step": 59160 + }, + { + "epoch": 1160.0, + "eval_loss": 0.37865450978279114, + "eval_runtime": 2.3374, + "eval_samples_per_second": 974.999, + "eval_steps_per_second": 3.85, + "step": 59160 + }, + { + "epoch": 1160.2, + "learning_rate": 4.507518666022114e-07, + "loss": 0.3704, + "step": 59170 + }, + { + "epoch": 1160.39, + "learning_rate": 4.463262987684707e-07, + "loss": 0.3662, + "step": 59180 + }, + { + "epoch": 1160.59, + "learning_rate": 4.419224989447495e-07, + "loss": 0.3682, + "step": 59190 + }, + { + "epoch": 1160.78, + "learning_rate": 4.3754046841685264e-07, + "loss": 0.3694, + "step": 59200 + }, + { + "epoch": 1160.98, + "learning_rate": 4.3318020846423163e-07, + "loss": 0.3753, + "step": 59210 + }, + { + "epoch": 1161.0, + "eval_loss": 0.37434232234954834, + "eval_runtime": 2.3184, + "eval_samples_per_second": 982.993, + "eval_steps_per_second": 3.882, + "step": 59211 + }, + { + "epoch": 1161.18, + "learning_rate": 4.288417203599848e-07, + "loss": 0.3667, + "step": 59220 + }, + { + "epoch": 1161.37, + "learning_rate": 4.245250053708654e-07, + "loss": 0.3721, + "step": 59230 + }, + { + "epoch": 1161.57, + "learning_rate": 4.2023006475724874e-07, + "loss": 0.3694, + "step": 59240 + }, + { + "epoch": 1161.76, + "learning_rate": 4.159568997731566e-07, + "loss": 0.3677, + "step": 59250 + }, + { + "epoch": 1161.96, + "learning_rate": 4.11705511666266e-07, + "loss": 0.3663, + "step": 59260 + }, + { + "epoch": 1162.0, + "eval_loss": 0.3757706880569458, + "eval_runtime": 2.2639, + "eval_samples_per_second": 1006.674, + "eval_steps_per_second": 3.975, + "step": 59262 + }, + { + "epoch": 1162.16, + "learning_rate": 4.074759016778839e-07, + "loss": 0.3725, + "step": 59270 + }, + { + "epoch": 1162.35, + "learning_rate": 4.0326807104297255e-07, + "loss": 0.3687, + "step": 59280 + }, + { + "epoch": 1162.55, + "learning_rate": 3.990820209901074e-07, + "loss": 0.3672, + "step": 59290 + }, + { + "epoch": 1162.75, + "learning_rate": 3.9491775274153594e-07, + "loss": 0.3649, + "step": 59300 + }, + { + "epoch": 1162.94, + "learning_rate": 3.9077526751312705e-07, + "loss": 0.3694, + "step": 59310 + }, + { + "epoch": 1163.0, + "eval_loss": 0.3772204518318176, + "eval_runtime": 2.2967, + "eval_samples_per_second": 992.275, + "eval_steps_per_second": 3.919, + "step": 59313 + }, + { + "epoch": 1163.14, + "learning_rate": 3.866545665143883e-07, + "loss": 0.3713, + "step": 59320 + }, + { + "epoch": 1163.33, + "learning_rate": 3.8255565094847393e-07, + "loss": 0.3747, + "step": 59330 + }, + { + "epoch": 1163.53, + "learning_rate": 3.7847852201218496e-07, + "loss": 0.3671, + "step": 59340 + }, + { + "epoch": 1163.73, + "learning_rate": 3.7442318089594416e-07, + "loss": 0.3704, + "step": 59350 + }, + { + "epoch": 1163.92, + "learning_rate": 3.7038962878382094e-07, + "loss": 0.3657, + "step": 59360 + }, + { + "epoch": 1164.0, + "eval_loss": 0.37626099586486816, + "eval_runtime": 2.2934, + "eval_samples_per_second": 993.736, + "eval_steps_per_second": 3.924, + "step": 59364 + }, + { + "epoch": 1164.12, + "learning_rate": 3.663778668535233e-07, + "loss": 0.3707, + "step": 59370 + }, + { + "epoch": 1164.31, + "learning_rate": 3.6238789627640596e-07, + "loss": 0.3678, + "step": 59380 + }, + { + "epoch": 1164.51, + "learning_rate": 3.5841971821742863e-07, + "loss": 0.3652, + "step": 59390 + }, + { + "epoch": 1164.71, + "learning_rate": 3.5447333383523123e-07, + "loss": 0.3686, + "step": 59400 + }, + { + "epoch": 1164.9, + "learning_rate": 3.50548744282067e-07, + "loss": 0.3643, + "step": 59410 + }, + { + "epoch": 1165.0, + "eval_loss": 0.3769838213920593, + "eval_runtime": 2.2497, + "eval_samples_per_second": 1013.009, + "eval_steps_per_second": 4.0, + "step": 59415 + }, + { + "epoch": 1165.1, + "learning_rate": 3.466459507038277e-07, + "loss": 0.3708, + "step": 59420 + }, + { + "epoch": 1165.29, + "learning_rate": 3.427649542400351e-07, + "loss": 0.3676, + "step": 59430 + }, + { + "epoch": 1165.49, + "learning_rate": 3.389057560238578e-07, + "loss": 0.3694, + "step": 59440 + }, + { + "epoch": 1165.69, + "learning_rate": 3.3506835718209447e-07, + "loss": 0.369, + "step": 59450 + }, + { + "epoch": 1165.88, + "learning_rate": 3.312527588351821e-07, + "loss": 0.3679, + "step": 59460 + }, + { + "epoch": 1166.0, + "eval_loss": 0.3771611452102661, + "eval_runtime": 2.3268, + "eval_samples_per_second": 979.474, + "eval_steps_per_second": 3.868, + "step": 59466 + }, + { + "epoch": 1166.08, + "learning_rate": 3.274589620971879e-07, + "loss": 0.3707, + "step": 59470 + }, + { + "epoch": 1166.27, + "learning_rate": 3.236869680758175e-07, + "loss": 0.3632, + "step": 59480 + }, + { + "epoch": 1166.47, + "learning_rate": 3.1993677787241487e-07, + "loss": 0.3704, + "step": 59490 + }, + { + "epoch": 1166.67, + "learning_rate": 3.162083925819375e-07, + "loss": 0.3704, + "step": 59500 + }, + { + "epoch": 1166.86, + "learning_rate": 3.1250181329300626e-07, + "loss": 0.37, + "step": 59510 + }, + { + "epoch": 1167.0, + "eval_loss": 0.37242335081100464, + "eval_runtime": 2.238, + "eval_samples_per_second": 1018.333, + "eval_steps_per_second": 4.022, + "step": 59517 + }, + { + "epoch": 1167.06, + "learning_rate": 3.088170410878471e-07, + "loss": 0.3691, + "step": 59520 + }, + { + "epoch": 1167.25, + "learning_rate": 3.051540770423411e-07, + "loss": 0.3692, + "step": 59530 + }, + { + "epoch": 1167.45, + "learning_rate": 3.01512922225991e-07, + "loss": 0.374, + "step": 59540 + }, + { + "epoch": 1167.65, + "learning_rate": 2.9789357770192147e-07, + "loss": 0.369, + "step": 59550 + }, + { + "epoch": 1167.84, + "learning_rate": 2.942960445269121e-07, + "loss": 0.3693, + "step": 59560 + }, + { + "epoch": 1168.0, + "eval_loss": 0.3752315938472748, + "eval_runtime": 2.3291, + "eval_samples_per_second": 978.487, + "eval_steps_per_second": 3.864, + "step": 59568 + }, + { + "epoch": 1168.04, + "learning_rate": 2.9072032375136445e-07, + "loss": 0.3643, + "step": 59570 + }, + { + "epoch": 1168.24, + "learning_rate": 2.871664164192933e-07, + "loss": 0.3731, + "step": 59580 + }, + { + "epoch": 1168.43, + "learning_rate": 2.836343235683769e-07, + "loss": 0.3728, + "step": 59590 + }, + { + "epoch": 1168.63, + "learning_rate": 2.8012404622989873e-07, + "loss": 0.3686, + "step": 59600 + }, + { + "epoch": 1168.82, + "learning_rate": 2.766355854287888e-07, + "loss": 0.3705, + "step": 59610 + }, + { + "epoch": 1169.0, + "eval_loss": 0.37323248386383057, + "eval_runtime": 2.2636, + "eval_samples_per_second": 1006.791, + "eval_steps_per_second": 3.976, + "step": 59619 + }, + { + "epoch": 1169.02, + "learning_rate": 2.731689421835909e-07, + "loss": 0.3702, + "step": 59620 + }, + { + "epoch": 1169.22, + "learning_rate": 2.697241175064868e-07, + "loss": 0.3682, + "step": 59630 + }, + { + "epoch": 1169.41, + "learning_rate": 2.6630111240330543e-07, + "loss": 0.3734, + "step": 59640 + }, + { + "epoch": 1169.61, + "learning_rate": 2.6289992787347224e-07, + "loss": 0.3712, + "step": 59650 + }, + { + "epoch": 1169.8, + "learning_rate": 2.5952056491005126e-07, + "loss": 0.3656, + "step": 59660 + }, + { + "epoch": 1170.0, + "learning_rate": 2.5616302449976145e-07, + "loss": 0.3671, + "step": 59670 + }, + { + "epoch": 1170.0, + "eval_loss": 0.37673383951187134, + "eval_runtime": 2.2475, + "eval_samples_per_second": 1013.999, + "eval_steps_per_second": 4.004, + "step": 59670 + }, + { + "epoch": 1170.2, + "learning_rate": 2.528273076229187e-07, + "loss": 0.3696, + "step": 59680 + }, + { + "epoch": 1170.39, + "learning_rate": 2.495134152534772e-07, + "loss": 0.3731, + "step": 59690 + }, + { + "epoch": 1170.59, + "learning_rate": 2.4622134835901307e-07, + "loss": 0.3657, + "step": 59700 + }, + { + "epoch": 1170.78, + "learning_rate": 2.429511079007573e-07, + "loss": 0.3659, + "step": 59710 + }, + { + "epoch": 1170.98, + "learning_rate": 2.397026948335212e-07, + "loss": 0.3729, + "step": 59720 + }, + { + "epoch": 1171.0, + "eval_loss": 0.37225037813186646, + "eval_runtime": 2.3919, + "eval_samples_per_second": 952.798, + "eval_steps_per_second": 3.763, + "step": 59721 + }, + { + "epoch": 1171.18, + "learning_rate": 2.364761101057877e-07, + "loss": 0.3698, + "step": 59730 + }, + { + "epoch": 1171.37, + "learning_rate": 2.3327135465964487e-07, + "loss": 0.3671, + "step": 59740 + }, + { + "epoch": 1171.57, + "learning_rate": 2.3008842943080253e-07, + "loss": 0.3752, + "step": 59750 + }, + { + "epoch": 1171.76, + "learning_rate": 2.2692733534860896e-07, + "loss": 0.371, + "step": 59760 + }, + { + "epoch": 1171.96, + "learning_rate": 2.2378807333603421e-07, + "loss": 0.3701, + "step": 59770 + }, + { + "epoch": 1172.0, + "eval_loss": 0.3768444359302521, + "eval_runtime": 2.2603, + "eval_samples_per_second": 1008.295, + "eval_steps_per_second": 3.982, + "step": 59772 + }, + { + "epoch": 1172.16, + "learning_rate": 2.2067064430967007e-07, + "loss": 0.3688, + "step": 59780 + }, + { + "epoch": 1172.35, + "learning_rate": 2.1757504917973013e-07, + "loss": 0.3652, + "step": 59790 + }, + { + "epoch": 1172.55, + "learning_rate": 2.1450128885007468e-07, + "loss": 0.3655, + "step": 59800 + }, + { + "epoch": 1172.75, + "learning_rate": 2.1144936421816083e-07, + "loss": 0.3707, + "step": 59810 + }, + { + "epoch": 1172.94, + "learning_rate": 2.0841927617508415e-07, + "loss": 0.3717, + "step": 59820 + }, + { + "epoch": 1173.0, + "eval_loss": 0.37819233536720276, + "eval_runtime": 2.391, + "eval_samples_per_second": 953.142, + "eval_steps_per_second": 3.764, + "step": 59823 + }, + { + "epoch": 1173.14, + "learning_rate": 2.0541102560556188e-07, + "loss": 0.3676, + "step": 59830 + }, + { + "epoch": 1173.33, + "learning_rate": 2.0242461338794979e-07, + "loss": 0.3685, + "step": 59840 + }, + { + "epoch": 1173.53, + "learning_rate": 1.9946004039420037e-07, + "loss": 0.3697, + "step": 59850 + }, + { + "epoch": 1173.73, + "learning_rate": 1.9651730748989624e-07, + "loss": 0.37, + "step": 59860 + }, + { + "epoch": 1173.92, + "learning_rate": 1.9359641553426675e-07, + "loss": 0.3716, + "step": 59870 + }, + { + "epoch": 1174.0, + "eval_loss": 0.37209558486938477, + "eval_runtime": 2.2294, + "eval_samples_per_second": 1022.25, + "eval_steps_per_second": 4.037, + "step": 59874 + }, + { + "epoch": 1174.12, + "learning_rate": 1.906973653801297e-07, + "loss": 0.3686, + "step": 59880 + }, + { + "epoch": 1174.31, + "learning_rate": 1.878201578739663e-07, + "loss": 0.368, + "step": 59890 + }, + { + "epoch": 1174.51, + "learning_rate": 1.8496479385583797e-07, + "loss": 0.3702, + "step": 59900 + }, + { + "epoch": 1174.71, + "learning_rate": 1.821312741594444e-07, + "loss": 0.3686, + "step": 59910 + }, + { + "epoch": 1174.9, + "learning_rate": 1.7931959961213215e-07, + "loss": 0.3723, + "step": 59920 + }, + { + "epoch": 1175.0, + "eval_loss": 0.3712206780910492, + "eval_runtime": 2.2517, + "eval_samples_per_second": 1012.146, + "eval_steps_per_second": 3.997, + "step": 59925 + }, + { + "epoch": 1175.1, + "learning_rate": 1.765297710348196e-07, + "loss": 0.3677, + "step": 59930 + }, + { + "epoch": 1175.29, + "learning_rate": 1.7376178924209673e-07, + "loss": 0.3676, + "step": 59940 + }, + { + "epoch": 1175.49, + "learning_rate": 1.710156550421421e-07, + "loss": 0.3669, + "step": 59950 + }, + { + "epoch": 1175.69, + "learning_rate": 1.6829136923677268e-07, + "loss": 0.3649, + "step": 59960 + }, + { + "epoch": 1175.88, + "learning_rate": 1.6558893262141048e-07, + "loss": 0.3674, + "step": 59970 + }, + { + "epoch": 1176.0, + "eval_loss": 0.37456443905830383, + "eval_runtime": 2.291, + "eval_samples_per_second": 994.763, + "eval_steps_per_second": 3.928, + "step": 59976 + }, + { + "epoch": 1176.08, + "learning_rate": 1.629083459851077e-07, + "loss": 0.3682, + "step": 59980 + }, + { + "epoch": 1176.27, + "learning_rate": 1.602496101105466e-07, + "loss": 0.3701, + "step": 59990 + }, + { + "epoch": 1176.47, + "learning_rate": 1.576127257740062e-07, + "loss": 0.3719, + "step": 60000 + }, + { + "epoch": 1176.67, + "learning_rate": 1.5499769374540394e-07, + "loss": 0.3672, + "step": 60010 + }, + { + "epoch": 1176.86, + "learning_rate": 1.5240451478826244e-07, + "loss": 0.365, + "step": 60020 + }, + { + "epoch": 1177.0, + "eval_loss": 0.37678277492523193, + "eval_runtime": 2.2264, + "eval_samples_per_second": 1023.605, + "eval_steps_per_second": 4.042, + "step": 60027 + }, + { + "epoch": 1177.06, + "learning_rate": 1.4983318965974267e-07, + "loss": 0.3681, + "step": 60030 + }, + { + "epoch": 1177.25, + "learning_rate": 1.4728371911061909e-07, + "loss": 0.3704, + "step": 60040 + }, + { + "epoch": 1177.45, + "learning_rate": 1.4475610388526294e-07, + "loss": 0.3682, + "step": 60050 + }, + { + "epoch": 1177.65, + "learning_rate": 1.4225034472169216e-07, + "loss": 0.3694, + "step": 60060 + }, + { + "epoch": 1177.84, + "learning_rate": 1.3976644235153823e-07, + "loss": 0.3725, + "step": 60070 + }, + { + "epoch": 1178.0, + "eval_loss": 0.3759779632091522, + "eval_runtime": 2.316, + "eval_samples_per_second": 984.038, + "eval_steps_per_second": 3.886, + "step": 60078 + }, + { + "epoch": 1178.04, + "learning_rate": 1.373043975000293e-07, + "loss": 0.3739, + "step": 60080 + }, + { + "epoch": 1178.24, + "learning_rate": 1.3486421088604038e-07, + "loss": 0.3693, + "step": 60090 + }, + { + "epoch": 1178.43, + "learning_rate": 1.324458832220432e-07, + "loss": 0.3683, + "step": 60100 + }, + { + "epoch": 1178.63, + "learning_rate": 1.300494152141396e-07, + "loss": 0.3689, + "step": 60110 + }, + { + "epoch": 1178.82, + "learning_rate": 1.2767480756205318e-07, + "loss": 0.3679, + "step": 60120 + }, + { + "epoch": 1179.0, + "eval_loss": 0.37421050667762756, + "eval_runtime": 2.2922, + "eval_samples_per_second": 994.254, + "eval_steps_per_second": 3.926, + "step": 60129 + }, + { + "epoch": 1179.02, + "learning_rate": 1.2532206095909604e-07, + "loss": 0.3686, + "step": 60130 + }, + { + "epoch": 1179.22, + "learning_rate": 1.229911760922353e-07, + "loss": 0.3644, + "step": 60140 + }, + { + "epoch": 1179.41, + "learning_rate": 1.2068215364203493e-07, + "loss": 0.3666, + "step": 60150 + }, + { + "epoch": 1179.61, + "learning_rate": 1.183949942826723e-07, + "loss": 0.3658, + "step": 60160 + }, + { + "epoch": 1179.8, + "learning_rate": 1.1612969868195488e-07, + "loss": 0.3681, + "step": 60170 + }, + { + "epoch": 1180.0, + "learning_rate": 1.1388626750128693e-07, + "loss": 0.3707, + "step": 60180 + }, + { + "epoch": 1180.0, + "eval_loss": 0.37527692317962646, + "eval_runtime": 2.3791, + "eval_samples_per_second": 957.916, + "eval_steps_per_second": 3.783, + "step": 60180 + }, + { + "epoch": 1180.2, + "learning_rate": 1.1166470139570282e-07, + "loss": 0.3689, + "step": 60190 + }, + { + "epoch": 1180.39, + "learning_rate": 1.0946500101385869e-07, + "loss": 0.3663, + "step": 60200 + }, + { + "epoch": 1180.59, + "learning_rate": 1.0728716699801576e-07, + "loss": 0.366, + "step": 60210 + }, + { + "epoch": 1180.78, + "learning_rate": 1.0513119998404873e-07, + "loss": 0.3691, + "step": 60220 + }, + { + "epoch": 1180.98, + "learning_rate": 1.0299710060144572e-07, + "loss": 0.3698, + "step": 60230 + }, + { + "epoch": 1181.0, + "eval_loss": 0.373009592294693, + "eval_runtime": 2.3903, + "eval_samples_per_second": 953.454, + "eval_steps_per_second": 3.765, + "step": 60231 + }, + { + "epoch": 1181.18, + "learning_rate": 1.008848694733333e-07, + "loss": 0.3702, + "step": 60240 + }, + { + "epoch": 1181.37, + "learning_rate": 9.879450721642645e-08, + "loss": 0.3721, + "step": 60250 + }, + { + "epoch": 1181.57, + "learning_rate": 9.672601444106198e-08, + "loss": 0.3669, + "step": 60260 + }, + { + "epoch": 1181.76, + "learning_rate": 9.467939175119843e-08, + "loss": 0.3679, + "step": 60270 + }, + { + "epoch": 1181.96, + "learning_rate": 9.265463974439947e-08, + "loss": 0.3697, + "step": 60280 + }, + { + "epoch": 1182.0, + "eval_loss": 0.3748382329940796, + "eval_runtime": 2.359, + "eval_samples_per_second": 966.069, + "eval_steps_per_second": 3.815, + "step": 60282 + }, + { + "epoch": 1182.16, + "learning_rate": 9.065175901185052e-08, + "loss": 0.3702, + "step": 60290 + }, + { + "epoch": 1182.35, + "learning_rate": 8.867075013834213e-08, + "loss": 0.3686, + "step": 60300 + }, + { + "epoch": 1182.55, + "learning_rate": 8.671161370229496e-08, + "loss": 0.3706, + "step": 60310 + }, + { + "epoch": 1182.75, + "learning_rate": 8.477435027572643e-08, + "loss": 0.373, + "step": 60320 + }, + { + "epoch": 1182.94, + "learning_rate": 8.285896042427576e-08, + "loss": 0.368, + "step": 60330 + }, + { + "epoch": 1183.0, + "eval_loss": 0.37221765518188477, + "eval_runtime": 2.3579, + "eval_samples_per_second": 966.524, + "eval_steps_per_second": 3.817, + "step": 60333 + }, + { + "epoch": 1183.14, + "learning_rate": 8.096544470719557e-08, + "loss": 0.37, + "step": 60340 + }, + { + "epoch": 1183.33, + "learning_rate": 7.909380367735197e-08, + "loss": 0.3684, + "step": 60350 + }, + { + "epoch": 1183.53, + "learning_rate": 7.724403788121614e-08, + "loss": 0.3696, + "step": 60360 + }, + { + "epoch": 1183.73, + "learning_rate": 7.541614785888105e-08, + "loss": 0.3703, + "step": 60370 + }, + { + "epoch": 1183.92, + "learning_rate": 7.361013414405315e-08, + "loss": 0.3689, + "step": 60380 + }, + { + "epoch": 1184.0, + "eval_loss": 0.3724251091480255, + "eval_runtime": 2.2633, + "eval_samples_per_second": 1006.959, + "eval_steps_per_second": 3.977, + "step": 60384 + }, + { + "epoch": 1184.12, + "learning_rate": 7.182599726404393e-08, + "loss": 0.3701, + "step": 60390 + }, + { + "epoch": 1184.31, + "learning_rate": 7.006373773977836e-08, + "loss": 0.3723, + "step": 60400 + }, + { + "epoch": 1184.51, + "learning_rate": 6.832335608581152e-08, + "loss": 0.3658, + "step": 60410 + }, + { + "epoch": 1184.71, + "learning_rate": 6.660485281027861e-08, + "loss": 0.3747, + "step": 60420 + }, + { + "epoch": 1184.9, + "learning_rate": 6.490822841495324e-08, + "loss": 0.3667, + "step": 60430 + }, + { + "epoch": 1185.0, + "eval_loss": 0.3731216490268707, + "eval_runtime": 2.4824, + "eval_samples_per_second": 918.05, + "eval_steps_per_second": 3.625, + "step": 60435 + }, + { + "epoch": 1185.1, + "learning_rate": 6.323348339521416e-08, + "loss": 0.366, + "step": 60440 + }, + { + "epoch": 1185.29, + "learning_rate": 6.15806182400369e-08, + "loss": 0.3657, + "step": 60450 + }, + { + "epoch": 1185.49, + "learning_rate": 5.99496334320354e-08, + "loss": 0.3673, + "step": 60460 + }, + { + "epoch": 1185.69, + "learning_rate": 5.8340529447420403e-08, + "loss": 0.3743, + "step": 60470 + }, + { + "epoch": 1185.88, + "learning_rate": 5.675330675600775e-08, + "loss": 0.3708, + "step": 60480 + }, + { + "epoch": 1186.0, + "eval_loss": 0.3785109519958496, + "eval_runtime": 2.3042, + "eval_samples_per_second": 989.051, + "eval_steps_per_second": 3.906, + "step": 60486 + }, + { + "epoch": 1186.08, + "learning_rate": 5.5187965821226755e-08, + "loss": 0.3672, + "step": 60490 + }, + { + "epoch": 1186.27, + "learning_rate": 5.3644507100128466e-08, + "loss": 0.3723, + "step": 60500 + }, + { + "epoch": 1186.47, + "learning_rate": 5.212293104337739e-08, + "loss": 0.3683, + "step": 60510 + }, + { + "epoch": 1186.67, + "learning_rate": 5.062323809522651e-08, + "loss": 0.3655, + "step": 60520 + }, + { + "epoch": 1186.86, + "learning_rate": 4.9145428693550536e-08, + "loss": 0.3684, + "step": 60530 + }, + { + "epoch": 1187.0, + "eval_loss": 0.3754968047142029, + "eval_runtime": 2.2349, + "eval_samples_per_second": 1019.716, + "eval_steps_per_second": 4.027, + "step": 60537 + }, + { + "epoch": 1187.06, + "learning_rate": 4.7689503269846e-08, + "loss": 0.3687, + "step": 60540 + }, + { + "epoch": 1187.25, + "learning_rate": 4.6255462249214505e-08, + "loss": 0.3706, + "step": 60550 + }, + { + "epoch": 1187.45, + "learning_rate": 4.484330605034614e-08, + "loss": 0.3703, + "step": 60560 + }, + { + "epoch": 1187.65, + "learning_rate": 4.345303508557774e-08, + "loss": 0.3698, + "step": 60570 + }, + { + "epoch": 1187.84, + "learning_rate": 4.208464976082626e-08, + "loss": 0.3701, + "step": 60580 + }, + { + "epoch": 1188.0, + "eval_loss": 0.3774392902851105, + "eval_runtime": 2.3768, + "eval_samples_per_second": 958.862, + "eval_steps_per_second": 3.787, + "step": 60588 + }, + { + "epoch": 1188.04, + "learning_rate": 4.073815047563878e-08, + "loss": 0.3723, + "step": 60590 + }, + { + "epoch": 1188.24, + "learning_rate": 3.94135376231508e-08, + "loss": 0.3624, + "step": 60600 + }, + { + "epoch": 1188.43, + "learning_rate": 3.811081159013629e-08, + "loss": 0.3694, + "step": 60610 + }, + { + "epoch": 1188.63, + "learning_rate": 3.6829972756941e-08, + "loss": 0.3766, + "step": 60620 + }, + { + "epoch": 1188.82, + "learning_rate": 3.5571021497557415e-08, + "loss": 0.3685, + "step": 60630 + }, + { + "epoch": 1189.0, + "eval_loss": 0.3732983469963074, + "eval_runtime": 2.243, + "eval_samples_per_second": 1016.052, + "eval_steps_per_second": 4.012, + "step": 60639 + }, + { + "epoch": 1189.02, + "learning_rate": 3.4333958179566526e-08, + "loss": 0.3683, + "step": 60640 + }, + { + "epoch": 1189.22, + "learning_rate": 3.311878316416272e-08, + "loss": 0.3726, + "step": 60650 + }, + { + "epoch": 1189.41, + "learning_rate": 3.192549680615386e-08, + "loss": 0.3681, + "step": 60660 + }, + { + "epoch": 1189.61, + "learning_rate": 3.075409945394458e-08, + "loss": 0.3716, + "step": 60670 + }, + { + "epoch": 1189.8, + "learning_rate": 2.9604591449569614e-08, + "loss": 0.3659, + "step": 60680 + }, + { + "epoch": 1190.0, + "learning_rate": 2.8476973128643832e-08, + "loss": 0.37, + "step": 60690 + }, + { + "epoch": 1190.0, + "eval_loss": 0.3773120641708374, + "eval_runtime": 2.3368, + "eval_samples_per_second": 975.255, + "eval_steps_per_second": 3.851, + "step": 60690 + }, + { + "epoch": 1190.2, + "learning_rate": 2.7371244820420524e-08, + "loss": 0.3676, + "step": 60700 + }, + { + "epoch": 1190.39, + "learning_rate": 2.6287406847733115e-08, + "loss": 0.3668, + "step": 60710 + }, + { + "epoch": 1190.59, + "learning_rate": 2.522545952705346e-08, + "loss": 0.3667, + "step": 60720 + }, + { + "epoch": 1190.78, + "learning_rate": 2.4185403168441863e-08, + "loss": 0.372, + "step": 60730 + }, + { + "epoch": 1190.98, + "learning_rate": 2.3167238075563754e-08, + "loss": 0.372, + "step": 60740 + }, + { + "epoch": 1191.0, + "eval_loss": 0.37607088685035706, + "eval_runtime": 2.2766, + "eval_samples_per_second": 1001.046, + "eval_steps_per_second": 3.953, + "step": 60741 + }, + { + "epoch": 1191.18, + "learning_rate": 2.2170964545714653e-08, + "loss": 0.3716, + "step": 60750 + }, + { + "epoch": 1191.37, + "learning_rate": 2.1196582869770217e-08, + "loss": 0.3714, + "step": 60760 + }, + { + "epoch": 1191.57, + "learning_rate": 2.0244093332227874e-08, + "loss": 0.3682, + "step": 60770 + }, + { + "epoch": 1191.76, + "learning_rate": 1.9313496211206813e-08, + "loss": 0.3675, + "step": 60780 + }, + { + "epoch": 1191.96, + "learning_rate": 1.8404791778414697e-08, + "loss": 0.3677, + "step": 60790 + }, + { + "epoch": 1192.0, + "eval_loss": 0.37334564328193665, + "eval_runtime": 2.2903, + "eval_samples_per_second": 995.079, + "eval_steps_per_second": 3.93, + "step": 60792 + }, + { + "epoch": 1192.16, + "learning_rate": 1.7517980299172618e-08, + "loss": 0.3719, + "step": 60800 + }, + { + "epoch": 1192.35, + "learning_rate": 1.6653062032406796e-08, + "loss": 0.3668, + "step": 60810 + }, + { + "epoch": 1192.55, + "learning_rate": 1.5810037230648553e-08, + "loss": 0.3675, + "step": 60820 + }, + { + "epoch": 1192.75, + "learning_rate": 1.498890614005932e-08, + "loss": 0.3713, + "step": 60830 + }, + { + "epoch": 1192.94, + "learning_rate": 1.4189669000380654e-08, + "loss": 0.367, + "step": 60840 + }, + { + "epoch": 1193.0, + "eval_loss": 0.37703248858451843, + "eval_runtime": 2.3099, + "eval_samples_per_second": 986.616, + "eval_steps_per_second": 3.896, + "step": 60843 + }, + { + "epoch": 1193.14, + "learning_rate": 1.3412326044967559e-08, + "loss": 0.3731, + "step": 60850 + }, + { + "epoch": 1193.33, + "learning_rate": 1.2656877500796803e-08, + "loss": 0.3671, + "step": 60860 + }, + { + "epoch": 1193.53, + "learning_rate": 1.192332358843362e-08, + "loss": 0.3688, + "step": 60870 + }, + { + "epoch": 1193.73, + "learning_rate": 1.1211664522065012e-08, + "loss": 0.3713, + "step": 60880 + }, + { + "epoch": 1193.92, + "learning_rate": 1.0521900509474768e-08, + "loss": 0.3641, + "step": 60890 + }, + { + "epoch": 1194.0, + "eval_loss": 0.3731459081172943, + "eval_runtime": 2.3405, + "eval_samples_per_second": 973.728, + "eval_steps_per_second": 3.845, + "step": 60894 + }, + { + "epoch": 1194.12, + "learning_rate": 9.854031752068447e-09, + "loss": 0.3639, + "step": 60900 + }, + { + "epoch": 1194.31, + "learning_rate": 9.208058444840072e-09, + "loss": 0.371, + "step": 60910 + }, + { + "epoch": 1194.51, + "learning_rate": 8.583980776397104e-09, + "loss": 0.3663, + "step": 60920 + }, + { + "epoch": 1194.71, + "learning_rate": 7.981798928968775e-09, + "loss": 0.3714, + "step": 60930 + }, + { + "epoch": 1194.9, + "learning_rate": 7.401513078364452e-09, + "loss": 0.3679, + "step": 60940 + }, + { + "epoch": 1195.0, + "eval_loss": 0.37386608123779297, + "eval_runtime": 2.2252, + "eval_samples_per_second": 1024.179, + "eval_steps_per_second": 4.045, + "step": 60945 + }, + { + "epoch": 1195.1, + "learning_rate": 6.843123394023598e-09, + "loss": 0.3725, + "step": 60950 + }, + { + "epoch": 1195.29, + "learning_rate": 6.306630038982463e-09, + "loss": 0.3694, + "step": 60960 + }, + { + "epoch": 1195.49, + "learning_rate": 5.792033169882415e-09, + "loss": 0.3669, + "step": 60970 + }, + { + "epoch": 1195.69, + "learning_rate": 5.299332936969935e-09, + "loss": 0.3689, + "step": 60980 + }, + { + "epoch": 1195.88, + "learning_rate": 4.8285294841132745e-09, + "loss": 0.3709, + "step": 60990 + }, + { + "epoch": 1196.0, + "eval_loss": 0.37305885553359985, + "eval_runtime": 2.3119, + "eval_samples_per_second": 985.777, + "eval_steps_per_second": 3.893, + "step": 60996 + }, + { + "epoch": 1196.08, + "learning_rate": 4.3796229487774725e-09, + "loss": 0.3687, + "step": 61000 + }, + { + "epoch": 1196.27, + "learning_rate": 3.95261346201603e-09, + "loss": 0.3692, + "step": 61010 + }, + { + "epoch": 1196.47, + "learning_rate": 3.5475011485208703e-09, + "loss": 0.3659, + "step": 61020 + }, + { + "epoch": 1196.67, + "learning_rate": 3.1642861265723794e-09, + "loss": 0.3662, + "step": 61030 + }, + { + "epoch": 1196.86, + "learning_rate": 2.802968508064385e-09, + "loss": 0.3668, + "step": 61040 + }, + { + "epoch": 1197.0, + "eval_loss": 0.37835466861724854, + "eval_runtime": 2.3758, + "eval_samples_per_second": 959.262, + "eval_steps_per_second": 3.788, + "step": 61047 + }, + { + "epoch": 1197.06, + "learning_rate": 2.4635483984875025e-09, + "loss": 0.3672, + "step": 61050 + }, + { + "epoch": 1197.25, + "learning_rate": 2.146025896945791e-09, + "loss": 0.3718, + "step": 61060 + }, + { + "epoch": 1197.45, + "learning_rate": 1.8504010961484238e-09, + "loss": 0.3628, + "step": 61070 + }, + { + "epoch": 1197.65, + "learning_rate": 1.576674082418017e-09, + "loss": 0.3723, + "step": 61080 + }, + { + "epoch": 1197.84, + "learning_rate": 1.3248449356739743e-09, + "loss": 0.3678, + "step": 61090 + }, + { + "epoch": 1198.0, + "eval_loss": 0.37536975741386414, + "eval_runtime": 2.2755, + "eval_samples_per_second": 1001.548, + "eval_steps_per_second": 3.955, + "step": 61098 + }, + { + "epoch": 1198.04, + "learning_rate": 1.0949137294324895e-09, + "loss": 0.365, + "step": 61100 + }, + { + "epoch": 1198.24, + "learning_rate": 8.868805308481775e-10, + "loss": 0.3705, + "step": 61110 + }, + { + "epoch": 1198.43, + "learning_rate": 7.007454006474623e-10, + "loss": 0.3716, + "step": 61120 + }, + { + "epoch": 1198.63, + "learning_rate": 5.365083931785363e-10, + "loss": 0.3701, + "step": 61130 + }, + { + "epoch": 1198.82, + "learning_rate": 3.9416955640303447e-10, + "loss": 0.3642, + "step": 61140 + }, + { + "epoch": 1199.0, + "eval_loss": 0.3795470595359802, + "eval_runtime": 2.3862, + "eval_samples_per_second": 955.076, + "eval_steps_per_second": 3.772, + "step": 61149 + }, + { + "epoch": 1199.02, + "learning_rate": 2.7372893187938003e-10, + "loss": 0.3652, + "step": 61150 + }, + { + "epoch": 1199.22, + "learning_rate": 1.751865547627851e-10, + "loss": 0.3687, + "step": 61160 + }, + { + "epoch": 1199.41, + "learning_rate": 9.854245383855708e-11, + "loss": 0.3639, + "step": 61170 + }, + { + "epoch": 1199.61, + "learning_rate": 4.379665148046552e-11, + "loss": 0.3695, + "step": 61180 + }, + { + "epoch": 1199.8, + "learning_rate": 1.0949163667395289e-11, + "loss": 0.3652, + "step": 61190 + }, + { + "epoch": 1200.0, + "learning_rate": 0.0, + "loss": 0.3717, + "step": 61200 + }, + { + "epoch": 1200.0, + "eval_loss": 0.3766399323940277, + "eval_runtime": 2.4193, + "eval_samples_per_second": 942.017, + "eval_steps_per_second": 3.72, + "step": 61200 + }, + { + "epoch": 1200.0, + "step": 61200, + "total_flos": 1.6045884532435452e+21, + "train_loss": 0.47761312031667996, + "train_runtime": 27118.5193, + "train_samples_per_second": 571.314, + "train_steps_per_second": 2.257 + } + ], + "max_steps": 61200, + "num_train_epochs": 1200, + "total_flos": 1.6045884532435452e+21, + "trial_name": null, + "trial_params": null +}