diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,104736 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.00680275, + "global_step": 1700000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 2.0179, + "step": 100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.9412, + "step": 200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.918, + "step": 300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.906, + "step": 400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8979, + "step": 500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8955, + "step": 600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.887, + "step": 700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8873, + "step": 800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8844, + "step": 900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8842, + "step": 1000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.872, + "step": 1100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8773, + "step": 1200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8775, + "step": 1300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8627, + "step": 1400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.875, + "step": 1500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8666, + "step": 1600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.869, + "step": 1700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8593, + "step": 1800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8715, + "step": 1900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8704, + "step": 2000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8658, + "step": 2100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8662, + "step": 2200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8616, + "step": 2300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8675, + "step": 2400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8675, + "step": 2500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8636, + "step": 2600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8626, + "step": 2700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8652, + "step": 2800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8607, + "step": 2900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8548, + "step": 3000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8571, + "step": 3100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8524, + "step": 3200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8551, + "step": 3300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.866, + "step": 3400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8453, + "step": 3500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8572, + "step": 3600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8596, + "step": 3700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8607, + "step": 3800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.859, + "step": 3900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8548, + "step": 4000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8657, + "step": 4100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8452, + "step": 4200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8466, + "step": 4300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8484, + "step": 4400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8459, + "step": 4500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8452, + "step": 4600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8515, + "step": 4700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8554, + "step": 4800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8528, + "step": 4900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8565, + "step": 5000 + }, + { + "epoch": 0.0, + "eval_loss": 0.7948816418647766, + "eval_runtime": 204.858, + "eval_samples_per_second": 244.072, + "eval_steps_per_second": 1.909, + "step": 5000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8474, + "step": 5100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8506, + "step": 5200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8418, + "step": 5300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8567, + "step": 5400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8483, + "step": 5500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8473, + "step": 5600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8454, + "step": 5700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8499, + "step": 5800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.841, + "step": 5900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8545, + "step": 6000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8393, + "step": 6100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8434, + "step": 6200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8405, + "step": 6300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8464, + "step": 6400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8456, + "step": 6500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8466, + "step": 6600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8451, + "step": 6700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8452, + "step": 6800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.844, + "step": 6900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8416, + "step": 7000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8453, + "step": 7100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8544, + "step": 7200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8442, + "step": 7300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.838, + "step": 7400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8398, + "step": 7500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8431, + "step": 7600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8431, + "step": 7700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8531, + "step": 7800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8326, + "step": 7900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8417, + "step": 8000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8506, + "step": 8100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.848, + "step": 8200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.84, + "step": 8300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8362, + "step": 8400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8445, + "step": 8500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8392, + "step": 8600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8361, + "step": 8700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8354, + "step": 8800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8373, + "step": 8900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8268, + "step": 9000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8424, + "step": 9100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8322, + "step": 9200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8407, + "step": 9300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8364, + "step": 9400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8346, + "step": 9500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8368, + "step": 9600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8377, + "step": 9700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8372, + "step": 9800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8392, + "step": 9900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8439, + "step": 10000 + }, + { + "epoch": 0.0, + "eval_loss": 0.7838985323905945, + "eval_runtime": 204.5164, + "eval_samples_per_second": 244.479, + "eval_steps_per_second": 1.912, + "step": 10000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.834, + "step": 10100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8392, + "step": 10200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8345, + "step": 10300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8403, + "step": 10400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8346, + "step": 10500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8348, + "step": 10600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8424, + "step": 10700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8338, + "step": 10800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8361, + "step": 10900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8398, + "step": 11000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8307, + "step": 11100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8302, + "step": 11200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8402, + "step": 11300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8289, + "step": 11400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8347, + "step": 11500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8312, + "step": 11600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.834, + "step": 11700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8324, + "step": 11800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8428, + "step": 11900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8356, + "step": 12000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8369, + "step": 12100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.83, + "step": 12200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8272, + "step": 12300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8345, + "step": 12400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8301, + "step": 12500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8286, + "step": 12600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8309, + "step": 12700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8253, + "step": 12800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8319, + "step": 12900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8258, + "step": 13000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.835, + "step": 13100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8318, + "step": 13200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8337, + "step": 13300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8283, + "step": 13400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.832, + "step": 13500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8352, + "step": 13600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8319, + "step": 13700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8359, + "step": 13800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8321, + "step": 13900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8316, + "step": 14000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8304, + "step": 14100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.828, + "step": 14200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8294, + "step": 14300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8345, + "step": 14400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8276, + "step": 14500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8254, + "step": 14600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8362, + "step": 14700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8353, + "step": 14800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8346, + "step": 14900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8313, + "step": 15000 + }, + { + "epoch": 0.0, + "eval_loss": 0.7768447399139404, + "eval_runtime": 202.8477, + "eval_samples_per_second": 246.49, + "eval_steps_per_second": 1.928, + "step": 15000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8296, + "step": 15100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8347, + "step": 15200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.829, + "step": 15300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8262, + "step": 15400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8319, + "step": 15500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8256, + "step": 15600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8267, + "step": 15700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.824, + "step": 15800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8318, + "step": 15900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8257, + "step": 16000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.825, + "step": 16100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8244, + "step": 16200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8158, + "step": 16300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8251, + "step": 16400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8259, + "step": 16500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8262, + "step": 16600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8228, + "step": 16700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.823, + "step": 16800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8242, + "step": 16900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8274, + "step": 17000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8256, + "step": 17100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8273, + "step": 17200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8195, + "step": 17300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8223, + "step": 17400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8245, + "step": 17500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8207, + "step": 17600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.82, + "step": 17700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8275, + "step": 17800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8329, + "step": 17900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8157, + "step": 18000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8308, + "step": 18100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8239, + "step": 18200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8254, + "step": 18300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.821, + "step": 18400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8293, + "step": 18500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.821, + "step": 18600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8142, + "step": 18700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.829, + "step": 18800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8251, + "step": 18900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8339, + "step": 19000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8252, + "step": 19100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8246, + "step": 19200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8289, + "step": 19300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8295, + "step": 19400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8255, + "step": 19500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8204, + "step": 19600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.815, + "step": 19700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8156, + "step": 19800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8201, + "step": 19900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8209, + "step": 20000 + }, + { + "epoch": 0.0, + "eval_loss": 0.7698022723197937, + "eval_runtime": 205.3954, + "eval_samples_per_second": 243.433, + "eval_steps_per_second": 1.904, + "step": 20000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8159, + "step": 20100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8257, + "step": 20200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8302, + "step": 20300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8291, + "step": 20400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8157, + "step": 20500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8214, + "step": 20600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8203, + "step": 20700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8253, + "step": 20800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8116, + "step": 20900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8218, + "step": 21000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8173, + "step": 21100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8247, + "step": 21200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8152, + "step": 21300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8124, + "step": 21400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8185, + "step": 21500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8243, + "step": 21600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8289, + "step": 21700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8208, + "step": 21800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8195, + "step": 21900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8239, + "step": 22000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8157, + "step": 22100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8215, + "step": 22200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8205, + "step": 22300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8174, + "step": 22400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8265, + "step": 22500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8162, + "step": 22600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8143, + "step": 22700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8233, + "step": 22800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8104, + "step": 22900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8176, + "step": 23000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8196, + "step": 23100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8301, + "step": 23200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8213, + "step": 23300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8205, + "step": 23400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8099, + "step": 23500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8203, + "step": 23600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8176, + "step": 23700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.822, + "step": 23800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8208, + "step": 23900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8172, + "step": 24000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8251, + "step": 24100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8111, + "step": 24200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8215, + "step": 24300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8133, + "step": 24400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.818, + "step": 24500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8173, + "step": 24600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8145, + "step": 24700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8226, + "step": 24800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8236, + "step": 24900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8213, + "step": 25000 + }, + { + "epoch": 0.0, + "eval_loss": 0.7671163082122803, + "eval_runtime": 204.3887, + "eval_samples_per_second": 244.632, + "eval_steps_per_second": 1.913, + "step": 25000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8148, + "step": 25100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8203, + "step": 25200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8242, + "step": 25300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8179, + "step": 25400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8132, + "step": 25500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8109, + "step": 25600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8088, + "step": 25700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8106, + "step": 25800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8136, + "step": 25900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.817, + "step": 26000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.812, + "step": 26100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.813, + "step": 26200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8125, + "step": 26300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8123, + "step": 26400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8154, + "step": 26500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8067, + "step": 26600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8138, + "step": 26700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8248, + "step": 26800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.812, + "step": 26900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8133, + "step": 27000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.822, + "step": 27100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8189, + "step": 27200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8089, + "step": 27300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8136, + "step": 27400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8129, + "step": 27500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8155, + "step": 27600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8175, + "step": 27700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8119, + "step": 27800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8169, + "step": 27900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8102, + "step": 28000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8208, + "step": 28100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8125, + "step": 28200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8127, + "step": 28300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8126, + "step": 28400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8104, + "step": 28500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8122, + "step": 28600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8179, + "step": 28700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8117, + "step": 28800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8073, + "step": 28900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8094, + "step": 29000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8165, + "step": 29100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8109, + "step": 29200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8134, + "step": 29300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8132, + "step": 29400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8163, + "step": 29500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8153, + "step": 29600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8097, + "step": 29700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8138, + "step": 29800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8085, + "step": 29900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8121, + "step": 30000 + }, + { + "epoch": 0.0, + "eval_loss": 0.7610729932785034, + "eval_runtime": 206.7211, + "eval_samples_per_second": 241.872, + "eval_steps_per_second": 1.891, + "step": 30000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8162, + "step": 30100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8095, + "step": 30200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8105, + "step": 30300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8175, + "step": 30400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8164, + "step": 30500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8066, + "step": 30600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8135, + "step": 30700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8173, + "step": 30800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8058, + "step": 30900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8207, + "step": 31000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.812, + "step": 31100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8066, + "step": 31200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8089, + "step": 31300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8145, + "step": 31400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8066, + "step": 31500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8147, + "step": 31600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8141, + "step": 31700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8064, + "step": 31800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.814, + "step": 31900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.818, + "step": 32000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8059, + "step": 32100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.815, + "step": 32200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8129, + "step": 32300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8103, + "step": 32400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8152, + "step": 32500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.808, + "step": 32600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8147, + "step": 32700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8097, + "step": 32800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8152, + "step": 32900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8054, + "step": 33000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8183, + "step": 33100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8089, + "step": 33200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8135, + "step": 33300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8109, + "step": 33400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8119, + "step": 33500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.814, + "step": 33600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8031, + "step": 33700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8024, + "step": 33800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8113, + "step": 33900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8127, + "step": 34000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8107, + "step": 34100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8119, + "step": 34200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8112, + "step": 34300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8042, + "step": 34400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8091, + "step": 34500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8047, + "step": 34600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8091, + "step": 34700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8105, + "step": 34800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8144, + "step": 34900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8092, + "step": 35000 + }, + { + "epoch": 0.0, + "eval_loss": 0.7591666579246521, + "eval_runtime": 205.1234, + "eval_samples_per_second": 243.756, + "eval_steps_per_second": 1.906, + "step": 35000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8158, + "step": 35100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8112, + "step": 35200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.811, + "step": 35300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8102, + "step": 35400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8084, + "step": 35500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8089, + "step": 35600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8135, + "step": 35700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8075, + "step": 35800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8118, + "step": 35900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8034, + "step": 36000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8166, + "step": 36100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.818, + "step": 36200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.807, + "step": 36300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8169, + "step": 36400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8164, + "step": 36500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8101, + "step": 36600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8079, + "step": 36700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8126, + "step": 36800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8072, + "step": 36900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8053, + "step": 37000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8063, + "step": 37100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8075, + "step": 37200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8049, + "step": 37300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8071, + "step": 37400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8033, + "step": 37500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8092, + "step": 37600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.81, + "step": 37700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8122, + "step": 37800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8084, + "step": 37900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.813, + "step": 38000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8036, + "step": 38100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8048, + "step": 38200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8038, + "step": 38300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8009, + "step": 38400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8074, + "step": 38500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8083, + "step": 38600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.7932, + "step": 38700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8065, + "step": 38800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.7998, + "step": 38900 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8116, + "step": 39000 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8023, + "step": 39100 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8137, + "step": 39200 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8076, + "step": 39300 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8098, + "step": 39400 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8028, + "step": 39500 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8041, + "step": 39600 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.7989, + "step": 39700 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8083, + "step": 39800 + }, + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 0.8003, + "step": 39900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.808, + "step": 40000 + }, + { + "epoch": 0.01, + "eval_loss": 0.757146418094635, + "eval_runtime": 205.0093, + "eval_samples_per_second": 243.891, + "eval_steps_per_second": 1.907, + "step": 40000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8112, + "step": 40100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7983, + "step": 40200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8066, + "step": 40300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8092, + "step": 40400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8075, + "step": 40500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8061, + "step": 40600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8054, + "step": 40700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8071, + "step": 40800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8105, + "step": 40900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8053, + "step": 41000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8064, + "step": 41100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8088, + "step": 41200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.809, + "step": 41300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8029, + "step": 41400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8052, + "step": 41500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7991, + "step": 41600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8055, + "step": 41700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8038, + "step": 41800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8041, + "step": 41900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8033, + "step": 42000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8125, + "step": 42100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8087, + "step": 42200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8015, + "step": 42300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8064, + "step": 42400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8043, + "step": 42500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8082, + "step": 42600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8053, + "step": 42700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8012, + "step": 42800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8082, + "step": 42900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8114, + "step": 43000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8012, + "step": 43100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8011, + "step": 43200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8035, + "step": 43300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8099, + "step": 43400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8049, + "step": 43500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8112, + "step": 43600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8115, + "step": 43700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.804, + "step": 43800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8126, + "step": 43900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8053, + "step": 44000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8046, + "step": 44100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7977, + "step": 44200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8066, + "step": 44300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8042, + "step": 44400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7989, + "step": 44500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8102, + "step": 44600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8105, + "step": 44700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8098, + "step": 44800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8042, + "step": 44900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7997, + "step": 45000 + }, + { + "epoch": 0.01, + "eval_loss": 0.7535812258720398, + "eval_runtime": 203.0127, + "eval_samples_per_second": 246.29, + "eval_steps_per_second": 1.926, + "step": 45000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7972, + "step": 45100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8005, + "step": 45200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8084, + "step": 45300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8072, + "step": 45400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8104, + "step": 45500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7954, + "step": 45600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8027, + "step": 45700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8109, + "step": 45800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8098, + "step": 45900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7953, + "step": 46000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8015, + "step": 46100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8039, + "step": 46200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8016, + "step": 46300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8059, + "step": 46400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8043, + "step": 46500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8015, + "step": 46600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8012, + "step": 46700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7971, + "step": 46800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8022, + "step": 46900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7949, + "step": 47000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8082, + "step": 47100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8008, + "step": 47200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8084, + "step": 47300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7934, + "step": 47400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8008, + "step": 47500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8089, + "step": 47600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8104, + "step": 47700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8055, + "step": 47800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8078, + "step": 47900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8056, + "step": 48000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7976, + "step": 48100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8091, + "step": 48200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7998, + "step": 48300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8086, + "step": 48400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8002, + "step": 48500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8044, + "step": 48600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8104, + "step": 48700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7998, + "step": 48800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.802, + "step": 48900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8096, + "step": 49000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8043, + "step": 49100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8009, + "step": 49200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7989, + "step": 49300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7979, + "step": 49400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7973, + "step": 49500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8013, + "step": 49600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7988, + "step": 49700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8051, + "step": 49800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7975, + "step": 49900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8119, + "step": 50000 + }, + { + "epoch": 0.01, + "eval_loss": 0.7516446113586426, + "eval_runtime": 205.3381, + "eval_samples_per_second": 243.501, + "eval_steps_per_second": 1.904, + "step": 50000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8062, + "step": 50100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8114, + "step": 50200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7993, + "step": 50300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8058, + "step": 50400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8062, + "step": 50500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7987, + "step": 50600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8012, + "step": 50700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.81, + "step": 50800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8095, + "step": 50900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7983, + "step": 51000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8015, + "step": 51100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8014, + "step": 51200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8092, + "step": 51300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8071, + "step": 51400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8085, + "step": 51500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8012, + "step": 51600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8012, + "step": 51700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7944, + "step": 51800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8111, + "step": 51900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8136, + "step": 52000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.791, + "step": 52100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8037, + "step": 52200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.798, + "step": 52300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8099, + "step": 52400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8014, + "step": 52500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7988, + "step": 52600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8044, + "step": 52700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8009, + "step": 52800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8058, + "step": 52900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7934, + "step": 53000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8045, + "step": 53100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8063, + "step": 53200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8086, + "step": 53300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7966, + "step": 53400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7961, + "step": 53500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.803, + "step": 53600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8001, + "step": 53700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8063, + "step": 53800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8047, + "step": 53900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.807, + "step": 54000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8009, + "step": 54100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8008, + "step": 54200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7991, + "step": 54300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7956, + "step": 54400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8073, + "step": 54500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7919, + "step": 54600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7955, + "step": 54700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8035, + "step": 54800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7994, + "step": 54900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8085, + "step": 55000 + }, + { + "epoch": 0.01, + "eval_loss": 0.7525084614753723, + "eval_runtime": 204.3549, + "eval_samples_per_second": 244.672, + "eval_steps_per_second": 1.913, + "step": 55000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7948, + "step": 55100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8059, + "step": 55200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7999, + "step": 55300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7986, + "step": 55400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7997, + "step": 55500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.797, + "step": 55600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7938, + "step": 55700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7983, + "step": 55800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7982, + "step": 55900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8123, + "step": 56000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8031, + "step": 56100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8058, + "step": 56200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8066, + "step": 56300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8023, + "step": 56400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8018, + "step": 56500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7935, + "step": 56600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8068, + "step": 56700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7936, + "step": 56800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8007, + "step": 56900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7978, + "step": 57000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.801, + "step": 57100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8017, + "step": 57200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7992, + "step": 57300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7999, + "step": 57400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7954, + "step": 57500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8033, + "step": 57600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7991, + "step": 57700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8016, + "step": 57800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7995, + "step": 57900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8027, + "step": 58000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7939, + "step": 58100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7999, + "step": 58200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7989, + "step": 58300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7873, + "step": 58400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8025, + "step": 58500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8011, + "step": 58600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8035, + "step": 58700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7941, + "step": 58800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8069, + "step": 58900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7931, + "step": 59000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8056, + "step": 59100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8015, + "step": 59200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.794, + "step": 59300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8043, + "step": 59400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7985, + "step": 59500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7937, + "step": 59600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8014, + "step": 59700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7991, + "step": 59800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8088, + "step": 59900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7926, + "step": 60000 + }, + { + "epoch": 0.01, + "eval_loss": 0.7483974099159241, + "eval_runtime": 204.1773, + "eval_samples_per_second": 244.885, + "eval_steps_per_second": 1.915, + "step": 60000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7991, + "step": 60100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7959, + "step": 60200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8101, + "step": 60300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8033, + "step": 60400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7919, + "step": 60500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7908, + "step": 60600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7964, + "step": 60700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.798, + "step": 60800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.801, + "step": 60900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7974, + "step": 61000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.801, + "step": 61100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7997, + "step": 61200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.791, + "step": 61300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7945, + "step": 61400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7883, + "step": 61500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7956, + "step": 61600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7994, + "step": 61700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7966, + "step": 61800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7973, + "step": 61900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8029, + "step": 62000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8007, + "step": 62100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7996, + "step": 62200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8, + "step": 62300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7922, + "step": 62400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7997, + "step": 62500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7978, + "step": 62600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.797, + "step": 62700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8031, + "step": 62800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8005, + "step": 62900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7945, + "step": 63000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.797, + "step": 63100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7999, + "step": 63200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8007, + "step": 63300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7905, + "step": 63400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8, + "step": 63500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8013, + "step": 63600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7962, + "step": 63700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7984, + "step": 63800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7936, + "step": 63900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7919, + "step": 64000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7932, + "step": 64100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7889, + "step": 64200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8014, + "step": 64300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8003, + "step": 64400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8007, + "step": 64500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7923, + "step": 64600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7996, + "step": 64700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7984, + "step": 64800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.795, + "step": 64900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8047, + "step": 65000 + }, + { + "epoch": 0.01, + "eval_loss": 0.7496934533119202, + "eval_runtime": 203.6389, + "eval_samples_per_second": 245.533, + "eval_steps_per_second": 1.92, + "step": 65000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7968, + "step": 65100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7978, + "step": 65200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8041, + "step": 65300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7922, + "step": 65400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7993, + "step": 65500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7982, + "step": 65600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7943, + "step": 65700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8029, + "step": 65800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7929, + "step": 65900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7956, + "step": 66000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8007, + "step": 66100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7981, + "step": 66200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7972, + "step": 66300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8035, + "step": 66400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7933, + "step": 66500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7889, + "step": 66600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8019, + "step": 66700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7898, + "step": 66800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7928, + "step": 66900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8028, + "step": 67000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7983, + "step": 67100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7945, + "step": 67200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8012, + "step": 67300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7961, + "step": 67400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7982, + "step": 67500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7905, + "step": 67600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7996, + "step": 67700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7909, + "step": 67800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8064, + "step": 67900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7966, + "step": 68000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7893, + "step": 68100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8021, + "step": 68200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7942, + "step": 68300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8015, + "step": 68400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7982, + "step": 68500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7991, + "step": 68600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7998, + "step": 68700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7948, + "step": 68800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7916, + "step": 68900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8064, + "step": 69000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7984, + "step": 69100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7907, + "step": 69200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7888, + "step": 69300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.794, + "step": 69400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7953, + "step": 69500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7908, + "step": 69600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7963, + "step": 69700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8021, + "step": 69800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.812, + "step": 69900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7998, + "step": 70000 + }, + { + "epoch": 0.01, + "eval_loss": 0.7458716630935669, + "eval_runtime": 204.0562, + "eval_samples_per_second": 245.031, + "eval_steps_per_second": 1.916, + "step": 70000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7952, + "step": 70100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7979, + "step": 70200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7973, + "step": 70300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7874, + "step": 70400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.792, + "step": 70500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7923, + "step": 70600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7941, + "step": 70700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7941, + "step": 70800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7937, + "step": 70900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7899, + "step": 71000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7943, + "step": 71100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7962, + "step": 71200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7981, + "step": 71300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7916, + "step": 71400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7987, + "step": 71500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7944, + "step": 71600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7968, + "step": 71700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7915, + "step": 71800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7956, + "step": 71900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7902, + "step": 72000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7905, + "step": 72100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8053, + "step": 72200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7997, + "step": 72300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7984, + "step": 72400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8014, + "step": 72500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7885, + "step": 72600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7943, + "step": 72700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7996, + "step": 72800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7937, + "step": 72900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7917, + "step": 73000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7936, + "step": 73100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7936, + "step": 73200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7964, + "step": 73300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8022, + "step": 73400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8045, + "step": 73500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7955, + "step": 73600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7955, + "step": 73700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7982, + "step": 73800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8026, + "step": 73900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7852, + "step": 74000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7829, + "step": 74100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.796, + "step": 74200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7986, + "step": 74300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7963, + "step": 74400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7941, + "step": 74500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7968, + "step": 74600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.806, + "step": 74700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7971, + "step": 74800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7869, + "step": 74900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7974, + "step": 75000 + }, + { + "epoch": 0.01, + "eval_loss": 0.7442731857299805, + "eval_runtime": 204.653, + "eval_samples_per_second": 244.316, + "eval_steps_per_second": 1.911, + "step": 75000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8016, + "step": 75100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7977, + "step": 75200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7925, + "step": 75300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7942, + "step": 75400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7965, + "step": 75500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7906, + "step": 75600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7885, + "step": 75700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7933, + "step": 75800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7876, + "step": 75900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7888, + "step": 76000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7935, + "step": 76100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7996, + "step": 76200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7923, + "step": 76300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7881, + "step": 76400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7948, + "step": 76500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7923, + "step": 76600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7879, + "step": 76700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7823, + "step": 76800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7982, + "step": 76900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7953, + "step": 77000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7986, + "step": 77100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7944, + "step": 77200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7888, + "step": 77300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7973, + "step": 77400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7789, + "step": 77500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7931, + "step": 77600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7964, + "step": 77700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7833, + "step": 77800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7942, + "step": 77900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7909, + "step": 78000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7972, + "step": 78100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7961, + "step": 78200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7862, + "step": 78300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7856, + "step": 78400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.785, + "step": 78500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7887, + "step": 78600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.79, + "step": 78700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7959, + "step": 78800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7872, + "step": 78900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7946, + "step": 79000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7914, + "step": 79100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.789, + "step": 79200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7928, + "step": 79300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7967, + "step": 79400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7967, + "step": 79500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7863, + "step": 79600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7924, + "step": 79700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7918, + "step": 79800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8018, + "step": 79900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7919, + "step": 80000 + }, + { + "epoch": 0.01, + "eval_loss": 0.7424513101577759, + "eval_runtime": 204.0239, + "eval_samples_per_second": 245.069, + "eval_steps_per_second": 1.916, + "step": 80000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7984, + "step": 80100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7846, + "step": 80200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7914, + "step": 80300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7921, + "step": 80400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7906, + "step": 80500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7852, + "step": 80600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7894, + "step": 80700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7833, + "step": 80800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7948, + "step": 80900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7951, + "step": 81000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8013, + "step": 81100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7946, + "step": 81200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.795, + "step": 81300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7908, + "step": 81400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7907, + "step": 81500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7882, + "step": 81600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7883, + "step": 81700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7845, + "step": 81800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7848, + "step": 81900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7887, + "step": 82000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.795, + "step": 82100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7858, + "step": 82200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7923, + "step": 82300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.791, + "step": 82400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7897, + "step": 82500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7909, + "step": 82600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7911, + "step": 82700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7941, + "step": 82800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7937, + "step": 82900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7911, + "step": 83000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7926, + "step": 83100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7931, + "step": 83200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7892, + "step": 83300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7917, + "step": 83400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7979, + "step": 83500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7878, + "step": 83600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7998, + "step": 83700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7898, + "step": 83800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7911, + "step": 83900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7857, + "step": 84000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7887, + "step": 84100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7859, + "step": 84200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7924, + "step": 84300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.789, + "step": 84400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7867, + "step": 84500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7917, + "step": 84600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7816, + "step": 84700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7862, + "step": 84800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7913, + "step": 84900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7876, + "step": 85000 + }, + { + "epoch": 0.01, + "eval_loss": 0.7407427430152893, + "eval_runtime": 206.2207, + "eval_samples_per_second": 242.459, + "eval_steps_per_second": 1.896, + "step": 85000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7913, + "step": 85100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7956, + "step": 85200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7898, + "step": 85300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7863, + "step": 85400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7855, + "step": 85500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7911, + "step": 85600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7868, + "step": 85700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7966, + "step": 85800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7986, + "step": 85900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7903, + "step": 86000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7953, + "step": 86100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7933, + "step": 86200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.793, + "step": 86300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7869, + "step": 86400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7888, + "step": 86500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7908, + "step": 86600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7871, + "step": 86700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.786, + "step": 86800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7923, + "step": 86900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7861, + "step": 87000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7923, + "step": 87100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7873, + "step": 87200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7866, + "step": 87300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7884, + "step": 87400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7862, + "step": 87500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7909, + "step": 87600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7956, + "step": 87700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7911, + "step": 87800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7837, + "step": 87900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.79, + "step": 88000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7893, + "step": 88100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.793, + "step": 88200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7845, + "step": 88300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7824, + "step": 88400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8, + "step": 88500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7927, + "step": 88600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7804, + "step": 88700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7907, + "step": 88800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7846, + "step": 88900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7935, + "step": 89000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7906, + "step": 89100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7893, + "step": 89200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7869, + "step": 89300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7985, + "step": 89400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7823, + "step": 89500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7899, + "step": 89600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7887, + "step": 89700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7882, + "step": 89800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7897, + "step": 89900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.788, + "step": 90000 + }, + { + "epoch": 0.01, + "eval_loss": 0.7397116422653198, + "eval_runtime": 204.1911, + "eval_samples_per_second": 244.869, + "eval_steps_per_second": 1.915, + "step": 90000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7879, + "step": 90100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8019, + "step": 90200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8016, + "step": 90300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7867, + "step": 90400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7935, + "step": 90500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7913, + "step": 90600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7854, + "step": 90700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7868, + "step": 90800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7922, + "step": 90900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7846, + "step": 91000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7914, + "step": 91100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7935, + "step": 91200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7901, + "step": 91300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7808, + "step": 91400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7911, + "step": 91500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7822, + "step": 91600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7895, + "step": 91700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7835, + "step": 91800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7893, + "step": 91900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7907, + "step": 92000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7936, + "step": 92100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7904, + "step": 92200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7886, + "step": 92300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.788, + "step": 92400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7873, + "step": 92500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7993, + "step": 92600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7828, + "step": 92700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7862, + "step": 92800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7897, + "step": 92900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7884, + "step": 93000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7897, + "step": 93100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.789, + "step": 93200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7905, + "step": 93300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7856, + "step": 93400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7925, + "step": 93500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7918, + "step": 93600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7912, + "step": 93700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7858, + "step": 93800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7897, + "step": 93900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7838, + "step": 94000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.8, + "step": 94100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7821, + "step": 94200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.786, + "step": 94300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7849, + "step": 94400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7863, + "step": 94500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7892, + "step": 94600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7881, + "step": 94700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7844, + "step": 94800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7853, + "step": 94900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7907, + "step": 95000 + }, + { + "epoch": 0.01, + "eval_loss": 0.7405564188957214, + "eval_runtime": 203.9554, + "eval_samples_per_second": 245.152, + "eval_steps_per_second": 1.917, + "step": 95000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7864, + "step": 95100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7899, + "step": 95200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7815, + "step": 95300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7872, + "step": 95400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.789, + "step": 95500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7887, + "step": 95600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7887, + "step": 95700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7874, + "step": 95800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7854, + "step": 95900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7832, + "step": 96000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7887, + "step": 96100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7955, + "step": 96200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7921, + "step": 96300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7901, + "step": 96400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7852, + "step": 96500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7971, + "step": 96600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7873, + "step": 96700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.79, + "step": 96800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7875, + "step": 96900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7945, + "step": 97000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7863, + "step": 97100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7876, + "step": 97200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7901, + "step": 97300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7942, + "step": 97400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7868, + "step": 97500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7866, + "step": 97600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7901, + "step": 97700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7951, + "step": 97800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7856, + "step": 97900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7875, + "step": 98000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7924, + "step": 98100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7894, + "step": 98200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7927, + "step": 98300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7877, + "step": 98400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7875, + "step": 98500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7915, + "step": 98600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7875, + "step": 98700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7845, + "step": 98800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7878, + "step": 98900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7829, + "step": 99000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7895, + "step": 99100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7924, + "step": 99200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.786, + "step": 99300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7873, + "step": 99400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7921, + "step": 99500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7947, + "step": 99600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7904, + "step": 99700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7848, + "step": 99800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7896, + "step": 99900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7924, + "step": 100000 + }, + { + "epoch": 0.01, + "eval_loss": 0.7385464310646057, + "eval_runtime": 205.6058, + "eval_samples_per_second": 243.184, + "eval_steps_per_second": 1.902, + "step": 100000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.782, + "step": 100100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.794, + "step": 100200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7825, + "step": 100300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7868, + "step": 100400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7797, + "step": 100500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7949, + "step": 100600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7815, + "step": 100700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7788, + "step": 100800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.784, + "step": 100900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7843, + "step": 101000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.787, + "step": 101100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7921, + "step": 101200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7916, + "step": 101300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7845, + "step": 101400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7878, + "step": 101500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7896, + "step": 101600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.791, + "step": 101700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7858, + "step": 101800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7953, + "step": 101900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7764, + "step": 102000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7973, + "step": 102100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7809, + "step": 102200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7707, + "step": 102300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7865, + "step": 102400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7948, + "step": 102500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7767, + "step": 102600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7866, + "step": 102700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.792, + "step": 102800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7866, + "step": 102900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.79, + "step": 103000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7924, + "step": 103100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7889, + "step": 103200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7825, + "step": 103300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7825, + "step": 103400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.79, + "step": 103500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7844, + "step": 103600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7794, + "step": 103700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7828, + "step": 103800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7903, + "step": 103900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7762, + "step": 104000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7849, + "step": 104100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7918, + "step": 104200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7906, + "step": 104300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7818, + "step": 104400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7858, + "step": 104500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7919, + "step": 104600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7869, + "step": 104700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7941, + "step": 104800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7888, + "step": 104900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7874, + "step": 105000 + }, + { + "epoch": 0.01, + "eval_loss": 0.7402731776237488, + "eval_runtime": 199.5477, + "eval_samples_per_second": 250.567, + "eval_steps_per_second": 1.959, + "step": 105000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7854, + "step": 105100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7849, + "step": 105200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7921, + "step": 105300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7866, + "step": 105400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7895, + "step": 105500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7858, + "step": 105600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.792, + "step": 105700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7804, + "step": 105800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7822, + "step": 105900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.794, + "step": 106000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7754, + "step": 106100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7853, + "step": 106200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7805, + "step": 106300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7792, + "step": 106400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7894, + "step": 106500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7873, + "step": 106600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7809, + "step": 106700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.787, + "step": 106800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7787, + "step": 106900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7799, + "step": 107000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7846, + "step": 107100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7807, + "step": 107200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7847, + "step": 107300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7839, + "step": 107400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7834, + "step": 107500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7882, + "step": 107600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7877, + "step": 107700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7872, + "step": 107800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7859, + "step": 107900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.779, + "step": 108000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7904, + "step": 108100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.789, + "step": 108200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7874, + "step": 108300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7892, + "step": 108400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7875, + "step": 108500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7897, + "step": 108600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7872, + "step": 108700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7899, + "step": 108800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7835, + "step": 108900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7811, + "step": 109000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7823, + "step": 109100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7827, + "step": 109200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7844, + "step": 109300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.782, + "step": 109400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7925, + "step": 109500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7805, + "step": 109600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7775, + "step": 109700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7846, + "step": 109800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7934, + "step": 109900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7832, + "step": 110000 + }, + { + "epoch": 0.01, + "eval_loss": 0.7355017066001892, + "eval_runtime": 203.8186, + "eval_samples_per_second": 245.316, + "eval_steps_per_second": 1.918, + "step": 110000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7852, + "step": 110100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7848, + "step": 110200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7789, + "step": 110300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7853, + "step": 110400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7876, + "step": 110500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7855, + "step": 110600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.776, + "step": 110700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7922, + "step": 110800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7809, + "step": 110900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7858, + "step": 111000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7866, + "step": 111100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.784, + "step": 111200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7831, + "step": 111300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7861, + "step": 111400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7901, + "step": 111500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7832, + "step": 111600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7812, + "step": 111700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7866, + "step": 111800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7853, + "step": 111900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7947, + "step": 112000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7887, + "step": 112100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7843, + "step": 112200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7845, + "step": 112300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.784, + "step": 112400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.782, + "step": 112500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7806, + "step": 112600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7868, + "step": 112700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7806, + "step": 112800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7756, + "step": 112900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7834, + "step": 113000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7843, + "step": 113100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7761, + "step": 113200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7799, + "step": 113300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7866, + "step": 113400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7843, + "step": 113500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7831, + "step": 113600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7854, + "step": 113700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7823, + "step": 113800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.788, + "step": 113900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7886, + "step": 114000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7779, + "step": 114100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7843, + "step": 114200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7921, + "step": 114300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7831, + "step": 114400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7921, + "step": 114500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7797, + "step": 114600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7825, + "step": 114700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7901, + "step": 114800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7936, + "step": 114900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7924, + "step": 115000 + }, + { + "epoch": 0.01, + "eval_loss": 0.7361956834793091, + "eval_runtime": 203.9139, + "eval_samples_per_second": 245.202, + "eval_steps_per_second": 1.917, + "step": 115000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7854, + "step": 115100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7807, + "step": 115200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7869, + "step": 115300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7808, + "step": 115400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7912, + "step": 115500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7843, + "step": 115600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7845, + "step": 115700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7833, + "step": 115800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7893, + "step": 115900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.792, + "step": 116000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7802, + "step": 116100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7776, + "step": 116200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7863, + "step": 116300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7789, + "step": 116400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.784, + "step": 116500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7781, + "step": 116600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7815, + "step": 116700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7801, + "step": 116800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7832, + "step": 116900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7817, + "step": 117000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.776, + "step": 117100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7916, + "step": 117200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7885, + "step": 117300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7846, + "step": 117400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7803, + "step": 117500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7829, + "step": 117600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7851, + "step": 117700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7845, + "step": 117800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7797, + "step": 117900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7822, + "step": 118000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7826, + "step": 118100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7752, + "step": 118200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7904, + "step": 118300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7801, + "step": 118400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7879, + "step": 118500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7846, + "step": 118600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7902, + "step": 118700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.789, + "step": 118800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7944, + "step": 118900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7765, + "step": 119000 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7845, + "step": 119100 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7771, + "step": 119200 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7854, + "step": 119300 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7853, + "step": 119400 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7841, + "step": 119500 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.789, + "step": 119600 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7791, + "step": 119700 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7914, + "step": 119800 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7797, + "step": 119900 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 0.7792, + "step": 120000 + }, + { + "epoch": 0.01, + "eval_loss": 0.7344588041305542, + "eval_runtime": 205.7304, + "eval_samples_per_second": 243.037, + "eval_steps_per_second": 1.901, + "step": 120000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7824, + "step": 120100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7891, + "step": 120200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7826, + "step": 120300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7811, + "step": 120400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7853, + "step": 120500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7837, + "step": 120600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7876, + "step": 120700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7852, + "step": 120800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7843, + "step": 120900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7878, + "step": 121000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7806, + "step": 121100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7826, + "step": 121200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7768, + "step": 121300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7888, + "step": 121400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7844, + "step": 121500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7836, + "step": 121600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7884, + "step": 121700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.779, + "step": 121800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7769, + "step": 121900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7764, + "step": 122000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7929, + "step": 122100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.779, + "step": 122200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7852, + "step": 122300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.779, + "step": 122400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.783, + "step": 122500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7809, + "step": 122600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7829, + "step": 122700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7836, + "step": 122800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7773, + "step": 122900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7855, + "step": 123000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7828, + "step": 123100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7793, + "step": 123200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7846, + "step": 123300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.784, + "step": 123400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7897, + "step": 123500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7885, + "step": 123600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7726, + "step": 123700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.784, + "step": 123800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.779, + "step": 123900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7801, + "step": 124000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7884, + "step": 124100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7893, + "step": 124200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7893, + "step": 124300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7827, + "step": 124400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7864, + "step": 124500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7777, + "step": 124600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7781, + "step": 124700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7746, + "step": 124800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7812, + "step": 124900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7843, + "step": 125000 + }, + { + "epoch": 0.02, + "eval_loss": 0.729735255241394, + "eval_runtime": 204.6756, + "eval_samples_per_second": 244.289, + "eval_steps_per_second": 1.91, + "step": 125000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7817, + "step": 125100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7793, + "step": 125200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7843, + "step": 125300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.78, + "step": 125400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7805, + "step": 125500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7812, + "step": 125600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7816, + "step": 125700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7756, + "step": 125800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7788, + "step": 125900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7816, + "step": 126000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7877, + "step": 126100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7786, + "step": 126200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7926, + "step": 126300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7828, + "step": 126400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7772, + "step": 126500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7789, + "step": 126600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7841, + "step": 126700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7837, + "step": 126800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.779, + "step": 126900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7838, + "step": 127000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7859, + "step": 127100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7771, + "step": 127200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7807, + "step": 127300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7901, + "step": 127400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7821, + "step": 127500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7808, + "step": 127600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7806, + "step": 127700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7885, + "step": 127800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7818, + "step": 127900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7894, + "step": 128000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7835, + "step": 128100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7812, + "step": 128200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7808, + "step": 128300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7798, + "step": 128400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7794, + "step": 128500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7766, + "step": 128600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.782, + "step": 128700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7799, + "step": 128800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7839, + "step": 128900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7852, + "step": 129000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.782, + "step": 129100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7798, + "step": 129200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7759, + "step": 129300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7882, + "step": 129400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7819, + "step": 129500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7791, + "step": 129600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7847, + "step": 129700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7814, + "step": 129800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7831, + "step": 129900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7846, + "step": 130000 + }, + { + "epoch": 0.02, + "eval_loss": 0.7291039824485779, + "eval_runtime": 204.4531, + "eval_samples_per_second": 244.555, + "eval_steps_per_second": 1.912, + "step": 130000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7752, + "step": 130100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7824, + "step": 130200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7754, + "step": 130300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7784, + "step": 130400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7819, + "step": 130500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.782, + "step": 130600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.784, + "step": 130700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.779, + "step": 130800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.789, + "step": 130900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7808, + "step": 131000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7837, + "step": 131100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7828, + "step": 131200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7815, + "step": 131300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7792, + "step": 131400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7742, + "step": 131500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7827, + "step": 131600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7775, + "step": 131700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7828, + "step": 131800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7739, + "step": 131900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7829, + "step": 132000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7808, + "step": 132100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7876, + "step": 132200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7873, + "step": 132300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.774, + "step": 132400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7753, + "step": 132500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7778, + "step": 132600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7808, + "step": 132700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7802, + "step": 132800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7763, + "step": 132900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7826, + "step": 133000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7775, + "step": 133100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7767, + "step": 133200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7807, + "step": 133300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.771, + "step": 133400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7756, + "step": 133500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7899, + "step": 133600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7806, + "step": 133700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7824, + "step": 133800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7861, + "step": 133900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7855, + "step": 134000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7809, + "step": 134100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7871, + "step": 134200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7776, + "step": 134300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7819, + "step": 134400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7874, + "step": 134500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7743, + "step": 134600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7834, + "step": 134700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.781, + "step": 134800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7756, + "step": 134900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7804, + "step": 135000 + }, + { + "epoch": 0.02, + "eval_loss": 0.7318330407142639, + "eval_runtime": 204.4753, + "eval_samples_per_second": 244.528, + "eval_steps_per_second": 1.912, + "step": 135000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.784, + "step": 135100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7832, + "step": 135200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.775, + "step": 135300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7821, + "step": 135400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7853, + "step": 135500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7847, + "step": 135600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7761, + "step": 135700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7747, + "step": 135800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7638, + "step": 135900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7825, + "step": 136000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7784, + "step": 136100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7781, + "step": 136200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7769, + "step": 136300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7814, + "step": 136400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.782, + "step": 136500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.778, + "step": 136600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7807, + "step": 136700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7801, + "step": 136800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7793, + "step": 136900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7761, + "step": 137000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7764, + "step": 137100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7783, + "step": 137200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7797, + "step": 137300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7744, + "step": 137400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7718, + "step": 137500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7749, + "step": 137600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7803, + "step": 137700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7791, + "step": 137800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7836, + "step": 137900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7714, + "step": 138000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.779, + "step": 138100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7815, + "step": 138200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7783, + "step": 138300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7911, + "step": 138400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7792, + "step": 138500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7858, + "step": 138600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7869, + "step": 138700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7776, + "step": 138800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.779, + "step": 138900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7876, + "step": 139000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7749, + "step": 139100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7831, + "step": 139200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7871, + "step": 139300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7789, + "step": 139400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7813, + "step": 139500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7747, + "step": 139600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7754, + "step": 139700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7843, + "step": 139800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7734, + "step": 139900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7753, + "step": 140000 + }, + { + "epoch": 0.02, + "eval_loss": 0.7310429811477661, + "eval_runtime": 204.6514, + "eval_samples_per_second": 244.318, + "eval_steps_per_second": 1.911, + "step": 140000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7814, + "step": 140100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7825, + "step": 140200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7843, + "step": 140300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7859, + "step": 140400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.785, + "step": 140500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7739, + "step": 140600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7833, + "step": 140700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7838, + "step": 140800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7798, + "step": 140900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7794, + "step": 141000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7736, + "step": 141100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7801, + "step": 141200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7792, + "step": 141300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7881, + "step": 141400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7768, + "step": 141500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7847, + "step": 141600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7801, + "step": 141700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7727, + "step": 141800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7758, + "step": 141900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7818, + "step": 142000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7754, + "step": 142100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7821, + "step": 142200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7774, + "step": 142300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7866, + "step": 142400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.784, + "step": 142500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7779, + "step": 142600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7788, + "step": 142700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7854, + "step": 142800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7786, + "step": 142900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7735, + "step": 143000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.769, + "step": 143100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7758, + "step": 143200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7769, + "step": 143300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7734, + "step": 143400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7763, + "step": 143500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7779, + "step": 143600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7771, + "step": 143700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7806, + "step": 143800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7746, + "step": 143900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7776, + "step": 144000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7777, + "step": 144100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7735, + "step": 144200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7892, + "step": 144300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7797, + "step": 144400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7764, + "step": 144500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7708, + "step": 144600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.781, + "step": 144700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7758, + "step": 144800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7751, + "step": 144900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7842, + "step": 145000 + }, + { + "epoch": 0.02, + "eval_loss": 0.7320641279220581, + "eval_runtime": 204.3932, + "eval_samples_per_second": 244.627, + "eval_steps_per_second": 1.913, + "step": 145000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7726, + "step": 145100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7756, + "step": 145200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7778, + "step": 145300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7782, + "step": 145400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7763, + "step": 145500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7809, + "step": 145600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7791, + "step": 145700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7724, + "step": 145800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7744, + "step": 145900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7692, + "step": 146000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7798, + "step": 146100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7823, + "step": 146200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7803, + "step": 146300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.784, + "step": 146400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7753, + "step": 146500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.776, + "step": 146600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7749, + "step": 146700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7818, + "step": 146800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.776, + "step": 146900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7768, + "step": 147000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7675, + "step": 147100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7845, + "step": 147200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7679, + "step": 147300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7808, + "step": 147400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7819, + "step": 147500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7825, + "step": 147600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7828, + "step": 147700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7737, + "step": 147800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7757, + "step": 147900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7838, + "step": 148000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7777, + "step": 148100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7728, + "step": 148200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.782, + "step": 148300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7785, + "step": 148400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7799, + "step": 148500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7765, + "step": 148600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7819, + "step": 148700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7754, + "step": 148800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7797, + "step": 148900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7727, + "step": 149000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.781, + "step": 149100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7781, + "step": 149200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7844, + "step": 149300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7784, + "step": 149400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7849, + "step": 149500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7728, + "step": 149600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7822, + "step": 149700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7713, + "step": 149800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7805, + "step": 149900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7746, + "step": 150000 + }, + { + "epoch": 0.02, + "eval_loss": 0.7317348718643188, + "eval_runtime": 204.4686, + "eval_samples_per_second": 244.536, + "eval_steps_per_second": 1.912, + "step": 150000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7819, + "step": 150100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7762, + "step": 150200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7858, + "step": 150300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7745, + "step": 150400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7839, + "step": 150500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.777, + "step": 150600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7763, + "step": 150700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7808, + "step": 150800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7755, + "step": 150900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7785, + "step": 151000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7781, + "step": 151100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7764, + "step": 151200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7834, + "step": 151300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7782, + "step": 151400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7797, + "step": 151500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7827, + "step": 151600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7778, + "step": 151700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7718, + "step": 151800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7789, + "step": 151900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7757, + "step": 152000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7819, + "step": 152100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.776, + "step": 152200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7709, + "step": 152300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7663, + "step": 152400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7754, + "step": 152500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7736, + "step": 152600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.78, + "step": 152700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7863, + "step": 152800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7872, + "step": 152900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7718, + "step": 153000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7849, + "step": 153100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7748, + "step": 153200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7802, + "step": 153300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7824, + "step": 153400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.78, + "step": 153500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 153600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7846, + "step": 153700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7751, + "step": 153800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7776, + "step": 153900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7833, + "step": 154000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7772, + "step": 154100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7713, + "step": 154200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7791, + "step": 154300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7766, + "step": 154400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7729, + "step": 154500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7828, + "step": 154600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.778, + "step": 154700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7709, + "step": 154800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7767, + "step": 154900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7727, + "step": 155000 + }, + { + "epoch": 0.02, + "eval_loss": 0.7296523451805115, + "eval_runtime": 204.9064, + "eval_samples_per_second": 244.014, + "eval_steps_per_second": 1.908, + "step": 155000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7811, + "step": 155100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.774, + "step": 155200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7704, + "step": 155300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7697, + "step": 155400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7852, + "step": 155500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.776, + "step": 155600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7862, + "step": 155700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7757, + "step": 155800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7742, + "step": 155900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.784, + "step": 156000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7776, + "step": 156100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7763, + "step": 156200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.773, + "step": 156300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7745, + "step": 156400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7875, + "step": 156500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7707, + "step": 156600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7828, + "step": 156700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7771, + "step": 156800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.782, + "step": 156900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7789, + "step": 157000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.779, + "step": 157100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7774, + "step": 157200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7739, + "step": 157300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7695, + "step": 157400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7789, + "step": 157500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7804, + "step": 157600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7786, + "step": 157700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7815, + "step": 157800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7801, + "step": 157900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.771, + "step": 158000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.778, + "step": 158100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.778, + "step": 158200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7761, + "step": 158300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7791, + "step": 158400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7788, + "step": 158500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7831, + "step": 158600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7753, + "step": 158700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7765, + "step": 158800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7727, + "step": 158900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7822, + "step": 159000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7835, + "step": 159100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7845, + "step": 159200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7758, + "step": 159300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.778, + "step": 159400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7781, + "step": 159500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.779, + "step": 159600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7804, + "step": 159700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7858, + "step": 159800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7816, + "step": 159900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7758, + "step": 160000 + }, + { + "epoch": 0.02, + "eval_loss": 0.7311965823173523, + "eval_runtime": 204.2454, + "eval_samples_per_second": 244.804, + "eval_steps_per_second": 1.914, + "step": 160000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7818, + "step": 160100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7772, + "step": 160200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7755, + "step": 160300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7755, + "step": 160400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7766, + "step": 160500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7814, + "step": 160600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7766, + "step": 160700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7804, + "step": 160800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7762, + "step": 160900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.782, + "step": 161000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7811, + "step": 161100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7711, + "step": 161200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7732, + "step": 161300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7794, + "step": 161400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7771, + "step": 161500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7719, + "step": 161600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7816, + "step": 161700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7737, + "step": 161800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7777, + "step": 161900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7753, + "step": 162000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7708, + "step": 162100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7839, + "step": 162200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.779, + "step": 162300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7734, + "step": 162400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7788, + "step": 162500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7733, + "step": 162600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.779, + "step": 162700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7731, + "step": 162800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7755, + "step": 162900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7699, + "step": 163000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7761, + "step": 163100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7758, + "step": 163200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7778, + "step": 163300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7707, + "step": 163400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7804, + "step": 163500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7808, + "step": 163600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7851, + "step": 163700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7682, + "step": 163800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7722, + "step": 163900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7686, + "step": 164000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7809, + "step": 164100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7749, + "step": 164200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.774, + "step": 164300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7774, + "step": 164400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7768, + "step": 164500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7769, + "step": 164600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7791, + "step": 164700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7739, + "step": 164800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7772, + "step": 164900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7766, + "step": 165000 + }, + { + "epoch": 0.02, + "eval_loss": 0.7293242812156677, + "eval_runtime": 203.969, + "eval_samples_per_second": 245.135, + "eval_steps_per_second": 1.917, + "step": 165000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7751, + "step": 165100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7839, + "step": 165200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7753, + "step": 165300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7717, + "step": 165400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.773, + "step": 165500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.78, + "step": 165600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7807, + "step": 165700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7743, + "step": 165800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7656, + "step": 165900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7711, + "step": 166000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7709, + "step": 166100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7791, + "step": 166200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.776, + "step": 166300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.774, + "step": 166400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7742, + "step": 166500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7772, + "step": 166600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7763, + "step": 166700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7741, + "step": 166800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7762, + "step": 166900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7754, + "step": 167000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7808, + "step": 167100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7877, + "step": 167200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7804, + "step": 167300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7815, + "step": 167400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.776, + "step": 167500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7763, + "step": 167600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7767, + "step": 167700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7704, + "step": 167800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7688, + "step": 167900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7718, + "step": 168000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7731, + "step": 168100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7793, + "step": 168200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7717, + "step": 168300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7747, + "step": 168400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7779, + "step": 168500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7778, + "step": 168600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.776, + "step": 168700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7797, + "step": 168800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7748, + "step": 168900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7706, + "step": 169000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7753, + "step": 169100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7775, + "step": 169200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7726, + "step": 169300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7697, + "step": 169400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7797, + "step": 169500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7716, + "step": 169600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7729, + "step": 169700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7834, + "step": 169800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7735, + "step": 169900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7655, + "step": 170000 + }, + { + "epoch": 0.02, + "eval_loss": 0.7290298938751221, + "eval_runtime": 204.777, + "eval_samples_per_second": 244.168, + "eval_steps_per_second": 1.909, + "step": 170000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7776, + "step": 170100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7747, + "step": 170200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7698, + "step": 170300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7737, + "step": 170400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7761, + "step": 170500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7688, + "step": 170600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7751, + "step": 170700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7763, + "step": 170800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.778, + "step": 170900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.774, + "step": 171000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7704, + "step": 171100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7787, + "step": 171200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.778, + "step": 171300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7804, + "step": 171400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7755, + "step": 171500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7802, + "step": 171600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7734, + "step": 171700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7719, + "step": 171800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.774, + "step": 171900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7683, + "step": 172000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.778, + "step": 172100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7776, + "step": 172200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.776, + "step": 172300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7837, + "step": 172400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7762, + "step": 172500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7659, + "step": 172600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7768, + "step": 172700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7764, + "step": 172800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7699, + "step": 172900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7756, + "step": 173000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7754, + "step": 173100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7735, + "step": 173200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7759, + "step": 173300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7688, + "step": 173400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.776, + "step": 173500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7693, + "step": 173600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7706, + "step": 173700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.771, + "step": 173800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 173900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.779, + "step": 174000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7851, + "step": 174100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7722, + "step": 174200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7775, + "step": 174300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7729, + "step": 174400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.771, + "step": 174500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7793, + "step": 174600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7774, + "step": 174700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7744, + "step": 174800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7795, + "step": 174900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7697, + "step": 175000 + }, + { + "epoch": 0.02, + "eval_loss": 0.7283743023872375, + "eval_runtime": 201.412, + "eval_samples_per_second": 248.247, + "eval_steps_per_second": 1.941, + "step": 175000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7731, + "step": 175100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 175200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7729, + "step": 175300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7739, + "step": 175400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7719, + "step": 175500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7656, + "step": 175600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7762, + "step": 175700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7776, + "step": 175800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7751, + "step": 175900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.772, + "step": 176000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7727, + "step": 176100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7722, + "step": 176200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7791, + "step": 176300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7657, + "step": 176400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7787, + "step": 176500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7797, + "step": 176600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7771, + "step": 176700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7659, + "step": 176800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7677, + "step": 176900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7741, + "step": 177000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7746, + "step": 177100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7794, + "step": 177200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7776, + "step": 177300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7826, + "step": 177400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7723, + "step": 177500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7657, + "step": 177600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.783, + "step": 177700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7805, + "step": 177800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7726, + "step": 177900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7766, + "step": 178000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7798, + "step": 178100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7791, + "step": 178200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7787, + "step": 178300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.78, + "step": 178400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7747, + "step": 178500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7673, + "step": 178600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7761, + "step": 178700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7661, + "step": 178800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7753, + "step": 178900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7813, + "step": 179000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7724, + "step": 179100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.773, + "step": 179200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7788, + "step": 179300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7777, + "step": 179400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.78, + "step": 179500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7769, + "step": 179600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7661, + "step": 179700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7675, + "step": 179800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7745, + "step": 179900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7744, + "step": 180000 + }, + { + "epoch": 0.02, + "eval_loss": 0.7283092737197876, + "eval_runtime": 201.9007, + "eval_samples_per_second": 247.647, + "eval_steps_per_second": 1.937, + "step": 180000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.775, + "step": 180100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.771, + "step": 180200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7761, + "step": 180300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7677, + "step": 180400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7713, + "step": 180500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.779, + "step": 180600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7756, + "step": 180700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7767, + "step": 180800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7768, + "step": 180900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7753, + "step": 181000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7744, + "step": 181100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7753, + "step": 181200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7739, + "step": 181300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7736, + "step": 181400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.779, + "step": 181500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7756, + "step": 181600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7716, + "step": 181700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7725, + "step": 181800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7694, + "step": 181900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7676, + "step": 182000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7706, + "step": 182100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7766, + "step": 182200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7682, + "step": 182300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7742, + "step": 182400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7735, + "step": 182500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7764, + "step": 182600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7734, + "step": 182700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.778, + "step": 182800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7703, + "step": 182900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7759, + "step": 183000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7741, + "step": 183100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7712, + "step": 183200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7658, + "step": 183300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7675, + "step": 183400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7771, + "step": 183500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7716, + "step": 183600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7734, + "step": 183700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7736, + "step": 183800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7726, + "step": 183900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7702, + "step": 184000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7704, + "step": 184100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7705, + "step": 184200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7704, + "step": 184300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7731, + "step": 184400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7772, + "step": 184500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7779, + "step": 184600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7741, + "step": 184700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7684, + "step": 184800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7733, + "step": 184900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7659, + "step": 185000 + }, + { + "epoch": 0.02, + "eval_loss": 0.7254592180252075, + "eval_runtime": 204.0508, + "eval_samples_per_second": 245.037, + "eval_steps_per_second": 1.916, + "step": 185000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7773, + "step": 185100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7836, + "step": 185200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7776, + "step": 185300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7731, + "step": 185400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7704, + "step": 185500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.777, + "step": 185600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.777, + "step": 185700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7719, + "step": 185800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7826, + "step": 185900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7735, + "step": 186000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7698, + "step": 186100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7742, + "step": 186200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7775, + "step": 186300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7694, + "step": 186400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7673, + "step": 186500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7715, + "step": 186600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7707, + "step": 186700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7723, + "step": 186800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7677, + "step": 186900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7808, + "step": 187000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.771, + "step": 187100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7761, + "step": 187200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7752, + "step": 187300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7744, + "step": 187400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7642, + "step": 187500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.773, + "step": 187600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7727, + "step": 187700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7677, + "step": 187800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7706, + "step": 187900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7859, + "step": 188000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7713, + "step": 188100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7719, + "step": 188200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7689, + "step": 188300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7736, + "step": 188400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7755, + "step": 188500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7708, + "step": 188600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7658, + "step": 188700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7745, + "step": 188800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7769, + "step": 188900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7686, + "step": 189000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7723, + "step": 189100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7676, + "step": 189200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7679, + "step": 189300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7771, + "step": 189400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7754, + "step": 189500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7695, + "step": 189600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7723, + "step": 189700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7807, + "step": 189800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7714, + "step": 189900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7783, + "step": 190000 + }, + { + "epoch": 0.02, + "eval_loss": 0.7265344262123108, + "eval_runtime": 203.2088, + "eval_samples_per_second": 246.052, + "eval_steps_per_second": 1.924, + "step": 190000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7664, + "step": 190100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7637, + "step": 190200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 190300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7711, + "step": 190400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7761, + "step": 190500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7688, + "step": 190600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7733, + "step": 190700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7696, + "step": 190800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7779, + "step": 190900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7769, + "step": 191000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7761, + "step": 191100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7754, + "step": 191200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7752, + "step": 191300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7731, + "step": 191400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7654, + "step": 191500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7633, + "step": 191600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7733, + "step": 191700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7715, + "step": 191800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7707, + "step": 191900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7761, + "step": 192000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7701, + "step": 192100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7773, + "step": 192200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7731, + "step": 192300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.774, + "step": 192400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7716, + "step": 192500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7769, + "step": 192600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7738, + "step": 192700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7686, + "step": 192800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7808, + "step": 192900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7781, + "step": 193000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7712, + "step": 193100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7694, + "step": 193200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7708, + "step": 193300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7671, + "step": 193400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7824, + "step": 193500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7714, + "step": 193600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7724, + "step": 193700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7684, + "step": 193800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7728, + "step": 193900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7701, + "step": 194000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7722, + "step": 194100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7659, + "step": 194200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7704, + "step": 194300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7781, + "step": 194400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7642, + "step": 194500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7774, + "step": 194600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7797, + "step": 194700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7734, + "step": 194800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7704, + "step": 194900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7805, + "step": 195000 + }, + { + "epoch": 0.02, + "eval_loss": 0.7270874381065369, + "eval_runtime": 195.6141, + "eval_samples_per_second": 255.605, + "eval_steps_per_second": 1.999, + "step": 195000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7684, + "step": 195100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7705, + "step": 195200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7669, + "step": 195300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7723, + "step": 195400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.764, + "step": 195500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7657, + "step": 195600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7747, + "step": 195700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7726, + "step": 195800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7584, + "step": 195900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7742, + "step": 196000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7728, + "step": 196100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7787, + "step": 196200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7683, + "step": 196300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7768, + "step": 196400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7764, + "step": 196500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7723, + "step": 196600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7724, + "step": 196700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7741, + "step": 196800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7713, + "step": 196900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7739, + "step": 197000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7731, + "step": 197100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7689, + "step": 197200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7816, + "step": 197300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.772, + "step": 197400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7701, + "step": 197500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7733, + "step": 197600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7703, + "step": 197700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7746, + "step": 197800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7717, + "step": 197900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7679, + "step": 198000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7644, + "step": 198100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7709, + "step": 198200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7701, + "step": 198300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7714, + "step": 198400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7712, + "step": 198500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7717, + "step": 198600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7628, + "step": 198700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7641, + "step": 198800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7729, + "step": 198900 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7681, + "step": 199000 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7651, + "step": 199100 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7654, + "step": 199200 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7764, + "step": 199300 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7763, + "step": 199400 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7777, + "step": 199500 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7793, + "step": 199600 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7765, + "step": 199700 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7692, + "step": 199800 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 0.7731, + "step": 199900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7679, + "step": 200000 + }, + { + "epoch": 0.03, + "eval_loss": 0.7222184538841248, + "eval_runtime": 194.621, + "eval_samples_per_second": 256.91, + "eval_steps_per_second": 2.009, + "step": 200000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7685, + "step": 200100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.775, + "step": 200200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7666, + "step": 200300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7751, + "step": 200400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.771, + "step": 200500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7684, + "step": 200600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7713, + "step": 200700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.775, + "step": 200800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7721, + "step": 200900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7721, + "step": 201000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7759, + "step": 201100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.769, + "step": 201200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7758, + "step": 201300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7762, + "step": 201400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7727, + "step": 201500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7703, + "step": 201600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.772, + "step": 201700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7697, + "step": 201800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7714, + "step": 201900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7691, + "step": 202000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7765, + "step": 202100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7716, + "step": 202200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7705, + "step": 202300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7772, + "step": 202400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7725, + "step": 202500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7809, + "step": 202600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7706, + "step": 202700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7754, + "step": 202800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7685, + "step": 202900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7771, + "step": 203000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7747, + "step": 203100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7733, + "step": 203200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7722, + "step": 203300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7715, + "step": 203400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7695, + "step": 203500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7694, + "step": 203600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7768, + "step": 203700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7713, + "step": 203800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.772, + "step": 203900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.769, + "step": 204000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7704, + "step": 204100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7713, + "step": 204200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7686, + "step": 204300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7701, + "step": 204400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7698, + "step": 204500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7715, + "step": 204600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7621, + "step": 204700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7715, + "step": 204800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7742, + "step": 204900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7686, + "step": 205000 + }, + { + "epoch": 0.03, + "eval_loss": 0.7252817749977112, + "eval_runtime": 194.5281, + "eval_samples_per_second": 257.032, + "eval_steps_per_second": 2.01, + "step": 205000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7679, + "step": 205100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7638, + "step": 205200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7763, + "step": 205300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7712, + "step": 205400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7722, + "step": 205500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7673, + "step": 205600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7771, + "step": 205700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7688, + "step": 205800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7787, + "step": 205900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7748, + "step": 206000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7699, + "step": 206100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7678, + "step": 206200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7699, + "step": 206300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7675, + "step": 206400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 206500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7654, + "step": 206600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7806, + "step": 206700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7628, + "step": 206800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7712, + "step": 206900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7703, + "step": 207000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7704, + "step": 207100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7758, + "step": 207200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7727, + "step": 207300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7773, + "step": 207400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7681, + "step": 207500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7711, + "step": 207600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7724, + "step": 207700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7742, + "step": 207800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7766, + "step": 207900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7689, + "step": 208000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7734, + "step": 208100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7746, + "step": 208200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7731, + "step": 208300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.77, + "step": 208400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7674, + "step": 208500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7717, + "step": 208600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7678, + "step": 208700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7661, + "step": 208800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7667, + "step": 208900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7701, + "step": 209000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7636, + "step": 209100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7762, + "step": 209200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.769, + "step": 209300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7697, + "step": 209400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7633, + "step": 209500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7682, + "step": 209600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7691, + "step": 209700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7675, + "step": 209800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 209900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7672, + "step": 210000 + }, + { + "epoch": 0.03, + "eval_loss": 0.723176896572113, + "eval_runtime": 194.7259, + "eval_samples_per_second": 256.771, + "eval_steps_per_second": 2.008, + "step": 210000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7762, + "step": 210100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7682, + "step": 210200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7687, + "step": 210300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7628, + "step": 210400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7658, + "step": 210500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.772, + "step": 210600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7693, + "step": 210700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7646, + "step": 210800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7673, + "step": 210900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7735, + "step": 211000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7628, + "step": 211100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7728, + "step": 211200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7698, + "step": 211300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7673, + "step": 211400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 211500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7669, + "step": 211600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7693, + "step": 211700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7711, + "step": 211800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7658, + "step": 211900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7746, + "step": 212000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7745, + "step": 212100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7707, + "step": 212200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7673, + "step": 212300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7664, + "step": 212400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7668, + "step": 212500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7708, + "step": 212600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7696, + "step": 212700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7759, + "step": 212800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7738, + "step": 212900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7622, + "step": 213000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7727, + "step": 213100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7742, + "step": 213200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7724, + "step": 213300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7688, + "step": 213400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.773, + "step": 213500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7769, + "step": 213600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.773, + "step": 213700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7727, + "step": 213800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7615, + "step": 213900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7746, + "step": 214000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7746, + "step": 214100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7728, + "step": 214200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7667, + "step": 214300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7695, + "step": 214400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7791, + "step": 214500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7625, + "step": 214600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7775, + "step": 214700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7677, + "step": 214800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7735, + "step": 214900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.77, + "step": 215000 + }, + { + "epoch": 0.03, + "eval_loss": 0.7234128713607788, + "eval_runtime": 194.8336, + "eval_samples_per_second": 256.629, + "eval_steps_per_second": 2.007, + "step": 215000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7726, + "step": 215100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7702, + "step": 215200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7733, + "step": 215300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7634, + "step": 215400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.772, + "step": 215500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7756, + "step": 215600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 215700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7629, + "step": 215800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7731, + "step": 215900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7702, + "step": 216000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7721, + "step": 216100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7644, + "step": 216200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7731, + "step": 216300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7685, + "step": 216400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7659, + "step": 216500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7724, + "step": 216600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7723, + "step": 216700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7666, + "step": 216800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 216900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7699, + "step": 217000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7642, + "step": 217100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7689, + "step": 217200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 217300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7664, + "step": 217400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7716, + "step": 217500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7675, + "step": 217600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7768, + "step": 217700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7738, + "step": 217800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7696, + "step": 217900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7716, + "step": 218000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7729, + "step": 218100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.777, + "step": 218200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7695, + "step": 218300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7709, + "step": 218400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 218500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7721, + "step": 218600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7689, + "step": 218700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.776, + "step": 218800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7658, + "step": 218900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7721, + "step": 219000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 219100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7748, + "step": 219200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7639, + "step": 219300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7745, + "step": 219400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7619, + "step": 219500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7702, + "step": 219600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7677, + "step": 219700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7704, + "step": 219800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7682, + "step": 219900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7662, + "step": 220000 + }, + { + "epoch": 0.03, + "eval_loss": 0.7229312658309937, + "eval_runtime": 194.6418, + "eval_samples_per_second": 256.882, + "eval_steps_per_second": 2.009, + "step": 220000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7765, + "step": 220100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7671, + "step": 220200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7716, + "step": 220300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7595, + "step": 220400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7609, + "step": 220500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7671, + "step": 220600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7689, + "step": 220700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7722, + "step": 220800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.768, + "step": 220900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7755, + "step": 221000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7697, + "step": 221100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7743, + "step": 221200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7718, + "step": 221300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7776, + "step": 221400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7727, + "step": 221500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7737, + "step": 221600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7694, + "step": 221700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7708, + "step": 221800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7651, + "step": 221900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7672, + "step": 222000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.771, + "step": 222100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7729, + "step": 222200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7762, + "step": 222300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 222400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7756, + "step": 222500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7766, + "step": 222600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7636, + "step": 222700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 222800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7673, + "step": 222900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 223000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7675, + "step": 223100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7634, + "step": 223200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 223300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7685, + "step": 223400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7643, + "step": 223500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7624, + "step": 223600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7672, + "step": 223700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 223800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7687, + "step": 223900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7612, + "step": 224000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7718, + "step": 224100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7636, + "step": 224200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7678, + "step": 224300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.771, + "step": 224400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7708, + "step": 224500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7645, + "step": 224600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7787, + "step": 224700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7756, + "step": 224800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7593, + "step": 224900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7667, + "step": 225000 + }, + { + "epoch": 0.03, + "eval_loss": 0.722387969493866, + "eval_runtime": 194.7501, + "eval_samples_per_second": 256.739, + "eval_steps_per_second": 2.008, + "step": 225000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7727, + "step": 225100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7738, + "step": 225200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7694, + "step": 225300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 225400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7733, + "step": 225500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7623, + "step": 225600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7627, + "step": 225700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7721, + "step": 225800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7753, + "step": 225900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7641, + "step": 226000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7667, + "step": 226100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7634, + "step": 226200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7683, + "step": 226300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7609, + "step": 226400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7631, + "step": 226500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7622, + "step": 226600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7769, + "step": 226700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7685, + "step": 226800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7639, + "step": 226900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7651, + "step": 227000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7691, + "step": 227100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 227200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7658, + "step": 227300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7684, + "step": 227400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7713, + "step": 227500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7713, + "step": 227600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7716, + "step": 227700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7747, + "step": 227800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7695, + "step": 227900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7634, + "step": 228000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7723, + "step": 228100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7723, + "step": 228200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7741, + "step": 228300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7587, + "step": 228400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7589, + "step": 228500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7758, + "step": 228600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7644, + "step": 228700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7701, + "step": 228800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7741, + "step": 228900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7636, + "step": 229000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7707, + "step": 229100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7666, + "step": 229200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7663, + "step": 229300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7692, + "step": 229400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7622, + "step": 229500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.771, + "step": 229600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7651, + "step": 229700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7697, + "step": 229800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7717, + "step": 229900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7693, + "step": 230000 + }, + { + "epoch": 0.03, + "eval_loss": 0.7221835851669312, + "eval_runtime": 194.8361, + "eval_samples_per_second": 256.626, + "eval_steps_per_second": 2.007, + "step": 230000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7716, + "step": 230100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7736, + "step": 230200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 230300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7708, + "step": 230400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7716, + "step": 230500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7673, + "step": 230600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7688, + "step": 230700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.775, + "step": 230800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7708, + "step": 230900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7664, + "step": 231000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7676, + "step": 231100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7647, + "step": 231200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7725, + "step": 231300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7684, + "step": 231400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7678, + "step": 231500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.775, + "step": 231600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7693, + "step": 231700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7675, + "step": 231800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7665, + "step": 231900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.779, + "step": 232000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7696, + "step": 232100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7744, + "step": 232200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7696, + "step": 232300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.779, + "step": 232400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7684, + "step": 232500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7705, + "step": 232600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7714, + "step": 232700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7662, + "step": 232800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7618, + "step": 232900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7698, + "step": 233000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7733, + "step": 233100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7628, + "step": 233200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7725, + "step": 233300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7726, + "step": 233400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7668, + "step": 233500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7633, + "step": 233600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7647, + "step": 233700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7679, + "step": 233800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7599, + "step": 233900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7755, + "step": 234000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 234100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7732, + "step": 234200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7689, + "step": 234300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7758, + "step": 234400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7668, + "step": 234500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7633, + "step": 234600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7643, + "step": 234700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7615, + "step": 234800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7681, + "step": 234900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7715, + "step": 235000 + }, + { + "epoch": 0.03, + "eval_loss": 0.7221626043319702, + "eval_runtime": 194.6545, + "eval_samples_per_second": 256.865, + "eval_steps_per_second": 2.009, + "step": 235000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7665, + "step": 235100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7701, + "step": 235200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7635, + "step": 235300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7728, + "step": 235400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7678, + "step": 235500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7723, + "step": 235600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.769, + "step": 235700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.77, + "step": 235800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7647, + "step": 235900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7661, + "step": 236000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 236100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7666, + "step": 236200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7711, + "step": 236300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7681, + "step": 236400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7712, + "step": 236500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7678, + "step": 236600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7704, + "step": 236700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.77, + "step": 236800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7716, + "step": 236900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7674, + "step": 237000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7599, + "step": 237100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 237200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 237300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7705, + "step": 237400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.766, + "step": 237500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7664, + "step": 237600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7671, + "step": 237700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7702, + "step": 237800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7659, + "step": 237900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7709, + "step": 238000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7624, + "step": 238100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.773, + "step": 238200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7731, + "step": 238300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7723, + "step": 238400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7689, + "step": 238500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7727, + "step": 238600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7702, + "step": 238700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7655, + "step": 238800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7708, + "step": 238900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7691, + "step": 239000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 239100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.768, + "step": 239200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 239300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7756, + "step": 239400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7661, + "step": 239500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7752, + "step": 239600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7688, + "step": 239700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7646, + "step": 239800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7664, + "step": 239900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7674, + "step": 240000 + }, + { + "epoch": 0.03, + "eval_loss": 0.7220859527587891, + "eval_runtime": 194.5722, + "eval_samples_per_second": 256.974, + "eval_steps_per_second": 2.01, + "step": 240000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7673, + "step": 240100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7708, + "step": 240200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7686, + "step": 240300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7711, + "step": 240400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7651, + "step": 240500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7653, + "step": 240600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7707, + "step": 240700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7674, + "step": 240800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7666, + "step": 240900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7708, + "step": 241000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7705, + "step": 241100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7732, + "step": 241200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7715, + "step": 241300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7647, + "step": 241400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7633, + "step": 241500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7582, + "step": 241600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7667, + "step": 241700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7666, + "step": 241800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7657, + "step": 241900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7742, + "step": 242000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7711, + "step": 242100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7655, + "step": 242200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7674, + "step": 242300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7691, + "step": 242400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.77, + "step": 242500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7629, + "step": 242600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7735, + "step": 242700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7706, + "step": 242800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7676, + "step": 242900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7646, + "step": 243000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7639, + "step": 243100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 243200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7624, + "step": 243300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7781, + "step": 243400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.769, + "step": 243500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7654, + "step": 243600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7622, + "step": 243700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7616, + "step": 243800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 243900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.772, + "step": 244000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7723, + "step": 244100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7661, + "step": 244200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7763, + "step": 244300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7662, + "step": 244400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7718, + "step": 244500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7706, + "step": 244600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 244700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7652, + "step": 244800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7664, + "step": 244900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7705, + "step": 245000 + }, + { + "epoch": 0.03, + "eval_loss": 0.7194216251373291, + "eval_runtime": 194.6455, + "eval_samples_per_second": 256.877, + "eval_steps_per_second": 2.009, + "step": 245000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7741, + "step": 245100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7693, + "step": 245200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7695, + "step": 245300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 245400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 245500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7574, + "step": 245600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7784, + "step": 245700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7699, + "step": 245800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.768, + "step": 245900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7618, + "step": 246000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.777, + "step": 246100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7622, + "step": 246200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7711, + "step": 246300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7686, + "step": 246400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7659, + "step": 246500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7689, + "step": 246600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7702, + "step": 246700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7719, + "step": 246800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7643, + "step": 246900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 247000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7756, + "step": 247100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7662, + "step": 247200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7739, + "step": 247300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7735, + "step": 247400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7732, + "step": 247500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 247600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7733, + "step": 247700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7689, + "step": 247800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7608, + "step": 247900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7705, + "step": 248000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7709, + "step": 248100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7676, + "step": 248200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7651, + "step": 248300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7707, + "step": 248400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 248500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7683, + "step": 248600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7724, + "step": 248700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7632, + "step": 248800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7646, + "step": 248900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7676, + "step": 249000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7749, + "step": 249100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7693, + "step": 249200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 249300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7665, + "step": 249400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7675, + "step": 249500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7641, + "step": 249600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 249700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7612, + "step": 249800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7734, + "step": 249900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7696, + "step": 250000 + }, + { + "epoch": 0.03, + "eval_loss": 0.7245768904685974, + "eval_runtime": 194.4227, + "eval_samples_per_second": 257.172, + "eval_steps_per_second": 2.011, + "step": 250000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7699, + "step": 250100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7704, + "step": 250200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7681, + "step": 250300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.771, + "step": 250400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7663, + "step": 250500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7752, + "step": 250600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 250700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7667, + "step": 250800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7763, + "step": 250900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7698, + "step": 251000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7647, + "step": 251100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7696, + "step": 251200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7659, + "step": 251300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7712, + "step": 251400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7685, + "step": 251500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7737, + "step": 251600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7662, + "step": 251700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7757, + "step": 251800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 251900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7747, + "step": 252000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7694, + "step": 252100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7667, + "step": 252200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7701, + "step": 252300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7662, + "step": 252400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7636, + "step": 252500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7746, + "step": 252600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7643, + "step": 252700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7711, + "step": 252800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7673, + "step": 252900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7618, + "step": 253000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7657, + "step": 253100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7666, + "step": 253200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7694, + "step": 253300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 253400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7732, + "step": 253500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7688, + "step": 253600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7715, + "step": 253700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7637, + "step": 253800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7661, + "step": 253900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7687, + "step": 254000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 254100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7596, + "step": 254200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7583, + "step": 254300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7663, + "step": 254400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7661, + "step": 254500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7706, + "step": 254600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7647, + "step": 254700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7739, + "step": 254800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7636, + "step": 254900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7651, + "step": 255000 + }, + { + "epoch": 0.03, + "eval_loss": 0.7205132246017456, + "eval_runtime": 195.1462, + "eval_samples_per_second": 256.218, + "eval_steps_per_second": 2.004, + "step": 255000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7618, + "step": 255100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7697, + "step": 255200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.768, + "step": 255300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7564, + "step": 255400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7702, + "step": 255500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7692, + "step": 255600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7735, + "step": 255700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7674, + "step": 255800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7711, + "step": 255900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7763, + "step": 256000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7633, + "step": 256100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7645, + "step": 256200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7681, + "step": 256300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7713, + "step": 256400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7654, + "step": 256500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 256600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7655, + "step": 256700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7657, + "step": 256800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 256900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7622, + "step": 257000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7741, + "step": 257100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.766, + "step": 257200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7658, + "step": 257300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7659, + "step": 257400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7733, + "step": 257500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7751, + "step": 257600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7732, + "step": 257700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7662, + "step": 257800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7687, + "step": 257900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7695, + "step": 258000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7701, + "step": 258100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7718, + "step": 258200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7688, + "step": 258300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7745, + "step": 258400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.77, + "step": 258500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 258600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 258700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.772, + "step": 258800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 258900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7657, + "step": 259000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7638, + "step": 259100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7702, + "step": 259200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7613, + "step": 259300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7595, + "step": 259400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7726, + "step": 259500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7664, + "step": 259600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7625, + "step": 259700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7677, + "step": 259800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7665, + "step": 259900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7763, + "step": 260000 + }, + { + "epoch": 0.03, + "eval_loss": 0.7205927968025208, + "eval_runtime": 194.8057, + "eval_samples_per_second": 256.666, + "eval_steps_per_second": 2.007, + "step": 260000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7714, + "step": 260100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7733, + "step": 260200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7685, + "step": 260300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7677, + "step": 260400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7695, + "step": 260500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7685, + "step": 260600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7639, + "step": 260700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7708, + "step": 260800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 260900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7733, + "step": 261000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7683, + "step": 261100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7666, + "step": 261200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.775, + "step": 261300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7657, + "step": 261400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7689, + "step": 261500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7669, + "step": 261600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7612, + "step": 261700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7652, + "step": 261800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7657, + "step": 261900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 262000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7686, + "step": 262100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7683, + "step": 262200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.768, + "step": 262300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7636, + "step": 262400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7684, + "step": 262500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7619, + "step": 262600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7674, + "step": 262700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7727, + "step": 262800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.768, + "step": 262900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 263000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7673, + "step": 263100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7757, + "step": 263200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7697, + "step": 263300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 263400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7668, + "step": 263500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7634, + "step": 263600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7643, + "step": 263700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7645, + "step": 263800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7667, + "step": 263900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7687, + "step": 264000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7624, + "step": 264100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 264200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7665, + "step": 264300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7667, + "step": 264400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7646, + "step": 264500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7677, + "step": 264600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7706, + "step": 264700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7597, + "step": 264800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7627, + "step": 264900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 265000 + }, + { + "epoch": 0.03, + "eval_loss": 0.7199035286903381, + "eval_runtime": 194.6875, + "eval_samples_per_second": 256.822, + "eval_steps_per_second": 2.008, + "step": 265000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7667, + "step": 265100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 265200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7622, + "step": 265300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7654, + "step": 265400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7645, + "step": 265500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7636, + "step": 265600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7594, + "step": 265700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7717, + "step": 265800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7639, + "step": 265900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7625, + "step": 266000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7727, + "step": 266100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.777, + "step": 266200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7717, + "step": 266300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7709, + "step": 266400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7669, + "step": 266500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 266600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 266700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7717, + "step": 266800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7753, + "step": 266900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7681, + "step": 267000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7642, + "step": 267100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7588, + "step": 267200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7688, + "step": 267300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7739, + "step": 267400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.773, + "step": 267500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7697, + "step": 267600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7707, + "step": 267700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7637, + "step": 267800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.769, + "step": 267900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7695, + "step": 268000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7656, + "step": 268100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7754, + "step": 268200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7697, + "step": 268300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7636, + "step": 268400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7707, + "step": 268500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7678, + "step": 268600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7692, + "step": 268700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.77, + "step": 268800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 268900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 269000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7706, + "step": 269100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7637, + "step": 269200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 269300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7679, + "step": 269400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7712, + "step": 269500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7686, + "step": 269600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.769, + "step": 269700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7639, + "step": 269800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7774, + "step": 269900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7745, + "step": 270000 + }, + { + "epoch": 0.03, + "eval_loss": 0.7194676399230957, + "eval_runtime": 194.2339, + "eval_samples_per_second": 257.422, + "eval_steps_per_second": 2.013, + "step": 270000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7683, + "step": 270100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7613, + "step": 270200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.766, + "step": 270300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 270400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7642, + "step": 270500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7656, + "step": 270600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7717, + "step": 270700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7655, + "step": 270800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7734, + "step": 270900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 271000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7736, + "step": 271100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7589, + "step": 271200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7676, + "step": 271300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7663, + "step": 271400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7703, + "step": 271500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7663, + "step": 271600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7691, + "step": 271700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7654, + "step": 271800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7685, + "step": 271900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7647, + "step": 272000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7674, + "step": 272100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7602, + "step": 272200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7654, + "step": 272300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 272400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7632, + "step": 272500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 272600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7654, + "step": 272700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7682, + "step": 272800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7674, + "step": 272900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 273000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 273100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7641, + "step": 273200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7698, + "step": 273300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 273400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 273500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7716, + "step": 273600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7633, + "step": 273700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 273800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.774, + "step": 273900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7679, + "step": 274000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7637, + "step": 274100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7687, + "step": 274200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7629, + "step": 274300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7623, + "step": 274400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7695, + "step": 274500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7637, + "step": 274600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7716, + "step": 274700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7706, + "step": 274800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7746, + "step": 274900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7706, + "step": 275000 + }, + { + "epoch": 0.03, + "eval_loss": 0.720512330532074, + "eval_runtime": 194.8789, + "eval_samples_per_second": 256.57, + "eval_steps_per_second": 2.006, + "step": 275000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7691, + "step": 275100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7644, + "step": 275200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 275300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 275400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7668, + "step": 275500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7686, + "step": 275600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7689, + "step": 275700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.766, + "step": 275800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7637, + "step": 275900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7685, + "step": 276000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7615, + "step": 276100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 276200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7588, + "step": 276300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 276400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 276500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7731, + "step": 276600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7732, + "step": 276700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 276800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7654, + "step": 276900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7559, + "step": 277000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7685, + "step": 277100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7659, + "step": 277200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7641, + "step": 277300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7655, + "step": 277400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7691, + "step": 277500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7625, + "step": 277600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7777, + "step": 277700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7594, + "step": 277800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 277900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7733, + "step": 278000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7716, + "step": 278100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7658, + "step": 278200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7749, + "step": 278300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7688, + "step": 278400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7756, + "step": 278500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7651, + "step": 278600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7616, + "step": 278700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.765, + "step": 278800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7677, + "step": 278900 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7689, + "step": 279000 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7681, + "step": 279100 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7668, + "step": 279200 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7708, + "step": 279300 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7621, + "step": 279400 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7628, + "step": 279500 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.764, + "step": 279600 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7679, + "step": 279700 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.772, + "step": 279800 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 0.7634, + "step": 279900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7669, + "step": 280000 + }, + { + "epoch": 0.04, + "eval_loss": 0.7187725901603699, + "eval_runtime": 195.0674, + "eval_samples_per_second": 256.322, + "eval_steps_per_second": 2.004, + "step": 280000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7662, + "step": 280100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7692, + "step": 280200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7661, + "step": 280300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7696, + "step": 280400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7608, + "step": 280500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7759, + "step": 280600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 280700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7692, + "step": 280800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7752, + "step": 280900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7719, + "step": 281000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7666, + "step": 281100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7637, + "step": 281200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7673, + "step": 281300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7625, + "step": 281400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7627, + "step": 281500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7618, + "step": 281600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7737, + "step": 281700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7686, + "step": 281800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 281900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 282000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7597, + "step": 282100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7695, + "step": 282200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7722, + "step": 282300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 282400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7663, + "step": 282500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7716, + "step": 282600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 282700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7687, + "step": 282800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7683, + "step": 282900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7657, + "step": 283000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7634, + "step": 283100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7642, + "step": 283200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7623, + "step": 283300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7629, + "step": 283400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 283500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7668, + "step": 283600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7695, + "step": 283700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7621, + "step": 283800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7706, + "step": 283900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7701, + "step": 284000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 284100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7762, + "step": 284200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7644, + "step": 284300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7671, + "step": 284400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 284500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7651, + "step": 284600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7658, + "step": 284700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7624, + "step": 284800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7615, + "step": 284900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7687, + "step": 285000 + }, + { + "epoch": 0.04, + "eval_loss": 0.7185755968093872, + "eval_runtime": 194.5575, + "eval_samples_per_second": 256.993, + "eval_steps_per_second": 2.01, + "step": 285000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.765, + "step": 285100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7702, + "step": 285200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.769, + "step": 285300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7607, + "step": 285400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7635, + "step": 285500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7618, + "step": 285600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7641, + "step": 285700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7583, + "step": 285800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7606, + "step": 285900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7645, + "step": 286000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 286100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7647, + "step": 286200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 286300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7703, + "step": 286400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7683, + "step": 286500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.764, + "step": 286600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 286700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7616, + "step": 286800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7671, + "step": 286900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7631, + "step": 287000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 287100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7575, + "step": 287200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7595, + "step": 287300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7659, + "step": 287400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7663, + "step": 287500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7584, + "step": 287600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.76, + "step": 287700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7737, + "step": 287800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7708, + "step": 287900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 288000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 288100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 288200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7583, + "step": 288300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7656, + "step": 288400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7637, + "step": 288500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 288600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7722, + "step": 288700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7653, + "step": 288800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7629, + "step": 288900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7703, + "step": 289000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7726, + "step": 289100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7632, + "step": 289200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.76, + "step": 289300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7579, + "step": 289400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7627, + "step": 289500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7727, + "step": 289600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7626, + "step": 289700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7633, + "step": 289800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7699, + "step": 289900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 290000 + }, + { + "epoch": 0.04, + "eval_loss": 0.7190911173820496, + "eval_runtime": 194.7742, + "eval_samples_per_second": 256.708, + "eval_steps_per_second": 2.007, + "step": 290000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7701, + "step": 290100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7621, + "step": 290200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7679, + "step": 290300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7695, + "step": 290400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7612, + "step": 290500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7664, + "step": 290600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7602, + "step": 290700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7686, + "step": 290800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7712, + "step": 290900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 291000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7641, + "step": 291100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7624, + "step": 291200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7709, + "step": 291300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 291400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 291500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 291600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7714, + "step": 291700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7625, + "step": 291800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7683, + "step": 291900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7596, + "step": 292000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7637, + "step": 292100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 292200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7635, + "step": 292300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 292400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 292500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7641, + "step": 292600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7619, + "step": 292700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7647, + "step": 292800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.768, + "step": 292900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.76, + "step": 293000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7595, + "step": 293100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.766, + "step": 293200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 293300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7627, + "step": 293400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7624, + "step": 293500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7631, + "step": 293600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7717, + "step": 293700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7743, + "step": 293800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7698, + "step": 293900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 294000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7674, + "step": 294100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7634, + "step": 294200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7584, + "step": 294300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7709, + "step": 294400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 294500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7594, + "step": 294600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 294700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7636, + "step": 294800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7706, + "step": 294900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7635, + "step": 295000 + }, + { + "epoch": 0.04, + "eval_loss": 0.7187118530273438, + "eval_runtime": 194.5781, + "eval_samples_per_second": 256.966, + "eval_steps_per_second": 2.009, + "step": 295000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7665, + "step": 295100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7644, + "step": 295200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7766, + "step": 295300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7564, + "step": 295400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7706, + "step": 295500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.764, + "step": 295600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 295700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.764, + "step": 295800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7681, + "step": 295900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 296000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7584, + "step": 296100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 296200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 296300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7694, + "step": 296400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 296500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7723, + "step": 296600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 296700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7658, + "step": 296800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7664, + "step": 296900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7624, + "step": 297000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7595, + "step": 297100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7667, + "step": 297200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7563, + "step": 297300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 297400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7659, + "step": 297500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7657, + "step": 297600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7657, + "step": 297700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7674, + "step": 297800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7573, + "step": 297900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7621, + "step": 298000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7621, + "step": 298100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7679, + "step": 298200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7656, + "step": 298300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 298400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7578, + "step": 298500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 298600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7636, + "step": 298700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7608, + "step": 298800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7622, + "step": 298900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7595, + "step": 299000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7663, + "step": 299100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7634, + "step": 299200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 299300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7657, + "step": 299400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7635, + "step": 299500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7689, + "step": 299600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7621, + "step": 299700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 299800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7626, + "step": 299900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7623, + "step": 300000 + }, + { + "epoch": 0.04, + "eval_loss": 0.7169745564460754, + "eval_runtime": 194.6865, + "eval_samples_per_second": 256.823, + "eval_steps_per_second": 2.008, + "step": 300000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7578, + "step": 300100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 300200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 300300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7659, + "step": 300400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 300500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7673, + "step": 300600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.765, + "step": 300700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7616, + "step": 300800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 300900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7625, + "step": 301000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7586, + "step": 301100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7607, + "step": 301200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7681, + "step": 301300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7584, + "step": 301400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 301500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7652, + "step": 301600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7618, + "step": 301700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7632, + "step": 301800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 301900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7693, + "step": 302000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7573, + "step": 302100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7616, + "step": 302200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7705, + "step": 302300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7658, + "step": 302400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7596, + "step": 302500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7635, + "step": 302600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7606, + "step": 302700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7655, + "step": 302800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7692, + "step": 302900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7612, + "step": 303000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 303100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7664, + "step": 303200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 303300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.769, + "step": 303400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7566, + "step": 303500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7619, + "step": 303600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7694, + "step": 303700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7688, + "step": 303800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7584, + "step": 303900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7699, + "step": 304000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7634, + "step": 304100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7659, + "step": 304200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 304300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7615, + "step": 304400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7721, + "step": 304500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7722, + "step": 304600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7673, + "step": 304700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7612, + "step": 304800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 304900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7658, + "step": 305000 + }, + { + "epoch": 0.04, + "eval_loss": 0.7181566953659058, + "eval_runtime": 194.5398, + "eval_samples_per_second": 257.017, + "eval_steps_per_second": 2.01, + "step": 305000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 305100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7618, + "step": 305200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.764, + "step": 305300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 305400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7665, + "step": 305500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7599, + "step": 305600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7588, + "step": 305700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 305800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 305900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7637, + "step": 306000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7652, + "step": 306100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7624, + "step": 306200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7629, + "step": 306300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7632, + "step": 306400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7616, + "step": 306500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7631, + "step": 306600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7547, + "step": 306700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 306800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7639, + "step": 306900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7628, + "step": 307000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7635, + "step": 307100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7718, + "step": 307200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7666, + "step": 307300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 307400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 307500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7646, + "step": 307600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7667, + "step": 307700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.766, + "step": 307800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7583, + "step": 307900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7624, + "step": 308000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7678, + "step": 308100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7628, + "step": 308200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7686, + "step": 308300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7674, + "step": 308400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 308500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7602, + "step": 308600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7594, + "step": 308700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7637, + "step": 308800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7644, + "step": 308900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7664, + "step": 309000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 309100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7609, + "step": 309200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 309300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.764, + "step": 309400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7686, + "step": 309500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 309600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7564, + "step": 309700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7593, + "step": 309800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7698, + "step": 309900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7591, + "step": 310000 + }, + { + "epoch": 0.04, + "eval_loss": 0.7167325615882874, + "eval_runtime": 194.6222, + "eval_samples_per_second": 256.908, + "eval_steps_per_second": 2.009, + "step": 310000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7619, + "step": 310100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7645, + "step": 310200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7608, + "step": 310300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7596, + "step": 310400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 310500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 310600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 310700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7696, + "step": 310800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 310900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7594, + "step": 311000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7671, + "step": 311100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7672, + "step": 311200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7578, + "step": 311300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7668, + "step": 311400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7626, + "step": 311500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 311600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7644, + "step": 311700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7639, + "step": 311800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7609, + "step": 311900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7647, + "step": 312000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7606, + "step": 312100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7675, + "step": 312200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 312300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 312400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7663, + "step": 312500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7615, + "step": 312600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7645, + "step": 312700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7582, + "step": 312800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7744, + "step": 312900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7676, + "step": 313000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7684, + "step": 313100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.766, + "step": 313200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7707, + "step": 313300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7637, + "step": 313400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7626, + "step": 313500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 313600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 313700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 313800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7657, + "step": 313900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7573, + "step": 314000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 314100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7594, + "step": 314200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.773, + "step": 314300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 314400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7684, + "step": 314500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 314600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 314700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7595, + "step": 314800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7652, + "step": 314900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7602, + "step": 315000 + }, + { + "epoch": 0.04, + "eval_loss": 0.7169172167778015, + "eval_runtime": 194.9797, + "eval_samples_per_second": 256.437, + "eval_steps_per_second": 2.005, + "step": 315000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 315100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7659, + "step": 315200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 315300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7657, + "step": 315400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7602, + "step": 315500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7659, + "step": 315600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7673, + "step": 315700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7641, + "step": 315800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7592, + "step": 315900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7594, + "step": 316000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7594, + "step": 316100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7586, + "step": 316200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 316300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7637, + "step": 316400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7599, + "step": 316500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7607, + "step": 316600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7609, + "step": 316700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 316800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 316900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7574, + "step": 317000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7672, + "step": 317100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7591, + "step": 317200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7626, + "step": 317300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7625, + "step": 317400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7627, + "step": 317500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 317600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 317700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 317800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 317900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7636, + "step": 318000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7652, + "step": 318100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7668, + "step": 318200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7616, + "step": 318300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7643, + "step": 318400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7607, + "step": 318500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7597, + "step": 318600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7618, + "step": 318700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7618, + "step": 318800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 318900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7641, + "step": 319000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7638, + "step": 319100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 319200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7633, + "step": 319300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7624, + "step": 319400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7619, + "step": 319500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7627, + "step": 319600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7629, + "step": 319700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7721, + "step": 319800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7593, + "step": 319900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7621, + "step": 320000 + }, + { + "epoch": 0.04, + "eval_loss": 0.7167797088623047, + "eval_runtime": 194.6632, + "eval_samples_per_second": 256.854, + "eval_steps_per_second": 2.009, + "step": 320000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7698, + "step": 320100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7712, + "step": 320200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7679, + "step": 320300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 320400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7627, + "step": 320500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7675, + "step": 320600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7547, + "step": 320700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 320800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7667, + "step": 320900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 321000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 321100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 321200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 321300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 321400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 321500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7626, + "step": 321600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7579, + "step": 321700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7596, + "step": 321800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 321900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.766, + "step": 322000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7559, + "step": 322100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7588, + "step": 322200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 322300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 322400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 322500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7676, + "step": 322600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7619, + "step": 322700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7677, + "step": 322800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7593, + "step": 322900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 323000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 323100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7684, + "step": 323200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7595, + "step": 323300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 323400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 323500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7591, + "step": 323600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.76, + "step": 323700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 323800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 323900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7588, + "step": 324000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7579, + "step": 324100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 324200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 324300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7627, + "step": 324400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7607, + "step": 324500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 324600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7591, + "step": 324700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7563, + "step": 324800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 324900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 325000 + }, + { + "epoch": 0.04, + "eval_loss": 0.7153588533401489, + "eval_runtime": 202.3735, + "eval_samples_per_second": 247.068, + "eval_steps_per_second": 1.932, + "step": 325000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7655, + "step": 325100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7587, + "step": 325200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7612, + "step": 325300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 325400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7624, + "step": 325500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7596, + "step": 325600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 325700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 325800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7635, + "step": 325900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7671, + "step": 326000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 326100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7637, + "step": 326200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7608, + "step": 326300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 326400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 326500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 326600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 326700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 326800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 326900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7637, + "step": 327000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 327100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7685, + "step": 327200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 327300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 327400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7624, + "step": 327500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 327600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7607, + "step": 327700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 327800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 327900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7615, + "step": 328000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7657, + "step": 328100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7593, + "step": 328200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 328300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7625, + "step": 328400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 328500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 328600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7653, + "step": 328700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7635, + "step": 328800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 328900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7602, + "step": 329000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7594, + "step": 329100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 329200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7681, + "step": 329300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 329400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7692, + "step": 329500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 329600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 329700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7606, + "step": 329800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7618, + "step": 329900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7636, + "step": 330000 + }, + { + "epoch": 0.04, + "eval_loss": 0.7145840525627136, + "eval_runtime": 203.3706, + "eval_samples_per_second": 245.857, + "eval_steps_per_second": 1.923, + "step": 330000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7608, + "step": 330100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7602, + "step": 330200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.76, + "step": 330300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7675, + "step": 330400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7547, + "step": 330500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7599, + "step": 330600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7619, + "step": 330700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 330800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7608, + "step": 330900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7655, + "step": 331000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 331100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7573, + "step": 331200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 331300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 331400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 331500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7615, + "step": 331600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.768, + "step": 331700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 331800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7636, + "step": 331900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7643, + "step": 332000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 332100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 332200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 332300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7652, + "step": 332400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 332500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7619, + "step": 332600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 332700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7593, + "step": 332800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7633, + "step": 332900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 333000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7619, + "step": 333100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 333200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 333300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7702, + "step": 333400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.76, + "step": 333500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 333600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 333700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 333800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7655, + "step": 333900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 334000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7566, + "step": 334100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 334200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 334300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 334400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7584, + "step": 334500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.768, + "step": 334600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 334700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 334800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 334900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7578, + "step": 335000 + }, + { + "epoch": 0.04, + "eval_loss": 0.713698148727417, + "eval_runtime": 201.6244, + "eval_samples_per_second": 247.986, + "eval_steps_per_second": 1.939, + "step": 335000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7626, + "step": 335100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 335200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7658, + "step": 335300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 335400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7636, + "step": 335500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7575, + "step": 335600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 335700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7639, + "step": 335800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7658, + "step": 335900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7602, + "step": 336000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7592, + "step": 336100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 336200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 336300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7679, + "step": 336400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7584, + "step": 336500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7645, + "step": 336600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7653, + "step": 336700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7612, + "step": 336800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7622, + "step": 336900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7592, + "step": 337000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 337100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7691, + "step": 337200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 337300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7624, + "step": 337400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 337500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 337600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7583, + "step": 337700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7586, + "step": 337800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 337900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7659, + "step": 338000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7623, + "step": 338100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7624, + "step": 338200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7587, + "step": 338300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 338400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 338500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 338600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 338700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7593, + "step": 338800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 338900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7701, + "step": 339000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 339100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7651, + "step": 339200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 339300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7631, + "step": 339400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 339500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7566, + "step": 339600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 339700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7607, + "step": 339800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7619, + "step": 339900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 340000 + }, + { + "epoch": 0.04, + "eval_loss": 0.7148870229721069, + "eval_runtime": 206.1839, + "eval_samples_per_second": 242.502, + "eval_steps_per_second": 1.896, + "step": 340000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.76, + "step": 340100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7591, + "step": 340200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7645, + "step": 340300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 340400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7626, + "step": 340500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 340600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 340700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 340800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7628, + "step": 340900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7623, + "step": 341000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 341100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7591, + "step": 341200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 341300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7608, + "step": 341400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 341500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7619, + "step": 341600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7573, + "step": 341700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7618, + "step": 341800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.765, + "step": 341900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7579, + "step": 342000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7607, + "step": 342100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7635, + "step": 342200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 342300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7624, + "step": 342400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 342500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 342600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7595, + "step": 342700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7647, + "step": 342800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 342900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 343000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7654, + "step": 343100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 343200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7606, + "step": 343300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7637, + "step": 343400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7588, + "step": 343500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7583, + "step": 343600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7579, + "step": 343700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 343800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 343900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7681, + "step": 344000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7619, + "step": 344100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 344200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 344300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 344400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7575, + "step": 344500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7594, + "step": 344600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7643, + "step": 344700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 344800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7607, + "step": 344900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 345000 + }, + { + "epoch": 0.04, + "eval_loss": 0.7139725685119629, + "eval_runtime": 207.3764, + "eval_samples_per_second": 241.107, + "eval_steps_per_second": 1.885, + "step": 345000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 345100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7618, + "step": 345200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7655, + "step": 345300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 345400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7591, + "step": 345500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7606, + "step": 345600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.77, + "step": 345700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7628, + "step": 345800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 345900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 346000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7583, + "step": 346100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7667, + "step": 346200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7658, + "step": 346300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7653, + "step": 346400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 346500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 346600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7564, + "step": 346700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7612, + "step": 346800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 346900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7586, + "step": 347000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 347100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7647, + "step": 347200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7591, + "step": 347300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7642, + "step": 347400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7622, + "step": 347500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7582, + "step": 347600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7653, + "step": 347700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7575, + "step": 347800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 347900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 348000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 348100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 348200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7628, + "step": 348300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7684, + "step": 348400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.766, + "step": 348500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7635, + "step": 348600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7632, + "step": 348700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7594, + "step": 348800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7609, + "step": 348900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 349000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7592, + "step": 349100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.76, + "step": 349200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7619, + "step": 349300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.764, + "step": 349400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7587, + "step": 349500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7615, + "step": 349600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7656, + "step": 349700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7575, + "step": 349800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7704, + "step": 349900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 350000 + }, + { + "epoch": 0.04, + "eval_loss": 0.7151290774345398, + "eval_runtime": 202.07, + "eval_samples_per_second": 247.439, + "eval_steps_per_second": 1.935, + "step": 350000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7566, + "step": 350100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7636, + "step": 350200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 350300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 350400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7659, + "step": 350500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7588, + "step": 350600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7592, + "step": 350700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.774, + "step": 350800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7666, + "step": 350900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 351000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7638, + "step": 351100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 351200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7696, + "step": 351300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7608, + "step": 351400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7708, + "step": 351500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7661, + "step": 351600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 351700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7646, + "step": 351800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 351900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 352000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 352100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7675, + "step": 352200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7608, + "step": 352300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7594, + "step": 352400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7616, + "step": 352500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7606, + "step": 352600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7627, + "step": 352700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 352800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7635, + "step": 352900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.765, + "step": 353000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 353100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7623, + "step": 353200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7597, + "step": 353300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 353400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 353500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 353600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7654, + "step": 353700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7593, + "step": 353800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7599, + "step": 353900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7657, + "step": 354000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 354100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 354200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 354300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7589, + "step": 354400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 354500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 354600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 354700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7547, + "step": 354800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7592, + "step": 354900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 355000 + }, + { + "epoch": 0.04, + "eval_loss": 0.7139496207237244, + "eval_runtime": 207.6971, + "eval_samples_per_second": 240.735, + "eval_steps_per_second": 1.883, + "step": 355000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7638, + "step": 355100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.766, + "step": 355200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7596, + "step": 355300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 355400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 355500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7606, + "step": 355600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 355700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 355800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 355900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 356000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7663, + "step": 356100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7656, + "step": 356200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 356300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 356400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 356500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7574, + "step": 356600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 356700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 356800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7629, + "step": 356900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7647, + "step": 357000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 357100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7627, + "step": 357200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 357300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 357400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 357500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7622, + "step": 357600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.765, + "step": 357700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7677, + "step": 357800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 357900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7593, + "step": 358000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 358100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7655, + "step": 358200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7633, + "step": 358300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7684, + "step": 358400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.76, + "step": 358500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 358600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7695, + "step": 358700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7635, + "step": 358800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 358900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 359000 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 359100 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 359200 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 359300 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7697, + "step": 359400 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7636, + "step": 359500 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7652, + "step": 359600 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 359700 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7609, + "step": 359800 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7646, + "step": 359900 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 0.7629, + "step": 360000 + }, + { + "epoch": 0.04, + "eval_loss": 0.7152824401855469, + "eval_runtime": 208.4686, + "eval_samples_per_second": 239.844, + "eval_steps_per_second": 1.876, + "step": 360000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7635, + "step": 360100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 360200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 360300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 360400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 360500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7669, + "step": 360600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7636, + "step": 360700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 360800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 360900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7602, + "step": 361000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7584, + "step": 361100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7626, + "step": 361200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 361300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 361400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 361500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7563, + "step": 361600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 361700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7622, + "step": 361800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 361900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7595, + "step": 362000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7656, + "step": 362100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7618, + "step": 362200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 362300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7599, + "step": 362400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7596, + "step": 362500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 362600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 362700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7638, + "step": 362800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 362900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7627, + "step": 363000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7593, + "step": 363100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7632, + "step": 363200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 363300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7637, + "step": 363400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 363500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 363600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7647, + "step": 363700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 363800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 363900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 364000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7609, + "step": 364100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7669, + "step": 364200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7589, + "step": 364300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7597, + "step": 364400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7591, + "step": 364500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7639, + "step": 364600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7589, + "step": 364700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7586, + "step": 364800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7588, + "step": 364900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 365000 + }, + { + "epoch": 0.05, + "eval_loss": 0.7134825587272644, + "eval_runtime": 207.3356, + "eval_samples_per_second": 241.155, + "eval_steps_per_second": 1.886, + "step": 365000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 365100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7596, + "step": 365200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 365300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7564, + "step": 365400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 365500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 365600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7591, + "step": 365700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 365800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7612, + "step": 365900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 366000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7612, + "step": 366100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7632, + "step": 366200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7587, + "step": 366300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 366400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7673, + "step": 366500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 366600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 366700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 366800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7616, + "step": 366900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7633, + "step": 367000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 367100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 367200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7595, + "step": 367300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 367400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 367500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7602, + "step": 367600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 367700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7625, + "step": 367800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7619, + "step": 367900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 368000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 368100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7599, + "step": 368200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7592, + "step": 368300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7578, + "step": 368400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 368500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7662, + "step": 368600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.765, + "step": 368700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 368800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7563, + "step": 368900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7629, + "step": 369000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 369100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 369200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 369300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 369400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7618, + "step": 369500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7618, + "step": 369600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7615, + "step": 369700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 369800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 369900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7619, + "step": 370000 + }, + { + "epoch": 0.05, + "eval_loss": 0.7130681276321411, + "eval_runtime": 206.0455, + "eval_samples_per_second": 242.665, + "eval_steps_per_second": 1.898, + "step": 370000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 370100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7708, + "step": 370200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7684, + "step": 370300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 370400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7615, + "step": 370500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7602, + "step": 370600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 370700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7628, + "step": 370800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 370900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7633, + "step": 371000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7625, + "step": 371100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7689, + "step": 371200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 371300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7628, + "step": 371400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7671, + "step": 371500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7599, + "step": 371600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7597, + "step": 371700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 371800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7635, + "step": 371900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 372000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 372100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7592, + "step": 372200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 372300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 372400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 372500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7625, + "step": 372600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 372700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 372800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 372900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 373000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 373100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 373200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7583, + "step": 373300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 373400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 373500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 373600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 373700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7587, + "step": 373800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7612, + "step": 373900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 374000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 374100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7587, + "step": 374200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 374300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7559, + "step": 374400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 374500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7589, + "step": 374600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.768, + "step": 374700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 374800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 374900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 375000 + }, + { + "epoch": 0.05, + "eval_loss": 0.7135566473007202, + "eval_runtime": 204.9801, + "eval_samples_per_second": 243.926, + "eval_steps_per_second": 1.908, + "step": 375000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 375100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 375200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 375300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 375400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 375500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7609, + "step": 375600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 375700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 375800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 375900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7613, + "step": 376000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7653, + "step": 376100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 376200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 376300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7692, + "step": 376400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7672, + "step": 376500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7587, + "step": 376600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 376700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7573, + "step": 376800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7578, + "step": 376900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7612, + "step": 377000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 377100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 377200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7654, + "step": 377300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7595, + "step": 377400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 377500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 377600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 377700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7573, + "step": 377800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7646, + "step": 377900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.76, + "step": 378000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 378100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 378200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7623, + "step": 378300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 378400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 378500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7599, + "step": 378600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 378700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7612, + "step": 378800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7676, + "step": 378900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7634, + "step": 379000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7587, + "step": 379100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 379200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7586, + "step": 379300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 379400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 379500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7613, + "step": 379600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 379700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7618, + "step": 379800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7591, + "step": 379900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7669, + "step": 380000 + }, + { + "epoch": 0.05, + "eval_loss": 0.7119324207305908, + "eval_runtime": 208.8545, + "eval_samples_per_second": 239.401, + "eval_steps_per_second": 1.872, + "step": 380000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 380100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7641, + "step": 380200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 380300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 380400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7625, + "step": 380500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7574, + "step": 380600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 380700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 380800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 380900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7629, + "step": 381000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 381100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 381200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7564, + "step": 381300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7591, + "step": 381400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 381500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7587, + "step": 381600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 381700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7649, + "step": 381800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7644, + "step": 381900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7588, + "step": 382000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7584, + "step": 382100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7628, + "step": 382200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7626, + "step": 382300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 382400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 382500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7619, + "step": 382600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7596, + "step": 382700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7584, + "step": 382800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 382900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 383000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 383100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 383200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7624, + "step": 383300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7609, + "step": 383400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7597, + "step": 383500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 383600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7612, + "step": 383700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 383800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7586, + "step": 383900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 384000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 384100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 384200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 384300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 384400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 384500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7675, + "step": 384600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 384700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 384800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.767, + "step": 384900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 385000 + }, + { + "epoch": 0.05, + "eval_loss": 0.7137901782989502, + "eval_runtime": 207.8203, + "eval_samples_per_second": 240.593, + "eval_steps_per_second": 1.881, + "step": 385000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7588, + "step": 385100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 385200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7597, + "step": 385300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 385400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 385500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 385600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 385700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7593, + "step": 385800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7566, + "step": 385900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 386000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7629, + "step": 386100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7573, + "step": 386200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 386300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7619, + "step": 386400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7638, + "step": 386500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 386600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7586, + "step": 386700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7663, + "step": 386800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7563, + "step": 386900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7658, + "step": 387000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 387100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 387200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7627, + "step": 387300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 387400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7597, + "step": 387500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7591, + "step": 387600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 387700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7592, + "step": 387800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7578, + "step": 387900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 388000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 388100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7666, + "step": 388200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 388300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 388400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 388500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 388600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7594, + "step": 388700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7647, + "step": 388800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7613, + "step": 388900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 389000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 389100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 389200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 389300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 389400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 389500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 389600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 389700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 389800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 389900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7586, + "step": 390000 + }, + { + "epoch": 0.05, + "eval_loss": 0.7126809358596802, + "eval_runtime": 205.3357, + "eval_samples_per_second": 243.504, + "eval_steps_per_second": 1.904, + "step": 390000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 390100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7695, + "step": 390200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7582, + "step": 390300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7606, + "step": 390400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.76, + "step": 390500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 390600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 390700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 390800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 390900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7691, + "step": 391000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 391100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7637, + "step": 391200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7593, + "step": 391300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7596, + "step": 391400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 391500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.76, + "step": 391600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 391700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 391800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 391900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 392000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7574, + "step": 392100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 392200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 392300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7575, + "step": 392400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 392500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 392600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7609, + "step": 392700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 392800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 392900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 393000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 393100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 393200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7626, + "step": 393300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 393400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 393500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 393600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 393700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 393800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7666, + "step": 393900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 394000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 394100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 394200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 394300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 394400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 394500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 394600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7559, + "step": 394700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 394800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7563, + "step": 394900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7632, + "step": 395000 + }, + { + "epoch": 0.05, + "eval_loss": 0.7117016911506653, + "eval_runtime": 209.4826, + "eval_samples_per_second": 238.683, + "eval_steps_per_second": 1.867, + "step": 395000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 395100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7584, + "step": 395200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7595, + "step": 395300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 395400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 395500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 395600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7656, + "step": 395700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.766, + "step": 395800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 395900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 396000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7549, + "step": 396100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 396200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 396300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 396400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7591, + "step": 396500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7639, + "step": 396600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 396700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 396800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7559, + "step": 396900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7587, + "step": 397000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 397100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7644, + "step": 397200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 397300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 397400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 397500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 397600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 397700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7564, + "step": 397800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 397900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 398000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 398100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 398200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 398300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7634, + "step": 398400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 398500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 398600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7583, + "step": 398700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 398800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7575, + "step": 398900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 399000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 399100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7599, + "step": 399200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7631, + "step": 399300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 399400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7636, + "step": 399500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 399600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7626, + "step": 399700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 399800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 399900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 400000 + }, + { + "epoch": 0.05, + "eval_loss": 0.7114148139953613, + "eval_runtime": 211.9423, + "eval_samples_per_second": 235.913, + "eval_steps_per_second": 1.845, + "step": 400000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 400100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7566, + "step": 400200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7607, + "step": 400300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7639, + "step": 400400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 400500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 400600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 400700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.76, + "step": 400800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 400900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7573, + "step": 401000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7606, + "step": 401100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7615, + "step": 401200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7589, + "step": 401300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 401400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 401500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 401600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7629, + "step": 401700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 401800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 401900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 402000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 402100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 402200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 402300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7628, + "step": 402400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 402500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7574, + "step": 402600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 402700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7622, + "step": 402800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 402900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 403000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 403100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7587, + "step": 403200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 403300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 403400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 403500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7573, + "step": 403600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 403700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 403800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 403900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 404000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 404100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 404200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 404300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 404400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7632, + "step": 404500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 404600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 404700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 404800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7687, + "step": 404900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 405000 + }, + { + "epoch": 0.05, + "eval_loss": 0.7116851806640625, + "eval_runtime": 206.952, + "eval_samples_per_second": 241.602, + "eval_steps_per_second": 1.889, + "step": 405000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7596, + "step": 405100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 405200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 405300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7582, + "step": 405400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7686, + "step": 405500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7559, + "step": 405600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 405700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7608, + "step": 405800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 405900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 406000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 406100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 406200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 406300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 406400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 406500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 406600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 406700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 406800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 406900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 407000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7591, + "step": 407100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 407200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 407300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7642, + "step": 407400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 407500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 407600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 407700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 407800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7586, + "step": 407900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 408000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 408100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 408200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7591, + "step": 408300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7616, + "step": 408400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 408500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 408600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 408700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7607, + "step": 408800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 408900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 409000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 409100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7613, + "step": 409200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 409300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 409400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 409500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 409600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 409700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7575, + "step": 409800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 409900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 410000 + }, + { + "epoch": 0.05, + "eval_loss": 0.7122207283973694, + "eval_runtime": 208.3668, + "eval_samples_per_second": 239.961, + "eval_steps_per_second": 1.876, + "step": 410000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 410100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 410200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 410300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 410400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 410500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7615, + "step": 410600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7582, + "step": 410700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7578, + "step": 410800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 410900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7602, + "step": 411000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 411100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 411200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 411300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 411400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 411500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.753, + "step": 411600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7579, + "step": 411700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 411800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 411900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 412000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 412100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 412200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 412300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 412400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 412500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 412600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 412700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 412800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7657, + "step": 412900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7616, + "step": 413000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 413100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 413200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 413300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 413400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7584, + "step": 413500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7631, + "step": 413600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7652, + "step": 413700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7579, + "step": 413800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 413900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 414000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7729, + "step": 414100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 414200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7638, + "step": 414300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 414400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 414500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7588, + "step": 414600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 414700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7596, + "step": 414800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 414900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 415000 + }, + { + "epoch": 0.05, + "eval_loss": 0.7115651369094849, + "eval_runtime": 208.0252, + "eval_samples_per_second": 240.355, + "eval_steps_per_second": 1.88, + "step": 415000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 415100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 415200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7587, + "step": 415300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7641, + "step": 415400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7587, + "step": 415500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 415600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 415700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 415800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 415900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 416000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7607, + "step": 416100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 416200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7584, + "step": 416300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 416400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.764, + "step": 416500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 416600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 416700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 416800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 416900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7661, + "step": 417000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7549, + "step": 417100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7587, + "step": 417200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.76, + "step": 417300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 417400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 417500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 417600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7588, + "step": 417700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 417800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 417900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 418000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 418100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7559, + "step": 418200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 418300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 418400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7602, + "step": 418500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 418600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 418700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 418800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7688, + "step": 418900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 419000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 419100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7586, + "step": 419200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7612, + "step": 419300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 419400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 419500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 419600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.768, + "step": 419700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 419800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7588, + "step": 419900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 420000 + }, + { + "epoch": 0.05, + "eval_loss": 0.7127427458763123, + "eval_runtime": 205.6068, + "eval_samples_per_second": 243.183, + "eval_steps_per_second": 1.902, + "step": 420000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 420100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 420200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 420300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 420400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7596, + "step": 420500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7583, + "step": 420600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7708, + "step": 420700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 420800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 420900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 421000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 421100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 421200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7612, + "step": 421300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7683, + "step": 421400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 421500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 421600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7694, + "step": 421700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 421800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 421900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7636, + "step": 422000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 422100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 422200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 422300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.753, + "step": 422400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 422500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7579, + "step": 422600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 422700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 422800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 422900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 423000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 423100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7602, + "step": 423200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 423300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 423400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7597, + "step": 423500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 423600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 423700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 423800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 423900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 424000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 424100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 424200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7587, + "step": 424300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 424400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 424500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 424600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7579, + "step": 424700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 424800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7582, + "step": 424900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 425000 + }, + { + "epoch": 0.05, + "eval_loss": 0.7107843160629272, + "eval_runtime": 208.9656, + "eval_samples_per_second": 239.274, + "eval_steps_per_second": 1.871, + "step": 425000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 425100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 425200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7586, + "step": 425300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 425400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 425500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 425600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 425700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7583, + "step": 425800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 425900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 426000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 426100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7623, + "step": 426200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 426300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 426400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7602, + "step": 426500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7623, + "step": 426600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 426700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 426800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7634, + "step": 426900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 427000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 427100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 427200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 427300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 427400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 427500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 427600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 427700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.77, + "step": 427800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.76, + "step": 427900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 428000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 428100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 428200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 428300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 428400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 428500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 428600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7589, + "step": 428700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 428800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 428900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 429000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 429100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 429200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7606, + "step": 429300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7547, + "step": 429400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 429500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7622, + "step": 429600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 429700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 429800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7599, + "step": 429900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 430000 + }, + { + "epoch": 0.05, + "eval_loss": 0.7116101384162903, + "eval_runtime": 204.4491, + "eval_samples_per_second": 244.56, + "eval_steps_per_second": 1.912, + "step": 430000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 430100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 430200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 430300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 430400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 430500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 430600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 430700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 430800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7642, + "step": 430900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 431000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 431100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7613, + "step": 431200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 431300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7586, + "step": 431400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 431500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7588, + "step": 431600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7613, + "step": 431700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 431800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7584, + "step": 431900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 432000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 432100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 432200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 432300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 432400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 432500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 432600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 432700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 432800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7656, + "step": 432900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7564, + "step": 433000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 433100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 433200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 433300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 433400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 433500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 433600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7566, + "step": 433700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7615, + "step": 433800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.76, + "step": 433900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 434000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 434100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7635, + "step": 434200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 434300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7651, + "step": 434400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 434500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7663, + "step": 434600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 434700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7602, + "step": 434800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 434900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7575, + "step": 435000 + }, + { + "epoch": 0.05, + "eval_loss": 0.7120763063430786, + "eval_runtime": 207.7863, + "eval_samples_per_second": 240.632, + "eval_steps_per_second": 1.882, + "step": 435000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 435100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 435200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 435300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 435400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 435500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 435600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 435700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 435800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7595, + "step": 435900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 436000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 436100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 436200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 436300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 436400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7573, + "step": 436500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 436600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 436700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 436800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 436900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7608, + "step": 437000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 437100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7625, + "step": 437200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 437300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 437400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 437500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7547, + "step": 437600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 437700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 437800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 437900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 438000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7636, + "step": 438100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 438200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7653, + "step": 438300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7584, + "step": 438400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7594, + "step": 438500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7596, + "step": 438600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7579, + "step": 438700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7619, + "step": 438800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 438900 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 439000 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 439100 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 439200 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7636, + "step": 439300 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 439400 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 439500 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7653, + "step": 439600 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 439700 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 439800 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 439900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 440000 + }, + { + "epoch": 0.06, + "eval_loss": 0.7116917967796326, + "eval_runtime": 201.9048, + "eval_samples_per_second": 247.642, + "eval_steps_per_second": 1.937, + "step": 440000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 440100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 440200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 440300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7628, + "step": 440400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 440500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7644, + "step": 440600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 440700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 440800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 440900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 441000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7643, + "step": 441100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 441200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7594, + "step": 441300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 441400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7573, + "step": 441500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 441600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 441700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 441800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 441900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7574, + "step": 442000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 442100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 442200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 442300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 442400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 442500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 442600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 442700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7586, + "step": 442800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 442900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 443000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 443100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7621, + "step": 443200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 443300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 443400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7599, + "step": 443500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7564, + "step": 443600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 443700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 443800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 443900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7586, + "step": 444000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7559, + "step": 444100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 444200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 444300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 444400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 444500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 444600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 444700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 444800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 444900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 445000 + }, + { + "epoch": 0.06, + "eval_loss": 0.709259033203125, + "eval_runtime": 200.7785, + "eval_samples_per_second": 249.031, + "eval_steps_per_second": 1.947, + "step": 445000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 445100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 445200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 445300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 445400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 445500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 445600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 445700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7583, + "step": 445800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 445900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 446000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7622, + "step": 446100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 446200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 446300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 446400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7586, + "step": 446500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 446600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7617, + "step": 446700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7566, + "step": 446800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7656, + "step": 446900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 447000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 447100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 447200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 447300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7653, + "step": 447400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7597, + "step": 447500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7574, + "step": 447600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 447700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 447800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7607, + "step": 447900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 448000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 448100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 448200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 448300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 448400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7618, + "step": 448500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 448600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 448700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 448800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7618, + "step": 448900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 449000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 449100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 449200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 449300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7573, + "step": 449400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7574, + "step": 449500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 449600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 449700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 449800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 449900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 450000 + }, + { + "epoch": 0.06, + "eval_loss": 0.7122431993484497, + "eval_runtime": 203.9361, + "eval_samples_per_second": 245.175, + "eval_steps_per_second": 1.917, + "step": 450000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 450100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7584, + "step": 450200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 450300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7633, + "step": 450400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 450500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 450600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7573, + "step": 450700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 450800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7639, + "step": 450900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 451000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 451100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.771, + "step": 451200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7631, + "step": 451300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7563, + "step": 451400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 451500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 451600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7613, + "step": 451700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 451800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 451900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 452000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 452100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 452200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 452300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7637, + "step": 452400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 452500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 452600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7612, + "step": 452700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 452800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 452900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 453000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 453100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 453200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7632, + "step": 453300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 453400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 453500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 453600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7564, + "step": 453700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 453800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 453900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7595, + "step": 454000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 454100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 454200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 454300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 454400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 454500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 454600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 454700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 454800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 454900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 455000 + }, + { + "epoch": 0.06, + "eval_loss": 0.7123976349830627, + "eval_runtime": 206.7053, + "eval_samples_per_second": 241.89, + "eval_steps_per_second": 1.892, + "step": 455000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 455100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 455200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 455300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 455400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 455500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7633, + "step": 455600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 455700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 455800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 455900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 456000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7594, + "step": 456100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 456200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 456300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7591, + "step": 456400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 456500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 456600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7591, + "step": 456700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7559, + "step": 456800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 456900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 457000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 457100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 457200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 457300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 457400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 457500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 457600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 457700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 457800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 457900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.753, + "step": 458000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 458100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7588, + "step": 458200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 458300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 458400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 458500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 458600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 458700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7575, + "step": 458800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 458900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 459000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 459100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 459200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7596, + "step": 459300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 459400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 459500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 459600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 459700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 459800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 459900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 460000 + }, + { + "epoch": 0.06, + "eval_loss": 0.7095310688018799, + "eval_runtime": 207.1686, + "eval_samples_per_second": 241.349, + "eval_steps_per_second": 1.887, + "step": 460000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 460100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 460200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 460300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 460400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7591, + "step": 460500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 460600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 460700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7679, + "step": 460800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 460900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 461000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 461100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 461200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 461300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 461400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7564, + "step": 461500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 461600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 461700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 461800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 461900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7574, + "step": 462000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7613, + "step": 462100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 462200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7599, + "step": 462300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 462400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 462500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7559, + "step": 462600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 462700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 462800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7597, + "step": 462900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7582, + "step": 463000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.753, + "step": 463100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 463200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7609, + "step": 463300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7681, + "step": 463400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 463500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 463600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 463700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7592, + "step": 463800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7665, + "step": 463900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 464000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 464100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 464200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 464300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 464400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 464500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 464600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 464700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 464800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 464900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 465000 + }, + { + "epoch": 0.06, + "eval_loss": 0.7111442685127258, + "eval_runtime": 204.3031, + "eval_samples_per_second": 244.734, + "eval_steps_per_second": 1.914, + "step": 465000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 465100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 465200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7633, + "step": 465300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 465400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7549, + "step": 465500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 465600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 465700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7608, + "step": 465800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 465900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 466000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 466100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 466200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7579, + "step": 466300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 466400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7547, + "step": 466500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 466600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 466700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 466800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 466900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 467000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 467100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7563, + "step": 467200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 467300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 467400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 467500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 467600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 467700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 467800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7597, + "step": 467900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7566, + "step": 468000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 468100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 468200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 468300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7563, + "step": 468400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 468500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 468600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 468700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 468800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7579, + "step": 468900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 469000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 469100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7582, + "step": 469200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 469300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 469400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 469500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 469600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7563, + "step": 469700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 469800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 469900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7594, + "step": 470000 + }, + { + "epoch": 0.06, + "eval_loss": 0.7078810334205627, + "eval_runtime": 207.1667, + "eval_samples_per_second": 241.352, + "eval_steps_per_second": 1.887, + "step": 470000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7621, + "step": 470100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 470200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 470300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 470400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 470500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 470600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7579, + "step": 470700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 470800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 470900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7599, + "step": 471000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 471100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 471200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 471300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 471400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 471500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 471600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 471700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 471800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 471900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 472000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 472100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 472200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7582, + "step": 472300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 472400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 472500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 472600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7639, + "step": 472700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7627, + "step": 472800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7615, + "step": 472900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 473000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 473100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 473200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7587, + "step": 473300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 473400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 473500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7589, + "step": 473600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 473700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 473800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 473900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 474000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 474100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 474200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 474300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 474400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 474500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 474600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7633, + "step": 474700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 474800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 474900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 475000 + }, + { + "epoch": 0.06, + "eval_loss": 0.7096142768859863, + "eval_runtime": 204.9206, + "eval_samples_per_second": 243.997, + "eval_steps_per_second": 1.908, + "step": 475000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 475100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 475200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 475300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 475400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 475500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 475600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7584, + "step": 475700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 475800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 475900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 476000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 476100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 476200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 476300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 476400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 476500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 476600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7648, + "step": 476700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 476800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 476900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 477000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7586, + "step": 477100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7579, + "step": 477200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 477300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 477400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 477500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 477600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 477700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 477800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 477900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7582, + "step": 478000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7573, + "step": 478100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 478200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 478300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 478400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 478500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 478600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 478700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 478800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 478900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 479000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 479100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 479200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 479300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 479400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 479500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 479600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 479700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 479800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7564, + "step": 479900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7574, + "step": 480000 + }, + { + "epoch": 0.06, + "eval_loss": 0.7097885012626648, + "eval_runtime": 206.2654, + "eval_samples_per_second": 242.406, + "eval_steps_per_second": 1.896, + "step": 480000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 480100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7632, + "step": 480200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 480300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 480400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 480500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 480600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 480700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 480800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 480900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 481000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 481100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7563, + "step": 481200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 481300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 481400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 481500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 481600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 481700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 481800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7588, + "step": 481900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 482000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 482100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 482200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 482300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7632, + "step": 482400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7575, + "step": 482500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 482600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7613, + "step": 482700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 482800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 482900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 483000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7671, + "step": 483100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7607, + "step": 483200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 483300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 483400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 483500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7655, + "step": 483600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 483700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 483800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 483900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 484000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7547, + "step": 484100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 484200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 484300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7575, + "step": 484400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7566, + "step": 484500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 484600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 484700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7634, + "step": 484800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 484900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 485000 + }, + { + "epoch": 0.06, + "eval_loss": 0.7099955677986145, + "eval_runtime": 205.5411, + "eval_samples_per_second": 243.26, + "eval_steps_per_second": 1.902, + "step": 485000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 485100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 485200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 485300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 485400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 485500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7549, + "step": 485600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 485700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7549, + "step": 485800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 485900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 486000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 486100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 486200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7629, + "step": 486300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 486400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 486500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 486600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 486700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 486800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 486900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 487000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 487100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 487200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 487300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 487400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 487500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 487600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 487700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 487800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.753, + "step": 487900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.763, + "step": 488000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 488100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 488200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7616, + "step": 488300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 488400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 488500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 488600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 488700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 488800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7559, + "step": 488900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 489000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 489100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7615, + "step": 489200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 489300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 489400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 489500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 489600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 489700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 489800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7628, + "step": 489900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 490000 + }, + { + "epoch": 0.06, + "eval_loss": 0.7070909142494202, + "eval_runtime": 202.7407, + "eval_samples_per_second": 246.62, + "eval_steps_per_second": 1.929, + "step": 490000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 490100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 490200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 490300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 490400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 490500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 490600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 490700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 490800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 490900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 491000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 491100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 491200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 491300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 491400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 491500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 491600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 491700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 491800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 491900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 492000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 492100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 492200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 492300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 492400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7566, + "step": 492500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.764, + "step": 492600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 492700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 492800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7607, + "step": 492900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7582, + "step": 493000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 493100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 493200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 493300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 493400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 493500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 493600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 493700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 493800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7659, + "step": 493900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7634, + "step": 494000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 494100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 494200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 494300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 494400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 494500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 494600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 494700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7641, + "step": 494800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 494900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 495000 + }, + { + "epoch": 0.06, + "eval_loss": 0.7100406885147095, + "eval_runtime": 203.2704, + "eval_samples_per_second": 245.978, + "eval_steps_per_second": 1.924, + "step": 495000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7614, + "step": 495100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 495200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 495300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 495400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7668, + "step": 495500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.764, + "step": 495600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7613, + "step": 495700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 495800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 495900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 496000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 496100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 496200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 496300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7574, + "step": 496400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7586, + "step": 496500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7588, + "step": 496600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7622, + "step": 496700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 496800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7578, + "step": 496900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 497000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 497100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7579, + "step": 497200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 497300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7643, + "step": 497400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7584, + "step": 497500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 497600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 497700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7608, + "step": 497800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 497900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7628, + "step": 498000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 498100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 498200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 498300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 498400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 498500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 498600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 498700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 498800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 498900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 499000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 499100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 499200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 499300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 499400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 499500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 499600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 499700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 499800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 499900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7668, + "step": 500000 + }, + { + "epoch": 0.06, + "eval_loss": 0.7097185850143433, + "eval_runtime": 205.5908, + "eval_samples_per_second": 243.202, + "eval_steps_per_second": 1.902, + "step": 500000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7578, + "step": 500100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 500200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 500300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 500400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 500500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 500600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 500700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 500800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 500900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 501000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 501100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 501200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 501300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 501400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 501500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 501600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 501700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 501800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7578, + "step": 501900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 502000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 502100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 502200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 502300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 502400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 502500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 502600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 502700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7578, + "step": 502800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 502900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7616, + "step": 503000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7589, + "step": 503100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 503200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 503300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7578, + "step": 503400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 503500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 503600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7593, + "step": 503700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7582, + "step": 503800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7582, + "step": 503900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.762, + "step": 504000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 504100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 504200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 504300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 504400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 504500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 504600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 504700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 504800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 504900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 505000 + }, + { + "epoch": 0.06, + "eval_loss": 0.7079646587371826, + "eval_runtime": 203.5169, + "eval_samples_per_second": 245.68, + "eval_steps_per_second": 1.921, + "step": 505000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 505100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 505200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 505300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 505400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7564, + "step": 505500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7563, + "step": 505600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 505700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 505800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7593, + "step": 505900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 506000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 506100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 506200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 506300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7563, + "step": 506400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 506500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7559, + "step": 506600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 506700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 506800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7622, + "step": 506900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 507000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 507100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7589, + "step": 507200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7574, + "step": 507300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 507400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 507500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 507600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7579, + "step": 507700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 507800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 507900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 508000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 508100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 508200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 508300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7573, + "step": 508400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 508500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 508600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 508700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 508800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 508900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 509000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 509100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 509200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 509300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7599, + "step": 509400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 509500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 509600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 509700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 509800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 509900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7575, + "step": 510000 + }, + { + "epoch": 0.06, + "eval_loss": 0.707108199596405, + "eval_runtime": 206.3893, + "eval_samples_per_second": 242.261, + "eval_steps_per_second": 1.894, + "step": 510000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 510100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 510200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 510300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 510400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7564, + "step": 510500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 510600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 510700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 510800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 510900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 511000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 511100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 511200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 511300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 511400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 511500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.765, + "step": 511600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 511700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 511800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 511900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7588, + "step": 512000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7615, + "step": 512100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7611, + "step": 512200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 512300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 512400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7595, + "step": 512500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 512600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 512700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7584, + "step": 512800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 512900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 513000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 513100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7579, + "step": 513200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7549, + "step": 513300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7574, + "step": 513400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7662, + "step": 513500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 513600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 513700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 513800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 513900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 514000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 514100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 514200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 514300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7564, + "step": 514400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7575, + "step": 514500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 514600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 514700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 514800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 514900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 515000 + }, + { + "epoch": 0.06, + "eval_loss": 0.7088050842285156, + "eval_runtime": 207.7635, + "eval_samples_per_second": 240.658, + "eval_steps_per_second": 1.882, + "step": 515000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 515100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 515200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 515300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 515400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7602, + "step": 515500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 515600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 515700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 515800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 515900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 516000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 516100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 516200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 516300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7597, + "step": 516400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7583, + "step": 516500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7575, + "step": 516600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 516700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 516800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 516900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7596, + "step": 517000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 517100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 517200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 517300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.76, + "step": 517400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 517500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 517600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7592, + "step": 517700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 517800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 517900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 518000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 518100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 518200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7597, + "step": 518300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7606, + "step": 518400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 518500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 518600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 518700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 518800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 518900 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 519000 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 519100 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 519200 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 519300 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 519400 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 519500 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 519600 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 519700 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 519800 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 0.7573, + "step": 519900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 520000 + }, + { + "epoch": 0.07, + "eval_loss": 0.7072643637657166, + "eval_runtime": 206.4566, + "eval_samples_per_second": 242.182, + "eval_steps_per_second": 1.894, + "step": 520000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 520100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7629, + "step": 520200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 520300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 520400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 520500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 520600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 520700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 520800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 520900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7573, + "step": 521000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 521100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 521200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 521300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 521400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 521500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 521600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 521700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 521800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 521900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 522000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 522100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 522200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 522300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 522400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 522500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 522600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 522700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 522800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 522900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 523000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 523100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 523200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 523300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7623, + "step": 523400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 523500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7615, + "step": 523600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 523700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 523800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 523900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 524000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7574, + "step": 524100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7582, + "step": 524200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7619, + "step": 524300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 524400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 524500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 524600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 524700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 524800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 524900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 525000 + }, + { + "epoch": 0.07, + "eval_loss": 0.706498384475708, + "eval_runtime": 208.8156, + "eval_samples_per_second": 239.446, + "eval_steps_per_second": 1.872, + "step": 525000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 525100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 525200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 525300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 525400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 525500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 525600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 525700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 525800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7573, + "step": 525900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7564, + "step": 526000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 526100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 526200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 526300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 526400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 526500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 526600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 526700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 526800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7599, + "step": 526900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 527000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7578, + "step": 527100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 527200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 527300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 527400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 527500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 527600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 527700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 527800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 527900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 528000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7549, + "step": 528100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 528200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 528300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7549, + "step": 528400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 528500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 528600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7549, + "step": 528700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 528800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 528900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 529000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 529100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 529200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 529300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 529400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7607, + "step": 529500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 529600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 529700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 529800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 529900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 530000 + }, + { + "epoch": 0.07, + "eval_loss": 0.7084597945213318, + "eval_runtime": 205.2857, + "eval_samples_per_second": 243.563, + "eval_steps_per_second": 1.905, + "step": 530000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 530100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 530200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 530300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 530400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 530500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 530600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 530700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 530800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 530900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 531000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 531100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 531200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 531300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 531400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 531500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 531600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 531700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 531800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7564, + "step": 531900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 532000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 532100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 532200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 532300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 532400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 532500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 532600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7602, + "step": 532700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 532800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7615, + "step": 532900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 533000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.761, + "step": 533100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 533200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7591, + "step": 533300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 533400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 533500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 533600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 533700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 533800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 533900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 534000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 534100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 534200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 534300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 534400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7578, + "step": 534500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 534600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 534700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 534800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 534900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 535000 + }, + { + "epoch": 0.07, + "eval_loss": 0.7075052857398987, + "eval_runtime": 208.0268, + "eval_samples_per_second": 240.354, + "eval_steps_per_second": 1.88, + "step": 535000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7579, + "step": 535100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 535200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 535300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 535400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 535500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 535600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 535700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 535800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 535900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 536000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 536100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 536200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 536300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 536400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 536500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 536600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 536700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 536800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7564, + "step": 536900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 537000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 537100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 537200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 537300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 537400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 537500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 537600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 537700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 537800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 537900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 538000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 538100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 538200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 538300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 538400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 538500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7547, + "step": 538600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 538700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 538800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 538900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 539000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 539100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7574, + "step": 539200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 539300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 539400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 539500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 539600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 539700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7574, + "step": 539800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 539900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 540000 + }, + { + "epoch": 0.07, + "eval_loss": 0.7067614197731018, + "eval_runtime": 204.8198, + "eval_samples_per_second": 244.117, + "eval_steps_per_second": 1.909, + "step": 540000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 540100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 540200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 540300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7584, + "step": 540400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 540500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 540600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 540700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 540800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 540900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 541000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 541100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 541200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 541300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 541400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 541500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 541600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7559, + "step": 541700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 541800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 541900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 542000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 542100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 542200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 542300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 542400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7574, + "step": 542500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 542600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 542700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 542800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 542900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7578, + "step": 543000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 543100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 543200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7593, + "step": 543300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 543400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 543500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 543600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 543700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 543800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 543900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 544000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 544100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 544200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 544300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 544400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 544500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 544600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 544700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 544800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 544900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 545000 + }, + { + "epoch": 0.07, + "eval_loss": 0.7069408297538757, + "eval_runtime": 210.1496, + "eval_samples_per_second": 237.926, + "eval_steps_per_second": 1.861, + "step": 545000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 545100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 545200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 545300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 545400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 545500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 545600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 545700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 545800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 545900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7589, + "step": 546000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 546100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 546200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 546300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 546400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7566, + "step": 546500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 546600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 546700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 546800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 546900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 547000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 547100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 547200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 547300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 547400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 547500 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 547600 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 547700 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7616, + "step": 547800 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 547900 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 548000 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 548100 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 548200 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 548300 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 548400 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 548500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 548600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 548700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 548800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 548900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 549000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 549100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 549200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 549300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 549400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 549500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 549600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 549700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 549800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 549900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 550000 + }, + { + "epoch": 1.0, + "eval_loss": 0.7065879106521606, + "eval_runtime": 206.6566, + "eval_samples_per_second": 241.947, + "eval_steps_per_second": 1.892, + "step": 550000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 550100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 550200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 550300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 550400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 550500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 550600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 550700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 550800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 550900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 551000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 551100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 551200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 551300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 551400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 551500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 551600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 551700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 551800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 551900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 552000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 552100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 552200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 552300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.753, + "step": 552400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 552500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 552600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 552700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 552800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 552900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 553000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 553100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 553200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 553300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 553400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 553500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 553600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 553700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 553800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 553900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 554000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 554100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 554200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 554300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 554400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 554500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 554600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 554700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 554800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 554900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 555000 + }, + { + "epoch": 1.0, + "eval_loss": 0.7056695222854614, + "eval_runtime": 207.0507, + "eval_samples_per_second": 241.487, + "eval_steps_per_second": 1.888, + "step": 555000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 555100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 555200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 555300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 555400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 555500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 555600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 555700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 555800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7598, + "step": 555900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 556000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 556100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 556200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7586, + "step": 556300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 556400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 556500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7623, + "step": 556600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 556700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 556800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 556900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 557000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 557100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 557200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 557300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 557400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 557500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.753, + "step": 557600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 557700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 557800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 557900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7564, + "step": 558000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 558100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 558200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 558300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 558400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 558500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 558600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 558700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 558800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 558900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 559000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7573, + "step": 559100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 559200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 559300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7582, + "step": 559400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 559500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 559600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 559700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 559800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 559900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 560000 + }, + { + "epoch": 1.0, + "eval_loss": 0.7081322073936462, + "eval_runtime": 206.8909, + "eval_samples_per_second": 241.673, + "eval_steps_per_second": 1.89, + "step": 560000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 560100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 560200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 560300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 560400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 560500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 560600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 560700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 560800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 560900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 561000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 561100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 561200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 561300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 561400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 561500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 561600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7563, + "step": 561700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 561800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 561900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 562000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 562100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 562200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 562300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 562400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 562500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 562600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 562700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 562800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 562900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 563000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 563100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7579, + "step": 563200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 563300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7605, + "step": 563400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 563500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 563600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 563700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 563800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 563900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 564000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 564100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 564200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 564300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 564400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 564500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 564600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 564700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 564800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 564900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 565000 + }, + { + "epoch": 1.0, + "eval_loss": 0.7076017260551453, + "eval_runtime": 205.7703, + "eval_samples_per_second": 242.989, + "eval_steps_per_second": 1.9, + "step": 565000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 565100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 565200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7549, + "step": 565300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 565400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 565500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 565600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 565700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 565800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 565900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 566000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 566100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 566200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 566300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 566400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 566500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 566600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 566700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 566800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 566900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 567000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 567100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 567200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 567300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 567400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7573, + "step": 567500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 567600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 567700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 567800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 567900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 568000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 568100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 568200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 568300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 568400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 568500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 568600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.758, + "step": 568700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 568800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 568900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 569000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 569100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 569200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7624, + "step": 569300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 569400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 569500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 569600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 569700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 569800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 569900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 570000 + }, + { + "epoch": 1.0, + "eval_loss": 0.7060913443565369, + "eval_runtime": 203.6261, + "eval_samples_per_second": 245.548, + "eval_steps_per_second": 1.92, + "step": 570000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 570100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 570200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 570300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 570400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 570500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.753, + "step": 570600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 570700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 570800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 570900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.753, + "step": 571000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 571100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 571200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 571300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 571400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 571500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 571600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7623, + "step": 571700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 571800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 571900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 572000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 572100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 572200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7578, + "step": 572300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 572400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 572500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 572600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 572700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 572800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 572900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 573000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7547, + "step": 573100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.753, + "step": 573200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 573300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 573400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 573500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 573600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 573700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7624, + "step": 573800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 573900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 574000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 574100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 574200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 574300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 574400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 574500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 574600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 574700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 574800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 574900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 575000 + }, + { + "epoch": 1.0, + "eval_loss": 0.706642746925354, + "eval_runtime": 208.8939, + "eval_samples_per_second": 239.356, + "eval_steps_per_second": 1.872, + "step": 575000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 575100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 575200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 575300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 575400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 575500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 575600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 575700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 575800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 575900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7586, + "step": 576000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 576100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7586, + "step": 576200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 576300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 576400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 576500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 576600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 576700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 576800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 576900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 577000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 577100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 577200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 577300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 577400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 577500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 577600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 577700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7559, + "step": 577800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 577900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 578000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 578100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 578200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 578300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 578400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 578500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 578600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 578700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 578800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 578900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 579000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 579100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 579200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 579300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 579400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 579500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 579600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 579700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 579800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 579900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 580000 + }, + { + "epoch": 1.0, + "eval_loss": 0.7070211172103882, + "eval_runtime": 209.8407, + "eval_samples_per_second": 238.276, + "eval_steps_per_second": 1.863, + "step": 580000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 580100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 580200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 580300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 580400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 580500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 580600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 580700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 580800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 580900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 581000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 581100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 581200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 581300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 581400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 581500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 581600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 581700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 581800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 581900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 582000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 582100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 582200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 582300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 582400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 582500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 582600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 582700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 582800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 582900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 583000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 583100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 583200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 583300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 583400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 583500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7595, + "step": 583600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 583700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 583800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 583900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 584000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 584100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 584200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 584300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 584400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 584500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 584600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7559, + "step": 584700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 584800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7582, + "step": 584900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 585000 + }, + { + "epoch": 1.0, + "eval_loss": 0.7080877423286438, + "eval_runtime": 206.6514, + "eval_samples_per_second": 241.953, + "eval_steps_per_second": 1.892, + "step": 585000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 585100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 585200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 585300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 585400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 585500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 585600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 585700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 585800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 585900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 586000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 586100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 586200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.753, + "step": 586300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 586400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 586500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 586600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 586700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 586800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 586900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 587000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 587100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 587200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 587300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 587400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7559, + "step": 587500 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 587600 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7608, + "step": 587700 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7604, + "step": 587800 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 587900 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 588000 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 588100 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 588200 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 588300 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 588400 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 588500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 588600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 588700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 588800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 588900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 589000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 589100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 589200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 589300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 589400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 589500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 589600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 589700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 589800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 589900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 590000 + }, + { + "epoch": 1.01, + "eval_loss": 0.7071018218994141, + "eval_runtime": 206.2108, + "eval_samples_per_second": 242.47, + "eval_steps_per_second": 1.896, + "step": 590000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 590100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 590200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 590300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 590400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 590500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 590600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 590700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 590800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 590900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 591000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 591100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 591200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 591300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 591400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 591500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 591600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 591700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 591800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 591900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 592000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 592100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 592200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 592300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 592400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 592500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 592600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 592700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 592800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 592900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 593000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 593100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 593200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7609, + "step": 593300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 593400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 593500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 593600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 593700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 593800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 593900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 594000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 594100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 594200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7583, + "step": 594300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 594400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 594500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 594600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 594700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 594800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 594900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 595000 + }, + { + "epoch": 1.01, + "eval_loss": 0.7045419812202454, + "eval_runtime": 208.0159, + "eval_samples_per_second": 240.366, + "eval_steps_per_second": 1.88, + "step": 595000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 595100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 595200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 595300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 595400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 595500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 595600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 595700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 595800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 595900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 596000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 596100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 596200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 596300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 596400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 596500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 596600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7599, + "step": 596700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 596800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 596900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 597000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7549, + "step": 597100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 597200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 597300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 597400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 597500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 597600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 597700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 597800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 597900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 598000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 598100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 598200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 598300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 598400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 598500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 598600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7608, + "step": 598700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 598800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 598900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 599000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 599100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 599200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 599300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 599400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 599500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 599600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 599700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 599800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7563, + "step": 599900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 600000 + }, + { + "epoch": 1.01, + "eval_loss": 0.7058716416358948, + "eval_runtime": 208.0063, + "eval_samples_per_second": 240.377, + "eval_steps_per_second": 1.88, + "step": 600000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 600100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 600200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 600300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 600400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 600500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 600600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 600700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 600800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 600900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 601000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 601100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 601200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 601300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 601400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 601500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 601600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 601700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 601800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 601900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 602000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 602100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 602200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 602300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 602400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7612, + "step": 602500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 602600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 602700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 602800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 602900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 603000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 603100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 603200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 603300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 603400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 603500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 603600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 603700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 603800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 603900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 604000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 604100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 604200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 604300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 604400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7547, + "step": 604500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 604600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 604700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 604800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 604900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 605000 + }, + { + "epoch": 1.01, + "eval_loss": 0.7062010765075684, + "eval_runtime": 208.4287, + "eval_samples_per_second": 239.89, + "eval_steps_per_second": 1.876, + "step": 605000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 605100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 605200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 605300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 605400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 605500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 605600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 605700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 605800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 605900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 606000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 606100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 606200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7579, + "step": 606300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 606400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 606500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 606600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 606700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 606800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 606900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 607000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 607100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 607200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 607300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 607400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 607500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 607600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 607700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 607800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 607900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 608000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 608100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 608200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 608300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 608400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 608500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 608600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 608700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7574, + "step": 608800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 608900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 609000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 609100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7547, + "step": 609200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 609300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 609400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 609500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 609600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 609700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7575, + "step": 609800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 609900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 610000 + }, + { + "epoch": 1.01, + "eval_loss": 0.7046141624450684, + "eval_runtime": 373.2611, + "eval_samples_per_second": 133.954, + "eval_steps_per_second": 1.048, + "step": 610000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 610100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 610200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 610300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 610400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.753, + "step": 610500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7563, + "step": 610600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 610700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 610800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 610900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 611000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 611100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 611200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 611300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 611400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 611500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 611600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 611700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 611800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 611900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 612000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7559, + "step": 612100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 612200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 612300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 612400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 612500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 612600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 612700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 612800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7586, + "step": 612900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 613000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 613100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 613200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 613300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 613400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 613500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 613600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 613700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 613800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 613900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 614000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 614100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 614200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 614300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 614400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 614500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 614600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 614700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 614800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 614900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 615000 + }, + { + "epoch": 1.01, + "eval_loss": 0.7065125107765198, + "eval_runtime": 214.9751, + "eval_samples_per_second": 232.585, + "eval_steps_per_second": 1.819, + "step": 615000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 615100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 615200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 615300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 615400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 615500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 615600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 615700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 615800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 615900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 616000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 616100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 616200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 616300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.757, + "step": 616400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 616500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 616600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 616700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 616800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 616900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 617000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 617100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 617200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 617300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 617400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 617500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 617600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 617700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 617800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 617900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 618000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 618100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 618200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7547, + "step": 618300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 618400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 618500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 618600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 618700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 618800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 618900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 619000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 619100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 619200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 619300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 619400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 619500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 619600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 619700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 619800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 619900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 620000 + }, + { + "epoch": 1.01, + "eval_loss": 0.7037102580070496, + "eval_runtime": 209.2906, + "eval_samples_per_second": 238.902, + "eval_steps_per_second": 1.868, + "step": 620000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 620100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 620200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 620300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 620400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 620500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 620600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 620700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 620800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 620900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 621000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 621100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 621200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 621300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 621400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 621500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 621600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 621700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 621800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 621900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7568, + "step": 622000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 622100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 622200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 622300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 622400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 622500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 622600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 622700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 622800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 622900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 623000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 623100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 623200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 623300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 623400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 623500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 623600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 623700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 623800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 623900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 624000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 624100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 624200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 624300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 624400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 624500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 624600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 624700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 624800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 624900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 625000 + }, + { + "epoch": 1.01, + "eval_loss": 0.7073270082473755, + "eval_runtime": 206.4574, + "eval_samples_per_second": 242.181, + "eval_steps_per_second": 1.894, + "step": 625000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 625100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 625200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 625300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 625400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 625500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 625600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 625700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 625800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 625900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 626000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 626100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 626200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 626300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 626400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 626500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 626600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 626700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 626800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 626900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 627000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 627100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 627200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 627300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 627400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 627500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 627600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 627700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 627800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 627900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 628000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 628100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 628200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 628300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 628400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 628500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 628600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 628700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 628800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 628900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 629000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 629100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 629200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 629300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 629400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 629500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 629600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 629700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 629800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 629900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 630000 + }, + { + "epoch": 1.01, + "eval_loss": 0.7038618922233582, + "eval_runtime": 209.1827, + "eval_samples_per_second": 239.025, + "eval_steps_per_second": 1.869, + "step": 630000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 630100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 630200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 630300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 630400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 630500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 630600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 630700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 630800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 630900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 631000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 631100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 631200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7559, + "step": 631300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 631400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7566, + "step": 631500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 631600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 631700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7559, + "step": 631800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 631900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7574, + "step": 632000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 632100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 632200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 632300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 632400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 632500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 632600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 632700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 632800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 632900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 633000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 633100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 633200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 633300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 633400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 633500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 633600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7581, + "step": 633700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 633800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 633900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 634000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 634100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 634200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 634300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 634400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 634500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 634600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 634700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7578, + "step": 634800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 634900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 635000 + }, + { + "epoch": 1.01, + "eval_loss": 0.7031733393669128, + "eval_runtime": 207.7004, + "eval_samples_per_second": 240.731, + "eval_steps_per_second": 1.883, + "step": 635000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 635100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 635200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 635300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 635400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 635500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 635600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 635700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 635800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 635900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 636000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 636100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 636200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 636300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 636400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 636500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 636600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 636700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 636800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 636900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 637000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 637100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 637200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 637300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 637400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 637500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 637600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 637700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 637800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 637900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 638000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7587, + "step": 638100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 638200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 638300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 638400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 638500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 638600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 638700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7627, + "step": 638800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 638900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.753, + "step": 639000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 639100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 639200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 639300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 639400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 639500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 639600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 639700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 639800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 639900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 640000 + }, + { + "epoch": 1.01, + "eval_loss": 0.7051756381988525, + "eval_runtime": 209.0555, + "eval_samples_per_second": 239.171, + "eval_steps_per_second": 1.87, + "step": 640000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 640100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 640200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 640300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 640400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 640500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 640600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 640700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 640800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 640900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 641000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 641100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 641200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 641300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 641400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 641500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 641600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 641700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 641800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 641900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 642000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 642100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 642200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 642300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 642400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 642500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 642600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 642700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 642800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 642900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 643000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 643100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 643200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 643300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 643400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 643500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 643600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 643700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 643800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 643900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 644000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 644100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 644200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7594, + "step": 644300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 644400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 644500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 644600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 644700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 644800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 644900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 645000 + }, + { + "epoch": 1.01, + "eval_loss": 0.7038088440895081, + "eval_runtime": 209.5065, + "eval_samples_per_second": 238.656, + "eval_steps_per_second": 1.866, + "step": 645000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 645100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 645200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 645300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 645400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 645500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 645600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 645700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 645800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 645900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 646000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 646100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 646200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 646300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 646400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 646500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 646600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 646700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 646800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 646900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 647000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 647100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 647200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 647300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 647400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 647500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 647600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7588, + "step": 647700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 647800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 647900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 648000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 648100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 648200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 648300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 648400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 648500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 648600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 648700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 648800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 648900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 649000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 649100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 649200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 649300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 649400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 649500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 649600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 649700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 649800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 649900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 650000 + }, + { + "epoch": 1.01, + "eval_loss": 0.706175684928894, + "eval_runtime": 205.9298, + "eval_samples_per_second": 242.801, + "eval_steps_per_second": 1.899, + "step": 650000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 650100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 650200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 650300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 650400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 650500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 650600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7578, + "step": 650700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 650800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 650900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7596, + "step": 651000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 651100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 651200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 651300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 651400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 651500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 651600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 651700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 651800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 651900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.753, + "step": 652000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 652100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 652200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 652300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 652400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 652500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 652600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 652700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 652800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 652900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 653000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 653100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 653200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 653300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 653400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 653500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 653600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 653700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 653800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 653900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 654000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 654100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 654200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 654300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 654400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 654500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 654600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 654700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 654800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 654900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 655000 + }, + { + "epoch": 1.01, + "eval_loss": 0.7067192196846008, + "eval_runtime": 206.9501, + "eval_samples_per_second": 241.604, + "eval_steps_per_second": 1.889, + "step": 655000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 655100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 655200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 655300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 655400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 655500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 655600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 655700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 655800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 655900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 656000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 656100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 656200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 656300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 656400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 656500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 656600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 656700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7563, + "step": 656800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 656900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 657000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 657100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 657200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 657300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 657400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 657500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 657600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 657700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7575, + "step": 657800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 657900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 658000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 658100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 658200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 658300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 658400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 658500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 658600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 658700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 658800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 658900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 659000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 659100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 659200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.753, + "step": 659300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 659400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 659500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 659600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 659700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 659800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 659900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 660000 + }, + { + "epoch": 1.01, + "eval_loss": 0.7032054662704468, + "eval_runtime": 206.5049, + "eval_samples_per_second": 242.125, + "eval_steps_per_second": 1.893, + "step": 660000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 660100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 660200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 660300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 660400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7582, + "step": 660500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7556, + "step": 660600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 660700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 660800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 660900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 661000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 661100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 661200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 661300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 661400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 661500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7589, + "step": 661600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 661700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 661800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 661900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 662000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 662100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 662200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 662300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 662400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 662500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 662600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 662700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 662800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 662900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 663000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 663100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 663200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 663300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7591, + "step": 663400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 663500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7589, + "step": 663600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 663700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 663800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 663900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7579, + "step": 664000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 664100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 664200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 664300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 664400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 664500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 664600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 664700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 664800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 664900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 665000 + }, + { + "epoch": 1.01, + "eval_loss": 0.7017516493797302, + "eval_runtime": 208.408, + "eval_samples_per_second": 239.914, + "eval_steps_per_second": 1.876, + "step": 665000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 665100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 665200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 665300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 665400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 665500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 665600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 665700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 665800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 665900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 666000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 666100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 666200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 666300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 666400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 666500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 666600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 666700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 666800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 666900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 667000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 667100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 667200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7589, + "step": 667300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 667400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 667500 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 667600 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 667700 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 667800 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.753, + "step": 667900 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 668000 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 668100 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 668200 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 668300 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 668400 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 668500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 668600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 668700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 668800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 668900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 669000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 669100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 669200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 669300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 669400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 669500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 669600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 669700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 669800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 669900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 670000 + }, + { + "epoch": 1.02, + "eval_loss": 0.7028328776359558, + "eval_runtime": 203.5887, + "eval_samples_per_second": 245.593, + "eval_steps_per_second": 1.921, + "step": 670000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 670100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 670200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 670300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 670400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 670500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 670600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 670700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 670800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 670900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 671000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 671100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 671200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 671300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 671400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 671500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 671600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 671700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 671800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 671900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 672000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 672100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 672200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 672300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 672400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 672500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 672600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 672700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 672800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 672900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 673000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 673100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 673200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 673300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 673400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 673500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 673600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 673700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 673800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 673900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 674000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 674100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 674200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 674300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 674400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 674500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 674600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 674700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7563, + "step": 674800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 674900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 675000 + }, + { + "epoch": 1.02, + "eval_loss": 0.7030300498008728, + "eval_runtime": 207.0756, + "eval_samples_per_second": 241.458, + "eval_steps_per_second": 1.888, + "step": 675000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 675100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 675200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 675300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 675400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 675500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 675600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 675700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 675800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 675900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 676000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 676100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 676200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 676300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 676400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 676500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 676600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 676700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 676800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 676900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 677000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 677100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 677200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 677300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 677400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 677500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 677600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 677700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 677800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 677900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 678000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 678100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 678200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 678300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 678400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 678500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 678600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 678700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 678800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 678900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 679000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 679100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 679200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 679300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 679400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 679500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 679600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 679700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 679800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 679900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 680000 + }, + { + "epoch": 1.02, + "eval_loss": 0.7032363414764404, + "eval_runtime": 207.9683, + "eval_samples_per_second": 240.421, + "eval_steps_per_second": 1.88, + "step": 680000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 680100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 680200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 680300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 680400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 680500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 680600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 680700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7563, + "step": 680800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 680900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 681000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 681100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 681200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 681300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 681400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 681500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 681600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 681700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 681800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 681900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 682000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7559, + "step": 682100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 682200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 682300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7582, + "step": 682400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 682500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 682600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7577, + "step": 682700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 682800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 682900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7565, + "step": 683000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 683100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 683200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 683300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 683400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 683500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 683600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 683700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 683800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 683900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 684000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 684100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 684200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 684300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 684400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 684500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 684600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 684700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 684800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 684900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 685000 + }, + { + "epoch": 1.02, + "eval_loss": 0.7048628330230713, + "eval_runtime": 203.5531, + "eval_samples_per_second": 245.636, + "eval_steps_per_second": 1.921, + "step": 685000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 685100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 685200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 685300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 685400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 685500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 685600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 685700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 685800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 685900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 686000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 686100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 686200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 686300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 686400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 686500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 686600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 686700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 686800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 686900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7574, + "step": 687000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 687100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 687200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 687300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 687400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 687500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 687600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 687700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 687800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 687900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 688000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 688100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 688200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 688300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 688400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 688500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 688600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 688700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 688800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 688900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7578, + "step": 689000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 689100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 689200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 689300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 689400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 689500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 689600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 689700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 689800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 689900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 690000 + }, + { + "epoch": 1.02, + "eval_loss": 0.702966570854187, + "eval_runtime": 202.3143, + "eval_samples_per_second": 247.14, + "eval_steps_per_second": 1.933, + "step": 690000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 690100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 690200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 690300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 690400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 690500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 690600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 690700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 690800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 690900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 691000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 691100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 691200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 691300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 691400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 691500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 691600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 691700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 691800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 691900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 692000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 692100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 692200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 692300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 692400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 692500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 692600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 692700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 692800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 692900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 693000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 693100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 693200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 693300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 693400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 693500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 693600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 693700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 693800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 693900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 694000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 694100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 694200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 694300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 694400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 694500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 694600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 694700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 694800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 694900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 695000 + }, + { + "epoch": 1.02, + "eval_loss": 0.7024686932563782, + "eval_runtime": 202.3612, + "eval_samples_per_second": 247.083, + "eval_steps_per_second": 1.932, + "step": 695000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 695100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 695200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7533, + "step": 695300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 695400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 695500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 695600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 695700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 695800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 695900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 696000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 696100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 696200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 696300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 696400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.753, + "step": 696500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 696600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 696700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 696800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 696900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 697000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 697100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 697200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 697300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 697400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 697500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 697600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 697700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 697800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 697900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 698000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 698100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 698200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 698300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 698400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 698500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 698600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 698700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 698800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 698900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 699000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 699100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 699200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 699300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 699400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 699500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 699600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 699700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 699800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 699900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 700000 + }, + { + "epoch": 1.02, + "eval_loss": 0.7030143737792969, + "eval_runtime": 195.8231, + "eval_samples_per_second": 255.332, + "eval_steps_per_second": 1.997, + "step": 700000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 700100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 700200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 700300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 700400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 700500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 700600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 700700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 700800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 700900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 701000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 701100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 701200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 701300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 701400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 701500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 701600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 701700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 701800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 701900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 702000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 702100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 702200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7547, + "step": 702300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 702400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 702500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 702600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 702700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 702800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 702900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 703000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7585, + "step": 703100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 703200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 703300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 703400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 703500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 703600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 703700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 703800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 703900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 704000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7563, + "step": 704100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 704200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 704300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 704400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 704500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 704600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 704700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 704800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 704900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 705000 + }, + { + "epoch": 1.02, + "eval_loss": 0.7021443247795105, + "eval_runtime": 201.7932, + "eval_samples_per_second": 247.778, + "eval_steps_per_second": 1.938, + "step": 705000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 705100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 705200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 705300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 705400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 705500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7562, + "step": 705600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 705700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 705800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 705900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 706000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 706100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 706200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7558, + "step": 706300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7539, + "step": 706400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 706500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 706600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 706700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 706800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7527, + "step": 706900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 707000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 707100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 707200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 707300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 707400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 707500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.753, + "step": 707600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 707700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 707800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 707900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 708000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 708100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 708200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 708300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 708400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 708500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 708600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 708700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 708800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 708900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 709000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 709100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 709200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 709300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 709400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 709500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 709600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 709700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 709800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 709900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 710000 + }, + { + "epoch": 1.02, + "eval_loss": 0.703992486000061, + "eval_runtime": 194.4138, + "eval_samples_per_second": 257.183, + "eval_steps_per_second": 2.011, + "step": 710000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 710100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 710200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 710300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 710400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 710500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 710600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7541, + "step": 710700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 710800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 710900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 711000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 711100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 711200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 711300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 711400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 711500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 711600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 711700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 711800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 711900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 712000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7571, + "step": 712100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 712200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 712300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 712400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 712500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 712600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 712700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 712800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 712900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 713000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 713100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 713200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 713300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 713400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7588, + "step": 713500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 713600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 713700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 713800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 713900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 714000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 714100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 714200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 714300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 714400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 714500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 714600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 714700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 714800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 714900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 715000 + }, + { + "epoch": 1.02, + "eval_loss": 0.7032497525215149, + "eval_runtime": 194.5615, + "eval_samples_per_second": 256.988, + "eval_steps_per_second": 2.01, + "step": 715000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 715100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 715200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 715300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 715400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 715500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 715600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 715700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 715800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 715900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 716000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 716100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 716200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 716300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 716400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 716500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 716600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 716700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 716800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 716900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 717000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 717100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 717200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 717300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 717400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 717500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 717600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 717700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 717800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 717900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 718000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 718100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 718200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.753, + "step": 718300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 718400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 718500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 718600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 718700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 718800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 718900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 719000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 719100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 719200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 719300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 719400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 719500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 719600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 719700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 719800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 719900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 720000 + }, + { + "epoch": 1.02, + "eval_loss": 0.7018641233444214, + "eval_runtime": 194.6657, + "eval_samples_per_second": 256.851, + "eval_steps_per_second": 2.009, + "step": 720000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 720100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 720200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 720300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 720400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 720500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 720600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 720700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 720800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 720900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 721000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 721100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 721200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 721300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 721400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 721500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 721600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 721700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 721800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 721900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 722000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7566, + "step": 722100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 722200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 722300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 722400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 722500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 722600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 722700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 722800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 722900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 723000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 723100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 723200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 723300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 723400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 723500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 723600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 723700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 723800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 723900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 724000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 724100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 724200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 724300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.753, + "step": 724400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 724500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 724600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 724700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 724800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 724900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 725000 + }, + { + "epoch": 1.02, + "eval_loss": 0.7017471790313721, + "eval_runtime": 194.844, + "eval_samples_per_second": 256.616, + "eval_steps_per_second": 2.007, + "step": 725000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 725100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 725200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 725300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 725400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 725500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 725600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 725700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 725800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 725900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 726000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 726100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 726200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 726300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 726400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 726500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 726600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 726700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 726800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 726900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 727000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 727100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 727200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 727300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 727400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 727500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 727600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 727700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 727800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 727900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 728000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 728100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 728200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 728300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 728400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 728500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 728600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 728700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 728800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 728900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 729000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7575, + "step": 729100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 729200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 729300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 729400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 729500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 729600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 729700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 729800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 729900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.759, + "step": 730000 + }, + { + "epoch": 1.02, + "eval_loss": 0.7034239768981934, + "eval_runtime": 194.8342, + "eval_samples_per_second": 256.628, + "eval_steps_per_second": 2.007, + "step": 730000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 730100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 730200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 730300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 730400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 730500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 730600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 730700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 730800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 730900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 731000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 731100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 731200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 731300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 731400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 731500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 731600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 731700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 731800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 731900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 732000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 732100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 732200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 732300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 732400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 732500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 732600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 732700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 732800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 732900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 733000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 733100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 733200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 733300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 733400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 733500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 733600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 733700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 733800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 733900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 734000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 734100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 734200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 734300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 734400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 734500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 734600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 734700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 734800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 734900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 735000 + }, + { + "epoch": 1.02, + "eval_loss": 0.7015842199325562, + "eval_runtime": 194.9341, + "eval_samples_per_second": 256.497, + "eval_steps_per_second": 2.006, + "step": 735000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 735100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 735200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 735300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 735400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 735500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 735600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 735700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 735800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 735900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 736000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 736100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 736200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 736300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 736400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 736500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 736600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 736700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 736800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 736900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 737000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 737100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 737200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 737300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 737400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 737500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 737600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 737700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 737800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 737900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 738000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 738100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 738200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 738300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 738400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 738500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 738600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 738700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 738800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 738900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 739000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 739100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 739200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 739300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7553, + "step": 739400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 739500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 739600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 739700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 739800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 739900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 740000 + }, + { + "epoch": 1.02, + "eval_loss": 0.7015743255615234, + "eval_runtime": 194.8311, + "eval_samples_per_second": 256.633, + "eval_steps_per_second": 2.007, + "step": 740000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 740100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 740200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 740300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 740400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 740500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 740600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 740700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 740800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 740900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 741000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 741100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 741200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 741300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 741400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 741500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 741600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 741700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 741800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 741900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 742000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7557, + "step": 742100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 742200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 742300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 742400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 742500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 742600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 742700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 742800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 742900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 743000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 743100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 743200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 743300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 743400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 743500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 743600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 743700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 743800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 743900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 744000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 744100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 744200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 744300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 744400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 744500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 744600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 744700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 744800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 744900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 745000 + }, + { + "epoch": 1.02, + "eval_loss": 0.7021865844726562, + "eval_runtime": 194.4109, + "eval_samples_per_second": 257.187, + "eval_steps_per_second": 2.011, + "step": 745000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 745100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 745200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 745300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 745400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.756, + "step": 745500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 745600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 745700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 745800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 745900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 746000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 746100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 746200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 746300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 746400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 746500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 746600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 746700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 746800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 746900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 747000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 747100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 747200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 747300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 747400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 747500 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 747600 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 747700 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 747800 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 747900 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 748000 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 748100 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 748200 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 748300 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 748400 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 748500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 748600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 748700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 748800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 748900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 749000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 749100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 749200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 749300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 749400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 749500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 749600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 749700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 749800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 749900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 750000 + }, + { + "epoch": 1.03, + "eval_loss": 0.6999218463897705, + "eval_runtime": 203.751, + "eval_samples_per_second": 245.398, + "eval_steps_per_second": 1.919, + "step": 750000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 750100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 750200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 750300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 750400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 750500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 750600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 750700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 750800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 750900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 751000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 751100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 751200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 751300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 751400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7574, + "step": 751500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 751600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 751700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 751800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 751900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 752000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 752100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 752200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 752300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 752400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 752500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 752600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 752700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 752800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 752900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 753000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 753100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 753200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 753300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 753400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 753500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 753600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 753700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 753800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 753900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 754000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 754100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 754200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 754300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 754400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 754500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 754600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 754700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 754800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 754900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 755000 + }, + { + "epoch": 1.03, + "eval_loss": 0.7005174160003662, + "eval_runtime": 195.8763, + "eval_samples_per_second": 255.263, + "eval_steps_per_second": 1.996, + "step": 755000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 755100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 755200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 755300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 755400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 755500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 755600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7572, + "step": 755700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 755800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 755900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 756000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 756100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 756200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 756300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7626, + "step": 756400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 756500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 756600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 756700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 756800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 756900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 757000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 757100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 757200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 757300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 757400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 757500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 757600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 757700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 757800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 757900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 758000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 758100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 758200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 758300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 758400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 758500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 758600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 758700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 758800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 758900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 759000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 759100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 759200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 759300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 759400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 759500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 759600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 759700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 759800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 759900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 760000 + }, + { + "epoch": 1.03, + "eval_loss": 0.7038360834121704, + "eval_runtime": 195.7056, + "eval_samples_per_second": 255.486, + "eval_steps_per_second": 1.998, + "step": 760000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 760100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 760200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 760300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 760400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 760500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.753, + "step": 760600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 760700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 760800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 760900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 761000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 761100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 761200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 761300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 761400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 761500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 761600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 761700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 761800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 761900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 762000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 762100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 762200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 762300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 762400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 762500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 762600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7567, + "step": 762700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 762800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 762900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 763000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 763100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7535, + "step": 763200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 763300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 763400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 763500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 763600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 763700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 763800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 763900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 764000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7554, + "step": 764100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 764200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 764300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 764400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 764500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 764600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 764700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 764800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 764900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 765000 + }, + { + "epoch": 1.03, + "eval_loss": 0.7031464576721191, + "eval_runtime": 194.5711, + "eval_samples_per_second": 256.975, + "eval_steps_per_second": 2.01, + "step": 765000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 765100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 765200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 765300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 765400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 765500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 765600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 765700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 765800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 765900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 766000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 766100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 766200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 766300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 766400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 766500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 766600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 766700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 766800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 766900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 767000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 767100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 767200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 767300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 767400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 767500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 767600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 767700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 767800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7546, + "step": 767900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 768000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 768100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 768200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 768300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 768400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 768500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 768600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 768700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 768800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 768900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 769000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 769100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 769200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 769300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 769400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 769500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 769600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 769700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 769800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7529, + "step": 769900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 770000 + }, + { + "epoch": 1.03, + "eval_loss": 0.7009237408638, + "eval_runtime": 194.6679, + "eval_samples_per_second": 256.848, + "eval_steps_per_second": 2.009, + "step": 770000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 770100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 770200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 770300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 770400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 770500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 770600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 770700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 770800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 770900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 771000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7583, + "step": 771100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 771200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 771300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 771400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7536, + "step": 771500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 771600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 771700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 771800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 771900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 772000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 772100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 772200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 772300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 772400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 772500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 772600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 772700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 772800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 772900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 773000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 773100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 773200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7544, + "step": 773300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 773400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 773500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 773600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 773700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 773800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 773900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 774000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 774100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 774200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 774300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 774400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 774500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 774600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 774700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 774800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 774900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 775000 + }, + { + "epoch": 1.03, + "eval_loss": 0.7024406790733337, + "eval_runtime": 205.3494, + "eval_samples_per_second": 243.487, + "eval_steps_per_second": 1.904, + "step": 775000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 775100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 775200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 775300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 775400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 775500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 775600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 775700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 775800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 775900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 776000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 776100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 776200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7569, + "step": 776300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 776400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 776500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 776600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 776700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 776800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 776900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 777000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 777100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 777200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 777300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 777400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 777500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 777600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 777700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 777800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 777900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 778000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 778100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 778200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 778300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 778400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 778500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 778600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 778700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 778800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 778900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 779000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 779100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 779200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 779300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 779400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 779500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 779600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 779700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 779800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 779900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 780000 + }, + { + "epoch": 1.03, + "eval_loss": 0.7008019685745239, + "eval_runtime": 200.2027, + "eval_samples_per_second": 249.747, + "eval_steps_per_second": 1.953, + "step": 780000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 780100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 780200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 780300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 780400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 780500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 780600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 780700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 780800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 780900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 781000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 781100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 781200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 781300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 781400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 781500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 781600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 781700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 781800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 781900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 782000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 782100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 782200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 782300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 782400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 782500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 782600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 782700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 782800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 782900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 783000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 783100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 783200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 783300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 783400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 783500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 783600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 783700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 783800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 783900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 784000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 784100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 784200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 784300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 784400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 784500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 784600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 784700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 784800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 784900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 785000 + }, + { + "epoch": 1.03, + "eval_loss": 0.700626015663147, + "eval_runtime": 204.4778, + "eval_samples_per_second": 244.525, + "eval_steps_per_second": 1.912, + "step": 785000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 785100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 785200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 785300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 785400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 785500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 785600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 785700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 785800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 785900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 786000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 786100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 786200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 786300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 786400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 786500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 786600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 786700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 786800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 786900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 787000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 787100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 787200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 787300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 787400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 787500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 787600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 787700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 787800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 787900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 788000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 788100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7538, + "step": 788200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 788300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 788400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 788500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 788600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 788700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 788800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 788900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 789000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 789100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 789200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 789300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 789400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 789500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 789600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 789700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 789800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 789900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 790000 + }, + { + "epoch": 1.03, + "eval_loss": 0.7005957961082458, + "eval_runtime": 811.9429, + "eval_samples_per_second": 61.581, + "eval_steps_per_second": 0.482, + "step": 790000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 790100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 790200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 790300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 790400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 790500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7543, + "step": 790600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 790700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 790800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 790900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 791000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 791100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 791200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 791300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 791400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 791500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 791600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 791700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 791800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 791900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 792000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 792100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 792200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 792300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 792400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 792500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 792600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 792700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 792800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 792900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 793000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 793100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 793200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 793300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 793400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 793500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 793600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 793700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 793800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 793900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 794000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 794100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 794200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 794300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 794400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 794500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 794600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 794700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 794800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 794900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 795000 + }, + { + "epoch": 1.03, + "eval_loss": 0.6998762488365173, + "eval_runtime": 199.9792, + "eval_samples_per_second": 250.026, + "eval_steps_per_second": 1.955, + "step": 795000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 795100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 795200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 795300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 795400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 795500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 795600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 795700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 795800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 795900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 796000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 796100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 796200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 796300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 796400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 796500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 796600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 796700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 796800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 796900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 797000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 797100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 797200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 797300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 797400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 797500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 797600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 797700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 797800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 797900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 798000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 798100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 798200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 798300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 798400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 798500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 798600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 798700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 798800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 798900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 799000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 799100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 799200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 799300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 799400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 799500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 799600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 799700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 799800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 799900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 800000 + }, + { + "epoch": 1.03, + "eval_loss": 0.7010047435760498, + "eval_runtime": 203.5185, + "eval_samples_per_second": 245.678, + "eval_steps_per_second": 1.921, + "step": 800000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 800100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 800200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 800300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 800400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 800500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 800600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 800700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 800800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 800900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 801000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 801100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 801200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 801300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 801400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 801500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 801600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 801700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 801800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 801900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 802000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 802100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 802200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 802300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 802400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 802500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 802600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 802700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7295, + "step": 802800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 802900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 803000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 803100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 803200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 803300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 803400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 803500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 803600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 803700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 803800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 803900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 804000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 804100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 804200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 804300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 804400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 804500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 804600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 804700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 804800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 804900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 805000 + }, + { + "epoch": 1.03, + "eval_loss": 0.7028763890266418, + "eval_runtime": 203.1272, + "eval_samples_per_second": 246.151, + "eval_steps_per_second": 1.925, + "step": 805000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 805100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 805200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 805300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 805400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 805500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 805600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 805700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 805800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 805900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 806000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 806100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 806200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 806300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 806400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 806500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 806600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 806700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 806800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7548, + "step": 806900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 807000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 807100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 807200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 807300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 807400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 807500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 807600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 807700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 807800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 807900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 808000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 808100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 808200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 808300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 808400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 808500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 808600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 808700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 808800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 808900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 809000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 809100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 809200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 809300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 809400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 809500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 809600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 809700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 809800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 809900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 810000 + }, + { + "epoch": 1.03, + "eval_loss": 0.7009583711624146, + "eval_runtime": 204.6869, + "eval_samples_per_second": 244.276, + "eval_steps_per_second": 1.91, + "step": 810000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 810100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 810200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 810300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 810400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 810500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 810600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 810700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 810800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 810900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 811000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 811100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 811200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 811300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 811400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 811500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 811600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 811700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7545, + "step": 811800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 811900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 812000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 812100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 812200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 812300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 812400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 812500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 812600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 812700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 812800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 812900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 813000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 813100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 813200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 813300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 813400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 813500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 813600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 813700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 813800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 813900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 814000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 814100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 814200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 814300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 814400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 814500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 814600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 814700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 814800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 814900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 815000 + }, + { + "epoch": 1.03, + "eval_loss": 0.7012917399406433, + "eval_runtime": 200.7367, + "eval_samples_per_second": 249.083, + "eval_steps_per_second": 1.948, + "step": 815000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 815100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 815200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 815300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 815400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 815500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 815600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 815700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 815800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 815900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 816000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 816100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 816200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 816300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 816400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 816500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 816600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 816700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 816800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 816900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 817000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 817100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 817200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 817300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 817400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 817500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 817600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 817700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 817800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 817900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 818000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7549, + "step": 818100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 818200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 818300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 818400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 818500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 818600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 818700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 818800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 818900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 819000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 819100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 819200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 819300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 819400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 819500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 819600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 819700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 819800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 819900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 820000 + }, + { + "epoch": 1.03, + "eval_loss": 0.6996245980262756, + "eval_runtime": 1267.1182, + "eval_samples_per_second": 39.46, + "eval_steps_per_second": 0.309, + "step": 820000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 820100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 820200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 820300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 820400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 820500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 820600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 820700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 820800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 820900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 821000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 821100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 821200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 821300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 821400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 821500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 821600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 821700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 821800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 821900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 822000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 822100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 822200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 822300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 822400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 822500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 822600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 822700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 822800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 822900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 823000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 823100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 823200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 823300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 823400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 823500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 823600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 823700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 823800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 823900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 824000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 824100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 824200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 824300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 824400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 824500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 824600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 824700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 824800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 824900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 825000 + }, + { + "epoch": 1.03, + "eval_loss": 0.6995617747306824, + "eval_runtime": 200.2123, + "eval_samples_per_second": 249.735, + "eval_steps_per_second": 1.953, + "step": 825000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 825100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 825200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 825300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 825400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 825500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 825600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 825700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 825800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 825900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 826000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 826100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7555, + "step": 826200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 826300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 826400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 826500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 826600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7516, + "step": 826700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 826800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 826900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 827000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 827100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 827200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 827300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 827400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 827500 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 827600 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 827700 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 827800 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 827900 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 828000 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 828100 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 828200 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 828300 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 828400 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 828500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 828600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 828700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 828800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 828900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 829000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 829100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 829200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 829300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 829400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 829500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 829600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 829700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 829800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 829900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 830000 + }, + { + "epoch": 1.04, + "eval_loss": 0.6999587416648865, + "eval_runtime": 202.6692, + "eval_samples_per_second": 246.707, + "eval_steps_per_second": 1.929, + "step": 830000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 830100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 830200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 830300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 830400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 830500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 830600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 830700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 830800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 830900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 831000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 831100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 831200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 831300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 831400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 831500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 831600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 831700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 831800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 831900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 832000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 832100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 832200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 832300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 832400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 832500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 832600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 832700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 832800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 832900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 833000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 833100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 833200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 833300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 833400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 833500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 833600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 833700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 833800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 833900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 834000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 834100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 834200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 834300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 834400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 834500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 834600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 834700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 834800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 834900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 835000 + }, + { + "epoch": 1.04, + "eval_loss": 0.6982614994049072, + "eval_runtime": 206.0848, + "eval_samples_per_second": 242.619, + "eval_steps_per_second": 1.897, + "step": 835000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 835100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 835200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 835300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 835400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 835500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 835600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 835700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 835800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 835900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 836000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 836100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 836200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 836300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 836400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 836500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 836600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 836700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 836800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 836900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 837000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 837100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 837200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 837300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 837400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 837500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 837600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 837700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 837800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 837900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 838000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 838100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 838200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 838300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 838400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 838500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 838600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 838700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 838800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 838900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 839000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 839100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 839200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 839300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 839400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 839500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 839600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 839700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 839800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 839900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 840000 + }, + { + "epoch": 1.04, + "eval_loss": 0.6995810866355896, + "eval_runtime": 246.5346, + "eval_samples_per_second": 202.811, + "eval_steps_per_second": 1.586, + "step": 840000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 840100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 840200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 840300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 840400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 840500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 840600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 840700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 840800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 840900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 841000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 841100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 841200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 841300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 841400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 841500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 841600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 841700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 841800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 841900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 842000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 842100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 842200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 842300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7534, + "step": 842400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7515, + "step": 842500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 842600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 842700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 842800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 842900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 843000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 843100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 843200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 843300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 843400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 843500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 843600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 843700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7547, + "step": 843800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 843900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 844000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 844100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 844200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 844300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 844400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.732, + "step": 844500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 844600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 844700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 844800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 844900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 845000 + }, + { + "epoch": 1.04, + "eval_loss": 0.6997884511947632, + "eval_runtime": 199.6827, + "eval_samples_per_second": 250.397, + "eval_steps_per_second": 1.958, + "step": 845000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 845100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 845200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 845300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 845400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 845500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 845600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 845700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 845800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 845900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 846000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 846100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 846200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 846300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 846400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 846500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 846600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 846700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 846800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 846900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 847000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 847100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 847200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 847300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 847400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 847500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 847600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 847700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 847800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 847900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 848000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 848100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 848200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 848300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 848400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 848500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 848600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 848700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 848800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 848900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 849000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 849100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 849200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 849300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 849400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 849500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 849600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 849700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 849800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 849900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 850000 + }, + { + "epoch": 1.04, + "eval_loss": 0.7002251744270325, + "eval_runtime": 203.5229, + "eval_samples_per_second": 245.673, + "eval_steps_per_second": 1.921, + "step": 850000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 850100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 850200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 850300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 850400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 850500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 850600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 850700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 850800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 850900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 851000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 851100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 851200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 851300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 851400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 851500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 851600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 851700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 851800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 851900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 852000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 852100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 852200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 852300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 852400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 852500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 852600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 852700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 852800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 852900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 853000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 853100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 853200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 853300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 853400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 853500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 853600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 853700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 853800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 853900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 854000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 854100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 854200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 854300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 854400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 854500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 854600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 854700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 854800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 854900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 855000 + }, + { + "epoch": 1.04, + "eval_loss": 0.6967592835426331, + "eval_runtime": 200.9972, + "eval_samples_per_second": 248.76, + "eval_steps_per_second": 1.945, + "step": 855000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 855100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 855200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 855300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 855400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 855500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 855600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 855700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 855800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 855900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 856000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 856100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 856200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 856300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 856400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 856500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 856600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 856700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 856800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 856900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 857000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 857100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 857200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 857300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 857400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.754, + "step": 857500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 857600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 857700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 857800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 857900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 858000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 858100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 858200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 858300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 858400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 858500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 858600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 858700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 858800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 858900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 859000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 859100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 859200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 859300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 859400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 859500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 859600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 859700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 859800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 859900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 860000 + }, + { + "epoch": 1.04, + "eval_loss": 0.6985325813293457, + "eval_runtime": 202.728, + "eval_samples_per_second": 246.636, + "eval_steps_per_second": 1.929, + "step": 860000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 860100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 860200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 860300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 860400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 860500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 860600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 860700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 860800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 860900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 861000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 861100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 861200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 861300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 861400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 861500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 861600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 861700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 861800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 861900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 862000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 862100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 862200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 862300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 862400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 862500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 862600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 862700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 862800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 862900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 863000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 863100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 863200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 863300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 863400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 863500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 863600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 863700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 863800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 863900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 864000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 864100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 864200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 864300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 864400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 864500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 864600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 864700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 864800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 864900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 865000 + }, + { + "epoch": 1.04, + "eval_loss": 0.6987593173980713, + "eval_runtime": 201.2811, + "eval_samples_per_second": 248.409, + "eval_steps_per_second": 1.943, + "step": 865000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 865100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 865200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 865300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 865400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 865500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 865600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 865700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 865800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 865900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 866000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 866100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 866200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 866300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 866400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 866500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 866600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 866700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 866800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 866900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 867000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 867100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 867200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 867300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 867400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 867500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 867600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 867700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 867800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 867900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 868000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 868100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 868200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 868300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 868400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 868500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 868600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 868700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 868800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 868900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 869000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 869100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 869200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 869300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 869400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 869500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 869600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 869700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 869800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 869900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 870000 + }, + { + "epoch": 1.04, + "eval_loss": 0.6997235417366028, + "eval_runtime": 202.6711, + "eval_samples_per_second": 246.705, + "eval_steps_per_second": 1.929, + "step": 870000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 870100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 870200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 870300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 870400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 870500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 870600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 870700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 870800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 870900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 871000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 871100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 871200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 871300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 871400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 871500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 871600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 871700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 871800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 871900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 872000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 872100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 872200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 872300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 872400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 872500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 872600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 872700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 872800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 872900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 873000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 873100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 873200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 873300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 873400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 873500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 873600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 873700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 873800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 873900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 874000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 874100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 874200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 874300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 874400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 874500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 874600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 874700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 874800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 874900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 875000 + }, + { + "epoch": 1.04, + "eval_loss": 0.6992688775062561, + "eval_runtime": 203.061, + "eval_samples_per_second": 246.231, + "eval_steps_per_second": 1.926, + "step": 875000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 875100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 875200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 875300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 875400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 875500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 875600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 875700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 875800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 875900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 876000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 876100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 876200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 876300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 876400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 876500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 876600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 876700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 876800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 876900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 877000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 877100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 877200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 877300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7512, + "step": 877400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 877500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 877600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 877700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 877800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 877900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 878000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 878100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 878200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 878300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 878400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 878500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 878600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 878700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 878800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 878900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 879000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 879100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 879200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 879300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 879400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 879500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 879600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 879700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 879800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 879900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 880000 + }, + { + "epoch": 1.04, + "eval_loss": 0.6993662714958191, + "eval_runtime": 203.8774, + "eval_samples_per_second": 245.245, + "eval_steps_per_second": 1.918, + "step": 880000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 880100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 880200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 880300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 880400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 880500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 880600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 880700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 880800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 880900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 881000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 881100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 881200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 881300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 881400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 881500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 881600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 881700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 881800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 881900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 882000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 882100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 882200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 882300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 882400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 882500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 882600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 882700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 882800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 882900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 883000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 883100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 883200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 883300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 883400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 883500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 883600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 883700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7528, + "step": 883800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 883900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 884000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 884100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 884200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 884300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 884400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 884500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 884600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 884700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 884800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 884900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 885000 + }, + { + "epoch": 1.04, + "eval_loss": 0.6965381503105164, + "eval_runtime": 239.7641, + "eval_samples_per_second": 208.538, + "eval_steps_per_second": 1.631, + "step": 885000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 885100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 885200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 885300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 885400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 885500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 885600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 885700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 885800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 885900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 886000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 886100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 886200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 886300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 886400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 886500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 886600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 886700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 886800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 886900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 887000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 887100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 887200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 887300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 887400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7551, + "step": 887500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 887600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 887700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 887800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 887900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 888000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 888100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 888200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 888300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 888400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 888500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 888600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 888700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 888800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 888900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 889000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 889100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 889200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 889300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 889400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 889500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 889600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 889700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 889800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 889900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 890000 + }, + { + "epoch": 1.04, + "eval_loss": 0.6979761123657227, + "eval_runtime": 233.9852, + "eval_samples_per_second": 213.689, + "eval_steps_per_second": 1.671, + "step": 890000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 890100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 890200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 890300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7524, + "step": 890400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 890500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 890600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 890700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 890800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 890900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 891000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 891100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 891200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 891300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 891400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 891500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 891600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 891700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 891800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 891900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 892000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 892100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 892200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 892300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 892400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 892500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 892600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 892700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 892800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 892900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 893000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 893100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 893200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 893300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 893400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 893500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 893600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 893700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 893800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 893900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 894000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 894100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 894200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 894300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 894400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 894500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 894600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 894700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 894800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 894900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 895000 + }, + { + "epoch": 1.04, + "eval_loss": 0.700946033000946, + "eval_runtime": 205.605, + "eval_samples_per_second": 243.185, + "eval_steps_per_second": 1.902, + "step": 895000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 895100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 895200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 895300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 895400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 895500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 895600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 895700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 895800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 895900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 896000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 896100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 896200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 896300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 896400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 896500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 896600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 896700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 896800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 896900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 897000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 897100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 897200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 897300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 897400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 897500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 897600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 897700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 897800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 897900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 898000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 898100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 898200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 898300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 898400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 898500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 898600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 898700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 898800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 898900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 899000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 899100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 899200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7579, + "step": 899300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 899400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 899500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 899600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 899700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 899800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 899900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 900000 + }, + { + "epoch": 1.04, + "eval_loss": 0.6999132633209229, + "eval_runtime": 205.565, + "eval_samples_per_second": 243.232, + "eval_steps_per_second": 1.902, + "step": 900000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 900100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 900200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 900300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 900400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 900500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 900600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 900700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 900800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 900900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 901000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 901100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 901200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 901300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 901400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 901500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 901600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 901700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 901800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 901900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 902000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 902100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 902200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 902300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 902400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 902500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 902600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 902700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 902800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 902900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 903000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 903100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 903200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 903300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 903400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 903500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 903600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 903700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 903800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 903900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 904000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 904100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7299, + "step": 904200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 904300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 904400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 904500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 904600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 904700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 904800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 904900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 905000 + }, + { + "epoch": 1.04, + "eval_loss": 0.6989642381668091, + "eval_runtime": 206.2345, + "eval_samples_per_second": 242.442, + "eval_steps_per_second": 1.896, + "step": 905000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7324, + "step": 905100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 905200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 905300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 905400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 905500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 905600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 905700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 905800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 905900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 906000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 906100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 906200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 906300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 906400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 906500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 906600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 906700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 906800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 906900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 907000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 907100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7542, + "step": 907200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 907300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 907400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 907500 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 907600 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 907700 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 907800 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 907900 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 908000 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 908100 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 908200 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 908300 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 908400 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 908500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 908600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 908700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 908800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 908900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 909000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 909100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 909200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 909300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 909400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 909500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 909600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 909700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 909800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 909900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7307, + "step": 910000 + }, + { + "epoch": 1.05, + "eval_loss": 0.6977934241294861, + "eval_runtime": 205.0361, + "eval_samples_per_second": 243.859, + "eval_steps_per_second": 1.907, + "step": 910000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 910100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 910200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 910300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 910400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 910500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 910600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 910700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.752, + "step": 910800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 910900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 911000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 911100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 911200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 911300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 911400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 911500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 911600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 911700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7523, + "step": 911800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 911900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 912000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 912100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 912200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 912300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 912400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 912500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 912600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7519, + "step": 912700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 912800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 912900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 913000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 913100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 913200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 913300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 913400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 913500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 913600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 913700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 913800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 913900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7496, + "step": 914000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 914100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 914200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 914300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 914400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 914500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 914600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 914700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 914800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 914900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 915000 + }, + { + "epoch": 1.05, + "eval_loss": 0.6987533569335938, + "eval_runtime": 204.9728, + "eval_samples_per_second": 243.935, + "eval_steps_per_second": 1.908, + "step": 915000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 915100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 915200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 915300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 915400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 915500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 915600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 915700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 915800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 915900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 916000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 916100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 916200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 916300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 916400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 916500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 916600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 916700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 916800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 916900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 917000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 917100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 917200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 917300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 917400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 917500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 917600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 917700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 917800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 917900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 918000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 918100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 918200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 918300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 918400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 918500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 918600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 918700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 918800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 918900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 919000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 919100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 919200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 919300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 919400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 919500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 919600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 919700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 919800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 919900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 920000 + }, + { + "epoch": 1.05, + "eval_loss": 0.6991868019104004, + "eval_runtime": 204.4782, + "eval_samples_per_second": 244.525, + "eval_steps_per_second": 1.912, + "step": 920000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 920100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 920200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7284, + "step": 920300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 920400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 920500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 920600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 920700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 920800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 920900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 921000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 921100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 921200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 921300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 921400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 921500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 921600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 921700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 921800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 921900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 922000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 922100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 922200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 922300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 922400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 922500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 922600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 922700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7312, + "step": 922800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 922900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 923000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 923100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 923200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 923300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 923400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 923500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 923600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 923700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 923800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 923900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 924000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 924100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 924200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 924300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 924400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 924500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 924600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 924700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 924800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 924900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7532, + "step": 925000 + }, + { + "epoch": 1.05, + "eval_loss": 0.6977989673614502, + "eval_runtime": 208.233, + "eval_samples_per_second": 240.116, + "eval_steps_per_second": 1.878, + "step": 925000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 925100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 925200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 925300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 925400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 925500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 925600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 925700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 925800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 925900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 926000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 926100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 926200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 926300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 926400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 926500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 926600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 926700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 926800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 926900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 927000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 927100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 927200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 927300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 927400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 927500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 927600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 927700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 927800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 927900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7313, + "step": 928000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 928100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 928200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 928300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 928400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 928500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 928600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 928700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 928800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 928900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 929000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 929100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 929200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 929300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 929400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 929500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 929600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 929700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 929800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 929900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 930000 + }, + { + "epoch": 1.05, + "eval_loss": 0.6975210905075073, + "eval_runtime": 216.9106, + "eval_samples_per_second": 230.51, + "eval_steps_per_second": 1.803, + "step": 930000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 930100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 930200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 930300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 930400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 930500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 930600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 930700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 930800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 930900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 931000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 931100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 931200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 931300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 931400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 931500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 931600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 931700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 931800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 931900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 932000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 932100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 932200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 932300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 932400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 932500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 932600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 932700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 932800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 932900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 933000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 933100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 933200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 933300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 933400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 933500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 933600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 933700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 933800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 933900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 934000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 934100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7306, + "step": 934200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 934300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 934400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 934500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 934600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 934700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 934800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 934900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 935000 + }, + { + "epoch": 1.05, + "eval_loss": 0.6974779963493347, + "eval_runtime": 204.3626, + "eval_samples_per_second": 244.663, + "eval_steps_per_second": 1.913, + "step": 935000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 935100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 935200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 935300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 935400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 935500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 935600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 935700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 935800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 935900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 936000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 936100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 936200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 936300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 936400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 936500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7311, + "step": 936600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 936700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 936800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 936900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 937000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 937100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 937200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 937300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 937400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 937500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 937600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7307, + "step": 937700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 937800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 937900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 938000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 938100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 938200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 938300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 938400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 938500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 938600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 938700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 938800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 938900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 939000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 939100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 939200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 939300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 939400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 939500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 939600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 939700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 939800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 939900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 940000 + }, + { + "epoch": 1.05, + "eval_loss": 0.6990136504173279, + "eval_runtime": 204.6451, + "eval_samples_per_second": 244.325, + "eval_steps_per_second": 1.911, + "step": 940000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 940100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 940200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 940300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 940400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 940500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 940600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 940700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 940800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 940900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 941000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 941100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 941200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 941300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 941400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 941500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7309, + "step": 941600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 941700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 941800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 941900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 942000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 942100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 942200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 942300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 942400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 942500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 942600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 942700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 942800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 942900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 943000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 943100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 943200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 943300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 943400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 943500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 943600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 943700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 943800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 943900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 944000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 944100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 944200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 944300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 944400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 944500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 944600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 944700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 944800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 944900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 945000 + }, + { + "epoch": 1.05, + "eval_loss": 0.6982185244560242, + "eval_runtime": 204.5633, + "eval_samples_per_second": 244.423, + "eval_steps_per_second": 1.911, + "step": 945000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 945100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 945200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 945300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 945400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 945500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 945600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 945700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 945800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 945900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 946000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 946100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 946200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 946300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 946400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 946500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 946600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 946700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 946800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 946900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 947000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 947100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 947200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 947300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 947400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 947500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 947600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 947700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 947800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 947900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 948000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 948100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 948200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 948300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 948400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 948500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 948600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 948700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 948800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 948900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 949000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 949100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 949200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 949300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 949400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 949500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 949600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 949700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 949800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 949900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 950000 + }, + { + "epoch": 1.05, + "eval_loss": 0.6980754137039185, + "eval_runtime": 204.3491, + "eval_samples_per_second": 244.679, + "eval_steps_per_second": 1.913, + "step": 950000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 950100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 950200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 950300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7292, + "step": 950400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 950500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 950600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7325, + "step": 950700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 950800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 950900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 951000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 951100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 951200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 951300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 951400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 951500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 951600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 951700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 951800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 951900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 952000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 952100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 952200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 952300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 952400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 952500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 952600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 952700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 952800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 952900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 953000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 953100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 953200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 953300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 953400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 953500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 953600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 953700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 953800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 953900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7576, + "step": 954000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 954100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 954200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 954300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 954400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 954500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 954600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 954700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 954800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 954900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 955000 + }, + { + "epoch": 1.05, + "eval_loss": 0.6982489824295044, + "eval_runtime": 203.9889, + "eval_samples_per_second": 245.111, + "eval_steps_per_second": 1.917, + "step": 955000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 955100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 955200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 955300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 955400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 955500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 955600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 955700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 955800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 955900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 956000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 956100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 956200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 956300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 956400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 956500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 956600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 956700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 956800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 956900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 957000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 957100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 957200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 957300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 957400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 957500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 957600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 957700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 957800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 957900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 958000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 958100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 958200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 958300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 958400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 958500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 958600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 958700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 958800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7325, + "step": 958900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 959000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 959100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 959200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 959300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 959400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 959500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 959600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 959700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 959800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 959900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 960000 + }, + { + "epoch": 1.05, + "eval_loss": 0.6997035145759583, + "eval_runtime": 205.1198, + "eval_samples_per_second": 243.76, + "eval_steps_per_second": 1.906, + "step": 960000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 960100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 960200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 960300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 960400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 960500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 960600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 960700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 960800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 960900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 961000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 961100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 961200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 961300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 961400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 961500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 961600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 961700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 961800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 961900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 962000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 962100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 962200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 962300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 962400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 962500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7537, + "step": 962600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 962700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 962800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 962900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 963000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 963100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 963200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 963300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 963400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 963500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 963600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 963700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 963800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 963900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 964000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 964100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 964200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 964300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 964400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 964500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 964600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 964700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 964800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 964900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 965000 + }, + { + "epoch": 1.05, + "eval_loss": 0.6977582573890686, + "eval_runtime": 205.3704, + "eval_samples_per_second": 243.463, + "eval_steps_per_second": 1.904, + "step": 965000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 965100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 965200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 965300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 965400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 965500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 965600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 965700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 965800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 965900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 966000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7296, + "step": 966100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 966200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 966300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 966400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 966500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 966600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 966700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 966800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 966900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 967000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 967100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 967200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 967300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 967400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 967500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 967600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 967700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 967800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 967900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 968000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 968100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 968200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7487, + "step": 968300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 968400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 968500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 968600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 968700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 968800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 968900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 969000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 969100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7563, + "step": 969200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 969300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 969400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 969500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 969600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 969700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 969800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7513, + "step": 969900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 970000 + }, + { + "epoch": 1.05, + "eval_loss": 0.6963879466056824, + "eval_runtime": 209.1727, + "eval_samples_per_second": 239.037, + "eval_steps_per_second": 1.869, + "step": 970000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 970100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 970200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 970300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7325, + "step": 970400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 970500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 970600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 970700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 970800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 970900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 971000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 971100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 971200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 971300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 971400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.732, + "step": 971500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 971600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 971700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 971800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 971900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 972000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 972100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 972200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.75, + "step": 972300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 972400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 972500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 972600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 972700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 972800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 972900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 973000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 973100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 973200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 973300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 973400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 973500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 973600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 973700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 973800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 973900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 974000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 974100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 974200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 974300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 974400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 974500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 974600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 974700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 974800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 974900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 975000 + }, + { + "epoch": 1.05, + "eval_loss": 0.695773184299469, + "eval_runtime": 1508.1794, + "eval_samples_per_second": 33.153, + "eval_steps_per_second": 0.259, + "step": 975000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 975100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 975200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 975300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 975400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 975500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 975600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 975700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7299, + "step": 975800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 975900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 976000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 976100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 976200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 976300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 976400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 976500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 976600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 976700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 976800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 976900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 977000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 977100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 977200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 977300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 977400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 977500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 977600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7264, + "step": 977700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 977800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 977900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 978000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 978100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 978200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7307, + "step": 978300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 978400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 978500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 978600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 978700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 978800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 978900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 979000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 979100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 979200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 979300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 979400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 979500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 979600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 979700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 979800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 979900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 980000 + }, + { + "epoch": 1.05, + "eval_loss": 0.696016252040863, + "eval_runtime": 207.2945, + "eval_samples_per_second": 241.203, + "eval_steps_per_second": 1.886, + "step": 980000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 980100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 980200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 980300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 980400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 980500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 980600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 980700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7287, + "step": 980800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 980900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 981000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 981100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 981200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 981300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 981400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 981500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 981600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 981700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 981800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 981900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 982000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 982100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 982200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 982300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 982400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 982500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 982600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 982700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 982800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 982900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 983000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 983100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 983200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 983300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 983400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 983500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 983600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 983700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 983800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 983900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 984000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 984100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 984200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 984300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 984400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 984500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7307, + "step": 984600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 984700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7325, + "step": 984800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 984900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 985000 + }, + { + "epoch": 1.05, + "eval_loss": 0.6967244744300842, + "eval_runtime": 207.5707, + "eval_samples_per_second": 240.882, + "eval_steps_per_second": 1.884, + "step": 985000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 985100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 985200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 985300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 985400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 985500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 985600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 985700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 985800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 985900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 986000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 986100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 986200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 986300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 986400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 986500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 986600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 986700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 986800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 986900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 987000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 987100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 987200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 987300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 987400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 987500 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 987600 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 987700 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 987800 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 987900 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 988000 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 988100 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 988200 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 988300 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 988400 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 988500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7324, + "step": 988600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 988700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 988800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 988900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 989000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 989100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 989200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 989300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 989400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7324, + "step": 989500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 989600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 989700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 989800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 989900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 990000 + }, + { + "epoch": 1.06, + "eval_loss": 0.6987284421920776, + "eval_runtime": 268.5781, + "eval_samples_per_second": 186.166, + "eval_steps_per_second": 1.456, + "step": 990000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 990100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 990200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 990300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 990400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 990500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 990600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 990700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 990800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 990900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 991000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 991100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 991200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 991300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 991400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 991500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 991600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 991700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 991800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 991900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 992000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 992100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 992200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 992300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 992400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 992500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 992600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 992700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 992800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 992900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 993000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 993100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 993200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 993300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 993400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 993500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 993600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7281, + "step": 993700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 993800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 993900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 994000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 994100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 994200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 994300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 994400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 994500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 994600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 994700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 994800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 994900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 995000 + }, + { + "epoch": 1.06, + "eval_loss": 0.6964648365974426, + "eval_runtime": 207.6129, + "eval_samples_per_second": 240.833, + "eval_steps_per_second": 1.883, + "step": 995000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 995100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 995200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 995300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 995400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 995500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 995600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 995700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 995800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 995900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 996000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 996100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 996200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 996300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 996400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 996500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 996600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 996700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 996800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 996900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 997000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 997100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 997200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 997300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 997400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 997500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 997600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 997700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 997800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 997900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 998000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 998100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 998200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 998300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 998400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 998500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 998600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 998700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 998800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 998900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 999000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 999100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 999200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 999300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 999400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 999500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7317, + "step": 999600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 999700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 999800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 999900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1000000 + }, + { + "epoch": 1.06, + "eval_loss": 0.6989625692367554, + "eval_runtime": 213.1026, + "eval_samples_per_second": 234.629, + "eval_steps_per_second": 1.835, + "step": 1000000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 1000100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1000200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 1000300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1000400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 1000500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1000600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1000700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1000800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 1000900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 1001000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1001100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 1001200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1001300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1001400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1001500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1001600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1001700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 1001800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1001900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1002000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1002100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 1002200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 1002300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1002400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1002500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1002600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1002700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1002800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1002900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7301, + "step": 1003000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 1003100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1003200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1003300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 1003400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7313, + "step": 1003500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1003600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1003700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 1003800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1003900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1004000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 1004100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 1004200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1004300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1004400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1004500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 1004600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 1004700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1004800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 1004900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1005000 + }, + { + "epoch": 1.06, + "eval_loss": 0.6978911757469177, + "eval_runtime": 207.4706, + "eval_samples_per_second": 240.998, + "eval_steps_per_second": 1.885, + "step": 1005000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1005100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 1005200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1005300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1005400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1005500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1005600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1005700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1005800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1005900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 1006000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1006100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1006200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1006300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1006400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1006500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1006600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1006700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1006800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1006900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 1007000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1007100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1007200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1007300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7312, + "step": 1007400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 1007500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1007600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1007700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 1007800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1007900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1008000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1008100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 1008200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1008300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1008400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1008500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1008600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 1008700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1008800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1008900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1009000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 1009100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 1009200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7517, + "step": 1009300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 1009400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1009500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 1009600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1009700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1009800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1009900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 1010000 + }, + { + "epoch": 1.06, + "eval_loss": 0.6958174705505371, + "eval_runtime": 207.0775, + "eval_samples_per_second": 241.456, + "eval_steps_per_second": 1.888, + "step": 1010000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1010100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1010200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 1010300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1010400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1010500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 1010600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1010700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1010800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 1010900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 1011000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1011100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1011200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1011300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1011400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 1011500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1011600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1011700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1011800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7531, + "step": 1011900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1012000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1012100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 1012200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 1012300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.751, + "step": 1012400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1012500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1012600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1012700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1012800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1012900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1013000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1013100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1013200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1013300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1013400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 1013500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1013600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1013700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 1013800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1013900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1014000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 1014100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1014200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 1014300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 1014400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 1014500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1014600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1014700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1014800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1014900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1015000 + }, + { + "epoch": 1.06, + "eval_loss": 0.6987661719322205, + "eval_runtime": 209.5042, + "eval_samples_per_second": 238.659, + "eval_steps_per_second": 1.866, + "step": 1015000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1015100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1015200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1015300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1015400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1015500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 1015600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1015700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 1015800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1015900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 1016000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1016100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 1016200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1016300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 1016400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1016500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 1016600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1016700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1016800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 1016900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1017000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1017100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1017200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1017300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1017400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1017500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1017600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 1017700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1017800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1017900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1018000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1018100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1018200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 1018300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1018400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 1018500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 1018600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1018700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1018800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1018900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1019000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1019100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 1019200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1019300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1019400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 1019500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1019600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1019700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 1019800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1019900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1020000 + }, + { + "epoch": 1.06, + "eval_loss": 0.696831226348877, + "eval_runtime": 208.2143, + "eval_samples_per_second": 240.137, + "eval_steps_per_second": 1.878, + "step": 1020000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1020100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1020200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1020300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 1020400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1020500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1020600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1020700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 1020800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1020900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 1021000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1021100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1021200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7511, + "step": 1021300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 1021400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1021500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1021600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1021700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 1021800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1021900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 1022000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 1022100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1022200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1022300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1022400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1022500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1022600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1022700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1022800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1022900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1023000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1023100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1023200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1023300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1023400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1023500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1023600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1023700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 1023800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1023900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1024000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1024100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1024200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 1024300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1024400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1024500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1024600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1024700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1024800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 1024900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1025000 + }, + { + "epoch": 1.06, + "eval_loss": 0.6981194019317627, + "eval_runtime": 207.532, + "eval_samples_per_second": 240.927, + "eval_steps_per_second": 1.884, + "step": 1025000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1025100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 1025200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1025300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1025400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1025500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7563, + "step": 1025600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 1025700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1025800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 1025900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7314, + "step": 1026000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1026100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1026200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1026300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7317, + "step": 1026400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1026500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1026600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1026700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 1026800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1026900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1027000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 1027100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 1027200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1027300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1027400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1027500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1027600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1027700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1027800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1027900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1028000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1028100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1028200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1028300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1028400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 1028500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1028600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7502, + "step": 1028700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1028800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1028900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1029000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 1029100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 1029200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 1029300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 1029400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1029500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 1029600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1029700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 1029800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1029900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1030000 + }, + { + "epoch": 1.06, + "eval_loss": 0.6968112587928772, + "eval_runtime": 207.3926, + "eval_samples_per_second": 241.089, + "eval_steps_per_second": 1.885, + "step": 1030000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 1030100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1030200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1030300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 1030400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1030500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1030600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1030700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1030800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1030900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 1031000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1031100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1031200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1031300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1031400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1031500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7522, + "step": 1031600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1031700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1031800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1031900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1032000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1032100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1032200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1032300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1032400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1032500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1032600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1032700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1032800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 1032900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1033000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1033100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1033200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1033300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 1033400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 1033500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1033600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1033700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 1033800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1033900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 1034000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1034100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1034200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1034300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1034400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1034500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1034600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1034700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1034800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1034900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1035000 + }, + { + "epoch": 1.06, + "eval_loss": 0.696743905544281, + "eval_runtime": 206.4144, + "eval_samples_per_second": 242.231, + "eval_steps_per_second": 1.894, + "step": 1035000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1035100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1035200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1035300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 1035400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1035500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1035600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1035700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 1035800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1035900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1036000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1036100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1036200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1036300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1036400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 1036500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1036600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1036700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1036800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 1036900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1037000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1037100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 1037200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1037300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1037400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 1037500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1037600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7518, + "step": 1037700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 1037800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 1037900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 1038000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 1038100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 1038200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1038300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 1038400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1038500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1038600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1038700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1038800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1038900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1039000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 1039100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1039200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1039300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1039400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1039500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1039600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 1039700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1039800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1039900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 1040000 + }, + { + "epoch": 1.06, + "eval_loss": 0.6981737613677979, + "eval_runtime": 207.5374, + "eval_samples_per_second": 240.92, + "eval_steps_per_second": 1.884, + "step": 1040000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1040100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1040200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1040300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 1040400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1040500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1040600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1040700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1040800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7298, + "step": 1040900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1041000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 1041100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 1041200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1041300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1041400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1041500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1041600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1041700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1041800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1041900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1042000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7294, + "step": 1042100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1042200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1042300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1042400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 1042500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 1042600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1042700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 1042800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1042900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1043000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1043100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1043200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 1043300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1043400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1043500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7477, + "step": 1043600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 1043700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1043800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1043900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 1044000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 1044100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1044200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7306, + "step": 1044300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1044400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1044500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1044600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1044700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1044800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 1044900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1045000 + }, + { + "epoch": 1.06, + "eval_loss": 0.69672030210495, + "eval_runtime": 207.4695, + "eval_samples_per_second": 240.999, + "eval_steps_per_second": 1.885, + "step": 1045000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1045100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1045200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1045300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1045400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1045500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1045600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1045700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1045800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 1045900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1046000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1046100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1046200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1046300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1046400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1046500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 1046600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1046700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1046800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1046900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1047000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1047100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 1047200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1047300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7304, + "step": 1047400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1047500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1047600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1047700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 1047800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1047900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1048000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7297, + "step": 1048100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1048200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1048300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1048400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 1048500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 1048600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1048700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1048800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1048900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1049000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1049100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1049200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1049300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1049400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1049500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1049600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1049700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1049800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1049900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 1050000 + }, + { + "epoch": 1.06, + "eval_loss": 0.6958229541778564, + "eval_runtime": 207.4587, + "eval_samples_per_second": 241.012, + "eval_steps_per_second": 1.885, + "step": 1050000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1050100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1050200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1050300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1050400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1050500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1050600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.73, + "step": 1050700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1050800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 1050900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1051000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1051100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1051200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1051300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1051400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1051500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7498, + "step": 1051600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7298, + "step": 1051700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1051800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1051900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1052000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1052100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 1052200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1052300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1052400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 1052500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1052600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1052700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 1052800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1052900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1053000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1053100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1053200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1053300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1053400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1053500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1053600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1053700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7292, + "step": 1053800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1053900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1054000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1054100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1054200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1054300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7321, + "step": 1054400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 1054500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1054600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1054700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7317, + "step": 1054800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1054900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 1055000 + }, + { + "epoch": 1.06, + "eval_loss": 0.6936817765235901, + "eval_runtime": 206.5774, + "eval_samples_per_second": 242.04, + "eval_steps_per_second": 1.893, + "step": 1055000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1055100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 1055200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1055300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 1055400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1055500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1055600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1055700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1055800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1055900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1056000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1056100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1056200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1056300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1056400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1056500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1056600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1056700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1056800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1056900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1057000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 1057100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1057200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1057300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 1057400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1057500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1057600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1057700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1057800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 1057900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1058000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1058100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 1058200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1058300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 1058400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1058500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1058600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1058700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7293, + "step": 1058800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1058900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 1059000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1059100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1059200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 1059300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 1059400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1059500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1059600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1059700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1059800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1059900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1060000 + }, + { + "epoch": 1.06, + "eval_loss": 0.6966774463653564, + "eval_runtime": 205.9379, + "eval_samples_per_second": 242.792, + "eval_steps_per_second": 1.899, + "step": 1060000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 1060100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1060200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1060300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1060400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 1060500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 1060600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 1060700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1060800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 1060900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 1061000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1061100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1061200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1061300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 1061400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 1061500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1061600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1061700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1061800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 1061900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 1062000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1062100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1062200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1062300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1062400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1062500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1062600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 1062700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1062800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1062900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1063000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1063100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 1063200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1063300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1063400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1063500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1063600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1063700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7315, + "step": 1063800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1063900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 1064000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1064100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1064200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1064300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1064400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 1064500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1064600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1064700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1064800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 1064900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 1065000 + }, + { + "epoch": 1.06, + "eval_loss": 0.6973162293434143, + "eval_runtime": 207.8802, + "eval_samples_per_second": 240.523, + "eval_steps_per_second": 1.881, + "step": 1065000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 1065100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1065200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1065300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 1065400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1065500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 1065600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1065700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1065800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1065900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 1066000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 1066100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 1066200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1066300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1066400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1066500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1066600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1066700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1066800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 1066900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1067000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1067100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1067200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1067300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1067400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1067500 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1067600 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1067700 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1067800 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7315, + "step": 1067900 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1068000 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 1068100 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1068200 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 1068300 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1068400 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1068500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1068600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1068700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 1068800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1068900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 1069000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 1069100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1069200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1069300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1069400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 1069500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1069600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1069700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1069800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1069900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1070000 + }, + { + "epoch": 1.07, + "eval_loss": 0.6998035311698914, + "eval_runtime": 205.4459, + "eval_samples_per_second": 243.373, + "eval_steps_per_second": 1.903, + "step": 1070000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1070100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1070200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1070300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1070400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1070500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1070600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 1070700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 1070800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1070900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1071000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1071100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1071200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 1071300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1071400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7283, + "step": 1071500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1071600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1071700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1071800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 1071900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1072000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 1072100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1072200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 1072300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1072400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 1072500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 1072600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1072700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 1072800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1072900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1073000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1073100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 1073200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1073300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1073400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1073500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1073600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1073700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1073800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1073900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1074000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1074100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1074200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1074300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 1074400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 1074500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1074600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1074700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1074800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 1074900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 1075000 + }, + { + "epoch": 1.07, + "eval_loss": 0.697563886642456, + "eval_runtime": 211.4095, + "eval_samples_per_second": 236.508, + "eval_steps_per_second": 1.849, + "step": 1075000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1075100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1075200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1075300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 1075400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 1075500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1075600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1075700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1075800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1075900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1076000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1076100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1076200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1076300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1076400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1076500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1076600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1076700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 1076800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 1076900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1077000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1077100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1077200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1077300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1077400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1077500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 1077600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1077700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 1077800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1077900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7462, + "step": 1078000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 1078100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 1078200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1078300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1078400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 1078500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1078600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1078700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1078800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1078900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1079000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 1079100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 1079200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1079300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1079400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1079500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1079600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 1079700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1079800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 1079900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1080000 + }, + { + "epoch": 1.07, + "eval_loss": 0.6988189816474915, + "eval_runtime": 207.2542, + "eval_samples_per_second": 241.25, + "eval_steps_per_second": 1.887, + "step": 1080000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1080100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1080200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1080300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 1080400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1080500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1080600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 1080700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1080800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1080900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1081000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1081100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 1081200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1081300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 1081400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1081500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 1081600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1081700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1081800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1081900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1082000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 1082100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1082200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1082300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1082400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1082500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1082600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 1082700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1082800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1082900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1083000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1083100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1083200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1083300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1083400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1083500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1083600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1083700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 1083800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1083900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1084000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1084100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1084200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1084300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1084400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1084500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1084600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1084700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1084800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1084900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1085000 + }, + { + "epoch": 1.07, + "eval_loss": 0.6958624124526978, + "eval_runtime": 206.4977, + "eval_samples_per_second": 242.133, + "eval_steps_per_second": 1.893, + "step": 1085000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 1085100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 1085200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1085300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 1085400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1085500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1085600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 1085700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1085800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1085900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1086000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1086100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1086200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1086300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1086400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1086500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1086600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1086700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1086800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1086900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1087000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1087100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1087200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1087300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1087400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 1087500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1087600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 1087700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1087800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 1087900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1088000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1088100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1088200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1088300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1088400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1088500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1088600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1088700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1088800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 1088900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1089000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 1089100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1089200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1089300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1089400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1089500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1089600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1089700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1089800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7303, + "step": 1089900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1090000 + }, + { + "epoch": 1.07, + "eval_loss": 0.6945422887802124, + "eval_runtime": 205.0592, + "eval_samples_per_second": 243.832, + "eval_steps_per_second": 1.907, + "step": 1090000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1090100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1090200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 1090300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1090400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 1090500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1090600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1090700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1090800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1090900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1091000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1091100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1091200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 1091300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7305, + "step": 1091400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 1091500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1091600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 1091700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1091800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 1091900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1092000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1092100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1092200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1092300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1092400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1092500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1092600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 1092700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1092800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1092900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1093000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7307, + "step": 1093100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1093200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1093300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1093400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1093500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1093600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 1093700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 1093800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1093900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1094000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 1094100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1094200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1094300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1094400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7504, + "step": 1094500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1094600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1094700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1094800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1094900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1095000 + }, + { + "epoch": 1.07, + "eval_loss": 0.6956625580787659, + "eval_runtime": 205.1057, + "eval_samples_per_second": 243.777, + "eval_steps_per_second": 1.906, + "step": 1095000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1095100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1095200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1095300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 1095400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1095500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1095600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1095700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1095800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1095900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1096000 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1096100 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1096200 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1096300 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1096400 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 1096500 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 1096600 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1096700 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 1096800 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1096900 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 1097000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1097100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1097200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1097300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1097400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1097500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1097600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1097700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1097800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1097900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1098000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1098100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1098200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1098300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1098400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1098500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1098600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1098700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1098800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7305, + "step": 1098900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1099000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 1099100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1099200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 1099300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 1099400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1099500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1099600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1099700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1099800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 1099900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1100000 + }, + { + "epoch": 2.0, + "eval_loss": 0.6958077549934387, + "eval_runtime": 205.2026, + "eval_samples_per_second": 243.662, + "eval_steps_per_second": 1.905, + "step": 1100000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1100100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7286, + "step": 1100200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1100300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 1100400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1100500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1100600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1100700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1100800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7506, + "step": 1100900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1101000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1101100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1101200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7311, + "step": 1101300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1101400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 1101500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1101600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1101700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1101800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1101900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1102000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1102100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1102200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1102300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1102400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1102500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1102600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1102700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1102800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1102900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1103000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1103100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1103200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7314, + "step": 1103300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 1103400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 1103500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1103600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1103700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1103800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1103900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1104000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1104100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1104200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1104300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1104400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1104500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1104600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 1104700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1104800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1104900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1105000 + }, + { + "epoch": 2.0, + "eval_loss": 0.6972557902336121, + "eval_runtime": 204.8484, + "eval_samples_per_second": 244.083, + "eval_steps_per_second": 1.909, + "step": 1105000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 1105100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 1105200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1105300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1105400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1105500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1105600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1105700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1105800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1105900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7317, + "step": 1106000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1106100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1106200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1106300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1106400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1106500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1106600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1106700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1106800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 1106900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1107000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 1107100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1107200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 1107300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1107400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1107500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1107600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1107700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1107800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1107900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1108000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7324, + "step": 1108100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1108200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1108300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1108400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1108500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1108600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1108700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1108800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1108900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1109000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1109100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1109200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7314, + "step": 1109300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1109400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1109500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1109600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1109700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1109800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1109900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1110000 + }, + { + "epoch": 2.0, + "eval_loss": 0.6954222917556763, + "eval_runtime": 204.2756, + "eval_samples_per_second": 244.767, + "eval_steps_per_second": 1.914, + "step": 1110000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7297, + "step": 1110100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1110200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1110300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1110400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1110500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 1110600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1110700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1110800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1110900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1111000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1111100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1111200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1111300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1111400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1111500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1111600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1111700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 1111800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 1111900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1112000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1112100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1112200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 1112300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1112400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1112500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1112600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7317, + "step": 1112700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1112800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1112900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 1113000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1113100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.729, + "step": 1113200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1113300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1113400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1113500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1113600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 1113700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 1113800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1113900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7315, + "step": 1114000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 1114100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1114200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1114300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1114400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1114500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 1114600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 1114700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1114800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 1114900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7308, + "step": 1115000 + }, + { + "epoch": 2.0, + "eval_loss": 0.6944382786750793, + "eval_runtime": 204.8261, + "eval_samples_per_second": 244.109, + "eval_steps_per_second": 1.909, + "step": 1115000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1115100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1115200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 1115300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1115400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7306, + "step": 1115500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1115600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1115700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1115800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1115900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 1116000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1116100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 1116200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1116300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1116400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1116500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1116600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 1116700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1116800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1116900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1117000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1117100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 1117200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1117300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 1117400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 1117500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1117600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1117700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 1117800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1117900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7308, + "step": 1118000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1118100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1118200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 1118300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7293, + "step": 1118400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1118500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1118600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1118700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1118800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1118900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1119000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1119100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1119200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1119300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1119400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1119500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 1119600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1119700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1119800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1119900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1120000 + }, + { + "epoch": 2.0, + "eval_loss": 0.6974844336509705, + "eval_runtime": 206.0821, + "eval_samples_per_second": 242.622, + "eval_steps_per_second": 1.897, + "step": 1120000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1120100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 1120200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 1120300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1120400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1120500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1120600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1120700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1120800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 1120900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1121000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 1121100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 1121200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1121300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1121400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1121500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1121600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1121700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1121800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 1121900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 1122000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1122100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 1122200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 1122300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1122400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 1122500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1122600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7275, + "step": 1122700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1122800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1122900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1123000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1123100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1123200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1123300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7299, + "step": 1123400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1123500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1123600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1123700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 1123800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 1123900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1124000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1124100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1124200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1124300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7296, + "step": 1124400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1124500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1124600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1124700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 1124800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1124900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1125000 + }, + { + "epoch": 2.0, + "eval_loss": 0.6972522139549255, + "eval_runtime": 204.9765, + "eval_samples_per_second": 243.93, + "eval_steps_per_second": 1.908, + "step": 1125000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1125100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 1125200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 1125300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1125400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1125500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1125600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1125700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1125800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1125900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7289, + "step": 1126000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 1126100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1126200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1126300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1126400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1126500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 1126600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1126700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1126800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1126900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1127000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1127100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1127200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1127300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1127400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1127500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1127600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1127700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7491, + "step": 1127800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1127900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1128000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1128100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1128200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1128300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1128400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1128500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1128600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 1128700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1128800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1128900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 1129000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1129100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1129200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7295, + "step": 1129300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1129400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1129500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1129600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1129700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1129800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1129900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1130000 + }, + { + "epoch": 2.0, + "eval_loss": 0.6970519423484802, + "eval_runtime": 208.5639, + "eval_samples_per_second": 239.735, + "eval_steps_per_second": 1.875, + "step": 1130000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1130100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1130200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1130300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7478, + "step": 1130400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1130500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 1130600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1130700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7291, + "step": 1130800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 1130900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 1131000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 1131100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1131200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1131300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1131400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1131500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1131600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1131700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1131800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 1131900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1132000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1132100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1132200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1132300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1132400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1132500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1132600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1132700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1132800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1132900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1133000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1133100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 1133200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 1133300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1133400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1133500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1133600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1133700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1133800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1133900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1134000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1134100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1134200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1134300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7488, + "step": 1134400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1134500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1134600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1134700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 1134800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1134900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1135000 + }, + { + "epoch": 2.0, + "eval_loss": 0.697513222694397, + "eval_runtime": 203.9175, + "eval_samples_per_second": 245.197, + "eval_steps_per_second": 1.917, + "step": 1135000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1135100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1135200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1135300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.732, + "step": 1135400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1135500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 1135600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1135700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1135800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1135900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 1136000 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7287, + "step": 1136100 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 1136200 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 1136300 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1136400 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1136500 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 1136600 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1136700 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1136800 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1136900 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1137000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1137100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1137200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1137300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 1137400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1137500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1137600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1137700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1137800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1137900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1138000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 1138100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1138200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1138300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1138400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1138500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7297, + "step": 1138600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1138700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1138800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1138900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1139000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1139100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 1139200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1139300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1139400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1139500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1139600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1139700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1139800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1139900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 1140000 + }, + { + "epoch": 2.01, + "eval_loss": 0.6973596215248108, + "eval_runtime": 206.0928, + "eval_samples_per_second": 242.609, + "eval_steps_per_second": 1.897, + "step": 1140000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1140100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1140200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1140300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 1140400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1140500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1140600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 1140700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 1140800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 1140900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1141000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1141100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1141200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1141300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1141400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1141500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1141600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 1141700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 1141800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1141900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1142000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1142100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1142200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1142300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1142400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1142500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1142600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1142700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1142800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1142900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1143000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1143100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1143200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 1143300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1143400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1143500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1143600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1143700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1143800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7307, + "step": 1143900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1144000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1144100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1144200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1144300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1144400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7309, + "step": 1144500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1144600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 1144700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1144800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 1144900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1145000 + }, + { + "epoch": 2.01, + "eval_loss": 0.6971275210380554, + "eval_runtime": 204.9142, + "eval_samples_per_second": 244.005, + "eval_steps_per_second": 1.908, + "step": 1145000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1145100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1145200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 1145300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1145400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 1145500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1145600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 1145700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 1145800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1145900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 1146000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1146100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1146200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1146300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 1146400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 1146500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1146600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1146700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7489, + "step": 1146800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7324, + "step": 1146900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1147000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 1147100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 1147200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 1147300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1147400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1147500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1147600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1147700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 1147800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1147900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1148000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1148100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1148200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1148300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 1148400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1148500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 1148600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1148700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1148800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1148900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 1149000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1149100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1149200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1149300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1149400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 1149500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1149600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 1149700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1149800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1149900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1150000 + }, + { + "epoch": 2.01, + "eval_loss": 0.6946325898170471, + "eval_runtime": 203.4327, + "eval_samples_per_second": 245.782, + "eval_steps_per_second": 1.922, + "step": 1150000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1150100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 1150200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 1150300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1150400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1150500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1150600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1150700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1150800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1150900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7521, + "step": 1151000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1151100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1151200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1151300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1151400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1151500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1151600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1151700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1151800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1151900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1152000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1152100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1152200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 1152300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1152400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1152500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1152600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1152700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 1152800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1152900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1153000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 1153100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7525, + "step": 1153200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1153300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1153400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1153500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 1153600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1153700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1153800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1153900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1154000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 1154100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 1154200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1154300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1154400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1154500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1154600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1154700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1154800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1154900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1155000 + }, + { + "epoch": 2.01, + "eval_loss": 0.6950593590736389, + "eval_runtime": 204.9854, + "eval_samples_per_second": 243.92, + "eval_steps_per_second": 1.907, + "step": 1155000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1155100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1155200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 1155300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 1155400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7315, + "step": 1155500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1155600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1155700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1155800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1155900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7325, + "step": 1156000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1156100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 1156200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1156300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1156400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1156500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1156600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1156700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1156800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7483, + "step": 1156900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1157000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1157100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1157200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 1157300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1157400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1157500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7234, + "step": 1157600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1157700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 1157800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1157900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1158000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1158100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1158200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1158300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1158400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 1158500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7303, + "step": 1158600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1158700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1158800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1158900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1159000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1159100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1159200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1159300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1159400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1159500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1159600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1159700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1159800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1159900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1160000 + }, + { + "epoch": 2.01, + "eval_loss": 0.6963341236114502, + "eval_runtime": 207.6431, + "eval_samples_per_second": 240.798, + "eval_steps_per_second": 1.883, + "step": 1160000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1160100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1160200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1160300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1160400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1160500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1160600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1160700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1160800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1160900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1161000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1161100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1161200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1161300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 1161400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1161500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1161600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1161700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1161800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1161900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1162000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1162100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1162200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 1162300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1162400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1162500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1162600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1162700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 1162800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1162900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1163000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1163100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 1163200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1163300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1163400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 1163500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7287, + "step": 1163600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1163700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1163800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1163900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 1164000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 1164100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1164200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1164300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1164400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1164500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1164600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1164700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1164800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7317, + "step": 1164900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 1165000 + }, + { + "epoch": 2.01, + "eval_loss": 0.6949604153633118, + "eval_runtime": 204.0116, + "eval_samples_per_second": 245.084, + "eval_steps_per_second": 1.917, + "step": 1165000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7324, + "step": 1165100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1165200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 1165300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1165400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 1165500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1165600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1165700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1165800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 1165900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1166000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1166100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1166200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1166300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1166400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1166500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1166600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1166700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 1166800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 1166900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7501, + "step": 1167000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1167100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 1167200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1167300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1167400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1167500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1167600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1167700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1167800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1167900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1168000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1168100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1168200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1168300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7278, + "step": 1168400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1168500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1168600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 1168700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1168800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1168900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1169000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1169100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1169200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1169300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1169400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1169500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1169600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7292, + "step": 1169700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1169800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1169900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1170000 + }, + { + "epoch": 2.01, + "eval_loss": 0.694342315196991, + "eval_runtime": 205.9534, + "eval_samples_per_second": 242.773, + "eval_steps_per_second": 1.898, + "step": 1170000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1170100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1170200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1170300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1170400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 1170500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1170600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1170700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1170800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1170900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1171000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1171100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7297, + "step": 1171200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1171300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1171400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1171500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1171600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1171700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7526, + "step": 1171800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1171900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1172000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1172100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1172200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 1172300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1172400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1172500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1172600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1172700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1172800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1172900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1173000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1173100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1173200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 1173300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7278, + "step": 1173400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 1173500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7283, + "step": 1173600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1173700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.728, + "step": 1173800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1173900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1174000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 1174100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 1174200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7284, + "step": 1174300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1174400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1174500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1174600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1174700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1174800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1174900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 1175000 + }, + { + "epoch": 2.01, + "eval_loss": 0.6968173980712891, + "eval_runtime": 204.8877, + "eval_samples_per_second": 244.036, + "eval_steps_per_second": 1.908, + "step": 1175000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1175100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1175200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1175300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.732, + "step": 1175400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1175500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1175600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1175700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1175800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 1175900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1176000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1176100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1176200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1176300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 1176400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 1176500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1176600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1176700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1176800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1176900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1177000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 1177100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 1177200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 1177300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1177400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1177500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1177600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 1177700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 1177800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1177900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 1178000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1178100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1178200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 1178300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1178400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1178500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1178600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1178700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1178800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1178900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1179000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1179100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1179200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 1179300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 1179400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1179500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1179600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1179700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1179800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1179900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1180000 + }, + { + "epoch": 2.01, + "eval_loss": 0.6931548714637756, + "eval_runtime": 204.556, + "eval_samples_per_second": 244.432, + "eval_steps_per_second": 1.911, + "step": 1180000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1180100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1180200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1180300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1180400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1180500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1180600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1180700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1180800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1180900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1181000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1181100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1181200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1181300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1181400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1181500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1181600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1181700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7278, + "step": 1181800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1181900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7296, + "step": 1182000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1182100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1182200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1182300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7304, + "step": 1182400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1182500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1182600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1182700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7321, + "step": 1182800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 1182900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1183000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1183100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7314, + "step": 1183200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 1183300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 1183400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1183500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1183600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1183700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 1183800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1183900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1184000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1184100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1184200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1184300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 1184400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1184500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1184600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1184700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 1184800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1184900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1185000 + }, + { + "epoch": 2.01, + "eval_loss": 0.6951209306716919, + "eval_runtime": 201.2242, + "eval_samples_per_second": 248.479, + "eval_steps_per_second": 1.943, + "step": 1185000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1185100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 1185200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1185300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7289, + "step": 1185400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 1185500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 1185600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1185700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1185800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1185900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1186000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1186100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1186200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1186300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 1186400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1186500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1186600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1186700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1186800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1186900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1187000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1187100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1187200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7493, + "step": 1187300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1187400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1187500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1187600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1187700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1187800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1187900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1188000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1188100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1188200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7508, + "step": 1188300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1188400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1188500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 1188600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1188700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 1188800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1188900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1189000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1189100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1189200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1189300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1189400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1189500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1189600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1189700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7286, + "step": 1189800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1189900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1190000 + }, + { + "epoch": 2.01, + "eval_loss": 0.6967108845710754, + "eval_runtime": 205.9224, + "eval_samples_per_second": 242.81, + "eval_steps_per_second": 1.899, + "step": 1190000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1190100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1190200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1190300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1190400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1190500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1190600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1190700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 1190800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1190900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1191000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1191100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1191200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1191300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1191400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1191500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1191600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1191700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1191800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1191900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1192000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1192100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1192200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7311, + "step": 1192300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1192400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1192500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1192600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1192700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1192800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1192900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 1193000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1193100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 1193200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1193300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1193400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7507, + "step": 1193500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1193600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7476, + "step": 1193700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1193800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1193900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1194000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 1194100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1194200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1194300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1194400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1194500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1194600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1194700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 1194800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1194900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1195000 + }, + { + "epoch": 2.01, + "eval_loss": 0.695705771446228, + "eval_runtime": 203.2356, + "eval_samples_per_second": 246.02, + "eval_steps_per_second": 1.924, + "step": 1195000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1195100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 1195200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1195300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1195400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1195500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 1195600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1195700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1195800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1195900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1196000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1196100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 1196200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1196300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1196400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1196500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1196600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 1196700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1196800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 1196900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 1197000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1197100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1197200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1197300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 1197400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1197500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 1197600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1197700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7324, + "step": 1197800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1197900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.728, + "step": 1198000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1198100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1198200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 1198300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1198400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1198500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 1198600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1198700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1198800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 1198900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 1199000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1199100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 1199200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7195, + "step": 1199300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1199400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1199500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1199600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7251, + "step": 1199700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 1199800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1199900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1200000 + }, + { + "epoch": 2.01, + "eval_loss": 0.6952381134033203, + "eval_runtime": 204.566, + "eval_samples_per_second": 244.42, + "eval_steps_per_second": 1.911, + "step": 1200000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1200100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1200200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1200300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1200400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 1200500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1200600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1200700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1200800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1200900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1201000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1201100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1201200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1201300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1201400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1201500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1201600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 1201700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1201800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 1201900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 1202000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1202100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1202200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1202300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 1202400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 1202500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1202600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1202700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1202800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1202900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1203000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1203100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1203200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1203300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1203400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1203500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1203600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1203700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7324, + "step": 1203800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1203900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1204000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1204100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7271, + "step": 1204200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 1204300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1204400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1204500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1204600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1204700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 1204800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1204900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1205000 + }, + { + "epoch": 2.01, + "eval_loss": 0.6934911012649536, + "eval_runtime": 204.7959, + "eval_samples_per_second": 244.145, + "eval_steps_per_second": 1.909, + "step": 1205000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1205100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1205200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 1205300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1205400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1205500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1205600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 1205700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 1205800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 1205900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1206000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1206100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1206200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 1206300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1206400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 1206500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1206600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1206700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1206800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1206900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1207000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1207100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1207200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1207300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1207400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 1207500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1207600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1207700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1207800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1207900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1208000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1208100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1208200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1208300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1208400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1208500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1208600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1208700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1208800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1208900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 1209000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1209100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 1209200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1209300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1209400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1209500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1209600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1209700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1209800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 1209900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7297, + "step": 1210000 + }, + { + "epoch": 2.01, + "eval_loss": 0.6941564679145813, + "eval_runtime": 208.372, + "eval_samples_per_second": 239.955, + "eval_steps_per_second": 1.876, + "step": 1210000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1210100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7321, + "step": 1210200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1210300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 1210400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1210500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1210600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1210700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1210800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1210900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1211000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7308, + "step": 1211100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1211200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 1211300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1211400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1211500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1211600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1211700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 1211800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1211900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 1212000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 1212100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1212200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7291, + "step": 1212300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7321, + "step": 1212400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1212500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 1212600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1212700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1212800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 1212900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 1213000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1213100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7275, + "step": 1213200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1213300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 1213400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1213500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1213600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1213700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1213800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1213900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1214000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1214100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 1214200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 1214300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1214400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1214500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1214600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1214700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1214800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1214900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1215000 + }, + { + "epoch": 2.01, + "eval_loss": 0.6927644610404968, + "eval_runtime": 209.8566, + "eval_samples_per_second": 238.258, + "eval_steps_per_second": 1.863, + "step": 1215000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 1215100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7301, + "step": 1215200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1215300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 1215400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1215500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1215600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 1215700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 1215800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 1215900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1216000 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1216100 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1216200 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7325, + "step": 1216300 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1216400 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 1216500 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 1216600 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1216700 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1216800 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1216900 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 1217000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1217100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1217200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1217300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1217400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1217500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1217600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1217700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 1217800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1217900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 1218000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1218100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1218200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1218300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1218400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1218500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1218600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1218700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1218800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1218900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1219000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1219100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1219200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1219300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1219400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1219500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 1219600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1219700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1219800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1219900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 1220000 + }, + { + "epoch": 2.02, + "eval_loss": 0.6960386633872986, + "eval_runtime": 206.1862, + "eval_samples_per_second": 242.499, + "eval_steps_per_second": 1.896, + "step": 1220000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1220100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1220200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1220300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1220400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1220500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 1220600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1220700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1220800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1220900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.732, + "step": 1221000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7474, + "step": 1221100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 1221200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 1221300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1221400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1221500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1221600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1221700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7292, + "step": 1221800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1221900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1222000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1222100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1222200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1222300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1222400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1222500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 1222600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 1222700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1222800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1222900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1223000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1223100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1223200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 1223300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 1223400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1223500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1223600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1223700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1223800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1223900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1224000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1224100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1224200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1224300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1224400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1224500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1224600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1224700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1224800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 1224900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1225000 + }, + { + "epoch": 2.02, + "eval_loss": 0.6961241960525513, + "eval_runtime": 205.2014, + "eval_samples_per_second": 243.663, + "eval_steps_per_second": 1.905, + "step": 1225000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1225100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1225200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1225300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1225400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1225500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1225600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1225700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1225800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1225900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 1226000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1226100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1226200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1226300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1226400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1226500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1226600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1226700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 1226800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1226900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1227000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 1227100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1227200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1227300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1227400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1227500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1227600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 1227700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1227800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1227900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1228000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7294, + "step": 1228100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 1228200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1228300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1228400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7313, + "step": 1228500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1228600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1228700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1228800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 1228900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 1229000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1229100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1229200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1229300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1229400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1229500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1229600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1229700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1229800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1229900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 1230000 + }, + { + "epoch": 2.02, + "eval_loss": 0.6950798630714417, + "eval_runtime": 207.3877, + "eval_samples_per_second": 241.094, + "eval_steps_per_second": 1.885, + "step": 1230000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1230100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1230200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 1230300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1230400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7274, + "step": 1230500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1230600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1230700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1230800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1230900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 1231000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1231100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7465, + "step": 1231200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1231300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1231400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 1231500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1231600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1231700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 1231800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1231900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1232000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1232100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1232200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1232300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1232400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1232500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1232600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1232700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1232800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7296, + "step": 1232900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1233000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1233100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1233200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1233300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1233400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1233500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1233600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1233700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1233800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1233900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1234000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1234100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1234200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1234300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1234400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1234500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7274, + "step": 1234600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1234700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1234800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1234900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1235000 + }, + { + "epoch": 2.02, + "eval_loss": 0.6963469386100769, + "eval_runtime": 206.1765, + "eval_samples_per_second": 242.511, + "eval_steps_per_second": 1.896, + "step": 1235000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1235100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 1235200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1235300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1235400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 1235500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1235600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 1235700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1235800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1235900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 1236000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1236100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1236200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1236300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1236400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1236500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1236600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1236700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1236800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1236900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1237000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1237100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1237200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1237300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1237400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 1237500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1237600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1237700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1237800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 1237900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 1238000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1238100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1238200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1238300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1238400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1238500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1238600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1238700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1238800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1238900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 1239000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1239100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1239200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1239300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1239400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7486, + "step": 1239500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1239600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1239700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1239800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1239900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1240000 + }, + { + "epoch": 2.02, + "eval_loss": 0.6954966187477112, + "eval_runtime": 203.811, + "eval_samples_per_second": 245.325, + "eval_steps_per_second": 1.918, + "step": 1240000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 1240100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1240200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7309, + "step": 1240300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1240400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1240500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1240600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1240700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1240800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1240900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 1241000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1241100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1241200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1241300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7495, + "step": 1241400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1241500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1241600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1241700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1241800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1241900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1242000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1242100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1242200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1242300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1242400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7301, + "step": 1242500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1242600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1242700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1242800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1242900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1243000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1243100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1243200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1243300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1243400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1243500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1243600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1243700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1243800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 1243900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1244000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1244100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1244200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1244300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1244400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 1244500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1244600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1244700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1244800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1244900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1245000 + }, + { + "epoch": 2.02, + "eval_loss": 0.6947163939476013, + "eval_runtime": 204.6375, + "eval_samples_per_second": 244.335, + "eval_steps_per_second": 1.911, + "step": 1245000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 1245100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1245200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1245300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1245400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1245500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1245600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 1245700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1245800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1245900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1246000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1246100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 1246200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1246300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 1246400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 1246500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1246600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1246700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1246800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1246900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1247000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1247100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1247200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1247300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7494, + "step": 1247400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1247500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1247600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1247700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1247800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1247900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1248000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 1248100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7292, + "step": 1248200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1248300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 1248400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1248500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 1248600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1248700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1248800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1248900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1249000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1249100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1249200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1249300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7296, + "step": 1249400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1249500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1249600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1249700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1249800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1249900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 1250000 + }, + { + "epoch": 2.02, + "eval_loss": 0.695235550403595, + "eval_runtime": 202.6416, + "eval_samples_per_second": 246.741, + "eval_steps_per_second": 1.93, + "step": 1250000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1250100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1250200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1250300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1250400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1250500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1250600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 1250700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1250800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1250900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7497, + "step": 1251000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1251100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1251200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1251300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1251400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 1251500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1251600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1251700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1251800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1251900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1252000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1252100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1252200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1252300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1252400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1252500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 1252600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 1252700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1252800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1252900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1253000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1253100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1253200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1253300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1253400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1253500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1253600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1253700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1253800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1253900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1254000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 1254100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1254200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7289, + "step": 1254300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1254400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7269, + "step": 1254500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1254600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1254700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1254800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1254900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7311, + "step": 1255000 + }, + { + "epoch": 2.02, + "eval_loss": 0.6938926577568054, + "eval_runtime": 203.8632, + "eval_samples_per_second": 245.263, + "eval_steps_per_second": 1.918, + "step": 1255000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1255100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1255200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1255300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1255400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1255500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1255600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1255700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1255800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1255900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 1256000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 1256100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 1256200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1256300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1256400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 1256500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1256600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 1256700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 1256800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1256900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1257000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1257100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 1257200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1257300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7306, + "step": 1257400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1257500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1257600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1257700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1257800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7321, + "step": 1257900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1258000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 1258100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7292, + "step": 1258200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1258300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1258400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1258500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1258600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 1258700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1258800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1258900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1259000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1259100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1259200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 1259300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1259400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1259500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 1259600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1259700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1259800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 1259900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7301, + "step": 1260000 + }, + { + "epoch": 2.02, + "eval_loss": 0.694320023059845, + "eval_runtime": 204.6582, + "eval_samples_per_second": 244.31, + "eval_steps_per_second": 1.911, + "step": 1260000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1260100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 1260200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1260300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1260400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1260500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7482, + "step": 1260600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1260700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1260800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1260900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7305, + "step": 1261000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1261100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1261200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1261300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1261400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1261500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 1261600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1261700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1261800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1261900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1262000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7291, + "step": 1262100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 1262200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1262300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1262400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1262500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1262600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1262700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1262800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.732, + "step": 1262900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7257, + "step": 1263000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1263100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1263200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1263300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1263400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1263500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1263600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1263700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1263800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 1263900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1264000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1264100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 1264200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1264300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 1264400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1264500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1264600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1264700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 1264800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1264900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 1265000 + }, + { + "epoch": 2.02, + "eval_loss": 0.6940274238586426, + "eval_runtime": 201.3695, + "eval_samples_per_second": 248.3, + "eval_steps_per_second": 1.942, + "step": 1265000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1265100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1265200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1265300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1265400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1265500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1265600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1265700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1265800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1265900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1266000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1266100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1266200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1266300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1266400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1266500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1266600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1266700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1266800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1266900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1267000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1267100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1267200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1267300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1267400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1267500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1267600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7324, + "step": 1267700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1267800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1267900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1268000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1268100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1268200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1268300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1268400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1268500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1268600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1268700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1268800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7311, + "step": 1268900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1269000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7324, + "step": 1269100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 1269200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1269300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 1269400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1269500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1269600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7312, + "step": 1269700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 1269800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1269900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1270000 + }, + { + "epoch": 2.02, + "eval_loss": 0.6949335932731628, + "eval_runtime": 194.6116, + "eval_samples_per_second": 256.922, + "eval_steps_per_second": 2.009, + "step": 1270000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1270100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1270200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1270300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1270400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1270500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1270600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.729, + "step": 1270700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1270800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1270900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1271000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7509, + "step": 1271100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1271200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1271300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1271400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1271500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7294, + "step": 1271600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1271700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 1271800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1271900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1272000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1272100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1272200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1272300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1272400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1272500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7283, + "step": 1272600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1272700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1272800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 1272900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1273000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1273100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1273200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1273300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 1273400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1273500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1273600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1273700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1273800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1273900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1274000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1274100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1274200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1274300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1274400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1274500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1274600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1274700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.747, + "step": 1274800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1274900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 1275000 + }, + { + "epoch": 2.02, + "eval_loss": 0.6966413259506226, + "eval_runtime": 194.7482, + "eval_samples_per_second": 256.742, + "eval_steps_per_second": 2.008, + "step": 1275000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1275100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1275200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1275300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1275400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1275500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7298, + "step": 1275600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 1275700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1275800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 1275900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1276000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1276100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1276200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1276300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1276400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1276500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 1276600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1276700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1276800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1276900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1277000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1277100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1277200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1277300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1277400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7282, + "step": 1277500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 1277600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1277700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1277800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 1277900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1278000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1278100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 1278200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1278300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1278400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 1278500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 1278600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1278700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1278800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1278900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1279000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1279100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1279200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1279300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1279400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1279500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1279600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1279700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1279800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1279900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1280000 + }, + { + "epoch": 2.02, + "eval_loss": 0.6938179135322571, + "eval_runtime": 194.8718, + "eval_samples_per_second": 256.579, + "eval_steps_per_second": 2.006, + "step": 1280000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1280100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1280200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1280300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1280400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1280500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 1280600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1280700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1280800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1280900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 1281000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7289, + "step": 1281100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 1281200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1281300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1281400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 1281500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1281600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1281700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1281800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1281900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1282000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1282100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 1282200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7446, + "step": 1282300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1282400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1282500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1282600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 1282700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1282800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 1282900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1283000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1283100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 1283200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1283300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1283400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1283500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1283600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7455, + "step": 1283700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7294, + "step": 1283800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1283900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1284000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1284100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7325, + "step": 1284200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1284300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1284400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1284500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1284600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1284700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1284800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1284900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1285000 + }, + { + "epoch": 2.02, + "eval_loss": 0.6942003965377808, + "eval_runtime": 194.811, + "eval_samples_per_second": 256.659, + "eval_steps_per_second": 2.007, + "step": 1285000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.746, + "step": 1285100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 1285200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1285300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1285400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 1285500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1285600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1285700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1285800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1285900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 1286000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1286100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1286200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7314, + "step": 1286300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 1286400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1286500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 1286600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1286700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1286800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1286900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1287000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.732, + "step": 1287100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1287200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7281, + "step": 1287300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1287400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1287500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7427, + "step": 1287600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1287700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1287800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1287900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1288000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1288100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1288200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1288300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1288400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1288500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7309, + "step": 1288600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7294, + "step": 1288700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1288800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1288900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1289000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 1289100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7301, + "step": 1289200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 1289300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1289400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1289500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1289600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7461, + "step": 1289700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1289800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 1289900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 1290000 + }, + { + "epoch": 2.02, + "eval_loss": 0.6959330439567566, + "eval_runtime": 194.4213, + "eval_samples_per_second": 257.173, + "eval_steps_per_second": 2.011, + "step": 1290000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1290100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1290200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1290300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1290400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1290500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 1290600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1290700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1290800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1290900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1291000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1291100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 1291200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1291300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1291400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1291500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1291600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1291700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1291800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1291900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1292000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1292100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1292200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7269, + "step": 1292300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1292400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1292500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7253, + "step": 1292600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 1292700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1292800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 1292900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7291, + "step": 1293000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1293100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1293200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1293300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1293400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 1293500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1293600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1293700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1293800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1293900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1294000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1294100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7295, + "step": 1294200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1294300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1294400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1294500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7279, + "step": 1294600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7464, + "step": 1294700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1294800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1294900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1295000 + }, + { + "epoch": 2.02, + "eval_loss": 0.6947426795959473, + "eval_runtime": 194.7398, + "eval_samples_per_second": 256.753, + "eval_steps_per_second": 2.008, + "step": 1295000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1295100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1295200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 1295300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1295400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1295500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1295600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1295700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1295800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1295900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1296000 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7286, + "step": 1296100 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 1296200 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1296300 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1296400 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1296500 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7481, + "step": 1296600 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 1296700 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1296800 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1296900 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1297000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1297100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1297200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1297300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1297400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1297500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1297600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 1297700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1297800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1297900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1298000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1298100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1298200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1298300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1298400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1298500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.732, + "step": 1298600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1298700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1298800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1298900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1299000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1299100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1299200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1299300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1299400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1299500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1299600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1299700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1299800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1299900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1300000 + }, + { + "epoch": 2.03, + "eval_loss": 0.6939392685890198, + "eval_runtime": 194.6648, + "eval_samples_per_second": 256.852, + "eval_steps_per_second": 2.009, + "step": 1300000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1300100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1300200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1300300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1300400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7308, + "step": 1300500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1300600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1300700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1300800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1300900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1301000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1301100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1301200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1301300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1301400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1301500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1301600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1301700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1301800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1301900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1302000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7324, + "step": 1302100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 1302200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1302300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 1302400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1302500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7313, + "step": 1302600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1302700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1302800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 1302900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1303000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1303100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1303200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1303300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7296, + "step": 1303400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1303500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1303600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1303700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 1303800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 1303900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 1304000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1304100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1304200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1304300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1304400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1304500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7287, + "step": 1304600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1304700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1304800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 1304900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1305000 + }, + { + "epoch": 2.03, + "eval_loss": 0.6939355731010437, + "eval_runtime": 194.437, + "eval_samples_per_second": 257.153, + "eval_steps_per_second": 2.011, + "step": 1305000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1305100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1305200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1305300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1305400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1305500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1305600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1305700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1305800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7309, + "step": 1305900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1306000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7321, + "step": 1306100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1306200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1306300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1306400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 1306500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1306600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1306700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1306800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1306900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7289, + "step": 1307000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7317, + "step": 1307100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1307200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1307300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1307400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7303, + "step": 1307500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1307600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1307700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1307800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1307900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1308000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1308100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1308200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1308300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1308400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1308500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7284, + "step": 1308600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1308700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1308800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 1308900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1309000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1309100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1309200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1309300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7309, + "step": 1309400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1309500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7295, + "step": 1309600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1309700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1309800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 1309900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1310000 + }, + { + "epoch": 2.03, + "eval_loss": 0.6949591636657715, + "eval_runtime": 194.8114, + "eval_samples_per_second": 256.659, + "eval_steps_per_second": 2.007, + "step": 1310000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1310100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1310200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1310300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1310400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1310500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1310600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 1310700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1310800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1310900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1311000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1311100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1311200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 1311300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1311400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1311500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1311600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1311700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7296, + "step": 1311800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.753, + "step": 1311900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1312000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1312100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 1312200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1312300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1312400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7301, + "step": 1312500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1312600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1312700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7292, + "step": 1312800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1312900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1313000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 1313100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7325, + "step": 1313200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1313300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1313400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 1313500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1313600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1313700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1313800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1313900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7283, + "step": 1314000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1314100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1314200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1314300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1314400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 1314500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 1314600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1314700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1314800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1314900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1315000 + }, + { + "epoch": 2.03, + "eval_loss": 0.6930851340293884, + "eval_runtime": 194.6929, + "eval_samples_per_second": 256.815, + "eval_steps_per_second": 2.008, + "step": 1315000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1315100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7514, + "step": 1315200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7276, + "step": 1315300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1315400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 1315500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1315600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1315700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7484, + "step": 1315800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 1315900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 1316000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1316100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1316200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1316300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1316400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1316500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1316600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 1316700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1316800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1316900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1317000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1317100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1317200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1317300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7279, + "step": 1317400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 1317500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7262, + "step": 1317600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1317700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1317800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1317900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1318000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1318100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7311, + "step": 1318200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 1318300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1318400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1318500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 1318600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1318700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1318800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1318900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1319000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1319100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1319200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1319300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1319400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1319500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 1319600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1319700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7321, + "step": 1319800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7306, + "step": 1319900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1320000 + }, + { + "epoch": 2.03, + "eval_loss": 0.6944131851196289, + "eval_runtime": 194.7661, + "eval_samples_per_second": 256.718, + "eval_steps_per_second": 2.008, + "step": 1320000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1320100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1320200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7299, + "step": 1320300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 1320400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1320500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7287, + "step": 1320600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1320700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7258, + "step": 1320800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1320900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7279, + "step": 1321000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1321100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1321200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1321300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1321400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1321500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1321600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1321700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1321800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1321900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.732, + "step": 1322000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1322100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1322200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1322300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7305, + "step": 1322400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1322500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1322600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1322700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1322800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1322900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1323000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1323100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1323200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1323300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7278, + "step": 1323400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7314, + "step": 1323500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7298, + "step": 1323600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7298, + "step": 1323700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1323800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1323900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1324000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1324100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7292, + "step": 1324200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1324300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1324400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1324500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1324600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1324700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1324800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1324900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7297, + "step": 1325000 + }, + { + "epoch": 2.03, + "eval_loss": 0.6931760311126709, + "eval_runtime": 194.305, + "eval_samples_per_second": 257.327, + "eval_steps_per_second": 2.012, + "step": 1325000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 1325100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1325200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1325300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7324, + "step": 1325400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7257, + "step": 1325500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1325600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1325700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1325800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1325900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1326000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1326100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1326200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1326300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1326400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1326500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1326600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 1326700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1326800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1326900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1327000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7305, + "step": 1327100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1327200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1327300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 1327400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1327500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1327600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1327700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1327800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1327900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1328000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7285, + "step": 1328100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1328200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 1328300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1328400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1328500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1328600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1328700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 1328800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1328900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1329000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1329100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1329200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7309, + "step": 1329300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1329400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1329500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1329600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1329700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7303, + "step": 1329800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7269, + "step": 1329900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7324, + "step": 1330000 + }, + { + "epoch": 2.03, + "eval_loss": 0.6926876306533813, + "eval_runtime": 194.46, + "eval_samples_per_second": 257.122, + "eval_steps_per_second": 2.011, + "step": 1330000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1330100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1330200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1330300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1330400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1330500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1330600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 1330700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1330800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1330900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1331000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1331100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7296, + "step": 1331200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1331300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1331400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1331500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.73, + "step": 1331600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7304, + "step": 1331700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7268, + "step": 1331800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7325, + "step": 1331900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1332000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1332100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1332200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1332300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1332400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1332500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1332600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1332700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1332800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1332900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7301, + "step": 1333000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7296, + "step": 1333100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1333200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7305, + "step": 1333300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 1333400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7314, + "step": 1333500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1333600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1333700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 1333800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1333900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1334000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7325, + "step": 1334100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7291, + "step": 1334200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7262, + "step": 1334300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7314, + "step": 1334400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.732, + "step": 1334500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7284, + "step": 1334600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1334700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 1334800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1334900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7301, + "step": 1335000 + }, + { + "epoch": 2.03, + "eval_loss": 0.6940681338310242, + "eval_runtime": 194.3072, + "eval_samples_per_second": 257.324, + "eval_steps_per_second": 2.012, + "step": 1335000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1335100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1335200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1335300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1335400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1335500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1335600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1335700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1335800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1335900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1336000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1336100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1336200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7293, + "step": 1336300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1336400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1336500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1336600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1336700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1336800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1336900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7321, + "step": 1337000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1337100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1337200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1337300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1337400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1337500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1337600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1337700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1337800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1337900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 1338000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1338100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1338200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1338300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7325, + "step": 1338400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1338500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7301, + "step": 1338600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7312, + "step": 1338700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1338800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1338900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1339000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1339100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1339200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1339300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1339400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1339500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1339600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 1339700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1339800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1339900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1340000 + }, + { + "epoch": 2.03, + "eval_loss": 0.6945679783821106, + "eval_runtime": 194.6901, + "eval_samples_per_second": 256.818, + "eval_steps_per_second": 2.008, + "step": 1340000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7312, + "step": 1340100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1340200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1340300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 1340400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1340500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7278, + "step": 1340600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1340700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7313, + "step": 1340800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1340900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1341000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1341100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1341200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1341300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 1341400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7321, + "step": 1341500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1341600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1341700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1341800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1341900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1342000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1342100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1342200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1342300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1342400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1342500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7294, + "step": 1342600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1342700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7459, + "step": 1342800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1342900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1343000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1343100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1343200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1343300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1343400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1343500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1343600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1343700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1343800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1343900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 1344000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 1344100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1344200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1344300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7442, + "step": 1344400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1344500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1344600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1344700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1344800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1344900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1345000 + }, + { + "epoch": 2.03, + "eval_loss": 0.6937873363494873, + "eval_runtime": 194.4871, + "eval_samples_per_second": 257.086, + "eval_steps_per_second": 2.01, + "step": 1345000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1345100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1345200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1345300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 1345400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1345500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1345600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1345700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1345800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1345900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7303, + "step": 1346000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7469, + "step": 1346100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1346200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1346300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1346400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1346500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1346600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1346700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7289, + "step": 1346800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1346900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7456, + "step": 1347000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1347100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1347200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1347300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1347400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1347500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1347600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1347700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1347800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1347900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1348000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7305, + "step": 1348100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1348200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1348300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 1348400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 1348500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1348600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1348700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1348800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1348900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1349000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1349100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1349200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1349300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1349400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1349500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1349600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1349700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7303, + "step": 1349800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.745, + "step": 1349900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1350000 + }, + { + "epoch": 2.03, + "eval_loss": 0.6945255398750305, + "eval_runtime": 195.1366, + "eval_samples_per_second": 256.231, + "eval_steps_per_second": 2.004, + "step": 1350000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1350100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1350200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1350300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1350400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1350500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1350600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1350700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1350800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7315, + "step": 1350900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1351000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1351100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1351200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1351300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 1351400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1351500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1351600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 1351700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1351800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1351900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 1352000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7307, + "step": 1352100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1352200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1352300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1352400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1352500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1352600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 1352700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1352800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7317, + "step": 1352900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1353000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1353100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1353200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7325, + "step": 1353300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1353400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1353500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1353600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1353700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1353800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7307, + "step": 1353900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1354000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1354100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1354200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7304, + "step": 1354300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1354400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1354500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1354600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7499, + "step": 1354700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1354800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1354900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.732, + "step": 1355000 + }, + { + "epoch": 2.03, + "eval_loss": 0.6933331489562988, + "eval_runtime": 194.6131, + "eval_samples_per_second": 256.92, + "eval_steps_per_second": 2.009, + "step": 1355000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1355100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1355200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1355300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1355400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7471, + "step": 1355500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.73, + "step": 1355600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7235, + "step": 1355700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1355800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1355900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7276, + "step": 1356000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7292, + "step": 1356100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 1356200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1356300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1356400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1356500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1356600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1356700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1356800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1356900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1357000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7432, + "step": 1357100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1357200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1357300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7301, + "step": 1357400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1357500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1357600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1357700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1357800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1357900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1358000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1358100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1358200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 1358300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1358400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1358500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 1358600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1358700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1358800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1358900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7311, + "step": 1359000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1359100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1359200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1359300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1359400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1359500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1359600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1359700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1359800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1359900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1360000 + }, + { + "epoch": 2.03, + "eval_loss": 0.6937475204467773, + "eval_runtime": 194.729, + "eval_samples_per_second": 256.767, + "eval_steps_per_second": 2.008, + "step": 1360000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1360100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1360200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 1360300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1360400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1360500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1360600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1360700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7298, + "step": 1360800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1360900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7321, + "step": 1361000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1361100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1361200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1361300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7294, + "step": 1361400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1361500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1361600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1361700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1361800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7264, + "step": 1361900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1362000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1362100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1362200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7297, + "step": 1362300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1362400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 1362500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1362600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7206, + "step": 1362700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1362800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1362900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1363000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1363100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1363200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7443, + "step": 1363300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 1363400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7313, + "step": 1363500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1363600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7303, + "step": 1363700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1363800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1363900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1364000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1364100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1364200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7303, + "step": 1364300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1364400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1364500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1364600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1364700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1364800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1364900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.732, + "step": 1365000 + }, + { + "epoch": 2.03, + "eval_loss": 0.6959472894668579, + "eval_runtime": 194.6466, + "eval_samples_per_second": 256.876, + "eval_steps_per_second": 2.009, + "step": 1365000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1365100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1365200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1365300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1365400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1365500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1365600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1365700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1365800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1365900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7275, + "step": 1366000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1366100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1366200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 1366300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1366400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1366500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7468, + "step": 1366600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1366700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1366800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1366900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1367000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1367100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7306, + "step": 1367200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1367300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7251, + "step": 1367400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1367500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1367600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1367700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1367800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1367900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1368000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7309, + "step": 1368100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1368200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 1368300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1368400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1368500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1368600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1368700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1368800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7305, + "step": 1368900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1369000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1369100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7256, + "step": 1369200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1369300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1369400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7312, + "step": 1369500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 1369600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1369700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1369800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.732, + "step": 1369900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1370000 + }, + { + "epoch": 2.03, + "eval_loss": 0.6928281188011169, + "eval_runtime": 194.8294, + "eval_samples_per_second": 256.635, + "eval_steps_per_second": 2.007, + "step": 1370000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1370100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7308, + "step": 1370200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1370300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1370400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1370500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1370600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1370700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1370800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1370900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1371000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7307, + "step": 1371100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1371200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7281, + "step": 1371300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1371400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1371500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1371600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1371700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1371800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1371900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1372000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1372100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1372200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7313, + "step": 1372300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1372400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1372500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1372600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1372700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1372800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1372900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1373000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1373100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7306, + "step": 1373200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1373300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1373400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1373500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1373600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1373700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1373800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7249, + "step": 1373900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7256, + "step": 1374000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1374100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1374200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1374300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1374400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1374500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1374600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1374700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1374800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.728, + "step": 1374900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1375000 + }, + { + "epoch": 2.03, + "eval_loss": 0.6932492852210999, + "eval_runtime": 194.6784, + "eval_samples_per_second": 256.834, + "eval_steps_per_second": 2.008, + "step": 1375000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1375100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1375200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1375300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1375400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1375500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1375600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1375700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1375800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1375900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1376000 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1376100 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7321, + "step": 1376200 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1376300 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1376400 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7299, + "step": 1376500 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1376600 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1376700 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1376800 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7321, + "step": 1376900 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1377000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1377100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1377200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1377300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1377400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1377500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1377600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.748, + "step": 1377700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1377800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.744, + "step": 1377900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7452, + "step": 1378000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1378100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1378200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1378300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1378400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1378500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1378600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 1378700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1378800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1378900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1379000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1379100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7259, + "step": 1379200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 1379300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1379400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1379500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 1379600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1379700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7308, + "step": 1379800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1379900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1380000 + }, + { + "epoch": 2.04, + "eval_loss": 0.6935710310935974, + "eval_runtime": 194.6566, + "eval_samples_per_second": 256.863, + "eval_steps_per_second": 2.009, + "step": 1380000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1380100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7259, + "step": 1380200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1380300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1380400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1380500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1380600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 1380700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1380800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1380900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1381000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1381100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1381200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1381300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7315, + "step": 1381400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1381500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1381600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7283, + "step": 1381700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1381800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7324, + "step": 1381900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1382000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1382100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1382200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1382300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1382400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1382500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1382600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7321, + "step": 1382700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1382800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1382900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7266, + "step": 1383000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7325, + "step": 1383100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1383200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1383300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1383400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7475, + "step": 1383500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1383600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1383700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7269, + "step": 1383800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1383900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 1384000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1384100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.728, + "step": 1384200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7291, + "step": 1384300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1384400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1384500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7317, + "step": 1384600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1384700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1384800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7451, + "step": 1384900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1385000 + }, + { + "epoch": 2.04, + "eval_loss": 0.6926178336143494, + "eval_runtime": 194.8471, + "eval_samples_per_second": 256.611, + "eval_steps_per_second": 2.007, + "step": 1385000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7287, + "step": 1385100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7292, + "step": 1385200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7314, + "step": 1385300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7314, + "step": 1385400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1385500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7294, + "step": 1385600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1385700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7492, + "step": 1385800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1385900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1386000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7479, + "step": 1386100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1386200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7312, + "step": 1386300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1386400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7309, + "step": 1386500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1386600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1386700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1386800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1386900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1387000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1387100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1387200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1387300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1387400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 1387500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1387600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1387700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1387800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1387900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1388000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7296, + "step": 1388100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1388200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1388300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1388400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1388500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1388600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1388700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1388800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1388900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1389000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1389100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1389200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1389300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1389400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7325, + "step": 1389500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1389600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1389700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1389800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 1389900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1390000 + }, + { + "epoch": 2.04, + "eval_loss": 0.6942101716995239, + "eval_runtime": 194.9343, + "eval_samples_per_second": 256.497, + "eval_steps_per_second": 2.006, + "step": 1390000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1390100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.73, + "step": 1390200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1390300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7305, + "step": 1390400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1390500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7301, + "step": 1390600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1390700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1390800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 1390900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1391000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7419, + "step": 1391100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1391200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1391300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1391400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1391500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1391600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7306, + "step": 1391700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1391800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1391900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1392000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1392100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1392200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 1392300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1392400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1392500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1392600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1392700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1392800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1392900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7291, + "step": 1393000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7255, + "step": 1393100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1393200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1393300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1393400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7287, + "step": 1393500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1393600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7448, + "step": 1393700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1393800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1393900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7309, + "step": 1394000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1394100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1394200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1394300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 1394400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7325, + "step": 1394500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1394600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1394700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1394800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1394900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 1395000 + }, + { + "epoch": 2.04, + "eval_loss": 0.6926989555358887, + "eval_runtime": 194.6281, + "eval_samples_per_second": 256.9, + "eval_steps_per_second": 2.009, + "step": 1395000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1395100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1395200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1395300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7311, + "step": 1395400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 1395500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7325, + "step": 1395600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1395700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1395800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7315, + "step": 1395900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7278, + "step": 1396000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1396100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1396200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7288, + "step": 1396300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1396400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1396500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1396600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1396700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1396800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 1396900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1397000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1397100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7324, + "step": 1397200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1397300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1397400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.728, + "step": 1397500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7279, + "step": 1397600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.749, + "step": 1397700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1397800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1397900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1398000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1398100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1398200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 1398300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7454, + "step": 1398400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1398500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7321, + "step": 1398600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7291, + "step": 1398700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1398800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 1398900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1399000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1399100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1399200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1399300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1399400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7284, + "step": 1399500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1399600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1399700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1399800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1399900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1400000 + }, + { + "epoch": 2.04, + "eval_loss": 0.6916325688362122, + "eval_runtime": 194.7342, + "eval_samples_per_second": 256.76, + "eval_steps_per_second": 2.008, + "step": 1400000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1400100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7267, + "step": 1400200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1400300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1400400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1400500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 1400600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7431, + "step": 1400700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1400800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1400900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1401000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1401100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1401200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1401300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1401400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1401500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1401600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1401700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1401800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1401900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1402000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1402100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7304, + "step": 1402200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1402300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1402400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1402500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1402600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7301, + "step": 1402700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 1402800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1402900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1403000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7317, + "step": 1403100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7309, + "step": 1403200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1403300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7307, + "step": 1403400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1403500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7434, + "step": 1403600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7308, + "step": 1403700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1403800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1403900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1404000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1404100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7473, + "step": 1404200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1404300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 1404400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.73, + "step": 1404500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1404600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1404700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1404800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1404900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7325, + "step": 1405000 + }, + { + "epoch": 2.04, + "eval_loss": 0.6937705278396606, + "eval_runtime": 194.7497, + "eval_samples_per_second": 256.74, + "eval_steps_per_second": 2.008, + "step": 1405000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1405100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1405200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1405300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1405400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7301, + "step": 1405500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7307, + "step": 1405600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1405700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1405800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1405900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 1406000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7325, + "step": 1406100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.73, + "step": 1406200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1406300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7315, + "step": 1406400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 1406500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1406600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7299, + "step": 1406700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1406800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1406900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1407000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1407100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.732, + "step": 1407200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1407300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1407400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1407500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7275, + "step": 1407600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1407700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7304, + "step": 1407800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7325, + "step": 1407900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1408000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1408100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1408200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1408300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1408400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1408500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1408600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1408700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1408800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1408900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1409000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7458, + "step": 1409100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1409200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1409300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1409400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1409500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1409600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1409700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.732, + "step": 1409800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1409900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7485, + "step": 1410000 + }, + { + "epoch": 2.04, + "eval_loss": 0.6946880221366882, + "eval_runtime": 203.4477, + "eval_samples_per_second": 245.763, + "eval_steps_per_second": 1.922, + "step": 1410000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7414, + "step": 1410100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1410200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1410300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1410400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 1410500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1410600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7311, + "step": 1410700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1410800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1410900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1411000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7271, + "step": 1411100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1411200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1411300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1411400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1411500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1411600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 1411700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1411800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7315, + "step": 1411900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7307, + "step": 1412000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 1412100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1412200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1412300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1412400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1412500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1412600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1412700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1412800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7286, + "step": 1412900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1413000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7279, + "step": 1413100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1413200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 1413300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1413400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1413500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7298, + "step": 1413600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1413700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1413800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7314, + "step": 1413900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7312, + "step": 1414000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 1414100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7307, + "step": 1414200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1414300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1414400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1414500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1414600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7317, + "step": 1414700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1414800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7286, + "step": 1414900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7447, + "step": 1415000 + }, + { + "epoch": 2.04, + "eval_loss": 0.6938619017601013, + "eval_runtime": 199.9529, + "eval_samples_per_second": 250.059, + "eval_steps_per_second": 1.955, + "step": 1415000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1415100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1415200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1415300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7433, + "step": 1415400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1415500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7313, + "step": 1415600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1415700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1415800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1415900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7249, + "step": 1416000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 1416100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1416200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1416300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1416400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1416500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1416600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1416700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1416800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1416900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1417000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1417100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1417200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 1417300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 1417400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1417500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1417600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1417700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7281, + "step": 1417800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1417900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1418000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7287, + "step": 1418100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 1418200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1418300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7268, + "step": 1418400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1418500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1418600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1418700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7306, + "step": 1418800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1418900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.724, + "step": 1419000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1419100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7296, + "step": 1419200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7298, + "step": 1419300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1419400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1419500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1419600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1419700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1419800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 1419900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7308, + "step": 1420000 + }, + { + "epoch": 2.04, + "eval_loss": 0.6929402947425842, + "eval_runtime": 194.6859, + "eval_samples_per_second": 256.824, + "eval_steps_per_second": 2.008, + "step": 1420000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7315, + "step": 1420100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1420200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1420300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7324, + "step": 1420400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7286, + "step": 1420500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7314, + "step": 1420600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7286, + "step": 1420700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1420800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7286, + "step": 1420900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7311, + "step": 1421000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7268, + "step": 1421100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7286, + "step": 1421200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1421300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1421400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1421500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1421600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7289, + "step": 1421700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7286, + "step": 1421800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7241, + "step": 1421900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1422000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7315, + "step": 1422100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1422200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1422300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1422400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1422500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 1422600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1422700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 1422800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1422900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1423000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 1423100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1423200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1423300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1423400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7233, + "step": 1423500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7311, + "step": 1423600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7309, + "step": 1423700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7317, + "step": 1423800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1423900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1424000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1424100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 1424200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1424300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1424400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1424500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1424600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1424700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1424800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7306, + "step": 1424900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1425000 + }, + { + "epoch": 2.04, + "eval_loss": 0.693824052810669, + "eval_runtime": 194.8767, + "eval_samples_per_second": 256.572, + "eval_steps_per_second": 2.006, + "step": 1425000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1425100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1425200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1425300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7317, + "step": 1425400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1425500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1425600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1425700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7428, + "step": 1425800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1425900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1426000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1426100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1426200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1426300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1426400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1426500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1426600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7293, + "step": 1426700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1426800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7325, + "step": 1426900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1427000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 1427100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1427200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1427300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7429, + "step": 1427400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.732, + "step": 1427500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1427600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1427700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1427800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7308, + "step": 1427900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1428000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1428100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7306, + "step": 1428200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1428300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1428400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1428500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1428600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1428700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1428800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7312, + "step": 1428900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1429000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1429100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 1429200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7261, + "step": 1429300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1429400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1429500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1429600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1429700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7321, + "step": 1429800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7304, + "step": 1429900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1430000 + }, + { + "epoch": 2.04, + "eval_loss": 0.6921608448028564, + "eval_runtime": 194.7627, + "eval_samples_per_second": 256.723, + "eval_steps_per_second": 2.008, + "step": 1430000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1430100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1430200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7263, + "step": 1430300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1430400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1430500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 1430600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1430700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7267, + "step": 1430800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7282, + "step": 1430900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1431000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7291, + "step": 1431100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 1431200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1431300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1431400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1431500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1431600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1431700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1431800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 1431900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7334, + "step": 1432000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1432100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7284, + "step": 1432200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7457, + "step": 1432300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 1432400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7276, + "step": 1432500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1432600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1432700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1432800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1432900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1433000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1433100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7313, + "step": 1433200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.725, + "step": 1433300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1433400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1433500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1433600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1433700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1433800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1433900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1434000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 1434100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1434200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.73, + "step": 1434300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1434400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 1434500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1434600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 1434700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7297, + "step": 1434800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1434900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1435000 + }, + { + "epoch": 2.04, + "eval_loss": 0.692685067653656, + "eval_runtime": 194.5633, + "eval_samples_per_second": 256.986, + "eval_steps_per_second": 2.01, + "step": 1435000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1435100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1435200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7317, + "step": 1435300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1435400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1435500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7219, + "step": 1435600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1435700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1435800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7313, + "step": 1435900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1436000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1436100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1436200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7268, + "step": 1436300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1436400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1436500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1436600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7264, + "step": 1436700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1436800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1436900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1437000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7264, + "step": 1437100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1437200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1437300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7311, + "step": 1437400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1437500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1437600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.728, + "step": 1437700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1437800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1437900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1438000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1438100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1438200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1438300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7278, + "step": 1438400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1438500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1438600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7271, + "step": 1438700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1438800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 1438900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7285, + "step": 1439000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1439100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1439200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1439300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7402, + "step": 1439400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7285, + "step": 1439500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7312, + "step": 1439600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7296, + "step": 1439700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1439800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7306, + "step": 1439900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7287, + "step": 1440000 + }, + { + "epoch": 2.04, + "eval_loss": 0.6933913826942444, + "eval_runtime": 194.9496, + "eval_samples_per_second": 256.477, + "eval_steps_per_second": 2.006, + "step": 1440000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1440100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1440200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1440300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1440400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1440500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7313, + "step": 1440600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7317, + "step": 1440700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7321, + "step": 1440800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7311, + "step": 1440900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7401, + "step": 1441000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.743, + "step": 1441100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7271, + "step": 1441200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7265, + "step": 1441300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7308, + "step": 1441400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1441500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1441600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1441700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1441800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7294, + "step": 1441900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1442000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.729, + "step": 1442100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 1442200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.741, + "step": 1442300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1442400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1442500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.728, + "step": 1442600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1442700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1442800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1442900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1443000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1443100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1443200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1443300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1443400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1443500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7299, + "step": 1443600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1443700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1443800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1443900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1444000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1444100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1444200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.728, + "step": 1444300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1444400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1444500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1444600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1444700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1444800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1444900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1445000 + }, + { + "epoch": 2.04, + "eval_loss": 0.693956196308136, + "eval_runtime": 194.9936, + "eval_samples_per_second": 256.419, + "eval_steps_per_second": 2.005, + "step": 1445000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7287, + "step": 1445100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1445200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7306, + "step": 1445300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7441, + "step": 1445400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1445500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1445600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1445700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1445800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1445900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1446000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1446100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7298, + "step": 1446200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1446300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1446400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1446500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 1446600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1446700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7281, + "step": 1446800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1446900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1447000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7279, + "step": 1447100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1447200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.727, + "step": 1447300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1447400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1447500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 1447600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7288, + "step": 1447700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1447800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 1447900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7301, + "step": 1448000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7324, + "step": 1448100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7312, + "step": 1448200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1448300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7463, + "step": 1448400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1448500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7472, + "step": 1448600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7298, + "step": 1448700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1448800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 1448900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1449000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 1449100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1449200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1449300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 1449400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1449500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1449600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7438, + "step": 1449700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7285, + "step": 1449800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7377, + "step": 1449900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1450000 + }, + { + "epoch": 2.04, + "eval_loss": 0.6916955709457397, + "eval_runtime": 193.9614, + "eval_samples_per_second": 257.783, + "eval_steps_per_second": 2.016, + "step": 1450000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7356, + "step": 1450100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7262, + "step": 1450200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1450300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1450400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1450500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1450600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1450700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7439, + "step": 1450800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7289, + "step": 1450900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7453, + "step": 1451000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1451100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7435, + "step": 1451200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1451300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1451400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 1451500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 1451600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7266, + "step": 1451700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7335, + "step": 1451800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1451900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 1452000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7301, + "step": 1452100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7449, + "step": 1452200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1452300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1452400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7267, + "step": 1452500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1452600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7277, + "step": 1452700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7278, + "step": 1452800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7278, + "step": 1452900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1453000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1453100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1453200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1453300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 1453400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7298, + "step": 1453500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7255, + "step": 1453600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1453700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 1453800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7314, + "step": 1453900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1454000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1454100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7324, + "step": 1454200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1454300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7252, + "step": 1454400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1454500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1454600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1454700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1454800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7325, + "step": 1454900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1455000 + }, + { + "epoch": 2.04, + "eval_loss": 0.6905053853988647, + "eval_runtime": 194.6428, + "eval_samples_per_second": 256.881, + "eval_steps_per_second": 2.009, + "step": 1455000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.73, + "step": 1455100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1455200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1455300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7466, + "step": 1455400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 1455500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1455600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1455700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1455800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7293, + "step": 1455900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1456000 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1456100 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7311, + "step": 1456200 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7408, + "step": 1456300 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1456400 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1456500 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1456600 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1456700 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1456800 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7405, + "step": 1456900 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1457000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1457100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1457200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7236, + "step": 1457300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1457400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1457500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1457600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1457700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7304, + "step": 1457800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1457900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1458000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 1458100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1458200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1458300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1458400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.729, + "step": 1458500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7263, + "step": 1458600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7321, + "step": 1458700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1458800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 1458900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7293, + "step": 1459000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1459100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1459200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1459300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1459400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1459500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1459600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7306, + "step": 1459700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1459800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1459900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1460000 + }, + { + "epoch": 2.05, + "eval_loss": 0.6919227838516235, + "eval_runtime": 194.6141, + "eval_samples_per_second": 256.919, + "eval_steps_per_second": 2.009, + "step": 1460000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1460100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1460200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1460300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1460400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1460500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.725, + "step": 1460600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1460700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1460800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7307, + "step": 1460900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7315, + "step": 1461000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 1461100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7423, + "step": 1461200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1461300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7299, + "step": 1461400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1461500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1461600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1461700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1461800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7317, + "step": 1461900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1462000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7279, + "step": 1462100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 1462200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1462300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7299, + "step": 1462400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1462500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1462600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7283, + "step": 1462700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1462800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1462900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7314, + "step": 1463000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1463100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1463200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1463300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1463400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7299, + "step": 1463500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7467, + "step": 1463600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1463700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1463800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7282, + "step": 1463900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1464000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1464100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7317, + "step": 1464200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7341, + "step": 1464300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1464400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1464500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1464600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7301, + "step": 1464700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1464800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7357, + "step": 1464900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7314, + "step": 1465000 + }, + { + "epoch": 2.05, + "eval_loss": 0.6930609941482544, + "eval_runtime": 194.3886, + "eval_samples_per_second": 257.217, + "eval_steps_per_second": 2.011, + "step": 1465000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1465100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 1465200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1465300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7289, + "step": 1465400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1465500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1465600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1465700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7436, + "step": 1465800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7309, + "step": 1465900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1466000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1466100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7303, + "step": 1466200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7317, + "step": 1466300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7309, + "step": 1466400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1466500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1466600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1466700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7392, + "step": 1466800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7284, + "step": 1466900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 1467000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1467100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7278, + "step": 1467200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7426, + "step": 1467300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1467400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7283, + "step": 1467500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1467600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1467700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7373, + "step": 1467800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7409, + "step": 1467900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1468000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 1468100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7403, + "step": 1468200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1468300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7282, + "step": 1468400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1468500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1468600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1468700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7258, + "step": 1468800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7422, + "step": 1468900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1469000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1469100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1469200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1469300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.73, + "step": 1469400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7273, + "step": 1469500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7288, + "step": 1469600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7412, + "step": 1469700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.732, + "step": 1469800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1469900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7291, + "step": 1470000 + }, + { + "epoch": 2.05, + "eval_loss": 0.6922557950019836, + "eval_runtime": 194.485, + "eval_samples_per_second": 257.089, + "eval_steps_per_second": 2.01, + "step": 1470000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7306, + "step": 1470100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7321, + "step": 1470200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7301, + "step": 1470300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7239, + "step": 1470400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.724, + "step": 1470500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1470600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7298, + "step": 1470700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1470800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1470900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7313, + "step": 1471000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7288, + "step": 1471100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1471200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7294, + "step": 1471300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7293, + "step": 1471400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.732, + "step": 1471500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7292, + "step": 1471600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7445, + "step": 1471700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7366, + "step": 1471800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.729, + "step": 1471900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7287, + "step": 1472000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1472100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7256, + "step": 1472200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 1472300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7285, + "step": 1472400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1472500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7258, + "step": 1472600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7309, + "step": 1472700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7281, + "step": 1472800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.729, + "step": 1472900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1473000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7314, + "step": 1473100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1473200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7271, + "step": 1473300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7425, + "step": 1473400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7406, + "step": 1473500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1473600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7319, + "step": 1473700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1473800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.736, + "step": 1473900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1474000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7315, + "step": 1474100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1474200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7305, + "step": 1474300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7417, + "step": 1474400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7262, + "step": 1474500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 1474600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7283, + "step": 1474700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7291, + "step": 1474800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.74, + "step": 1474900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1475000 + }, + { + "epoch": 2.05, + "eval_loss": 0.6930740475654602, + "eval_runtime": 194.68, + "eval_samples_per_second": 256.832, + "eval_steps_per_second": 2.008, + "step": 1475000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1475100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7288, + "step": 1475200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 1475300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1475400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1475500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1475600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1475700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7306, + "step": 1475800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1475900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1476000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7313, + "step": 1476100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7315, + "step": 1476200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7268, + "step": 1476300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7396, + "step": 1476400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.732, + "step": 1476500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7362, + "step": 1476600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1476700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7275, + "step": 1476800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1476900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7389, + "step": 1477000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1477100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1477200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1477300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 1477400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1477500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7322, + "step": 1477600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7313, + "step": 1477700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7296, + "step": 1477800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1477900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7276, + "step": 1478000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1478100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 1478200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7233, + "step": 1478300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7421, + "step": 1478400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7315, + "step": 1478500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 1478600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7265, + "step": 1478700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7309, + "step": 1478800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1478900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7363, + "step": 1479000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7313, + "step": 1479100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1479200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7324, + "step": 1479300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1479400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7294, + "step": 1479500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1479600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1479700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1479800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7262, + "step": 1479900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7324, + "step": 1480000 + }, + { + "epoch": 2.05, + "eval_loss": 0.6922203898429871, + "eval_runtime": 194.9048, + "eval_samples_per_second": 256.535, + "eval_steps_per_second": 2.006, + "step": 1480000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7307, + "step": 1480100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7314, + "step": 1480200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1480300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1480400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7397, + "step": 1480500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7312, + "step": 1480600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7289, + "step": 1480700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7374, + "step": 1480800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1480900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1481000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 1481100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7311, + "step": 1481200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1481300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 1481400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1481500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1481600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.732, + "step": 1481700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7286, + "step": 1481800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 1481900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7371, + "step": 1482000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1482100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7259, + "step": 1482200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1482300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7368, + "step": 1482400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 1482500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7324, + "step": 1482600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7294, + "step": 1482700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7286, + "step": 1482800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7309, + "step": 1482900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7348, + "step": 1483000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1483100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1483200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 1483300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1483400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7413, + "step": 1483500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7399, + "step": 1483600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 1483700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7385, + "step": 1483800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7388, + "step": 1483900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1484000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7339, + "step": 1484100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1484200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7311, + "step": 1484300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7314, + "step": 1484400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1484500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1484600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7303, + "step": 1484700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7343, + "step": 1484800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7328, + "step": 1484900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7305, + "step": 1485000 + }, + { + "epoch": 2.05, + "eval_loss": 0.6922410130500793, + "eval_runtime": 194.5142, + "eval_samples_per_second": 257.051, + "eval_steps_per_second": 2.01, + "step": 1485000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7292, + "step": 1485100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.732, + "step": 1485200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1485300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7344, + "step": 1485400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1485500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1485600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7299, + "step": 1485700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1485800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.742, + "step": 1485900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1486000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 1486100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7286, + "step": 1486200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7324, + "step": 1486300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.737, + "step": 1486400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7325, + "step": 1486500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7386, + "step": 1486600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1486700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1486800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7258, + "step": 1486900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7415, + "step": 1487000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7301, + "step": 1487100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7379, + "step": 1487200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1487300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.728, + "step": 1487400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7315, + "step": 1487500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7278, + "step": 1487600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7291, + "step": 1487700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1487800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1487900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1488000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7352, + "step": 1488100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7313, + "step": 1488200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7416, + "step": 1488300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1488400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7305, + "step": 1488500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1488600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1488700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7336, + "step": 1488800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1488900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7307, + "step": 1489000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1489100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7378, + "step": 1489200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7267, + "step": 1489300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1489400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7323, + "step": 1489500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1489600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7296, + "step": 1489700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7382, + "step": 1489800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7321, + "step": 1489900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1490000 + }, + { + "epoch": 2.05, + "eval_loss": 0.6925473213195801, + "eval_runtime": 194.8607, + "eval_samples_per_second": 256.594, + "eval_steps_per_second": 2.007, + "step": 1490000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7311, + "step": 1490100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7235, + "step": 1490200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 1490300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.739, + "step": 1490400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7326, + "step": 1490500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1490600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7254, + "step": 1490700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7292, + "step": 1490800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7387, + "step": 1490900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7395, + "step": 1491000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7263, + "step": 1491100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7259, + "step": 1491200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7282, + "step": 1491300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7289, + "step": 1491400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7264, + "step": 1491500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7398, + "step": 1491600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1491700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7306, + "step": 1491800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7277, + "step": 1491900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 1492000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.734, + "step": 1492100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1492200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7315, + "step": 1492300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.732, + "step": 1492400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7314, + "step": 1492500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7317, + "step": 1492600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7444, + "step": 1492700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1492800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7361, + "step": 1492900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7283, + "step": 1493000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7317, + "step": 1493100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7351, + "step": 1493200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7364, + "step": 1493300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7355, + "step": 1493400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7263, + "step": 1493500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7345, + "step": 1493600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1493700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1493800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7353, + "step": 1493900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 1494000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7294, + "step": 1494100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1494200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1494300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7375, + "step": 1494400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7303, + "step": 1494500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.73, + "step": 1494600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.73, + "step": 1494700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7243, + "step": 1494800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7278, + "step": 1494900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1495000 + }, + { + "epoch": 2.05, + "eval_loss": 0.6898379921913147, + "eval_runtime": 194.3102, + "eval_samples_per_second": 257.32, + "eval_steps_per_second": 2.012, + "step": 1495000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7337, + "step": 1495100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7318, + "step": 1495200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1495300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7424, + "step": 1495400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7321, + "step": 1495500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7309, + "step": 1495600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7297, + "step": 1495700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7338, + "step": 1495800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 1495900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7393, + "step": 1496000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7294, + "step": 1496100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 1496200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7295, + "step": 1496300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7332, + "step": 1496400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7329, + "step": 1496500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7347, + "step": 1496600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7358, + "step": 1496700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 1496800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7209, + "step": 1496900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1497000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7307, + "step": 1497100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 1497200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7288, + "step": 1497300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 1497400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7359, + "step": 1497500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1497600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7346, + "step": 1497700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7383, + "step": 1497800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 1497900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7391, + "step": 1498000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7316, + "step": 1498100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7404, + "step": 1498200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.731, + "step": 1498300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7365, + "step": 1498400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7291, + "step": 1498500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 1498600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7327, + "step": 1498700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7302, + "step": 1498800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7187, + "step": 1498900 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.727, + "step": 1499000 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.735, + "step": 1499100 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7292, + "step": 1499200 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7285, + "step": 1499300 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.738, + "step": 1499400 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7349, + "step": 1499500 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7342, + "step": 1499600 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 1499700 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7381, + "step": 1499800 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 0.7354, + "step": 1499900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7359, + "step": 1500000 + }, + { + "epoch": 2.05, + "eval_loss": 0.6918967366218567, + "eval_runtime": 194.9655, + "eval_samples_per_second": 256.456, + "eval_steps_per_second": 2.005, + "step": 1500000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7272, + "step": 1500100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7268, + "step": 1500200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7313, + "step": 1500300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7333, + "step": 1500400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7255, + "step": 1500500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7278, + "step": 1500600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7271, + "step": 1500700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7308, + "step": 1500800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7289, + "step": 1500900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7241, + "step": 1501000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7249, + "step": 1501100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7293, + "step": 1501200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7308, + "step": 1501300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7314, + "step": 1501400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7278, + "step": 1501500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7297, + "step": 1501600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7234, + "step": 1501700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7254, + "step": 1501800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7289, + "step": 1501900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7346, + "step": 1502000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7304, + "step": 1502100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7259, + "step": 1502200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7283, + "step": 1502300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7215, + "step": 1502400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.728, + "step": 1502500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7302, + "step": 1502600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.727, + "step": 1502700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7227, + "step": 1502800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7242, + "step": 1502900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7206, + "step": 1503000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7254, + "step": 1503100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7315, + "step": 1503200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7187, + "step": 1503300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7234, + "step": 1503400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7144, + "step": 1503500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7196, + "step": 1503600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7236, + "step": 1503700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7209, + "step": 1503800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7206, + "step": 1503900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7183, + "step": 1504000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7168, + "step": 1504100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7197, + "step": 1504200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.721, + "step": 1504300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7281, + "step": 1504400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7265, + "step": 1504500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7203, + "step": 1504600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7206, + "step": 1504700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7174, + "step": 1504800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7167, + "step": 1504900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7209, + "step": 1505000 + }, + { + "epoch": 2.05, + "eval_loss": 0.6809056997299194, + "eval_runtime": 194.8193, + "eval_samples_per_second": 256.648, + "eval_steps_per_second": 2.007, + "step": 1505000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7196, + "step": 1505100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7243, + "step": 1505200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7185, + "step": 1505300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7202, + "step": 1505400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7297, + "step": 1505500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7218, + "step": 1505600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7225, + "step": 1505700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7212, + "step": 1505800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7239, + "step": 1505900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.724, + "step": 1506000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7203, + "step": 1506100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7301, + "step": 1506200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7199, + "step": 1506300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7164, + "step": 1506400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7187, + "step": 1506500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7284, + "step": 1506600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7187, + "step": 1506700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7226, + "step": 1506800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7237, + "step": 1506900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7169, + "step": 1507000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7187, + "step": 1507100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7168, + "step": 1507200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.726, + "step": 1507300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.711, + "step": 1507400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7162, + "step": 1507500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7185, + "step": 1507600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7306, + "step": 1507700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.72, + "step": 1507800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7236, + "step": 1507900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7196, + "step": 1508000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7257, + "step": 1508100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7205, + "step": 1508200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7147, + "step": 1508300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7227, + "step": 1508400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7126, + "step": 1508500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.719, + "step": 1508600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7176, + "step": 1508700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7249, + "step": 1508800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7215, + "step": 1508900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7186, + "step": 1509000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7171, + "step": 1509100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7173, + "step": 1509200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7211, + "step": 1509300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7211, + "step": 1509400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7144, + "step": 1509500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7198, + "step": 1509600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7269, + "step": 1509700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7162, + "step": 1509800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7303, + "step": 1509900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7231, + "step": 1510000 + }, + { + "epoch": 2.05, + "eval_loss": 0.679633617401123, + "eval_runtime": 194.6144, + "eval_samples_per_second": 256.918, + "eval_steps_per_second": 2.009, + "step": 1510000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7249, + "step": 1510100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7174, + "step": 1510200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7204, + "step": 1510300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7169, + "step": 1510400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7209, + "step": 1510500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7202, + "step": 1510600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7302, + "step": 1510700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7199, + "step": 1510800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7227, + "step": 1510900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7146, + "step": 1511000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7259, + "step": 1511100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7283, + "step": 1511200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7187, + "step": 1511300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7211, + "step": 1511400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7227, + "step": 1511500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7162, + "step": 1511600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7179, + "step": 1511700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7218, + "step": 1511800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7221, + "step": 1511900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7162, + "step": 1512000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7198, + "step": 1512100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7149, + "step": 1512200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7201, + "step": 1512300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7207, + "step": 1512400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7284, + "step": 1512500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7146, + "step": 1512600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7089, + "step": 1512700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.72, + "step": 1512800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7134, + "step": 1512900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7137, + "step": 1513000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7212, + "step": 1513100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7159, + "step": 1513200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7118, + "step": 1513300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7248, + "step": 1513400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7174, + "step": 1513500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7186, + "step": 1513600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7111, + "step": 1513700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7157, + "step": 1513800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7176, + "step": 1513900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7268, + "step": 1514000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7254, + "step": 1514100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7106, + "step": 1514200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7225, + "step": 1514300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.716, + "step": 1514400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7202, + "step": 1514500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7115, + "step": 1514600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7122, + "step": 1514700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7224, + "step": 1514800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7192, + "step": 1514900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7155, + "step": 1515000 + }, + { + "epoch": 2.05, + "eval_loss": 0.6776483654975891, + "eval_runtime": 194.7764, + "eval_samples_per_second": 256.705, + "eval_steps_per_second": 2.007, + "step": 1515000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7174, + "step": 1515100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.715, + "step": 1515200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7133, + "step": 1515300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7224, + "step": 1515400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7182, + "step": 1515500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7155, + "step": 1515600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7151, + "step": 1515700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7128, + "step": 1515800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7182, + "step": 1515900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7246, + "step": 1516000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7205, + "step": 1516100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7223, + "step": 1516200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7211, + "step": 1516300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7189, + "step": 1516400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7173, + "step": 1516500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7178, + "step": 1516600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7181, + "step": 1516700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7281, + "step": 1516800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7157, + "step": 1516900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7126, + "step": 1517000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7168, + "step": 1517100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7155, + "step": 1517200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7139, + "step": 1517300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.722, + "step": 1517400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.713, + "step": 1517500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7171, + "step": 1517600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7238, + "step": 1517700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7186, + "step": 1517800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7158, + "step": 1517900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7194, + "step": 1518000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7107, + "step": 1518100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7194, + "step": 1518200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7205, + "step": 1518300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7234, + "step": 1518400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7258, + "step": 1518500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7138, + "step": 1518600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7202, + "step": 1518700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7233, + "step": 1518800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7181, + "step": 1518900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7192, + "step": 1519000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7175, + "step": 1519100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7131, + "step": 1519200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7167, + "step": 1519300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7148, + "step": 1519400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7142, + "step": 1519500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7199, + "step": 1519600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.72, + "step": 1519700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7173, + "step": 1519800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7161, + "step": 1519900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7101, + "step": 1520000 + }, + { + "epoch": 2.05, + "eval_loss": 0.6772561073303223, + "eval_runtime": 195.0811, + "eval_samples_per_second": 256.304, + "eval_steps_per_second": 2.004, + "step": 1520000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7194, + "step": 1520100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7133, + "step": 1520200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7179, + "step": 1520300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7187, + "step": 1520400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.717, + "step": 1520500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7147, + "step": 1520600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7178, + "step": 1520700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7238, + "step": 1520800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7135, + "step": 1520900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7116, + "step": 1521000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7167, + "step": 1521100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7213, + "step": 1521200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7139, + "step": 1521300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7163, + "step": 1521400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7205, + "step": 1521500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7158, + "step": 1521600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7196, + "step": 1521700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7122, + "step": 1521800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.716, + "step": 1521900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7145, + "step": 1522000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7206, + "step": 1522100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7156, + "step": 1522200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7158, + "step": 1522300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7101, + "step": 1522400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7148, + "step": 1522500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7094, + "step": 1522600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7088, + "step": 1522700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7125, + "step": 1522800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7139, + "step": 1522900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.715, + "step": 1523000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.713, + "step": 1523100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7211, + "step": 1523200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7151, + "step": 1523300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7144, + "step": 1523400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7118, + "step": 1523500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7198, + "step": 1523600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7207, + "step": 1523700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7101, + "step": 1523800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7195, + "step": 1523900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7127, + "step": 1524000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7197, + "step": 1524100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7214, + "step": 1524200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7047, + "step": 1524300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7106, + "step": 1524400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7209, + "step": 1524500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7133, + "step": 1524600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7147, + "step": 1524700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7195, + "step": 1524800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7216, + "step": 1524900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7173, + "step": 1525000 + }, + { + "epoch": 2.05, + "eval_loss": 0.6770954132080078, + "eval_runtime": 194.5241, + "eval_samples_per_second": 257.038, + "eval_steps_per_second": 2.01, + "step": 1525000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7139, + "step": 1525100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7154, + "step": 1525200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7136, + "step": 1525300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.711, + "step": 1525400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7161, + "step": 1525500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7153, + "step": 1525600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7118, + "step": 1525700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7115, + "step": 1525800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7141, + "step": 1525900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7136, + "step": 1526000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7149, + "step": 1526100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7028, + "step": 1526200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7139, + "step": 1526300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7171, + "step": 1526400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7079, + "step": 1526500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7226, + "step": 1526600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7177, + "step": 1526700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7037, + "step": 1526800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7125, + "step": 1526900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7112, + "step": 1527000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7156, + "step": 1527100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7107, + "step": 1527200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7131, + "step": 1527300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7081, + "step": 1527400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7089, + "step": 1527500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7137, + "step": 1527600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7084, + "step": 1527700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7139, + "step": 1527800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7199, + "step": 1527900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7092, + "step": 1528000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7256, + "step": 1528100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7108, + "step": 1528200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7231, + "step": 1528300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7153, + "step": 1528400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.712, + "step": 1528500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7213, + "step": 1528600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7143, + "step": 1528700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7206, + "step": 1528800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7084, + "step": 1528900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7113, + "step": 1529000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7061, + "step": 1529100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7189, + "step": 1529200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.71, + "step": 1529300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7049, + "step": 1529400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7185, + "step": 1529500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7176, + "step": 1529600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7163, + "step": 1529700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7145, + "step": 1529800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7129, + "step": 1529900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.718, + "step": 1530000 + }, + { + "epoch": 2.05, + "eval_loss": 0.6751891374588013, + "eval_runtime": 194.7816, + "eval_samples_per_second": 256.698, + "eval_steps_per_second": 2.007, + "step": 1530000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7129, + "step": 1530100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7177, + "step": 1530200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7136, + "step": 1530300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7098, + "step": 1530400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7152, + "step": 1530500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7071, + "step": 1530600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7188, + "step": 1530700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7175, + "step": 1530800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7179, + "step": 1530900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7207, + "step": 1531000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7079, + "step": 1531100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7213, + "step": 1531200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7073, + "step": 1531300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7164, + "step": 1531400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7171, + "step": 1531500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7207, + "step": 1531600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7166, + "step": 1531700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7168, + "step": 1531800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7177, + "step": 1531900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7144, + "step": 1532000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7114, + "step": 1532100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7186, + "step": 1532200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7163, + "step": 1532300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7172, + "step": 1532400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7164, + "step": 1532500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7071, + "step": 1532600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7186, + "step": 1532700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7041, + "step": 1532800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7252, + "step": 1532900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7131, + "step": 1533000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7061, + "step": 1533100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7085, + "step": 1533200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.708, + "step": 1533300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7163, + "step": 1533400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7162, + "step": 1533500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7166, + "step": 1533600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7165, + "step": 1533700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7105, + "step": 1533800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7108, + "step": 1533900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7198, + "step": 1534000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7146, + "step": 1534100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7144, + "step": 1534200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7189, + "step": 1534300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7147, + "step": 1534400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7107, + "step": 1534500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7198, + "step": 1534600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7092, + "step": 1534700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7143, + "step": 1534800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7103, + "step": 1534900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7053, + "step": 1535000 + }, + { + "epoch": 2.05, + "eval_loss": 0.67606520652771, + "eval_runtime": 195.8147, + "eval_samples_per_second": 255.343, + "eval_steps_per_second": 1.997, + "step": 1535000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.716, + "step": 1535100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7181, + "step": 1535200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7108, + "step": 1535300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7206, + "step": 1535400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.714, + "step": 1535500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7142, + "step": 1535600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7114, + "step": 1535700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.722, + "step": 1535800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7174, + "step": 1535900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7107, + "step": 1536000 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7121, + "step": 1536100 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7131, + "step": 1536200 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7119, + "step": 1536300 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7185, + "step": 1536400 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7047, + "step": 1536500 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7163, + "step": 1536600 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7223, + "step": 1536700 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.725, + "step": 1536800 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.716, + "step": 1536900 + }, + { + "epoch": 2.05, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7167, + "step": 1537000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.707, + "step": 1537100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7162, + "step": 1537200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7042, + "step": 1537300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7139, + "step": 1537400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7156, + "step": 1537500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7159, + "step": 1537600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7167, + "step": 1537700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7123, + "step": 1537800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.711, + "step": 1537900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7103, + "step": 1538000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7133, + "step": 1538100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7107, + "step": 1538200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7249, + "step": 1538300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7103, + "step": 1538400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.71, + "step": 1538500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7105, + "step": 1538600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7074, + "step": 1538700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7163, + "step": 1538800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7092, + "step": 1538900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7078, + "step": 1539000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7161, + "step": 1539100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.709, + "step": 1539200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7193, + "step": 1539300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7151, + "step": 1539400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7116, + "step": 1539500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7163, + "step": 1539600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7096, + "step": 1539700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7103, + "step": 1539800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7098, + "step": 1539900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7139, + "step": 1540000 + }, + { + "epoch": 2.06, + "eval_loss": 0.6728585362434387, + "eval_runtime": 194.5375, + "eval_samples_per_second": 257.02, + "eval_steps_per_second": 2.01, + "step": 1540000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7144, + "step": 1540100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7098, + "step": 1540200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7127, + "step": 1540300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7181, + "step": 1540400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7131, + "step": 1540500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7181, + "step": 1540600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7152, + "step": 1540700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7143, + "step": 1540800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7134, + "step": 1540900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7148, + "step": 1541000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7137, + "step": 1541100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7078, + "step": 1541200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7162, + "step": 1541300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7088, + "step": 1541400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7142, + "step": 1541500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7104, + "step": 1541600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7153, + "step": 1541700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7152, + "step": 1541800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7066, + "step": 1541900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7077, + "step": 1542000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7013, + "step": 1542100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7022, + "step": 1542200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7075, + "step": 1542300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7143, + "step": 1542400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7156, + "step": 1542500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7118, + "step": 1542600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7051, + "step": 1542700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.714, + "step": 1542800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7155, + "step": 1542900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7098, + "step": 1543000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7137, + "step": 1543100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7162, + "step": 1543200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.705, + "step": 1543300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7121, + "step": 1543400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7107, + "step": 1543500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7143, + "step": 1543600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7133, + "step": 1543700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7166, + "step": 1543800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7114, + "step": 1543900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7212, + "step": 1544000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7085, + "step": 1544100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7132, + "step": 1544200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7057, + "step": 1544300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7172, + "step": 1544400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7238, + "step": 1544500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7136, + "step": 1544600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.71, + "step": 1544700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7061, + "step": 1544800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7108, + "step": 1544900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7148, + "step": 1545000 + }, + { + "epoch": 2.06, + "eval_loss": 0.6715091466903687, + "eval_runtime": 194.575, + "eval_samples_per_second": 256.97, + "eval_steps_per_second": 2.01, + "step": 1545000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7114, + "step": 1545100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7105, + "step": 1545200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7072, + "step": 1545300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7108, + "step": 1545400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7103, + "step": 1545500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7153, + "step": 1545600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7077, + "step": 1545700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7121, + "step": 1545800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7152, + "step": 1545900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7128, + "step": 1546000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7148, + "step": 1546100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7111, + "step": 1546200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7173, + "step": 1546300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7147, + "step": 1546400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7071, + "step": 1546500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7149, + "step": 1546600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7164, + "step": 1546700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7063, + "step": 1546800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7104, + "step": 1546900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7103, + "step": 1547000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7134, + "step": 1547100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7135, + "step": 1547200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7185, + "step": 1547300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7145, + "step": 1547400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7106, + "step": 1547500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7107, + "step": 1547600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7116, + "step": 1547700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7138, + "step": 1547800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7076, + "step": 1547900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7192, + "step": 1548000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7058, + "step": 1548100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.717, + "step": 1548200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7244, + "step": 1548300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7118, + "step": 1548400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7097, + "step": 1548500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7157, + "step": 1548600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7177, + "step": 1548700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7083, + "step": 1548800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7069, + "step": 1548900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7191, + "step": 1549000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.717, + "step": 1549100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7071, + "step": 1549200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7079, + "step": 1549300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7179, + "step": 1549400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7164, + "step": 1549500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7072, + "step": 1549600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7144, + "step": 1549700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7119, + "step": 1549800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7092, + "step": 1549900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7104, + "step": 1550000 + }, + { + "epoch": 2.06, + "eval_loss": 0.6696457862854004, + "eval_runtime": 194.8005, + "eval_samples_per_second": 256.673, + "eval_steps_per_second": 2.007, + "step": 1550000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7042, + "step": 1550100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7048, + "step": 1550200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7117, + "step": 1550300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7129, + "step": 1550400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7139, + "step": 1550500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7111, + "step": 1550600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7098, + "step": 1550700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7093, + "step": 1550800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7076, + "step": 1550900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7121, + "step": 1551000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7155, + "step": 1551100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7065, + "step": 1551200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7132, + "step": 1551300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7103, + "step": 1551400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7028, + "step": 1551500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7174, + "step": 1551600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7101, + "step": 1551700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7105, + "step": 1551800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7121, + "step": 1551900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7018, + "step": 1552000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.707, + "step": 1552100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7122, + "step": 1552200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7161, + "step": 1552300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7087, + "step": 1552400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7053, + "step": 1552500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7137, + "step": 1552600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7182, + "step": 1552700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7128, + "step": 1552800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7093, + "step": 1552900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.709, + "step": 1553000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7107, + "step": 1553100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7172, + "step": 1553200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7097, + "step": 1553300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.715, + "step": 1553400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7105, + "step": 1553500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7054, + "step": 1553600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7167, + "step": 1553700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7141, + "step": 1553800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7077, + "step": 1553900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7029, + "step": 1554000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7105, + "step": 1554100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7153, + "step": 1554200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7001, + "step": 1554300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7038, + "step": 1554400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7094, + "step": 1554500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7062, + "step": 1554600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7054, + "step": 1554700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7104, + "step": 1554800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7035, + "step": 1554900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.704, + "step": 1555000 + }, + { + "epoch": 2.06, + "eval_loss": 0.6720991134643555, + "eval_runtime": 194.5489, + "eval_samples_per_second": 257.005, + "eval_steps_per_second": 2.01, + "step": 1555000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7137, + "step": 1555100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7181, + "step": 1555200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7095, + "step": 1555300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7092, + "step": 1555400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7119, + "step": 1555500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7062, + "step": 1555600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7082, + "step": 1555700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7014, + "step": 1555800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7135, + "step": 1555900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7179, + "step": 1556000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7116, + "step": 1556100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7045, + "step": 1556200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7143, + "step": 1556300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7109, + "step": 1556400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7058, + "step": 1556500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7076, + "step": 1556600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7114, + "step": 1556700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7106, + "step": 1556800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7037, + "step": 1556900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7045, + "step": 1557000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7045, + "step": 1557100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7182, + "step": 1557200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7082, + "step": 1557300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7035, + "step": 1557400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7109, + "step": 1557500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7153, + "step": 1557600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7072, + "step": 1557700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7158, + "step": 1557800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7183, + "step": 1557900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7108, + "step": 1558000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7139, + "step": 1558100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7159, + "step": 1558200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7079, + "step": 1558300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7069, + "step": 1558400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7131, + "step": 1558500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7082, + "step": 1558600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7096, + "step": 1558700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7119, + "step": 1558800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7075, + "step": 1558900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7156, + "step": 1559000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7141, + "step": 1559100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7128, + "step": 1559200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7115, + "step": 1559300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7089, + "step": 1559400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7153, + "step": 1559500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7168, + "step": 1559600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7065, + "step": 1559700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7085, + "step": 1559800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7051, + "step": 1559900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7106, + "step": 1560000 + }, + { + "epoch": 2.06, + "eval_loss": 0.67218416929245, + "eval_runtime": 194.3337, + "eval_samples_per_second": 257.289, + "eval_steps_per_second": 2.012, + "step": 1560000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7093, + "step": 1560100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7083, + "step": 1560200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7076, + "step": 1560300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7171, + "step": 1560400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7137, + "step": 1560500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7173, + "step": 1560600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7071, + "step": 1560700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7188, + "step": 1560800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.718, + "step": 1560900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7176, + "step": 1561000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7069, + "step": 1561100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7079, + "step": 1561200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7101, + "step": 1561300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7095, + "step": 1561400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.706, + "step": 1561500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7039, + "step": 1561600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7074, + "step": 1561700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7122, + "step": 1561800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7125, + "step": 1561900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7153, + "step": 1562000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7022, + "step": 1562100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7128, + "step": 1562200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7136, + "step": 1562300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7108, + "step": 1562400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7081, + "step": 1562500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7139, + "step": 1562600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7116, + "step": 1562700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7159, + "step": 1562800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7167, + "step": 1562900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7144, + "step": 1563000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7118, + "step": 1563100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7092, + "step": 1563200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7157, + "step": 1563300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7094, + "step": 1563400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7095, + "step": 1563500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7127, + "step": 1563600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.711, + "step": 1563700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7012, + "step": 1563800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7131, + "step": 1563900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7065, + "step": 1564000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7153, + "step": 1564100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7115, + "step": 1564200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7128, + "step": 1564300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7133, + "step": 1564400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.715, + "step": 1564500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7086, + "step": 1564600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7196, + "step": 1564700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7044, + "step": 1564800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7138, + "step": 1564900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7109, + "step": 1565000 + }, + { + "epoch": 2.06, + "eval_loss": 0.6707226037979126, + "eval_runtime": 194.5176, + "eval_samples_per_second": 257.046, + "eval_steps_per_second": 2.01, + "step": 1565000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7101, + "step": 1565100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7073, + "step": 1565200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7106, + "step": 1565300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7113, + "step": 1565400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7065, + "step": 1565500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7098, + "step": 1565600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7057, + "step": 1565700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7013, + "step": 1565800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7108, + "step": 1565900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7089, + "step": 1566000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7089, + "step": 1566100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7097, + "step": 1566200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7148, + "step": 1566300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7103, + "step": 1566400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7103, + "step": 1566500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7118, + "step": 1566600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7079, + "step": 1566700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7156, + "step": 1566800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7127, + "step": 1566900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7125, + "step": 1567000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7137, + "step": 1567100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7126, + "step": 1567200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7125, + "step": 1567300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7058, + "step": 1567400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7076, + "step": 1567500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7074, + "step": 1567600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.711, + "step": 1567700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7094, + "step": 1567800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7098, + "step": 1567900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7179, + "step": 1568000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7155, + "step": 1568100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7117, + "step": 1568200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.712, + "step": 1568300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7103, + "step": 1568400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7089, + "step": 1568500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.705, + "step": 1568600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.711, + "step": 1568700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7062, + "step": 1568800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7091, + "step": 1568900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7073, + "step": 1569000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7015, + "step": 1569100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7059, + "step": 1569200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7165, + "step": 1569300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7089, + "step": 1569400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7121, + "step": 1569500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7109, + "step": 1569600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7064, + "step": 1569700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.715, + "step": 1569800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7107, + "step": 1569900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.712, + "step": 1570000 + }, + { + "epoch": 2.06, + "eval_loss": 0.6680857539176941, + "eval_runtime": 198.2262, + "eval_samples_per_second": 252.237, + "eval_steps_per_second": 1.972, + "step": 1570000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7082, + "step": 1570100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7033, + "step": 1570200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7099, + "step": 1570300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7124, + "step": 1570400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7119, + "step": 1570500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7084, + "step": 1570600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7076, + "step": 1570700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7026, + "step": 1570800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.71, + "step": 1570900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7079, + "step": 1571000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7095, + "step": 1571100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7058, + "step": 1571200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7142, + "step": 1571300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6975, + "step": 1571400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7079, + "step": 1571500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7115, + "step": 1571600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7073, + "step": 1571700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7135, + "step": 1571800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7049, + "step": 1571900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7079, + "step": 1572000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7077, + "step": 1572100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7128, + "step": 1572200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7094, + "step": 1572300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7094, + "step": 1572400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7082, + "step": 1572500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7074, + "step": 1572600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7107, + "step": 1572700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7156, + "step": 1572800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7095, + "step": 1572900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7099, + "step": 1573000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7092, + "step": 1573100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7072, + "step": 1573200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7118, + "step": 1573300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7128, + "step": 1573400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.711, + "step": 1573500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6964, + "step": 1573600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7113, + "step": 1573700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7154, + "step": 1573800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7056, + "step": 1573900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7131, + "step": 1574000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7195, + "step": 1574100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7161, + "step": 1574200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7086, + "step": 1574300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.71, + "step": 1574400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7035, + "step": 1574500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7064, + "step": 1574600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7101, + "step": 1574700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7082, + "step": 1574800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7084, + "step": 1574900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7038, + "step": 1575000 + }, + { + "epoch": 2.06, + "eval_loss": 0.6703412532806396, + "eval_runtime": 194.9337, + "eval_samples_per_second": 256.497, + "eval_steps_per_second": 2.006, + "step": 1575000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7133, + "step": 1575100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7042, + "step": 1575200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7059, + "step": 1575300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7176, + "step": 1575400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7062, + "step": 1575500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7072, + "step": 1575600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7072, + "step": 1575700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7068, + "step": 1575800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7097, + "step": 1575900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7062, + "step": 1576000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7088, + "step": 1576100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7127, + "step": 1576200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7083, + "step": 1576300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7073, + "step": 1576400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7105, + "step": 1576500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7082, + "step": 1576600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7069, + "step": 1576700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7118, + "step": 1576800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7071, + "step": 1576900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7114, + "step": 1577000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7083, + "step": 1577100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7147, + "step": 1577200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7103, + "step": 1577300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7061, + "step": 1577400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7044, + "step": 1577500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7149, + "step": 1577600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7083, + "step": 1577700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7075, + "step": 1577800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7091, + "step": 1577900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7132, + "step": 1578000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7215, + "step": 1578100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7092, + "step": 1578200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.71, + "step": 1578300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7076, + "step": 1578400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7015, + "step": 1578500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7146, + "step": 1578600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7091, + "step": 1578700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7038, + "step": 1578800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7052, + "step": 1578900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7139, + "step": 1579000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6996, + "step": 1579100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7094, + "step": 1579200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7107, + "step": 1579300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7088, + "step": 1579400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7133, + "step": 1579500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7053, + "step": 1579600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.71, + "step": 1579700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7089, + "step": 1579800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7014, + "step": 1579900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7045, + "step": 1580000 + }, + { + "epoch": 2.06, + "eval_loss": 0.6690846085548401, + "eval_runtime": 194.5913, + "eval_samples_per_second": 256.949, + "eval_steps_per_second": 2.009, + "step": 1580000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7124, + "step": 1580100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7173, + "step": 1580200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7122, + "step": 1580300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7078, + "step": 1580400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7094, + "step": 1580500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7042, + "step": 1580600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7146, + "step": 1580700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7042, + "step": 1580800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7077, + "step": 1580900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7024, + "step": 1581000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7124, + "step": 1581100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7062, + "step": 1581200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7018, + "step": 1581300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7083, + "step": 1581400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7127, + "step": 1581500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7056, + "step": 1581600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7092, + "step": 1581700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7064, + "step": 1581800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7154, + "step": 1581900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7095, + "step": 1582000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7078, + "step": 1582100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7139, + "step": 1582200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7112, + "step": 1582300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7047, + "step": 1582400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7149, + "step": 1582500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7101, + "step": 1582600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7086, + "step": 1582700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7089, + "step": 1582800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7025, + "step": 1582900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7039, + "step": 1583000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7114, + "step": 1583100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7006, + "step": 1583200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7119, + "step": 1583300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7104, + "step": 1583400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7041, + "step": 1583500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7066, + "step": 1583600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7065, + "step": 1583700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7104, + "step": 1583800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7087, + "step": 1583900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7055, + "step": 1584000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7037, + "step": 1584100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7098, + "step": 1584200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7092, + "step": 1584300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7071, + "step": 1584400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7079, + "step": 1584500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7039, + "step": 1584600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7069, + "step": 1584700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7072, + "step": 1584800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.704, + "step": 1584900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7095, + "step": 1585000 + }, + { + "epoch": 2.06, + "eval_loss": 0.669644296169281, + "eval_runtime": 194.4885, + "eval_samples_per_second": 257.085, + "eval_steps_per_second": 2.01, + "step": 1585000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7163, + "step": 1585100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7033, + "step": 1585200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7096, + "step": 1585300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7171, + "step": 1585400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7035, + "step": 1585500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7125, + "step": 1585600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7059, + "step": 1585700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7104, + "step": 1585800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7097, + "step": 1585900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7061, + "step": 1586000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6987, + "step": 1586100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7126, + "step": 1586200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7128, + "step": 1586300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7139, + "step": 1586400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7119, + "step": 1586500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7028, + "step": 1586600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7094, + "step": 1586700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.709, + "step": 1586800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7054, + "step": 1586900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.717, + "step": 1587000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7116, + "step": 1587100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.707, + "step": 1587200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7072, + "step": 1587300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7066, + "step": 1587400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7133, + "step": 1587500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7115, + "step": 1587600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7059, + "step": 1587700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7048, + "step": 1587800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7089, + "step": 1587900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7055, + "step": 1588000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7015, + "step": 1588100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.714, + "step": 1588200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7052, + "step": 1588300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7119, + "step": 1588400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7128, + "step": 1588500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7043, + "step": 1588600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.707, + "step": 1588700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.708, + "step": 1588800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7137, + "step": 1588900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7036, + "step": 1589000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7033, + "step": 1589100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7076, + "step": 1589200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7092, + "step": 1589300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6999, + "step": 1589400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7112, + "step": 1589500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7141, + "step": 1589600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7123, + "step": 1589700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7045, + "step": 1589800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7099, + "step": 1589900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7119, + "step": 1590000 + }, + { + "epoch": 2.06, + "eval_loss": 0.670275092124939, + "eval_runtime": 194.6148, + "eval_samples_per_second": 256.918, + "eval_steps_per_second": 2.009, + "step": 1590000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7036, + "step": 1590100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7042, + "step": 1590200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6964, + "step": 1590300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7016, + "step": 1590400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7068, + "step": 1590500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7039, + "step": 1590600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7091, + "step": 1590700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7098, + "step": 1590800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.709, + "step": 1590900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7108, + "step": 1591000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7149, + "step": 1591100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7063, + "step": 1591200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7114, + "step": 1591300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7102, + "step": 1591400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7072, + "step": 1591500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7103, + "step": 1591600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7099, + "step": 1591700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.72, + "step": 1591800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7074, + "step": 1591900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7048, + "step": 1592000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.72, + "step": 1592100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7115, + "step": 1592200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7093, + "step": 1592300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.707, + "step": 1592400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7148, + "step": 1592500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7162, + "step": 1592600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7118, + "step": 1592700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7034, + "step": 1592800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.704, + "step": 1592900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7036, + "step": 1593000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7056, + "step": 1593100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.709, + "step": 1593200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7004, + "step": 1593300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7105, + "step": 1593400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7153, + "step": 1593500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7102, + "step": 1593600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7132, + "step": 1593700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7103, + "step": 1593800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7116, + "step": 1593900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7102, + "step": 1594000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7005, + "step": 1594100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7055, + "step": 1594200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7068, + "step": 1594300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7062, + "step": 1594400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7167, + "step": 1594500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7095, + "step": 1594600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.708, + "step": 1594700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.711, + "step": 1594800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7116, + "step": 1594900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7079, + "step": 1595000 + }, + { + "epoch": 2.06, + "eval_loss": 0.669308602809906, + "eval_runtime": 194.3701, + "eval_samples_per_second": 257.241, + "eval_steps_per_second": 2.012, + "step": 1595000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7145, + "step": 1595100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7088, + "step": 1595200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7026, + "step": 1595300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7073, + "step": 1595400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7013, + "step": 1595500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7088, + "step": 1595600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7137, + "step": 1595700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7051, + "step": 1595800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6988, + "step": 1595900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7078, + "step": 1596000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7048, + "step": 1596100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7086, + "step": 1596200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7073, + "step": 1596300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7087, + "step": 1596400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7077, + "step": 1596500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6979, + "step": 1596600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7135, + "step": 1596700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7129, + "step": 1596800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7078, + "step": 1596900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7131, + "step": 1597000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7131, + "step": 1597100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.713, + "step": 1597200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7088, + "step": 1597300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7076, + "step": 1597400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7092, + "step": 1597500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7031, + "step": 1597600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7069, + "step": 1597700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7152, + "step": 1597800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7151, + "step": 1597900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7039, + "step": 1598000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7045, + "step": 1598100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7012, + "step": 1598200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7053, + "step": 1598300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7081, + "step": 1598400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7107, + "step": 1598500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7075, + "step": 1598600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7067, + "step": 1598700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.705, + "step": 1598800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.705, + "step": 1598900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7069, + "step": 1599000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7106, + "step": 1599100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6976, + "step": 1599200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7023, + "step": 1599300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7049, + "step": 1599400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.712, + "step": 1599500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7063, + "step": 1599600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.699, + "step": 1599700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6994, + "step": 1599800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7131, + "step": 1599900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7123, + "step": 1600000 + }, + { + "epoch": 2.06, + "eval_loss": 0.6678040623664856, + "eval_runtime": 194.6108, + "eval_samples_per_second": 256.923, + "eval_steps_per_second": 2.009, + "step": 1600000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7179, + "step": 1600100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7104, + "step": 1600200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6944, + "step": 1600300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7065, + "step": 1600400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7059, + "step": 1600500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7101, + "step": 1600600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7155, + "step": 1600700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7124, + "step": 1600800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7033, + "step": 1600900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7185, + "step": 1601000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7111, + "step": 1601100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7085, + "step": 1601200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7085, + "step": 1601300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7083, + "step": 1601400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7082, + "step": 1601500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7041, + "step": 1601600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7034, + "step": 1601700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7015, + "step": 1601800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7089, + "step": 1601900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7042, + "step": 1602000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7047, + "step": 1602100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7064, + "step": 1602200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7014, + "step": 1602300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6998, + "step": 1602400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7157, + "step": 1602500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7127, + "step": 1602600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7024, + "step": 1602700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7045, + "step": 1602800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7067, + "step": 1602900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7145, + "step": 1603000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7098, + "step": 1603100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6994, + "step": 1603200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7032, + "step": 1603300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7082, + "step": 1603400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7079, + "step": 1603500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7036, + "step": 1603600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7134, + "step": 1603700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7051, + "step": 1603800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7149, + "step": 1603900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7067, + "step": 1604000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7061, + "step": 1604100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7052, + "step": 1604200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7086, + "step": 1604300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7109, + "step": 1604400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7087, + "step": 1604500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7053, + "step": 1604600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7084, + "step": 1604700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7135, + "step": 1604800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6999, + "step": 1604900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.712, + "step": 1605000 + }, + { + "epoch": 2.06, + "eval_loss": 0.6686995029449463, + "eval_runtime": 194.3388, + "eval_samples_per_second": 257.283, + "eval_steps_per_second": 2.012, + "step": 1605000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7065, + "step": 1605100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7042, + "step": 1605200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7001, + "step": 1605300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7085, + "step": 1605400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7101, + "step": 1605500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7081, + "step": 1605600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.711, + "step": 1605700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7086, + "step": 1605800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7081, + "step": 1605900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7098, + "step": 1606000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7062, + "step": 1606100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7082, + "step": 1606200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.703, + "step": 1606300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7111, + "step": 1606400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7078, + "step": 1606500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7078, + "step": 1606600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6964, + "step": 1606700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7083, + "step": 1606800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7158, + "step": 1606900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7144, + "step": 1607000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7074, + "step": 1607100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7062, + "step": 1607200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6945, + "step": 1607300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7015, + "step": 1607400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7177, + "step": 1607500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7112, + "step": 1607600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7017, + "step": 1607700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7081, + "step": 1607800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7045, + "step": 1607900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7025, + "step": 1608000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7126, + "step": 1608100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7065, + "step": 1608200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7112, + "step": 1608300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7072, + "step": 1608400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6993, + "step": 1608500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.713, + "step": 1608600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7118, + "step": 1608700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7106, + "step": 1608800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7021, + "step": 1608900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7176, + "step": 1609000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7128, + "step": 1609100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.712, + "step": 1609200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7089, + "step": 1609300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.705, + "step": 1609400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7059, + "step": 1609500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7115, + "step": 1609600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7075, + "step": 1609700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7095, + "step": 1609800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7118, + "step": 1609900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7093, + "step": 1610000 + }, + { + "epoch": 2.06, + "eval_loss": 0.6666762232780457, + "eval_runtime": 194.4528, + "eval_samples_per_second": 257.132, + "eval_steps_per_second": 2.011, + "step": 1610000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7089, + "step": 1610100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7067, + "step": 1610200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7051, + "step": 1610300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7073, + "step": 1610400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7163, + "step": 1610500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7101, + "step": 1610600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7096, + "step": 1610700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7044, + "step": 1610800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7027, + "step": 1610900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7046, + "step": 1611000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7121, + "step": 1611100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7123, + "step": 1611200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7048, + "step": 1611300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7063, + "step": 1611400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7109, + "step": 1611500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7132, + "step": 1611600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7029, + "step": 1611700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7083, + "step": 1611800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7076, + "step": 1611900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7017, + "step": 1612000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7043, + "step": 1612100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.707, + "step": 1612200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7051, + "step": 1612300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7024, + "step": 1612400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7124, + "step": 1612500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7062, + "step": 1612600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7054, + "step": 1612700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7078, + "step": 1612800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7022, + "step": 1612900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7155, + "step": 1613000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7038, + "step": 1613100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.709, + "step": 1613200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7083, + "step": 1613300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7075, + "step": 1613400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7139, + "step": 1613500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7085, + "step": 1613600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7107, + "step": 1613700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7004, + "step": 1613800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7069, + "step": 1613900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.709, + "step": 1614000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7132, + "step": 1614100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7064, + "step": 1614200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7055, + "step": 1614300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7042, + "step": 1614400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7165, + "step": 1614500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7086, + "step": 1614600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7139, + "step": 1614700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7003, + "step": 1614800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6966, + "step": 1614900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7042, + "step": 1615000 + }, + { + "epoch": 2.06, + "eval_loss": 0.6679654717445374, + "eval_runtime": 194.502, + "eval_samples_per_second": 257.067, + "eval_steps_per_second": 2.01, + "step": 1615000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7028, + "step": 1615100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7042, + "step": 1615200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7066, + "step": 1615300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7129, + "step": 1615400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7084, + "step": 1615500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7006, + "step": 1615600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7099, + "step": 1615700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7029, + "step": 1615800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7107, + "step": 1615900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7056, + "step": 1616000 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7094, + "step": 1616100 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7047, + "step": 1616200 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7043, + "step": 1616300 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6978, + "step": 1616400 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7032, + "step": 1616500 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7061, + "step": 1616600 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7069, + "step": 1616700 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7045, + "step": 1616800 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7064, + "step": 1616900 + }, + { + "epoch": 2.06, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7052, + "step": 1617000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7005, + "step": 1617100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7085, + "step": 1617200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.713, + "step": 1617300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7063, + "step": 1617400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7028, + "step": 1617500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7111, + "step": 1617600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7076, + "step": 1617700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7057, + "step": 1617800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7044, + "step": 1617900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7084, + "step": 1618000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7045, + "step": 1618100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7016, + "step": 1618200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7094, + "step": 1618300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7023, + "step": 1618400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7067, + "step": 1618500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7061, + "step": 1618600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7057, + "step": 1618700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7027, + "step": 1618800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6907, + "step": 1618900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7023, + "step": 1619000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.706, + "step": 1619100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7108, + "step": 1619200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7004, + "step": 1619300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7026, + "step": 1619400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7059, + "step": 1619500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7121, + "step": 1619600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7034, + "step": 1619700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7134, + "step": 1619800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7032, + "step": 1619900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6993, + "step": 1620000 + }, + { + "epoch": 2.07, + "eval_loss": 0.66707444190979, + "eval_runtime": 205.7066, + "eval_samples_per_second": 243.065, + "eval_steps_per_second": 1.901, + "step": 1620000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7047, + "step": 1620100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.708, + "step": 1620200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7044, + "step": 1620300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7114, + "step": 1620400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7043, + "step": 1620500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7111, + "step": 1620600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7105, + "step": 1620700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7055, + "step": 1620800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7055, + "step": 1620900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7121, + "step": 1621000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7085, + "step": 1621100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7074, + "step": 1621200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7119, + "step": 1621300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7132, + "step": 1621400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.704, + "step": 1621500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7011, + "step": 1621600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7076, + "step": 1621700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.704, + "step": 1621800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7049, + "step": 1621900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7092, + "step": 1622000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.703, + "step": 1622100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7075, + "step": 1622200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6997, + "step": 1622300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7005, + "step": 1622400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7072, + "step": 1622500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6995, + "step": 1622600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7019, + "step": 1622700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7011, + "step": 1622800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7078, + "step": 1622900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7094, + "step": 1623000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7087, + "step": 1623100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7052, + "step": 1623200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7097, + "step": 1623300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7091, + "step": 1623400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7154, + "step": 1623500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7071, + "step": 1623600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7039, + "step": 1623700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7071, + "step": 1623800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7068, + "step": 1623900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7146, + "step": 1624000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7046, + "step": 1624100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7106, + "step": 1624200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7062, + "step": 1624300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7044, + "step": 1624400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7061, + "step": 1624500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7068, + "step": 1624600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7022, + "step": 1624700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7023, + "step": 1624800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7061, + "step": 1624900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.706, + "step": 1625000 + }, + { + "epoch": 2.07, + "eval_loss": 0.6661863327026367, + "eval_runtime": 200.4343, + "eval_samples_per_second": 249.458, + "eval_steps_per_second": 1.951, + "step": 1625000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7141, + "step": 1625100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.701, + "step": 1625200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7099, + "step": 1625300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7114, + "step": 1625400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7113, + "step": 1625500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6976, + "step": 1625600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7042, + "step": 1625700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6983, + "step": 1625800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7021, + "step": 1625900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7017, + "step": 1626000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7052, + "step": 1626100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7066, + "step": 1626200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7057, + "step": 1626300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7075, + "step": 1626400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7071, + "step": 1626500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7065, + "step": 1626600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7065, + "step": 1626700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7048, + "step": 1626800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7017, + "step": 1626900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7095, + "step": 1627000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7087, + "step": 1627100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7015, + "step": 1627200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7075, + "step": 1627300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.705, + "step": 1627400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7018, + "step": 1627500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7046, + "step": 1627600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.706, + "step": 1627700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7078, + "step": 1627800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7072, + "step": 1627900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.703, + "step": 1628000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7073, + "step": 1628100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7051, + "step": 1628200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7062, + "step": 1628300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7029, + "step": 1628400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7088, + "step": 1628500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6988, + "step": 1628600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6993, + "step": 1628700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7028, + "step": 1628800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7073, + "step": 1628900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7018, + "step": 1629000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7059, + "step": 1629100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7072, + "step": 1629200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7023, + "step": 1629300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6993, + "step": 1629400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7017, + "step": 1629500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7076, + "step": 1629600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.714, + "step": 1629700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.708, + "step": 1629800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7114, + "step": 1629900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.709, + "step": 1630000 + }, + { + "epoch": 2.07, + "eval_loss": 0.6691371202468872, + "eval_runtime": 208.3903, + "eval_samples_per_second": 239.934, + "eval_steps_per_second": 1.876, + "step": 1630000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7077, + "step": 1630100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7071, + "step": 1630200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7045, + "step": 1630300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7138, + "step": 1630400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7018, + "step": 1630500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.705, + "step": 1630600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.705, + "step": 1630700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7014, + "step": 1630800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7119, + "step": 1630900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7049, + "step": 1631000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6996, + "step": 1631100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7095, + "step": 1631200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7093, + "step": 1631300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7018, + "step": 1631400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7105, + "step": 1631500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7037, + "step": 1631600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7087, + "step": 1631700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6986, + "step": 1631800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7039, + "step": 1631900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7062, + "step": 1632000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.705, + "step": 1632100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7061, + "step": 1632200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7149, + "step": 1632300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7075, + "step": 1632400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7097, + "step": 1632500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7037, + "step": 1632600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7002, + "step": 1632700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7024, + "step": 1632800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7047, + "step": 1632900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7108, + "step": 1633000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7057, + "step": 1633100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6985, + "step": 1633200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7082, + "step": 1633300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.715, + "step": 1633400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6986, + "step": 1633500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7054, + "step": 1633600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.712, + "step": 1633700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7026, + "step": 1633800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7121, + "step": 1633900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.712, + "step": 1634000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7003, + "step": 1634100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7093, + "step": 1634200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7025, + "step": 1634300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7021, + "step": 1634400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7043, + "step": 1634500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7078, + "step": 1634600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7065, + "step": 1634700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7052, + "step": 1634800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7057, + "step": 1634900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7125, + "step": 1635000 + }, + { + "epoch": 2.07, + "eval_loss": 0.6663634181022644, + "eval_runtime": 203.2205, + "eval_samples_per_second": 246.038, + "eval_steps_per_second": 1.924, + "step": 1635000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7057, + "step": 1635100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7057, + "step": 1635200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7044, + "step": 1635300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6992, + "step": 1635400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.704, + "step": 1635500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7032, + "step": 1635600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7018, + "step": 1635700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7078, + "step": 1635800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7015, + "step": 1635900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7054, + "step": 1636000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7004, + "step": 1636100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7022, + "step": 1636200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7074, + "step": 1636300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7119, + "step": 1636400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7016, + "step": 1636500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7064, + "step": 1636600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7093, + "step": 1636700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7116, + "step": 1636800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7062, + "step": 1636900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7058, + "step": 1637000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7065, + "step": 1637100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7087, + "step": 1637200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7051, + "step": 1637300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.708, + "step": 1637400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7108, + "step": 1637500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7007, + "step": 1637600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7044, + "step": 1637700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7037, + "step": 1637800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7046, + "step": 1637900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.71, + "step": 1638000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7039, + "step": 1638100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7046, + "step": 1638200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7013, + "step": 1638300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6945, + "step": 1638400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7083, + "step": 1638500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7009, + "step": 1638600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6992, + "step": 1638700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7108, + "step": 1638800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7106, + "step": 1638900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7066, + "step": 1639000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.712, + "step": 1639100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7034, + "step": 1639200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7035, + "step": 1639300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7026, + "step": 1639400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7073, + "step": 1639500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7055, + "step": 1639600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7028, + "step": 1639700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.704, + "step": 1639800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7021, + "step": 1639900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7084, + "step": 1640000 + }, + { + "epoch": 2.07, + "eval_loss": 0.6667556166648865, + "eval_runtime": 205.6474, + "eval_samples_per_second": 243.135, + "eval_steps_per_second": 1.901, + "step": 1640000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7038, + "step": 1640100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7113, + "step": 1640200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7125, + "step": 1640300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7039, + "step": 1640400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7048, + "step": 1640500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7078, + "step": 1640600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7006, + "step": 1640700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7056, + "step": 1640800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7069, + "step": 1640900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7018, + "step": 1641000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7029, + "step": 1641100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7043, + "step": 1641200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7071, + "step": 1641300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7057, + "step": 1641400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7073, + "step": 1641500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7005, + "step": 1641600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7035, + "step": 1641700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7054, + "step": 1641800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7045, + "step": 1641900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7051, + "step": 1642000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7094, + "step": 1642100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.708, + "step": 1642200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7047, + "step": 1642300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7029, + "step": 1642400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7045, + "step": 1642500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7066, + "step": 1642600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7027, + "step": 1642700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7067, + "step": 1642800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6989, + "step": 1642900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7153, + "step": 1643000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7049, + "step": 1643100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7082, + "step": 1643200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7025, + "step": 1643300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7056, + "step": 1643400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7061, + "step": 1643500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7025, + "step": 1643600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7019, + "step": 1643700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7095, + "step": 1643800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7004, + "step": 1643900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7102, + "step": 1644000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7075, + "step": 1644100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7068, + "step": 1644200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7068, + "step": 1644300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7029, + "step": 1644400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7028, + "step": 1644500 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7035, + "step": 1644600 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7101, + "step": 1644700 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7081, + "step": 1644800 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7115, + "step": 1644900 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7028, + "step": 1645000 + }, + { + "epoch": 2.07, + "eval_loss": 0.6672505736351013, + "eval_runtime": 206.6883, + "eval_samples_per_second": 241.91, + "eval_steps_per_second": 1.892, + "step": 1645000 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6968, + "step": 1645100 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7011, + "step": 1645200 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.711, + "step": 1645300 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7043, + "step": 1645400 + }, + { + "epoch": 2.07, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7026, + "step": 1645500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7029, + "step": 1645600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7107, + "step": 1645700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7023, + "step": 1645800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.71, + "step": 1645900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7074, + "step": 1646000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7047, + "step": 1646100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7005, + "step": 1646200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6984, + "step": 1646300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7045, + "step": 1646400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7084, + "step": 1646500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7059, + "step": 1646600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6956, + "step": 1646700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7063, + "step": 1646800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7016, + "step": 1646900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6973, + "step": 1647000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7083, + "step": 1647100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6996, + "step": 1647200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7048, + "step": 1647300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7001, + "step": 1647400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7051, + "step": 1647500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7129, + "step": 1647600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7022, + "step": 1647700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7073, + "step": 1647800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6992, + "step": 1647900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7022, + "step": 1648000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7106, + "step": 1648100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7073, + "step": 1648200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7013, + "step": 1648300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7144, + "step": 1648400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.703, + "step": 1648500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7031, + "step": 1648600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7018, + "step": 1648700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7034, + "step": 1648800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7022, + "step": 1648900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7073, + "step": 1649000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7045, + "step": 1649100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7024, + "step": 1649200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7078, + "step": 1649300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7079, + "step": 1649400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7086, + "step": 1649500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7069, + "step": 1649600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7049, + "step": 1649700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.698, + "step": 1649800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6996, + "step": 1649900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7027, + "step": 1650000 + }, + { + "epoch": 3.0, + "eval_loss": 0.6658620238304138, + "eval_runtime": 206.7594, + "eval_samples_per_second": 241.827, + "eval_steps_per_second": 1.891, + "step": 1650000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7038, + "step": 1650100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7039, + "step": 1650200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7096, + "step": 1650300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7123, + "step": 1650400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7071, + "step": 1650500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7072, + "step": 1650600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7053, + "step": 1650700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.709, + "step": 1650800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6974, + "step": 1650900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7062, + "step": 1651000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7063, + "step": 1651100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7079, + "step": 1651200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7, + "step": 1651300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7002, + "step": 1651400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6995, + "step": 1651500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.708, + "step": 1651600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7033, + "step": 1651700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.699, + "step": 1651800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.696, + "step": 1651900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7052, + "step": 1652000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7062, + "step": 1652100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7009, + "step": 1652200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7004, + "step": 1652300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7052, + "step": 1652400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7043, + "step": 1652500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7029, + "step": 1652600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7032, + "step": 1652700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7095, + "step": 1652800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.704, + "step": 1652900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7029, + "step": 1653000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6984, + "step": 1653100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7102, + "step": 1653200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7066, + "step": 1653300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7093, + "step": 1653400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7002, + "step": 1653500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7086, + "step": 1653600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.714, + "step": 1653700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.701, + "step": 1653800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7026, + "step": 1653900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7058, + "step": 1654000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7027, + "step": 1654100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7043, + "step": 1654200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7024, + "step": 1654300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7013, + "step": 1654400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6968, + "step": 1654500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6998, + "step": 1654600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7059, + "step": 1654700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7028, + "step": 1654800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7067, + "step": 1654900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7014, + "step": 1655000 + }, + { + "epoch": 3.0, + "eval_loss": 0.6663699746131897, + "eval_runtime": 205.5062, + "eval_samples_per_second": 243.302, + "eval_steps_per_second": 1.903, + "step": 1655000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7063, + "step": 1655100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7021, + "step": 1655200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7047, + "step": 1655300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7088, + "step": 1655400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7053, + "step": 1655500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7079, + "step": 1655600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7033, + "step": 1655700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7122, + "step": 1655800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7043, + "step": 1655900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7098, + "step": 1656000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7034, + "step": 1656100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7102, + "step": 1656200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7141, + "step": 1656300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7036, + "step": 1656400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7022, + "step": 1656500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7049, + "step": 1656600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6975, + "step": 1656700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7057, + "step": 1656800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.704, + "step": 1656900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7002, + "step": 1657000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7069, + "step": 1657100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7023, + "step": 1657200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7062, + "step": 1657300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7029, + "step": 1657400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.709, + "step": 1657500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7024, + "step": 1657600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6981, + "step": 1657700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6989, + "step": 1657800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7038, + "step": 1657900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7057, + "step": 1658000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7012, + "step": 1658100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6978, + "step": 1658200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7092, + "step": 1658300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.698, + "step": 1658400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7021, + "step": 1658500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6945, + "step": 1658600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7076, + "step": 1658700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7079, + "step": 1658800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7105, + "step": 1658900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.701, + "step": 1659000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7083, + "step": 1659100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7032, + "step": 1659200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7083, + "step": 1659300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7019, + "step": 1659400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7039, + "step": 1659500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7094, + "step": 1659600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7037, + "step": 1659700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.708, + "step": 1659800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7056, + "step": 1659900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7036, + "step": 1660000 + }, + { + "epoch": 3.0, + "eval_loss": 0.6648625135421753, + "eval_runtime": 206.1877, + "eval_samples_per_second": 242.497, + "eval_steps_per_second": 1.896, + "step": 1660000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7036, + "step": 1660100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7048, + "step": 1660200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7133, + "step": 1660300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7094, + "step": 1660400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7046, + "step": 1660500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7018, + "step": 1660600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7082, + "step": 1660700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7095, + "step": 1660800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7061, + "step": 1660900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7005, + "step": 1661000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7086, + "step": 1661100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.697, + "step": 1661200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7018, + "step": 1661300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.702, + "step": 1661400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7111, + "step": 1661500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7081, + "step": 1661600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6963, + "step": 1661700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7034, + "step": 1661800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6975, + "step": 1661900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7044, + "step": 1662000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7025, + "step": 1662100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7048, + "step": 1662200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6996, + "step": 1662300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7072, + "step": 1662400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6974, + "step": 1662500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7071, + "step": 1662600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7056, + "step": 1662700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7072, + "step": 1662800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6995, + "step": 1662900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7036, + "step": 1663000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7057, + "step": 1663100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7094, + "step": 1663200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6958, + "step": 1663300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7056, + "step": 1663400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7055, + "step": 1663500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6956, + "step": 1663600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7087, + "step": 1663700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7035, + "step": 1663800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7016, + "step": 1663900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6949, + "step": 1664000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7079, + "step": 1664100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7069, + "step": 1664200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6976, + "step": 1664300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7059, + "step": 1664400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7048, + "step": 1664500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7147, + "step": 1664600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7058, + "step": 1664700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7058, + "step": 1664800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7104, + "step": 1664900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7063, + "step": 1665000 + }, + { + "epoch": 3.0, + "eval_loss": 0.6649179458618164, + "eval_runtime": 206.4995, + "eval_samples_per_second": 242.131, + "eval_steps_per_second": 1.893, + "step": 1665000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7086, + "step": 1665100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7021, + "step": 1665200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7017, + "step": 1665300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6971, + "step": 1665400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6959, + "step": 1665500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7007, + "step": 1665600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7043, + "step": 1665700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7065, + "step": 1665800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7087, + "step": 1665900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7095, + "step": 1666000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7033, + "step": 1666100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6982, + "step": 1666200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7065, + "step": 1666300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7052, + "step": 1666400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6917, + "step": 1666500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7062, + "step": 1666600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7017, + "step": 1666700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7076, + "step": 1666800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6959, + "step": 1666900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6998, + "step": 1667000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7027, + "step": 1667100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.706, + "step": 1667200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7075, + "step": 1667300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7082, + "step": 1667400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7031, + "step": 1667500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7037, + "step": 1667600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7005, + "step": 1667700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7028, + "step": 1667800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7057, + "step": 1667900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7038, + "step": 1668000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7153, + "step": 1668100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7015, + "step": 1668200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7028, + "step": 1668300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7049, + "step": 1668400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6955, + "step": 1668500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7002, + "step": 1668600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7078, + "step": 1668700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.719, + "step": 1668800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7066, + "step": 1668900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7041, + "step": 1669000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6965, + "step": 1669100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7092, + "step": 1669200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7074, + "step": 1669300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7057, + "step": 1669400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7069, + "step": 1669500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7097, + "step": 1669600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7094, + "step": 1669700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7023, + "step": 1669800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7001, + "step": 1669900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7012, + "step": 1670000 + }, + { + "epoch": 3.0, + "eval_loss": 0.6655164957046509, + "eval_runtime": 204.3764, + "eval_samples_per_second": 244.647, + "eval_steps_per_second": 1.913, + "step": 1670000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7069, + "step": 1670100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7034, + "step": 1670200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7049, + "step": 1670300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7054, + "step": 1670400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.709, + "step": 1670500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7047, + "step": 1670600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7048, + "step": 1670700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7062, + "step": 1670800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7106, + "step": 1670900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7092, + "step": 1671000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7035, + "step": 1671100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7042, + "step": 1671200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6923, + "step": 1671300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7, + "step": 1671400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7041, + "step": 1671500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7055, + "step": 1671600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.702, + "step": 1671700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6981, + "step": 1671800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6994, + "step": 1671900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.696, + "step": 1672000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7043, + "step": 1672100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6954, + "step": 1672200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7027, + "step": 1672300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7098, + "step": 1672400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.701, + "step": 1672500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7001, + "step": 1672600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7083, + "step": 1672700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7025, + "step": 1672800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.698, + "step": 1672900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7129, + "step": 1673000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.704, + "step": 1673100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6988, + "step": 1673200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7054, + "step": 1673300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7013, + "step": 1673400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7075, + "step": 1673500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.697, + "step": 1673600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.71, + "step": 1673700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7069, + "step": 1673800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7046, + "step": 1673900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.705, + "step": 1674000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6996, + "step": 1674100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.703, + "step": 1674200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7086, + "step": 1674300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6991, + "step": 1674400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6933, + "step": 1674500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7057, + "step": 1674600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7036, + "step": 1674700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7057, + "step": 1674800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7021, + "step": 1674900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7007, + "step": 1675000 + }, + { + "epoch": 3.0, + "eval_loss": 0.6647133827209473, + "eval_runtime": 195.5023, + "eval_samples_per_second": 255.752, + "eval_steps_per_second": 2.0, + "step": 1675000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.707, + "step": 1675100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7025, + "step": 1675200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7003, + "step": 1675300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7047, + "step": 1675400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7081, + "step": 1675500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7001, + "step": 1675600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7063, + "step": 1675700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6996, + "step": 1675800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7066, + "step": 1675900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7068, + "step": 1676000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7077, + "step": 1676100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.703, + "step": 1676200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7108, + "step": 1676300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7057, + "step": 1676400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7049, + "step": 1676500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7048, + "step": 1676600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6977, + "step": 1676700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7024, + "step": 1676800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.703, + "step": 1676900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7014, + "step": 1677000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7039, + "step": 1677100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7049, + "step": 1677200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7036, + "step": 1677300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7026, + "step": 1677400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7047, + "step": 1677500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7089, + "step": 1677600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7013, + "step": 1677700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7023, + "step": 1677800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7024, + "step": 1677900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7057, + "step": 1678000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7043, + "step": 1678100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7038, + "step": 1678200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7033, + "step": 1678300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7011, + "step": 1678400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7, + "step": 1678500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7051, + "step": 1678600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7095, + "step": 1678700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6923, + "step": 1678800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7085, + "step": 1678900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6942, + "step": 1679000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.71, + "step": 1679100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.702, + "step": 1679200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6912, + "step": 1679300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6986, + "step": 1679400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7072, + "step": 1679500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7091, + "step": 1679600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7015, + "step": 1679700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7019, + "step": 1679800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7075, + "step": 1679900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7033, + "step": 1680000 + }, + { + "epoch": 3.0, + "eval_loss": 0.6660146117210388, + "eval_runtime": 195.8738, + "eval_samples_per_second": 255.266, + "eval_steps_per_second": 1.996, + "step": 1680000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7026, + "step": 1680100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7017, + "step": 1680200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7073, + "step": 1680300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7006, + "step": 1680400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7082, + "step": 1680500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7032, + "step": 1680600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7066, + "step": 1680700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7024, + "step": 1680800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6991, + "step": 1680900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7053, + "step": 1681000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7009, + "step": 1681100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7078, + "step": 1681200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7094, + "step": 1681300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7065, + "step": 1681400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6968, + "step": 1681500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7048, + "step": 1681600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7086, + "step": 1681700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.712, + "step": 1681800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7013, + "step": 1681900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7076, + "step": 1682000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7035, + "step": 1682100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7078, + "step": 1682200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7029, + "step": 1682300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7097, + "step": 1682400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7003, + "step": 1682500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6953, + "step": 1682600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6995, + "step": 1682700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7048, + "step": 1682800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7021, + "step": 1682900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7041, + "step": 1683000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6985, + "step": 1683100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7031, + "step": 1683200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7039, + "step": 1683300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7064, + "step": 1683400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7058, + "step": 1683500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7116, + "step": 1683600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6977, + "step": 1683700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7025, + "step": 1683800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7034, + "step": 1683900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6962, + "step": 1684000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7013, + "step": 1684100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7028, + "step": 1684200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6932, + "step": 1684300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7047, + "step": 1684400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6963, + "step": 1684500 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7023, + "step": 1684600 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7079, + "step": 1684700 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7078, + "step": 1684800 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7073, + "step": 1684900 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6997, + "step": 1685000 + }, + { + "epoch": 3.0, + "eval_loss": 0.6658166646957397, + "eval_runtime": 195.353, + "eval_samples_per_second": 255.947, + "eval_steps_per_second": 2.002, + "step": 1685000 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7012, + "step": 1685100 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7004, + "step": 1685200 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7019, + "step": 1685300 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7, + "step": 1685400 + }, + { + "epoch": 3.0, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7006, + "step": 1685500 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.708, + "step": 1685600 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7053, + "step": 1685700 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6957, + "step": 1685800 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7085, + "step": 1685900 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6997, + "step": 1686000 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7018, + "step": 1686100 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7028, + "step": 1686200 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6983, + "step": 1686300 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7055, + "step": 1686400 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7112, + "step": 1686500 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7045, + "step": 1686600 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7022, + "step": 1686700 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7014, + "step": 1686800 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7098, + "step": 1686900 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.702, + "step": 1687000 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7002, + "step": 1687100 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6957, + "step": 1687200 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7052, + "step": 1687300 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7014, + "step": 1687400 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.708, + "step": 1687500 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7007, + "step": 1687600 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7069, + "step": 1687700 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7065, + "step": 1687800 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7041, + "step": 1687900 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6993, + "step": 1688000 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7105, + "step": 1688100 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7037, + "step": 1688200 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7076, + "step": 1688300 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6944, + "step": 1688400 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7022, + "step": 1688500 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7059, + "step": 1688600 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7, + "step": 1688700 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6936, + "step": 1688800 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7044, + "step": 1688900 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6999, + "step": 1689000 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6998, + "step": 1689100 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7115, + "step": 1689200 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7084, + "step": 1689300 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6963, + "step": 1689400 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7029, + "step": 1689500 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7022, + "step": 1689600 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6991, + "step": 1689700 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6922, + "step": 1689800 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7027, + "step": 1689900 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6981, + "step": 1690000 + }, + { + "epoch": 3.01, + "eval_loss": 0.6643096208572388, + "eval_runtime": 195.7682, + "eval_samples_per_second": 255.404, + "eval_steps_per_second": 1.997, + "step": 1690000 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6996, + "step": 1690100 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7082, + "step": 1690200 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7097, + "step": 1690300 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7059, + "step": 1690400 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7053, + "step": 1690500 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7018, + "step": 1690600 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7012, + "step": 1690700 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7003, + "step": 1690800 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7007, + "step": 1690900 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7018, + "step": 1691000 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7083, + "step": 1691100 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6951, + "step": 1691200 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7033, + "step": 1691300 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7066, + "step": 1691400 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7056, + "step": 1691500 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6971, + "step": 1691600 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6965, + "step": 1691700 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7027, + "step": 1691800 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7038, + "step": 1691900 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7036, + "step": 1692000 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7061, + "step": 1692100 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7024, + "step": 1692200 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6984, + "step": 1692300 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6962, + "step": 1692400 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7021, + "step": 1692500 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6964, + "step": 1692600 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7016, + "step": 1692700 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7006, + "step": 1692800 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7036, + "step": 1692900 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6994, + "step": 1693000 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7004, + "step": 1693100 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7028, + "step": 1693200 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7086, + "step": 1693300 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7088, + "step": 1693400 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7029, + "step": 1693500 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7042, + "step": 1693600 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.703, + "step": 1693700 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7033, + "step": 1693800 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7018, + "step": 1693900 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7071, + "step": 1694000 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6977, + "step": 1694100 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7028, + "step": 1694200 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7075, + "step": 1694300 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6946, + "step": 1694400 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7022, + "step": 1694500 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7064, + "step": 1694600 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7048, + "step": 1694700 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6998, + "step": 1694800 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6943, + "step": 1694900 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7021, + "step": 1695000 + }, + { + "epoch": 3.01, + "eval_loss": 0.6644836068153381, + "eval_runtime": 195.522, + "eval_samples_per_second": 255.726, + "eval_steps_per_second": 2.0, + "step": 1695000 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.698, + "step": 1695100 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6978, + "step": 1695200 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7063, + "step": 1695300 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6977, + "step": 1695400 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7017, + "step": 1695500 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7083, + "step": 1695600 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.709, + "step": 1695700 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7035, + "step": 1695800 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6989, + "step": 1695900 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7018, + "step": 1696000 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7028, + "step": 1696100 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6993, + "step": 1696200 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6983, + "step": 1696300 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7113, + "step": 1696400 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7036, + "step": 1696500 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.702, + "step": 1696600 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7022, + "step": 1696700 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.695, + "step": 1696800 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7136, + "step": 1696900 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7049, + "step": 1697000 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7049, + "step": 1697100 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6999, + "step": 1697200 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7058, + "step": 1697300 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6991, + "step": 1697400 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7048, + "step": 1697500 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7091, + "step": 1697600 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6994, + "step": 1697700 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7064, + "step": 1697800 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7026, + "step": 1697900 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7074, + "step": 1698000 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6996, + "step": 1698100 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7016, + "step": 1698200 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7043, + "step": 1698300 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7008, + "step": 1698400 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6998, + "step": 1698500 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.701, + "step": 1698600 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7065, + "step": 1698700 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7032, + "step": 1698800 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7101, + "step": 1698900 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.698, + "step": 1699000 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7029, + "step": 1699100 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7002, + "step": 1699200 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7064, + "step": 1699300 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7064, + "step": 1699400 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7044, + "step": 1699500 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7012, + "step": 1699600 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7038, + "step": 1699700 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7061, + "step": 1699800 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7, + "step": 1699900 + }, + { + "epoch": 3.01, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.698, + "step": 1700000 + }, + { + "epoch": 3.01, + "eval_loss": 0.6644004583358765, + "eval_runtime": 195.2166, + "eval_samples_per_second": 256.126, + "eval_steps_per_second": 2.003, + "step": 1700000 + } + ], + "max_steps": 8000000.0, + "num_train_epochs": 9223372036854775807, + "total_flos": 2.0282185909429862e+20, + "trial_name": null, + "trial_params": null +}