diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" deleted file mode 100644--- "a/last-checkpoint/trainer_state.json" +++ /dev/null @@ -1,15216 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.1, - "global_step": 2000000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0, - "learning_rate": 7.499999999999999e-07, - "loss": 1.0632, - "step": 1000 - }, - { - "epoch": 0.0, - "learning_rate": 1.4999999999999998e-06, - "loss": 0.9867, - "step": 2000 - }, - { - "epoch": 0.0, - "learning_rate": 2.2499999999999996e-06, - "loss": 0.9811, - "step": 3000 - }, - { - "epoch": 0.0, - "learning_rate": 2.9999999999999997e-06, - "loss": 0.9787, - "step": 4000 - }, - { - "epoch": 0.0, - "learning_rate": 3.7499999999999997e-06, - "loss": 0.9729, - "step": 5000 - }, - { - "epoch": 0.0, - "eval_loss": 0.9619815349578857, - "eval_runtime": 82.8842, - "eval_samples_per_second": 77.216, - "eval_steps_per_second": 0.603, - "step": 5000 - }, - { - "epoch": 0.0, - "learning_rate": 4.499999999999999e-06, - "loss": 0.9491, - "step": 6000 - }, - { - "epoch": 0.0, - "learning_rate": 5.25e-06, - "loss": 0.8618, - "step": 7000 - }, - { - "epoch": 0.0, - "learning_rate": 5.999999999999999e-06, - "loss": 0.8256, - "step": 8000 - }, - { - "epoch": 0.0, - "learning_rate": 6.749999999999999e-06, - "loss": 0.8148, - "step": 9000 - }, - { - "epoch": 0.01, - "learning_rate": 7.499999999999999e-06, - "loss": 0.8019, - "step": 10000 - }, - { - "epoch": 0.01, - "eval_loss": 0.8153918385505676, - "eval_runtime": 78.5433, - "eval_samples_per_second": 81.484, - "eval_steps_per_second": 0.637, - "step": 10000 - }, - { - "epoch": 0.01, - "learning_rate": 8.249999999999999e-06, - "loss": 0.7961, - "step": 11000 - }, - { - "epoch": 0.01, - "learning_rate": 8.999999999999999e-06, - "loss": 0.7914, - "step": 12000 - }, - { - "epoch": 0.01, - "learning_rate": 9.75e-06, - "loss": 0.7849, - "step": 13000 - }, - { - "epoch": 0.01, - "learning_rate": 1.05e-05, - "loss": 0.7817, - "step": 14000 - }, - { - "epoch": 0.01, - "learning_rate": 1.1249999999999999e-05, - "loss": 0.7777, - "step": 15000 - }, - { - "epoch": 0.01, - "eval_loss": 0.7774372696876526, - "eval_runtime": 79.7934, - "eval_samples_per_second": 80.207, - "eval_steps_per_second": 0.627, - "step": 15000 - }, - { - "epoch": 0.01, - "learning_rate": 1.1999999999999999e-05, - "loss": 0.7751, - "step": 16000 - }, - { - "epoch": 0.01, - "learning_rate": 1.2749249999999998e-05, - "loss": 0.7714, - "step": 17000 - }, - { - "epoch": 0.01, - "learning_rate": 1.34985e-05, - "loss": 0.7673, - "step": 18000 - }, - { - "epoch": 0.01, - "learning_rate": 1.42485e-05, - "loss": 0.7649, - "step": 19000 - }, - { - "epoch": 0.01, - "learning_rate": 1.4998499999999999e-05, - "loss": 0.7629, - "step": 20000 - }, - { - "epoch": 0.01, - "eval_loss": 0.7626805305480957, - "eval_runtime": 78.8878, - "eval_samples_per_second": 81.128, - "eval_steps_per_second": 0.634, - "step": 20000 - }, - { - "epoch": 0.01, - "learning_rate": 1.57485e-05, - "loss": 0.7616, - "step": 21000 - }, - { - "epoch": 0.01, - "learning_rate": 1.649775e-05, - "loss": 0.7592, - "step": 22000 - }, - { - "epoch": 0.01, - "learning_rate": 1.7247749999999998e-05, - "loss": 0.7577, - "step": 23000 - }, - { - "epoch": 0.01, - "learning_rate": 1.799775e-05, - "loss": 0.7557, - "step": 24000 - }, - { - "epoch": 0.01, - "learning_rate": 1.8747749999999997e-05, - "loss": 0.7542, - "step": 25000 - }, - { - "epoch": 0.01, - "eval_loss": 0.7557830810546875, - "eval_runtime": 80.9075, - "eval_samples_per_second": 79.103, - "eval_steps_per_second": 0.618, - "step": 25000 - }, - { - "epoch": 0.01, - "learning_rate": 1.9497749999999998e-05, - "loss": 0.7529, - "step": 26000 - }, - { - "epoch": 0.01, - "learning_rate": 2.0246999999999998e-05, - "loss": 0.7526, - "step": 27000 - }, - { - "epoch": 0.01, - "learning_rate": 2.0996999999999996e-05, - "loss": 0.7518, - "step": 28000 - }, - { - "epoch": 0.01, - "learning_rate": 2.1746999999999997e-05, - "loss": 0.7508, - "step": 29000 - }, - { - "epoch": 0.01, - "learning_rate": 2.24955e-05, - "loss": 0.7496, - "step": 30000 - }, - { - "epoch": 0.01, - "eval_loss": 0.7457370162010193, - "eval_runtime": 76.4686, - "eval_samples_per_second": 83.694, - "eval_steps_per_second": 0.654, - "step": 30000 - }, - { - "epoch": 0.02, - "learning_rate": 2.3245499999999996e-05, - "loss": 0.748, - "step": 31000 - }, - { - "epoch": 0.02, - "learning_rate": 2.3995499999999998e-05, - "loss": 0.7472, - "step": 32000 - }, - { - "epoch": 0.02, - "learning_rate": 2.4744749999999997e-05, - "loss": 0.7467, - "step": 33000 - }, - { - "epoch": 0.02, - "learning_rate": 2.549475e-05, - "loss": 0.7465, - "step": 34000 - }, - { - "epoch": 0.02, - "learning_rate": 2.6243999999999998e-05, - "loss": 0.7447, - "step": 35000 - }, - { - "epoch": 0.02, - "eval_loss": 0.7436805963516235, - "eval_runtime": 80.6483, - "eval_samples_per_second": 79.357, - "eval_steps_per_second": 0.62, - "step": 35000 - }, - { - "epoch": 0.02, - "learning_rate": 2.6994e-05, - "loss": 0.7446, - "step": 36000 - }, - { - "epoch": 0.02, - "learning_rate": 2.774325e-05, - "loss": 0.7432, - "step": 37000 - }, - { - "epoch": 0.02, - "learning_rate": 2.849325e-05, - "loss": 0.7435, - "step": 38000 - }, - { - "epoch": 0.02, - "learning_rate": 2.9243249999999995e-05, - "loss": 0.7398, - "step": 39000 - }, - { - "epoch": 0.02, - "learning_rate": 2.9992499999999994e-05, - "loss": 0.7365, - "step": 40000 - }, - { - "epoch": 0.02, - "eval_loss": 0.7301322817802429, - "eval_runtime": 81.961, - "eval_samples_per_second": 78.086, - "eval_steps_per_second": 0.61, - "step": 40000 - }, - { - "epoch": 0.02, - "learning_rate": 3.07425e-05, - "loss": 0.7345, - "step": 41000 - }, - { - "epoch": 0.02, - "learning_rate": 3.149175e-05, - "loss": 0.7318, - "step": 42000 - }, - { - "epoch": 0.02, - "learning_rate": 3.224175e-05, - "loss": 0.7293, - "step": 43000 - }, - { - "epoch": 0.02, - "learning_rate": 3.2990999999999996e-05, - "loss": 0.7263, - "step": 44000 - }, - { - "epoch": 0.02, - "learning_rate": 3.3740999999999994e-05, - "loss": 0.719, - "step": 45000 - }, - { - "epoch": 0.02, - "eval_loss": 0.70879727602005, - "eval_runtime": 80.7536, - "eval_samples_per_second": 79.253, - "eval_steps_per_second": 0.619, - "step": 45000 - }, - { - "epoch": 0.02, - "learning_rate": 3.4489499999999996e-05, - "loss": 0.7129, - "step": 46000 - }, - { - "epoch": 0.02, - "learning_rate": 3.5239499999999994e-05, - "loss": 0.7101, - "step": 47000 - }, - { - "epoch": 0.02, - "learning_rate": 3.59895e-05, - "loss": 0.7054, - "step": 48000 - }, - { - "epoch": 0.02, - "learning_rate": 3.6739499999999996e-05, - "loss": 0.7032, - "step": 49000 - }, - { - "epoch": 0.03, - "learning_rate": 3.74895e-05, - "loss": 0.6997, - "step": 50000 - }, - { - "epoch": 0.03, - "eval_loss": 0.6915566921234131, - "eval_runtime": 80.6444, - "eval_samples_per_second": 79.361, - "eval_steps_per_second": 0.62, - "step": 50000 - }, - { - "epoch": 0.03, - "learning_rate": 3.823875e-05, - "loss": 0.6956, - "step": 51000 - }, - { - "epoch": 0.03, - "learning_rate": 3.898875e-05, - "loss": 0.6912, - "step": 52000 - }, - { - "epoch": 0.03, - "learning_rate": 3.9738e-05, - "loss": 0.6889, - "step": 53000 - }, - { - "epoch": 0.03, - "learning_rate": 4.0487999999999996e-05, - "loss": 0.6873, - "step": 54000 - }, - { - "epoch": 0.03, - "learning_rate": 4.1237999999999994e-05, - "loss": 0.6834, - "step": 55000 - }, - { - "epoch": 0.03, - "eval_loss": 0.6724392771720886, - "eval_runtime": 82.1061, - "eval_samples_per_second": 77.948, - "eval_steps_per_second": 0.609, - "step": 55000 - }, - { - "epoch": 0.03, - "learning_rate": 4.198725e-05, - "loss": 0.6817, - "step": 56000 - }, - { - "epoch": 0.03, - "learning_rate": 4.273724999999999e-05, - "loss": 0.6811, - "step": 57000 - }, - { - "epoch": 0.03, - "learning_rate": 4.34865e-05, - "loss": 0.677, - "step": 58000 - }, - { - "epoch": 0.03, - "learning_rate": 4.4236499999999996e-05, - "loss": 0.6756, - "step": 59000 - }, - { - "epoch": 0.03, - "learning_rate": 4.49865e-05, - "loss": 0.6735, - "step": 60000 - }, - { - "epoch": 0.03, - "eval_loss": 0.6614792346954346, - "eval_runtime": 80.4754, - "eval_samples_per_second": 79.527, - "eval_steps_per_second": 0.621, - "step": 60000 - }, - { - "epoch": 0.03, - "learning_rate": 4.573574999999999e-05, - "loss": 0.6712, - "step": 61000 - }, - { - "epoch": 0.03, - "learning_rate": 4.648574999999999e-05, - "loss": 0.6687, - "step": 62000 - }, - { - "epoch": 0.03, - "learning_rate": 4.7235e-05, - "loss": 0.6685, - "step": 63000 - }, - { - "epoch": 0.03, - "learning_rate": 4.7984999999999995e-05, - "loss": 0.6658, - "step": 64000 - }, - { - "epoch": 0.03, - "learning_rate": 4.8735e-05, - "loss": 0.6629, - "step": 65000 - }, - { - "epoch": 0.03, - "eval_loss": 0.6503811478614807, - "eval_runtime": 79.7102, - "eval_samples_per_second": 80.291, - "eval_steps_per_second": 0.627, - "step": 65000 - }, - { - "epoch": 0.03, - "learning_rate": 4.948424999999999e-05, - "loss": 0.661, - "step": 66000 - }, - { - "epoch": 0.03, - "learning_rate": 5.023425e-05, - "loss": 0.6599, - "step": 67000 - }, - { - "epoch": 0.03, - "learning_rate": 5.09835e-05, - "loss": 0.657, - "step": 68000 - }, - { - "epoch": 0.03, - "learning_rate": 5.173349999999999e-05, - "loss": 0.6563, - "step": 69000 - }, - { - "epoch": 0.04, - "learning_rate": 5.248349999999999e-05, - "loss": 0.6541, - "step": 70000 - }, - { - "epoch": 0.04, - "eval_loss": 0.6402961015701294, - "eval_runtime": 81.1292, - "eval_samples_per_second": 78.887, - "eval_steps_per_second": 0.616, - "step": 70000 - }, - { - "epoch": 0.04, - "learning_rate": 5.323275e-05, - "loss": 0.6524, - "step": 71000 - }, - { - "epoch": 0.04, - "learning_rate": 5.398275e-05, - "loss": 0.6514, - "step": 72000 - }, - { - "epoch": 0.04, - "learning_rate": 5.4732749999999995e-05, - "loss": 0.6488, - "step": 73000 - }, - { - "epoch": 0.04, - "learning_rate": 5.5481999999999995e-05, - "loss": 0.6476, - "step": 74000 - }, - { - "epoch": 0.04, - "learning_rate": 5.623199999999999e-05, - "loss": 0.6474, - "step": 75000 - }, - { - "epoch": 0.04, - "eval_loss": 0.6383292078971863, - "eval_runtime": 80.3652, - "eval_samples_per_second": 79.636, - "eval_steps_per_second": 0.622, - "step": 75000 - }, - { - "epoch": 0.04, - "learning_rate": 5.698125e-05, - "loss": 0.6443, - "step": 76000 - }, - { - "epoch": 0.04, - "learning_rate": 5.773125e-05, - "loss": 0.6436, - "step": 77000 - }, - { - "epoch": 0.04, - "learning_rate": 5.84805e-05, - "loss": 0.6408, - "step": 78000 - }, - { - "epoch": 0.04, - "learning_rate": 5.9230499999999995e-05, - "loss": 0.6409, - "step": 79000 - }, - { - "epoch": 0.04, - "learning_rate": 5.998049999999999e-05, - "loss": 0.6399, - "step": 80000 - }, - { - "epoch": 0.04, - "eval_loss": 0.630017876625061, - "eval_runtime": 81.3873, - "eval_samples_per_second": 78.636, - "eval_steps_per_second": 0.614, - "step": 80000 - }, - { - "epoch": 0.04, - "learning_rate": 6.072974999999999e-05, - "loss": 0.6372, - "step": 81000 - }, - { - "epoch": 0.04, - "learning_rate": 6.147975e-05, - "loss": 0.6375, - "step": 82000 - }, - { - "epoch": 0.04, - "learning_rate": 6.222974999999999e-05, - "loss": 0.6351, - "step": 83000 - }, - { - "epoch": 0.04, - "learning_rate": 6.297974999999999e-05, - "loss": 0.6345, - "step": 84000 - }, - { - "epoch": 0.04, - "learning_rate": 6.372975e-05, - "loss": 0.6331, - "step": 85000 - }, - { - "epoch": 0.04, - "eval_loss": 0.6252214908599854, - "eval_runtime": 81.0014, - "eval_samples_per_second": 79.011, - "eval_steps_per_second": 0.617, - "step": 85000 - }, - { - "epoch": 0.04, - "learning_rate": 6.4479e-05, - "loss": 0.6321, - "step": 86000 - }, - { - "epoch": 0.04, - "learning_rate": 6.5229e-05, - "loss": 0.6313, - "step": 87000 - }, - { - "epoch": 0.04, - "learning_rate": 6.597899999999999e-05, - "loss": 0.6299, - "step": 88000 - }, - { - "epoch": 0.04, - "learning_rate": 6.672824999999999e-05, - "loss": 0.6284, - "step": 89000 - }, - { - "epoch": 0.04, - "learning_rate": 6.74775e-05, - "loss": 0.6264, - "step": 90000 - }, - { - "epoch": 0.04, - "eval_loss": 0.6188452839851379, - "eval_runtime": 79.754, - "eval_samples_per_second": 80.247, - "eval_steps_per_second": 0.627, - "step": 90000 - }, - { - "epoch": 0.05, - "learning_rate": 6.822749999999999e-05, - "loss": 0.6273, - "step": 91000 - }, - { - "epoch": 0.05, - "learning_rate": 6.89775e-05, - "loss": 0.6241, - "step": 92000 - }, - { - "epoch": 0.05, - "learning_rate": 6.972675e-05, - "loss": 0.625, - "step": 93000 - }, - { - "epoch": 0.05, - "learning_rate": 7.047674999999999e-05, - "loss": 0.6227, - "step": 94000 - }, - { - "epoch": 0.05, - "learning_rate": 7.122675e-05, - "loss": 0.6223, - "step": 95000 - }, - { - "epoch": 0.05, - "eval_loss": 0.6109737753868103, - "eval_runtime": 80.7059, - "eval_samples_per_second": 79.3, - "eval_steps_per_second": 0.62, - "step": 95000 - }, - { - "epoch": 0.05, - "learning_rate": 7.197599999999999e-05, - "loss": 0.6206, - "step": 96000 - }, - { - "epoch": 0.05, - "learning_rate": 7.2726e-05, - "loss": 0.6193, - "step": 97000 - }, - { - "epoch": 0.05, - "learning_rate": 7.3476e-05, - "loss": 0.6178, - "step": 98000 - }, - { - "epoch": 0.05, - "learning_rate": 7.422524999999999e-05, - "loss": 0.6179, - "step": 99000 - }, - { - "epoch": 0.05, - "learning_rate": 7.497524999999998e-05, - "loss": 0.6168, - "step": 100000 - }, - { - "epoch": 0.05, - "eval_loss": 0.6086182594299316, - "eval_runtime": 81.3282, - "eval_samples_per_second": 78.693, - "eval_steps_per_second": 0.615, - "step": 100000 - }, - { - "epoch": 0.05, - "learning_rate": 7.572449999999999e-05, - "loss": 0.6163, - "step": 101000 - }, - { - "epoch": 0.05, - "learning_rate": 7.64745e-05, - "loss": 0.6152, - "step": 102000 - }, - { - "epoch": 0.05, - "learning_rate": 7.72245e-05, - "loss": 0.6134, - "step": 103000 - }, - { - "epoch": 0.05, - "learning_rate": 7.797374999999999e-05, - "loss": 0.612, - "step": 104000 - }, - { - "epoch": 0.05, - "learning_rate": 7.872375e-05, - "loss": 0.6118, - "step": 105000 - }, - { - "epoch": 0.05, - "eval_loss": 0.6021054983139038, - "eval_runtime": 80.8303, - "eval_samples_per_second": 79.178, - "eval_steps_per_second": 0.619, - "step": 105000 - }, - { - "epoch": 0.05, - "learning_rate": 7.947225e-05, - "loss": 0.6106, - "step": 106000 - }, - { - "epoch": 0.05, - "learning_rate": 8.022225e-05, - "loss": 0.6099, - "step": 107000 - }, - { - "epoch": 0.05, - "learning_rate": 8.097225e-05, - "loss": 0.609, - "step": 108000 - }, - { - "epoch": 0.05, - "learning_rate": 8.172225e-05, - "loss": 0.6088, - "step": 109000 - }, - { - "epoch": 0.06, - "learning_rate": 8.247225e-05, - "loss": 0.6063, - "step": 110000 - }, - { - "epoch": 0.06, - "eval_loss": 0.5952876806259155, - "eval_runtime": 80.8626, - "eval_samples_per_second": 79.147, - "eval_steps_per_second": 0.618, - "step": 110000 - }, - { - "epoch": 0.06, - "learning_rate": 8.322224999999999e-05, - "loss": 0.6055, - "step": 111000 - }, - { - "epoch": 0.06, - "learning_rate": 8.39715e-05, - "loss": 0.6059, - "step": 112000 - }, - { - "epoch": 0.06, - "learning_rate": 8.471999999999999e-05, - "loss": 0.6036, - "step": 113000 - }, - { - "epoch": 0.06, - "learning_rate": 8.546999999999999e-05, - "loss": 0.6035, - "step": 114000 - }, - { - "epoch": 0.06, - "learning_rate": 8.621999999999998e-05, - "loss": 0.6017, - "step": 115000 - }, - { - "epoch": 0.06, - "eval_loss": 0.5951609015464783, - "eval_runtime": 80.5699, - "eval_samples_per_second": 79.434, - "eval_steps_per_second": 0.621, - "step": 115000 - }, - { - "epoch": 0.06, - "learning_rate": 8.696999999999999e-05, - "loss": 0.5997, - "step": 116000 - }, - { - "epoch": 0.06, - "learning_rate": 8.771924999999998e-05, - "loss": 0.6007, - "step": 117000 - }, - { - "epoch": 0.06, - "learning_rate": 8.846924999999999e-05, - "loss": 0.5999, - "step": 118000 - }, - { - "epoch": 0.06, - "learning_rate": 8.921924999999999e-05, - "loss": 0.5991, - "step": 119000 - }, - { - "epoch": 0.06, - "learning_rate": 8.996924999999998e-05, - "loss": 0.5988, - "step": 120000 - }, - { - "epoch": 0.06, - "eval_loss": 0.5855764746665955, - "eval_runtime": 79.6751, - "eval_samples_per_second": 80.326, - "eval_steps_per_second": 0.628, - "step": 120000 - }, - { - "epoch": 0.06, - "learning_rate": 9.07185e-05, - "loss": 0.5964, - "step": 121000 - }, - { - "epoch": 0.06, - "learning_rate": 9.14685e-05, - "loss": 0.5964, - "step": 122000 - }, - { - "epoch": 0.06, - "learning_rate": 9.221775e-05, - "loss": 0.596, - "step": 123000 - }, - { - "epoch": 0.06, - "learning_rate": 9.296774999999999e-05, - "loss": 0.5956, - "step": 124000 - }, - { - "epoch": 0.06, - "learning_rate": 9.3717e-05, - "loss": 0.5941, - "step": 125000 - }, - { - "epoch": 0.06, - "eval_loss": 0.5847644209861755, - "eval_runtime": 79.7033, - "eval_samples_per_second": 80.298, - "eval_steps_per_second": 0.627, - "step": 125000 - }, - { - "epoch": 0.06, - "learning_rate": 9.446624999999999e-05, - "loss": 0.5936, - "step": 126000 - }, - { - "epoch": 0.06, - "learning_rate": 9.521625e-05, - "loss": 0.5935, - "step": 127000 - }, - { - "epoch": 0.06, - "learning_rate": 9.596624999999999e-05, - "loss": 0.5927, - "step": 128000 - }, - { - "epoch": 0.06, - "learning_rate": 9.671624999999999e-05, - "loss": 0.5925, - "step": 129000 - }, - { - "epoch": 0.07, - "learning_rate": 9.746549999999998e-05, - "loss": 0.5914, - "step": 130000 - }, - { - "epoch": 0.07, - "eval_loss": 0.5818326473236084, - "eval_runtime": 80.9583, - "eval_samples_per_second": 79.053, - "eval_steps_per_second": 0.618, - "step": 130000 - }, - { - "epoch": 0.07, - "learning_rate": 9.821549999999999e-05, - "loss": 0.5887, - "step": 131000 - }, - { - "epoch": 0.07, - "learning_rate": 9.896549999999999e-05, - "loss": 0.5892, - "step": 132000 - }, - { - "epoch": 0.07, - "learning_rate": 9.971475e-05, - "loss": 0.5884, - "step": 133000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010046475, - "loss": 0.5882, - "step": 134000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010121249999999999, - "loss": 0.5879, - "step": 135000 - }, - { - "epoch": 0.07, - "eval_loss": 0.5765495300292969, - "eval_runtime": 78.6913, - "eval_samples_per_second": 81.33, - "eval_steps_per_second": 0.635, - "step": 135000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010196249999999999, - "loss": 0.5872, - "step": 136000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010271249999999998, - "loss": 0.5854, - "step": 137000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010346249999999999, - "loss": 0.5846, - "step": 138000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010421249999999999, - "loss": 0.5845, - "step": 139000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010496174999999998, - "loss": 0.5847, - "step": 140000 - }, - { - "epoch": 0.07, - "eval_loss": 0.5738531351089478, - "eval_runtime": 79.0795, - "eval_samples_per_second": 80.931, - "eval_steps_per_second": 0.632, - "step": 140000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010571174999999999, - "loss": 0.5829, - "step": 141000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010646174999999998, - "loss": 0.5832, - "step": 142000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010721174999999999, - "loss": 0.5826, - "step": 143000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010796099999999999, - "loss": 0.581, - "step": 144000 - }, - { - "epoch": 0.07, - "learning_rate": 0.000108711, - "loss": 0.5812, - "step": 145000 - }, - { - "epoch": 0.07, - "eval_loss": 0.570769190788269, - "eval_runtime": 78.7666, - "eval_samples_per_second": 81.253, - "eval_steps_per_second": 0.635, - "step": 145000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010946024999999999, - "loss": 0.5804, - "step": 146000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00011021025, - "loss": 0.5807, - "step": 147000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00011096025, - "loss": 0.5797, - "step": 148000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00011170949999999999, - "loss": 0.5804, - "step": 149000 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001124595, - "loss": 0.5785, - "step": 150000 - }, - { - "epoch": 0.07, - "eval_loss": 0.5669087767601013, - "eval_runtime": 78.7624, - "eval_samples_per_second": 81.257, - "eval_steps_per_second": 0.635, - "step": 150000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00011320874999999999, - "loss": 0.5777, - "step": 151000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00011395875, - "loss": 0.5778, - "step": 152000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00011470799999999999, - "loss": 0.5772, - "step": 153000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00011545799999999998, - "loss": 0.5776, - "step": 154000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00011620799999999998, - "loss": 0.5745, - "step": 155000 - }, - { - "epoch": 0.08, - "eval_loss": 0.5640676021575928, - "eval_runtime": 79.5789, - "eval_samples_per_second": 80.423, - "eval_steps_per_second": 0.628, - "step": 155000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00011695799999999999, - "loss": 0.5758, - "step": 156000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00011770725, - "loss": 0.5747, - "step": 157000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00011845724999999998, - "loss": 0.5743, - "step": 158000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00011920649999999999, - "loss": 0.5736, - "step": 159000 - }, - { - "epoch": 0.08, - "learning_rate": 0.0001199565, - "loss": 0.5738, - "step": 160000 - }, - { - "epoch": 0.08, - "eval_loss": 0.5617207884788513, - "eval_runtime": 79.215, - "eval_samples_per_second": 80.793, - "eval_steps_per_second": 0.631, - "step": 160000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00012070574999999999, - "loss": 0.5723, - "step": 161000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00012145575, - "loss": 0.5704, - "step": 162000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00012220575, - "loss": 0.5703, - "step": 163000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00012295499999999998, - "loss": 0.5708, - "step": 164000 - }, - { - "epoch": 0.08, - "learning_rate": 0.000123705, - "loss": 0.5712, - "step": 165000 - }, - { - "epoch": 0.08, - "eval_loss": 0.5598272085189819, - "eval_runtime": 78.3832, - "eval_samples_per_second": 81.65, - "eval_steps_per_second": 0.638, - "step": 165000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00012445424999999997, - "loss": 0.57, - "step": 166000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00012520425, - "loss": 0.5686, - "step": 167000 - }, - { - "epoch": 0.08, - "learning_rate": 0.0001259535, - "loss": 0.5678, - "step": 168000 - }, - { - "epoch": 0.08, - "learning_rate": 0.0001267035, - "loss": 0.5695, - "step": 169000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00012745349999999998, - "loss": 0.5689, - "step": 170000 - }, - { - "epoch": 0.09, - "eval_loss": 0.5588927865028381, - "eval_runtime": 79.0237, - "eval_samples_per_second": 80.988, - "eval_steps_per_second": 0.633, - "step": 170000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00012820275, - "loss": 0.5689, - "step": 171000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00012895275, - "loss": 0.5668, - "step": 172000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00012970199999999999, - "loss": 0.5663, - "step": 173000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013045125, - "loss": 0.5655, - "step": 174000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013120124999999998, - "loss": 0.566, - "step": 175000 - }, - { - "epoch": 0.09, - "eval_loss": 0.5537316203117371, - "eval_runtime": 78.8358, - "eval_samples_per_second": 81.181, - "eval_steps_per_second": 0.634, - "step": 175000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013195124999999997, - "loss": 0.5653, - "step": 176000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013270049999999998, - "loss": 0.5651, - "step": 177000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013345049999999997, - "loss": 0.5649, - "step": 178000 - }, - { - "epoch": 0.09, - "learning_rate": 0.0001342005, - "loss": 0.5624, - "step": 179000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013495049999999998, - "loss": 0.5645, - "step": 180000 - }, - { - "epoch": 0.09, - "eval_loss": 0.5552637577056885, - "eval_runtime": 77.1932, - "eval_samples_per_second": 82.909, - "eval_steps_per_second": 0.648, - "step": 180000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013569975, - "loss": 0.5634, - "step": 181000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013644974999999998, - "loss": 0.5635, - "step": 182000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013719974999999997, - "loss": 0.5625, - "step": 183000 - }, - { - "epoch": 0.09, - "learning_rate": 0.000137949, - "loss": 0.5614, - "step": 184000 - }, - { - "epoch": 0.09, - "learning_rate": 0.000138699, - "loss": 0.5606, - "step": 185000 - }, - { - "epoch": 0.09, - "eval_loss": 0.5475028157234192, - "eval_runtime": 79.3213, - "eval_samples_per_second": 80.684, - "eval_steps_per_second": 0.63, - "step": 185000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013944899999999999, - "loss": 0.561, - "step": 186000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00014019749999999998, - "loss": 0.5607, - "step": 187000 - }, - { - "epoch": 0.09, - "learning_rate": 0.0001409475, - "loss": 0.5589, - "step": 188000 - }, - { - "epoch": 0.09, - "learning_rate": 0.0001416975, - "loss": 0.5586, - "step": 189000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00014244749999999998, - "loss": 0.5586, - "step": 190000 - }, - { - "epoch": 0.1, - "eval_loss": 0.5459640026092529, - "eval_runtime": 79.4666, - "eval_samples_per_second": 80.537, - "eval_steps_per_second": 0.629, - "step": 190000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00014319675, - "loss": 0.5576, - "step": 191000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00014394674999999998, - "loss": 0.5587, - "step": 192000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00014469599999999998, - "loss": 0.5584, - "step": 193000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00014544599999999998, - "loss": 0.5583, - "step": 194000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00014619524999999998, - "loss": 0.5585, - "step": 195000 - }, - { - "epoch": 0.1, - "eval_loss": 0.5435135960578918, - "eval_runtime": 80.1194, - "eval_samples_per_second": 79.881, - "eval_steps_per_second": 0.624, - "step": 195000 - }, - { - "epoch": 0.1, - "learning_rate": 0.0001469445, - "loss": 0.5572, - "step": 196000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00014769449999999998, - "loss": 0.5556, - "step": 197000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00014844374999999999, - "loss": 0.5552, - "step": 198000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00014919374999999998, - "loss": 0.5557, - "step": 199000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00014994375, - "loss": 0.554, - "step": 200000 - }, - { - "epoch": 0.1, - "eval_loss": 0.5443071126937866, - "eval_runtime": 78.383, - "eval_samples_per_second": 81.65, - "eval_steps_per_second": 0.638, - "step": 200000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00014999990877662188, - "loss": 0.5531, - "step": 201000 - }, - { - "epoch": 0.1, - "learning_rate": 0.0001499996057416134, - "loss": 0.5546, - "step": 202000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00014999908908161323, - "loss": 0.5529, - "step": 203000 - }, - { - "epoch": 0.1, - "learning_rate": 0.0001499983591922482, - "loss": 0.5527, - "step": 204000 - }, - { - "epoch": 0.1, - "learning_rate": 0.0001499974171253642, - "loss": 0.5518, - "step": 205000 - }, - { - "epoch": 0.1, - "eval_loss": 0.5403967499732971, - "eval_runtime": 79.4655, - "eval_samples_per_second": 80.538, - "eval_steps_per_second": 0.629, - "step": 205000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00014999626099781158, - "loss": 0.5512, - "step": 206000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00014999489312535964, - "loss": 0.5513, - "step": 207000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00014999331077369244, - "loss": 0.5519, - "step": 208000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00014999151521026205, - "loss": 0.5513, - "step": 209000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00014998950644053808, - "loss": 0.5499, - "step": 210000 - }, - { - "epoch": 0.1, - "eval_loss": 0.5364459753036499, - "eval_runtime": 78.5073, - "eval_samples_per_second": 81.521, - "eval_steps_per_second": 0.637, - "step": 210000 - }, - { - "epoch": 0.11, - "learning_rate": 0.0001499872844706396, - "loss": 0.5499, - "step": 211000 - }, - { - "epoch": 0.11, - "learning_rate": 0.0001499848493073351, - "loss": 0.548, - "step": 212000 - }, - { - "epoch": 0.11, - "learning_rate": 0.0001499822037128756, - "loss": 0.5483, - "step": 213000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00014997934536662948, - "loss": 0.5483, - "step": 214000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00014997627427728645, - "loss": 0.5483, - "step": 215000 - }, - { - "epoch": 0.11, - "eval_loss": 0.5351453423500061, - "eval_runtime": 79.4482, - "eval_samples_per_second": 80.556, - "eval_steps_per_second": 0.629, - "step": 215000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00014997298706049593, - "loss": 0.5467, - "step": 216000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00014996948669385182, - "loss": 0.5479, - "step": 217000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00014996577318801676, - "loss": 0.5476, - "step": 218000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00014996185058739, - "loss": 0.5451, - "step": 219000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00014995771105086807, - "loss": 0.5458, - "step": 220000 - }, - { - "epoch": 0.11, - "eval_loss": 0.5317646861076355, - "eval_runtime": 79.21, - "eval_samples_per_second": 80.798, - "eval_steps_per_second": 0.631, - "step": 220000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00014995335841102588, - "loss": 0.5443, - "step": 221000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00014994879268112236, - "loss": 0.5443, - "step": 222000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00014994401876029827, - "loss": 0.5445, - "step": 223000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00014993902710569922, - "loss": 0.5446, - "step": 224000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00014993382240469446, - "loss": 0.5442, - "step": 225000 - }, - { - "epoch": 0.11, - "eval_loss": 0.5309335589408875, - "eval_runtime": 78.9413, - "eval_samples_per_second": 81.073, - "eval_steps_per_second": 0.633, - "step": 225000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00014992841019727328, - "loss": 0.5452, - "step": 226000 - }, - { - "epoch": 0.11, - "learning_rate": 0.0001499227796646749, - "loss": 0.5426, - "step": 227000 - }, - { - "epoch": 0.11, - "learning_rate": 0.0001499169361351633, - "loss": 0.5419, - "step": 228000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00014991087962653893, - "loss": 0.542, - "step": 229000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00014990461653308776, - "loss": 0.5432, - "step": 230000 - }, - { - "epoch": 0.12, - "eval_loss": 0.5244734883308411, - "eval_runtime": 78.2913, - "eval_samples_per_second": 81.746, - "eval_steps_per_second": 0.639, - "step": 230000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00014989813433516588, - "loss": 0.5417, - "step": 231000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00014989144601687223, - "loss": 0.5414, - "step": 232000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00014988454522270304, - "loss": 0.5408, - "step": 233000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00014987742474683482, - "loss": 0.5409, - "step": 234000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00014987009141221238, - "loss": 0.5391, - "step": 235000 - }, - { - "epoch": 0.12, - "eval_loss": 0.5243217349052429, - "eval_runtime": 78.6423, - "eval_samples_per_second": 81.381, - "eval_steps_per_second": 0.636, - "step": 235000 - }, - { - "epoch": 0.12, - "learning_rate": 0.0001498625528936495, - "loss": 0.5386, - "step": 236000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00014985479412198444, - "loss": 0.5398, - "step": 237000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00014984682256050195, - "loss": 0.5385, - "step": 238000 - }, - { - "epoch": 0.12, - "learning_rate": 0.0001498386382334848, - "loss": 0.5394, - "step": 239000 - }, - { - "epoch": 0.12, - "learning_rate": 0.0001498302496691869, - "loss": 0.5377, - "step": 240000 - }, - { - "epoch": 0.12, - "eval_loss": 0.5230529308319092, - "eval_runtime": 77.6998, - "eval_samples_per_second": 82.368, - "eval_steps_per_second": 0.644, - "step": 240000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00014982164881505554, - "loss": 0.5377, - "step": 241000 - }, - { - "epoch": 0.12, - "learning_rate": 0.0001498128267689627, - "loss": 0.5373, - "step": 242000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00014980379206089227, - "loss": 0.5354, - "step": 243000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00014979455407190963, - "loss": 0.5354, - "step": 244000 - }, - { - "epoch": 0.12, - "learning_rate": 0.0001497850943356876, - "loss": 0.5348, - "step": 245000 - }, - { - "epoch": 0.12, - "eval_loss": 0.5176389217376709, - "eval_runtime": 78.4635, - "eval_samples_per_second": 81.567, - "eval_steps_per_second": 0.637, - "step": 245000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00014977544157872605, - "loss": 0.5361, - "step": 246000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00014976555714203422, - "loss": 0.5355, - "step": 247000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00014975546018735622, - "loss": 0.5346, - "step": 248000 - }, - { - "epoch": 0.12, - "learning_rate": 0.0001497451507454492, - "loss": 0.5336, - "step": 249000 - }, - { - "epoch": 0.12, - "learning_rate": 0.0001497346288477175, - "loss": 0.5338, - "step": 250000 - }, - { - "epoch": 0.12, - "eval_loss": 0.5214279294013977, - "eval_runtime": 79.6078, - "eval_samples_per_second": 80.394, - "eval_steps_per_second": 0.628, - "step": 250000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00014972390536662896, - "loss": 0.5344, - "step": 251000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00014971296991900254, - "loss": 0.5342, - "step": 252000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00014970181127337587, - "loss": 0.5347, - "step": 253000 - }, - { - "epoch": 0.13, - "learning_rate": 0.000149690440303944, - "loss": 0.5323, - "step": 254000 - }, - { - "epoch": 0.13, - "learning_rate": 0.000149678857045345, - "loss": 0.5328, - "step": 255000 - }, - { - "epoch": 0.13, - "eval_loss": 0.5172922611236572, - "eval_runtime": 79.325, - "eval_samples_per_second": 80.681, - "eval_steps_per_second": 0.63, - "step": 255000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00014966706153286336, - "loss": 0.5313, - "step": 256000 - }, - { - "epoch": 0.13, - "learning_rate": 0.0001496550659161515, - "loss": 0.5312, - "step": 257000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00014964284621650758, - "loss": 0.5321, - "step": 258000 - }, - { - "epoch": 0.13, - "learning_rate": 0.0001496304143726763, - "loss": 0.531, - "step": 259000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00014961777042252718, - "loss": 0.5295, - "step": 260000 - }, - { - "epoch": 0.13, - "eval_loss": 0.5135677456855774, - "eval_runtime": 77.9166, - "eval_samples_per_second": 82.139, - "eval_steps_per_second": 0.642, - "step": 260000 - }, - { - "epoch": 0.13, - "learning_rate": 0.0001496049403282296, - "loss": 0.5307, - "step": 261000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00014959187270565565, - "loss": 0.53, - "step": 262000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00014957860647965505, - "loss": 0.5301, - "step": 263000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00014956511513163472, - "loss": 0.5297, - "step": 264000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00014955143949417623, - "loss": 0.5295, - "step": 265000 - }, - { - "epoch": 0.13, - "eval_loss": 0.513587236404419, - "eval_runtime": 79.2411, - "eval_samples_per_second": 80.766, - "eval_steps_per_second": 0.631, - "step": 265000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00014953752479677766, - "loss": 0.5289, - "step": 266000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00014952339827601903, - "loss": 0.5306, - "step": 267000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00014950905997493217, - "loss": 0.5292, - "step": 268000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00014949450993719408, - "loss": 0.5277, - "step": 269000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00014947976307458214, - "loss": 0.5274, - "step": 270000 - }, - { - "epoch": 0.14, - "eval_loss": 0.5130758881568909, - "eval_runtime": 82.5777, - "eval_samples_per_second": 77.503, - "eval_steps_per_second": 0.605, - "step": 270000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00014946478990877685, - "loss": 0.5284, - "step": 271000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00014944960514117462, - "loss": 0.5273, - "step": 272000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00014943420881803084, - "loss": 0.5271, - "step": 273000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00014941861669970997, - "loss": 0.526, - "step": 274000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00014940281354295447, - "loss": 0.5265, - "step": 275000 - }, - { - "epoch": 0.14, - "eval_loss": 0.510474681854248, - "eval_runtime": 79.0154, - "eval_samples_per_second": 80.997, - "eval_steps_per_second": 0.633, - "step": 275000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00014938678325993965, - "loss": 0.5256, - "step": 276000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00014937054161274965, - "loss": 0.5268, - "step": 277000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00014935408865085934, - "loss": 0.5247, - "step": 278000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00014933742442438727, - "loss": 0.5247, - "step": 279000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00014932056596502014, - "loss": 0.5239, - "step": 280000 - }, - { - "epoch": 0.14, - "eval_loss": 0.5078325867652893, - "eval_runtime": 79.4774, - "eval_samples_per_second": 80.526, - "eval_steps_per_second": 0.629, - "step": 280000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00014930347957345093, - "loss": 0.5242, - "step": 281000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00014928619947439815, - "loss": 0.5243, - "step": 282000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00014926869112571586, - "loss": 0.5242, - "step": 283000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00014925098959731945, - "loss": 0.524, - "step": 284000 - }, - { - "epoch": 0.14, - "learning_rate": 0.0001492330595046446, - "loss": 0.525, - "step": 285000 - }, - { - "epoch": 0.14, - "eval_loss": 0.508979320526123, - "eval_runtime": 78.5336, - "eval_samples_per_second": 81.494, - "eval_steps_per_second": 0.637, - "step": 285000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00014921491851606374, - "loss": 0.5229, - "step": 286000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00014919658514396308, - "loss": 0.5233, - "step": 287000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00014917802274075096, - "loss": 0.5229, - "step": 288000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00014915926848765564, - "loss": 0.5233, - "step": 289000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00014914028489576315, - "loss": 0.5225, - "step": 290000 - }, - { - "epoch": 0.14, - "eval_loss": 0.5040385723114014, - "eval_runtime": 82.1473, - "eval_samples_per_second": 77.909, - "eval_steps_per_second": 0.609, - "step": 290000 - }, - { - "epoch": 0.15, - "learning_rate": 0.0001491210906905722, - "loss": 0.5212, - "step": 291000 - }, - { - "epoch": 0.15, - "learning_rate": 0.00014910168593055176, - "loss": 0.5221, - "step": 292000 - }, - { - "epoch": 0.15, - "learning_rate": 0.000149082070674812, - "loss": 0.522, - "step": 293000 - }, - { - "epoch": 0.15, - "learning_rate": 0.0001490622649138889, - "loss": 0.521, - "step": 294000 - }, - { - "epoch": 0.15, - "learning_rate": 0.00014904224919787054, - "loss": 0.5206, - "step": 295000 - }, - { - "epoch": 0.15, - "eval_loss": 0.5053508877754211, - "eval_runtime": 82.4285, - "eval_samples_per_second": 77.643, - "eval_steps_per_second": 0.607, - "step": 295000 - }, - { - "epoch": 0.15, - "learning_rate": 0.00014902200323661346, - "loss": 0.5199, - "step": 296000 - }, - { - "epoch": 0.15, - "learning_rate": 0.0001490015470223644, - "loss": 0.5213, - "step": 297000 - }, - { - "epoch": 0.15, - "learning_rate": 0.00014898090138881081, - "loss": 0.5202, - "step": 298000 - }, - { - "epoch": 0.15, - "learning_rate": 0.00014896004604751396, - "loss": 0.5201, - "step": 299000 - }, - { - "epoch": 0.15, - "learning_rate": 0.00014893895987079335, - "loss": 0.5199, - "step": 300000 - }, - { - "epoch": 0.15, - "eval_loss": 0.5044429898262024, - "eval_runtime": 81.8484, - "eval_samples_per_second": 78.193, - "eval_steps_per_second": 0.611, - "step": 300000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00014891766369404524, - "loss": 0.519, - "step": 301000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001488961575821415, - "loss": 0.519, - "step": 302000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00014887444160059348, - "loss": 0.5188, - "step": 303000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00014885251581555183, - "loss": 0.5185, - "step": 304000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00014883040253406912, - "loss": 0.5184, - "step": 305000 - }, - { - "epoch": 0.0, - "eval_loss": 0.5007572174072266, - "eval_runtime": 81.9944, - "eval_samples_per_second": 78.054, - "eval_steps_per_second": 0.61, - "step": 305000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00014880805755268368, - "loss": 0.5178, - "step": 306000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00014878550297002193, - "loss": 0.518, - "step": 307000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00014876276172354279, - "loss": 0.5176, - "step": 308000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00014873978835451058, - "loss": 0.5193, - "step": 309000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014871662887949272, - "loss": 0.5163, - "step": 310000 - }, - { - "epoch": 0.01, - "eval_loss": 0.5015575289726257, - "eval_runtime": 79.7243, - "eval_samples_per_second": 80.277, - "eval_steps_per_second": 0.627, - "step": 310000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014869323700373343, - "loss": 0.518, - "step": 311000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014866965958208565, - "loss": 0.5179, - "step": 312000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0001486458733988959, - "loss": 0.5176, - "step": 313000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014862185440305553, - "loss": 0.5176, - "step": 314000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014859762637314596, - "loss": 0.516, - "step": 315000 - }, - { - "epoch": 0.01, - "eval_loss": 0.5005845427513123, - "eval_runtime": 78.6119, - "eval_samples_per_second": 81.413, - "eval_steps_per_second": 0.636, - "step": 315000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0001485731893829699, - "loss": 0.5173, - "step": 316000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014854856825715626, - "loss": 0.5169, - "step": 317000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0001485237137791746, - "loss": 0.5151, - "step": 318000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014849867573352797, - "loss": 0.5154, - "step": 319000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014847340407028189, - "loss": 0.516, - "step": 320000 - }, - { - "epoch": 0.01, - "eval_loss": 0.500928521156311, - "eval_runtime": 79.795, - "eval_samples_per_second": 80.206, - "eval_steps_per_second": 0.627, - "step": 320000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014844792382517202, - "loss": 0.5163, - "step": 321000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014842223507581563, - "loss": 0.5155, - "step": 322000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014839633790046512, - "loss": 0.515, - "step": 323000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0001483702847969303, - "loss": 0.514, - "step": 324000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0001483439714233434, - "loss": 0.5138, - "step": 325000 - }, - { - "epoch": 0.01, - "eval_loss": 0.5003533363342285, - "eval_runtime": 78.4102, - "eval_samples_per_second": 81.622, - "eval_steps_per_second": 0.638, - "step": 325000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014831744986216702, - "loss": 0.5143, - "step": 326000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014829072019419042, - "loss": 0.5139, - "step": 327000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0001482638095424116, - "loss": 0.5143, - "step": 328000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014823666411364003, - "loss": 0.513, - "step": 329000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014820931082415564, - "loss": 0.5136, - "step": 330000 - }, - { - "epoch": 0.01, - "eval_loss": 0.49581122398376465, - "eval_runtime": 78.2187, - "eval_samples_per_second": 81.822, - "eval_steps_per_second": 0.639, - "step": 330000 - }, - { - "epoch": 0.02, - "learning_rate": 0.0001481817497572813, - "loss": 0.5129, - "step": 331000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014815398099697283, - "loss": 0.5129, - "step": 332000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014812603270786007, - "loss": 0.5123, - "step": 333000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014809784902256195, - "loss": 0.512, - "step": 334000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014806948639411544, - "loss": 0.5112, - "step": 335000 - }, - { - "epoch": 0.02, - "eval_loss": 0.4955292046070099, - "eval_runtime": 79.8311, - "eval_samples_per_second": 80.169, - "eval_steps_per_second": 0.626, - "step": 335000 - }, - { - "epoch": 0.02, - "learning_rate": 0.0001480408881268936, - "loss": 0.5122, - "step": 336000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014801211150435752, - "loss": 0.5114, - "step": 337000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014798312811928091, - "loss": 0.5126, - "step": 338000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014795390873698235, - "loss": 0.5112, - "step": 339000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014792451188504096, - "loss": 0.5114, - "step": 340000 - }, - { - "epoch": 0.02, - "eval_loss": 0.4929465353488922, - "eval_runtime": 78.1965, - "eval_samples_per_second": 81.845, - "eval_steps_per_second": 0.639, - "step": 340000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014789487880046323, - "loss": 0.5116, - "step": 341000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014786503889631, - "loss": 0.5108, - "step": 342000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014783499226347888, - "loss": 0.5108, - "step": 343000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014780473899349703, - "loss": 0.5101, - "step": 344000 - }, - { - "epoch": 0.02, - "learning_rate": 0.0001477743403042215, - "loss": 0.5103, - "step": 345000 - }, - { - "epoch": 0.02, - "eval_loss": 0.49353981018066406, - "eval_runtime": 79.8609, - "eval_samples_per_second": 80.139, - "eval_steps_per_second": 0.626, - "step": 345000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014774367444984895, - "loss": 0.51, - "step": 346000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014771280223649564, - "loss": 0.5104, - "step": 347000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014768172375820364, - "loss": 0.5083, - "step": 348000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014765047049724233, - "loss": 0.5092, - "step": 349000 - }, - { - "epoch": 0.03, - "learning_rate": 0.0001476190115731596, - "loss": 0.5097, - "step": 350000 - }, - { - "epoch": 0.03, - "eval_loss": 0.49557891488075256, - "eval_runtime": 80.7409, - "eval_samples_per_second": 79.266, - "eval_steps_per_second": 0.619, - "step": 350000 - }, - { - "epoch": 0.03, - "learning_rate": 0.0001475873152824479, - "loss": 0.5086, - "step": 351000 - }, - { - "epoch": 0.03, - "learning_rate": 0.0001475554131090526, - "loss": 0.508, - "step": 352000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00014752330515015318, - "loss": 0.509, - "step": 353000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00014749102391991098, - "loss": 0.5094, - "step": 354000 - }, - { - "epoch": 0.03, - "learning_rate": 0.000147458504889589, - "loss": 0.5079, - "step": 355000 - }, - { - "epoch": 0.03, - "eval_loss": 0.49167051911354065, - "eval_runtime": 79.1142, - "eval_samples_per_second": 80.896, - "eval_steps_per_second": 0.632, - "step": 355000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00014742578036896223, - "loss": 0.5081, - "step": 356000 - }, - { - "epoch": 0.03, - "learning_rate": 0.0001473929165224509, - "loss": 0.51, - "step": 357000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00014735978173137384, - "loss": 0.507, - "step": 358000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00014732644175071978, - "loss": 0.5079, - "step": 359000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00014729289668204807, - "loss": 0.5079, - "step": 360000 - }, - { - "epoch": 0.03, - "eval_loss": 0.48893874883651733, - "eval_runtime": 79.0429, - "eval_samples_per_second": 80.969, - "eval_steps_per_second": 0.633, - "step": 360000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00014725914662754285, - "loss": 0.5076, - "step": 361000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00014722522574725483, - "loss": 0.5066, - "step": 362000 - }, - { - "epoch": 0.03, - "learning_rate": 0.0001471910662348599, - "loss": 0.5085, - "step": 363000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00014715670204682463, - "loss": 0.5072, - "step": 364000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00014712220262943823, - "loss": 0.5059, - "step": 365000 - }, - { - "epoch": 0.03, - "eval_loss": 0.4867703914642334, - "eval_runtime": 74.6947, - "eval_samples_per_second": 85.682, - "eval_steps_per_second": 0.669, - "step": 365000 - }, - { - "epoch": 0.03, - "learning_rate": 0.0001470874298136091, - "loss": 0.506, - "step": 366000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00014705245263783414, - "loss": 0.5073, - "step": 367000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00014701727120865993, - "loss": 0.5052, - "step": 368000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00014698188563325515, - "loss": 0.5066, - "step": 369000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00014694636740219654, - "loss": 0.5055, - "step": 370000 - }, - { - "epoch": 0.04, - "eval_loss": 0.4855530560016632, - "eval_runtime": 79.7083, - "eval_samples_per_second": 80.293, - "eval_steps_per_second": 0.627, - "step": 370000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00014691057426607544, - "loss": 0.5063, - "step": 371000 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001468745773087412, - "loss": 0.5064, - "step": 372000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00014683841294223294, - "loss": 0.5051, - "step": 373000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00014680200887559787, - "loss": 0.5059, - "step": 374000 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001467654380276222, - "loss": 0.5066, - "step": 375000 - }, - { - "epoch": 0.04, - "eval_loss": 0.4906100034713745, - "eval_runtime": 80.9132, - "eval_samples_per_second": 79.097, - "eval_steps_per_second": 0.618, - "step": 375000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00014672862729481488, - "loss": 0.5051, - "step": 376000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00014669161329503643, - "loss": 0.505, - "step": 377000 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001466543961410379, - "loss": 0.5034, - "step": 378000 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001466170134677651, - "loss": 0.5048, - "step": 379000 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001465793905489244, - "loss": 0.5054, - "step": 380000 - }, - { - "epoch": 0.04, - "eval_loss": 0.4870378375053406, - "eval_runtime": 80.5642, - "eval_samples_per_second": 79.44, - "eval_steps_per_second": 0.621, - "step": 380000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00014654160274471124, - "loss": 0.5036, - "step": 381000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00014650357451899344, - "loss": 0.5046, - "step": 382000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00014646538204381214, - "loss": 0.5035, - "step": 383000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00014642694897411588, - "loss": 0.504, - "step": 384000 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001463883522928644, - "loss": 0.504, - "step": 385000 - }, - { - "epoch": 0.04, - "eval_loss": 0.48908066749572754, - "eval_runtime": 80.3298, - "eval_samples_per_second": 79.672, - "eval_steps_per_second": 0.622, - "step": 385000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00014634951484701638, - "loss": 0.5041, - "step": 386000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00014631047528903952, - "loss": 0.504, - "step": 387000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00014627123373785504, - "loss": 0.5036, - "step": 388000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00014623182985722033, - "loss": 0.5033, - "step": 389000 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001461922246262511, - "loss": 0.502, - "step": 390000 - }, - { - "epoch": 0.04, - "eval_loss": 0.4866912066936493, - "eval_runtime": 80.948, - "eval_samples_per_second": 79.063, - "eval_steps_per_second": 0.618, - "step": 390000 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001461523782182672, - "loss": 0.5041, - "step": 391000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00014611233029866684, - "loss": 0.5011, - "step": 392000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00014607208098944285, - "loss": 0.5036, - "step": 393000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00014603167096426982, - "loss": 0.5021, - "step": 394000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00014599101944531292, - "loss": 0.5024, - "step": 395000 - }, - { - "epoch": 0.05, - "eval_loss": 0.4846630394458771, - "eval_runtime": 80.3433, - "eval_samples_per_second": 79.658, - "eval_steps_per_second": 0.622, - "step": 395000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00014595020785917355, - "loss": 0.5013, - "step": 396000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00014590915462531445, - "loss": 0.5013, - "step": 397000 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001458679006207401, - "loss": 0.5011, - "step": 398000 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001458264875259472, - "loss": 0.5018, - "step": 399000 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001457848325580096, - "loss": 0.5021, - "step": 400000 - }, - { - "epoch": 0.05, - "eval_loss": 0.486747145652771, - "eval_runtime": 75.8258, - "eval_samples_per_second": 84.404, - "eval_steps_per_second": 0.659, - "step": 400000 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001457429771980633, - "loss": 0.5019, - "step": 401000 - }, - { - "epoch": 0.05, - "learning_rate": 0.000145700963729221, - "loss": 0.502, - "step": 402000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00014565870816843566, - "loss": 0.5006, - "step": 403000 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001456162951552679, - "loss": 0.5002, - "step": 404000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00014557363990799942, - "loss": 0.5011, - "step": 405000 - }, - { - "epoch": 0.05, - "eval_loss": 0.4839228689670563, - "eval_runtime": 81.4745, - "eval_samples_per_second": 78.552, - "eval_steps_per_second": 0.614, - "step": 405000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00014553078491205172, - "loss": 0.5007, - "step": 406000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00014548777345224824, - "loss": 0.5004, - "step": 407000 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001454445629041043, - "loss": 0.5006, - "step": 408000 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001454011098463918, - "loss": 0.5018, - "step": 409000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00014535745756555653, - "loss": 0.4996, - "step": 410000 - }, - { - "epoch": 0.06, - "eval_loss": 0.4806668162345886, - "eval_runtime": 80.4842, - "eval_samples_per_second": 79.519, - "eval_steps_per_second": 0.621, - "step": 410000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00014531360619457094, - "loss": 0.4996, - "step": 411000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00014526955586701388, - "loss": 0.5014, - "step": 412000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00014522530671707033, - "loss": 0.4991, - "step": 413000 - }, - { - "epoch": 0.06, - "learning_rate": 0.0001451809479734062, - "loss": 0.4997, - "step": 414000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00014513630198063546, - "loss": 0.4985, - "step": 415000 - }, - { - "epoch": 0.06, - "eval_loss": 0.4842430055141449, - "eval_runtime": 79.3255, - "eval_samples_per_second": 80.68, - "eval_steps_per_second": 0.63, - "step": 415000 - }, - { - "epoch": 0.06, - "learning_rate": 0.0001450915025148655, - "loss": 0.4976, - "step": 416000 - }, - { - "epoch": 0.06, - "learning_rate": 0.0001450464600239659, - "loss": 0.4988, - "step": 417000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00014500126472982865, - "loss": 0.4992, - "step": 418000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00014495587182637829, - "loss": 0.4989, - "step": 419000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00014491023571671524, - "loss": 0.5, - "step": 420000 - }, - { - "epoch": 0.06, - "eval_loss": 0.4820484519004822, - "eval_runtime": 78.241, - "eval_samples_per_second": 81.799, - "eval_steps_per_second": 0.639, - "step": 420000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00014486440187921716, - "loss": 0.498, - "step": 421000 - }, - { - "epoch": 0.06, - "learning_rate": 0.0001448183704535019, - "loss": 0.4985, - "step": 422000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00014477214157978911, - "loss": 0.4985, - "step": 423000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00014472576192358876, - "loss": 0.4983, - "step": 424000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00014467913877404042, - "loss": 0.4979, - "step": 425000 - }, - { - "epoch": 0.06, - "eval_loss": 0.4826773405075073, - "eval_runtime": 78.9952, - "eval_samples_per_second": 81.018, - "eval_steps_per_second": 0.633, - "step": 425000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00014463236551915786, - "loss": 0.4984, - "step": 426000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00014458534866129436, - "loss": 0.4991, - "step": 427000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00014453813506525818, - "loss": 0.4982, - "step": 428000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00014449072487487006, - "loss": 0.4987, - "step": 429000 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001444431182345497, - "loss": 0.4986, - "step": 430000 - }, - { - "epoch": 0.07, - "eval_loss": 0.4816123843193054, - "eval_runtime": 79.868, - "eval_samples_per_second": 80.132, - "eval_steps_per_second": 0.626, - "step": 430000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014439536319026632, - "loss": 0.4959, - "step": 431000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014434736428182006, - "loss": 0.4972, - "step": 432000 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001442991693601428, - "loss": 0.4971, - "step": 433000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014425082706061895, - "loss": 0.4973, - "step": 434000 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001442022894331757, - "loss": 0.4971, - "step": 435000 - }, - { - "epoch": 0.07, - "eval_loss": 0.47952431440353394, - "eval_runtime": 79.6918, - "eval_samples_per_second": 80.309, - "eval_steps_per_second": 0.627, - "step": 435000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014415350774604595, - "loss": 0.4975, - "step": 436000 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001441045306362058, - "loss": 0.4959, - "step": 437000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014405535825284804, - "loss": 0.4956, - "step": 438000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014400604021068167, - "loss": 0.4963, - "step": 439000 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001439564779251439, - "loss": 0.4968, - "step": 440000 - }, - { - "epoch": 0.07, - "eval_loss": 0.4796030819416046, - "eval_runtime": 79.6283, - "eval_samples_per_second": 80.373, - "eval_steps_per_second": 0.628, - "step": 440000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014390672081708274, - "loss": 0.4956, - "step": 441000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014385681908703354, - "loss": 0.4963, - "step": 442000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014380667298366756, - "loss": 0.496, - "step": 443000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014375633251411, - "loss": 0.4955, - "step": 444000 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001437057978317067, - "loss": 0.4955, - "step": 445000 - }, - { - "epoch": 0.07, - "eval_loss": 0.47917410731315613, - "eval_runtime": 79.9877, - "eval_samples_per_second": 80.012, - "eval_steps_per_second": 0.625, - "step": 445000 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001436551199160174, - "loss": 0.4956, - "step": 446000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014360419746415294, - "loss": 0.4965, - "step": 447000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00014355308126287247, - "loss": 0.4953, - "step": 448000 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001435018228743279, - "loss": 0.4965, - "step": 449000 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001434503198352907, - "loss": 0.4951, - "step": 450000 - }, - { - "epoch": 0.07, - "eval_loss": 0.47704780101776123, - "eval_runtime": 78.6754, - "eval_samples_per_second": 81.347, - "eval_steps_per_second": 0.636, - "step": 450000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00014339862351557547, - "loss": 0.4934, - "step": 451000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001433467860585135, - "loss": 0.4934, - "step": 452000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00014329470384334429, - "loss": 0.4932, - "step": 453000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00014324242882153025, - "loss": 0.4935, - "step": 454000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00014318996115231028, - "loss": 0.4931, - "step": 455000 - }, - { - "epoch": 0.0, - "eval_loss": 0.4748893678188324, - "eval_runtime": 82.5143, - "eval_samples_per_second": 77.562, - "eval_steps_per_second": 0.606, - "step": 455000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00014313730099550992, - "loss": 0.4927, - "step": 456000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00014308455440834473, - "loss": 0.4931, - "step": 457000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001430315101423767, - "loss": 0.4928, - "step": 458000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00014297827387149757, - "loss": 0.4945, - "step": 459000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014292484575787425, - "loss": 0.4917, - "step": 460000 - }, - { - "epoch": 0.01, - "eval_loss": 0.4765481948852539, - "eval_runtime": 80.7233, - "eval_samples_per_second": 79.283, - "eval_steps_per_second": 0.619, - "step": 460000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014287133339503343, - "loss": 0.4935, - "step": 461000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0001428175224676294, - "loss": 0.4942, - "step": 462000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014276352018715785, - "loss": 0.4933, - "step": 463000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0001427093267181191, - "loss": 0.4933, - "step": 464000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014265494222559578, - "loss": 0.4927, - "step": 465000 - }, - { - "epoch": 0.01, - "eval_loss": 0.4773631989955902, - "eval_runtime": 80.561, - "eval_samples_per_second": 79.443, - "eval_steps_per_second": 0.621, - "step": 465000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014260036687525253, - "loss": 0.4932, - "step": 466000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0001425457105556186, - "loss": 0.4931, - "step": 467000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014249075436983697, - "loss": 0.4911, - "step": 468000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014243560782638008, - "loss": 0.4917, - "step": 469000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014238027109323388, - "loss": 0.4925, - "step": 470000 - }, - { - "epoch": 0.01, - "eval_loss": 0.4760352373123169, - "eval_runtime": 79.1679, - "eval_samples_per_second": 80.841, - "eval_steps_per_second": 0.632, - "step": 470000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014232479996057723, - "loss": 0.4928, - "step": 471000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0001422690835440944, - "loss": 0.4921, - "step": 472000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014221317744518412, - "loss": 0.4919, - "step": 473000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014215708183414603, - "loss": 0.4908, - "step": 474000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014210085326132836, - "loss": 0.4908, - "step": 475000 - }, - { - "epoch": 0.01, - "eval_loss": 0.47689467668533325, - "eval_runtime": 79.0304, - "eval_samples_per_second": 80.981, - "eval_steps_per_second": 0.633, - "step": 475000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0001420443793283263, - "loss": 0.4908, - "step": 476000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0001419877731546628, - "loss": 0.4906, - "step": 477000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014193092158712586, - "loss": 0.4914, - "step": 478000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00014187393850165534, - "loss": 0.4906, - "step": 479000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0001418167099916094, - "loss": 0.4912, - "step": 480000 - }, - { - "epoch": 0.01, - "eval_loss": 0.4743531346321106, - "eval_runtime": 80.9996, - "eval_samples_per_second": 79.013, - "eval_steps_per_second": 0.617, - "step": 480000 - }, - { - "epoch": 0.02, - "learning_rate": 0.0001417592931771443, - "loss": 0.49, - "step": 481000 - }, - { - "epoch": 0.02, - "learning_rate": 0.0001417016882331616, - "loss": 0.4902, - "step": 482000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014164395322185843, - "loss": 0.4899, - "step": 483000 - }, - { - "epoch": 0.02, - "learning_rate": 0.0001415859727335269, - "loss": 0.4897, - "step": 484000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014152786290537982, - "loss": 0.4893, - "step": 485000 - }, - { - "epoch": 0.02, - "eval_loss": 0.473435640335083, - "eval_runtime": 81.6676, - "eval_samples_per_second": 78.366, - "eval_steps_per_second": 0.612, - "step": 485000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014146950757846707, - "loss": 0.49, - "step": 486000 - }, - { - "epoch": 0.02, - "learning_rate": 0.0001414110236408181, - "loss": 0.4894, - "step": 487000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014135229418565457, - "loss": 0.4909, - "step": 488000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014129337784076295, - "loss": 0.4892, - "step": 489000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014123427478561265, - "loss": 0.4899, - "step": 490000 - }, - { - "epoch": 0.02, - "eval_loss": 0.47095221281051636, - "eval_runtime": 81.6054, - "eval_samples_per_second": 78.426, - "eval_steps_per_second": 0.613, - "step": 490000 - }, - { - "epoch": 0.02, - "learning_rate": 0.000141175044582939, - "loss": 0.49, - "step": 491000 - }, - { - "epoch": 0.02, - "learning_rate": 0.0001411155688342132, - "loss": 0.4894, - "step": 492000 - }, - { - "epoch": 0.02, - "learning_rate": 0.0001410559069168658, - "loss": 0.4898, - "step": 493000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014099605901263734, - "loss": 0.4885, - "step": 494000 - }, - { - "epoch": 0.02, - "learning_rate": 0.0001409360854302923, - "loss": 0.4894, - "step": 495000 - }, - { - "epoch": 0.02, - "eval_loss": 0.47246918082237244, - "eval_runtime": 79.94, - "eval_samples_per_second": 80.06, - "eval_steps_per_second": 0.625, - "step": 495000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014087592659712124, - "loss": 0.4882, - "step": 496000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014081552219904862, - "loss": 0.4895, - "step": 497000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014075493254634853, - "loss": 0.4874, - "step": 498000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00014069421869069117, - "loss": 0.4883, - "step": 499000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00014063325926779179, - "loss": 0.4889, - "step": 500000 - }, - { - "epoch": 0.03, - "eval_loss": 0.4746360778808594, - "eval_runtime": 80.6889, - "eval_samples_per_second": 79.317, - "eval_steps_per_second": 0.62, - "step": 500000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00014057211514546938, - "loss": 0.4878, - "step": 501000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00014051084793071725, - "loss": 0.4872, - "step": 502000 - }, - { - "epoch": 0.03, - "learning_rate": 0.0001404493351531105, - "loss": 0.4881, - "step": 503000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00014038763823634546, - "loss": 0.4891, - "step": 504000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00014032575736836167, - "loss": 0.4875, - "step": 505000 - }, - { - "epoch": 0.03, - "eval_loss": 0.47102564573287964, - "eval_runtime": 79.1525, - "eval_samples_per_second": 80.857, - "eval_steps_per_second": 0.632, - "step": 505000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00014026381705018997, - "loss": 0.4878, - "step": 506000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00014020156921278623, - "loss": 0.4892, - "step": 507000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00014013913799096225, - "loss": 0.4868, - "step": 508000 - }, - { - "epoch": 0.03, - "learning_rate": 0.0001400765862807525, - "loss": 0.4881, - "step": 509000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00014001378904408309, - "loss": 0.4885, - "step": 510000 - }, - { - "epoch": 0.03, - "eval_loss": 0.4696248769760132, - "eval_runtime": 80.7581, - "eval_samples_per_second": 79.249, - "eval_steps_per_second": 0.619, - "step": 510000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00013995093513742188, - "loss": 0.4875, - "step": 511000 - }, - { - "epoch": 0.03, - "learning_rate": 0.0001398877728328313, - "loss": 0.4867, - "step": 512000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00013982442809969868, - "loss": 0.4888, - "step": 513000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00013976090113098301, - "loss": 0.4873, - "step": 514000 - }, - { - "epoch": 0.03, - "learning_rate": 0.0001396971921201985, - "loss": 0.4863, - "step": 515000 - }, - { - "epoch": 0.03, - "eval_loss": 0.46735358238220215, - "eval_runtime": 82.0128, - "eval_samples_per_second": 78.037, - "eval_steps_per_second": 0.61, - "step": 515000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00013963336524304095, - "loss": 0.4864, - "step": 516000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00013956929291243476, - "loss": 0.4878, - "step": 517000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00013950510346779393, - "loss": 0.4858, - "step": 518000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00013944066859728662, - "loss": 0.4871, - "step": 519000 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001393760526601943, - "loss": 0.4861, - "step": 520000 - }, - { - "epoch": 0.04, - "eval_loss": 0.4665950834751129, - "eval_runtime": 80.8998, - "eval_samples_per_second": 79.11, - "eval_steps_per_second": 0.618, - "step": 520000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00013931132074043395, - "loss": 0.487, - "step": 521000 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001392463434417902, - "loss": 0.4869, - "step": 522000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00013918118566850956, - "loss": 0.4861, - "step": 523000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00013911591304710506, - "loss": 0.4867, - "step": 524000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00013905039510052293, - "loss": 0.4878, - "step": 525000 - }, - { - "epoch": 0.04, - "eval_loss": 0.47149112820625305, - "eval_runtime": 79.7289, - "eval_samples_per_second": 80.272, - "eval_steps_per_second": 0.627, - "step": 525000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00013898469727619654, - "loss": 0.4862, - "step": 526000 - }, - { - "epoch": 0.04, - "learning_rate": 0.000138918885741437, - "loss": 0.4859, - "step": 527000 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001388528289419268, - "loss": 0.4846, - "step": 528000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00013878659286649316, - "loss": 0.4859, - "step": 529000 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001387202442214324, - "loss": 0.4864, - "step": 530000 - }, - { - "epoch": 0.04, - "eval_loss": 0.46929115056991577, - "eval_runtime": 81.2258, - "eval_samples_per_second": 78.793, - "eval_steps_per_second": 0.616, - "step": 530000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00013865365037876758, - "loss": 0.485, - "step": 531000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00013858694472860118, - "loss": 0.4861, - "step": 532000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00013851999392931896, - "loss": 0.4848, - "step": 533000 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001384529320860216, - "loss": 0.4852, - "step": 534000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00013838562514508073, - "loss": 0.4854, - "step": 535000 - }, - { - "epoch": 0.04, - "eval_loss": 0.47047823667526245, - "eval_runtime": 80.5052, - "eval_samples_per_second": 79.498, - "eval_steps_per_second": 0.621, - "step": 535000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00013831814035140017, - "loss": 0.4855, - "step": 536000 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001382504779105503, - "loss": 0.4856, - "step": 537000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00013818277388535536, - "loss": 0.4855, - "step": 538000 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001381148252285261, - "loss": 0.4849, - "step": 539000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00013804663161577017, - "loss": 0.4838, - "step": 540000 - }, - { - "epoch": 0.04, - "eval_loss": 0.4682275354862213, - "eval_runtime": 81.8185, - "eval_samples_per_second": 78.222, - "eval_steps_per_second": 0.611, - "step": 540000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00013797826118290763, - "loss": 0.4858, - "step": 541000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00013790971413820663, - "loss": 0.4831, - "step": 542000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00013784099069047342, - "loss": 0.4858, - "step": 543000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00013777216003663188, - "loss": 0.4845, - "step": 544000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00013770308458728033, - "loss": 0.4846, - "step": 545000 - }, - { - "epoch": 0.05, - "eval_loss": 0.46753159165382385, - "eval_runtime": 78.6329, - "eval_samples_per_second": 81.391, - "eval_steps_per_second": 0.636, - "step": 545000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00013763390270327768, - "loss": 0.4835, - "step": 546000 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001375644760931289, - "loss": 0.4832, - "step": 547000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00013749494382108183, - "loss": 0.4834, - "step": 548000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00013742516689523143, - "loss": 0.4843, - "step": 549000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00013735521504236372, - "loss": 0.4847, - "step": 550000 - }, - { - "epoch": 0.05, - "eval_loss": 0.4699910283088684, - "eval_runtime": 77.0998, - "eval_samples_per_second": 83.009, - "eval_steps_per_second": 0.649, - "step": 550000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00013728515868932954, - "loss": 0.4843, - "step": 551000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00013721485779660927, - "loss": 0.485, - "step": 552000 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001371443826175097, - "loss": 0.4833, - "step": 553000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00013707380410283847, - "loss": 0.4829, - "step": 554000 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001370029811692982, - "loss": 0.4838, - "step": 555000 - }, - { - "epoch": 0.05, - "eval_loss": 0.46684029698371887, - "eval_runtime": 76.4066, - "eval_samples_per_second": 83.762, - "eval_steps_per_second": 0.654, - "step": 555000 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001369319845947916, - "loss": 0.4838, - "step": 556000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00013686081459558647, - "loss": 0.4836, - "step": 557000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00013678961424761038, - "loss": 0.4837, - "step": 558000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00013671809839568835, - "loss": 0.4844, - "step": 559000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00013664640977060228, - "loss": 0.4825, - "step": 560000 - }, - { - "epoch": 0.06, - "eval_loss": 0.4646618962287903, - "eval_runtime": 77.1272, - "eval_samples_per_second": 82.98, - "eval_steps_per_second": 0.648, - "step": 560000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00013657462053802633, - "loss": 0.4829, - "step": 561000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00013650258719449216, - "loss": 0.4844, - "step": 562000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00013643045402563777, - "loss": 0.4823, - "step": 563000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00013635807684048157, - "loss": 0.4832, - "step": 564000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00013628560061338996, - "loss": 0.4822, - "step": 565000 - }, - { - "epoch": 0.06, - "eval_loss": 0.4679878354072571, - "eval_runtime": 77.1624, - "eval_samples_per_second": 82.942, - "eval_steps_per_second": 0.648, - "step": 565000 - }, - { - "epoch": 0.06, - "learning_rate": 0.0001362128804676232, - "loss": 0.4811, - "step": 566000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00013614006206451144, - "loss": 0.4821, - "step": 567000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00013606699984332037, - "loss": 0.4824, - "step": 568000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00013599376683232436, - "loss": 0.4827, - "step": 569000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00013592036325460387, - "loss": 0.4828, - "step": 570000 - }, - { - "epoch": 0.06, - "eval_loss": 0.4641859531402588, - "eval_runtime": 77.5968, - "eval_samples_per_second": 82.478, - "eval_steps_per_second": 0.644, - "step": 570000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00013584686299269162, - "loss": 0.4812, - "step": 571000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00013577319295161774, - "loss": 0.4817, - "step": 572000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00013569927935696313, - "loss": 0.4822, - "step": 573000 - }, - { - "epoch": 0.06, - "learning_rate": 0.000135625196092644, - "loss": 0.4822, - "step": 574000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00013555094338433074, - "loss": 0.4816, - "step": 575000 - }, - { - "epoch": 0.06, - "eval_loss": 0.46690383553504944, - "eval_runtime": 77.9328, - "eval_samples_per_second": 82.122, - "eval_steps_per_second": 0.642, - "step": 575000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00013547652145821007, - "loss": 0.4822, - "step": 576000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00013540200521623672, - "loss": 0.4829, - "step": 577000 - }, - { - "epoch": 0.06, - "learning_rate": 0.0001353272457037724, - "loss": 0.4822, - "step": 578000 - }, - { - "epoch": 0.06, - "learning_rate": 0.0001352523176549226, - "loss": 0.4825, - "step": 579000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00013517729647828186, - "loss": 0.4824, - "step": 580000 - }, - { - "epoch": 0.07, - "eval_loss": 0.46679091453552246, - "eval_runtime": 78.4477, - "eval_samples_per_second": 81.583, - "eval_steps_per_second": 0.637, - "step": 580000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00013510203220987017, - "loss": 0.4801, - "step": 581000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00013502667560699584, - "loss": 0.4814, - "step": 582000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00013495107603517676, - "loss": 0.4813, - "step": 583000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00013487530907285017, - "loss": 0.4814, - "step": 584000 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001347994509683568, - "loss": 0.4812, - "step": 585000 - }, - { - "epoch": 0.07, - "eval_loss": 0.465150386095047, - "eval_runtime": 77.7438, - "eval_samples_per_second": 82.322, - "eval_steps_per_second": 0.643, - "step": 585000 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001347233500847351, - "loss": 0.4818, - "step": 586000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00013464708250429864, - "loss": 0.4804, - "step": 587000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00013457064845937174, - "loss": 0.4798, - "step": 588000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00013449404818278575, - "loss": 0.4807, - "step": 589000 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001344174356059392, - "loss": 0.4811, - "step": 590000 - }, - { - "epoch": 0.07, - "eval_loss": 0.46463653445243835, - "eval_runtime": 77.8511, - "eval_samples_per_second": 82.208, - "eval_steps_per_second": 0.642, - "step": 590000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00013434050389784906, - "loss": 0.4798, - "step": 591000 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001342634066591601, - "loss": 0.4808, - "step": 592000 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001341862214697452, - "loss": 0.4804, - "step": 593000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00013410887154965859, - "loss": 0.48, - "step": 594000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00013403127945970912, - "loss": 0.4801, - "step": 595000 - }, - { - "epoch": 0.07, - "eval_loss": 0.4652685523033142, - "eval_runtime": 81.8053, - "eval_samples_per_second": 78.235, - "eval_steps_per_second": 0.611, - "step": 595000 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001339535227811121, - "loss": 0.4801, - "step": 596000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00013387560175072777, - "loss": 0.4811, - "step": 597000 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001337975166059172, - "loss": 0.4802, - "step": 598000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00013371934591533964, - "loss": 0.4815, - "step": 599000 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001336409334192773, - "loss": 0.4802, - "step": 600000 - }, - { - "epoch": 0.07, - "eval_loss": 0.46285489201545715, - "eval_runtime": 79.5065, - "eval_samples_per_second": 80.497, - "eval_steps_per_second": 0.629, - "step": 600000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00013356235752362913, - "loss": 0.4814, - "step": 601000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00013348369728822555, - "loss": 0.482, - "step": 602000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00013340479547477053, - "loss": 0.4818, - "step": 603000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00013332573098104674, - "loss": 0.4825, - "step": 604000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00013324650404789842, - "loss": 0.4801, - "step": 605000 - }, - { - "epoch": 0.08, - "eval_loss": 0.4614011347293854, - "eval_runtime": 79.077, - "eval_samples_per_second": 80.934, - "eval_steps_per_second": 0.632, - "step": 605000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00013316719438673325, - "loss": 0.482, - "step": 606000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00013308764346108213, - "loss": 0.4808, - "step": 607000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00013300801061459758, - "loss": 0.4817, - "step": 608000 - }, - { - "epoch": 0.08, - "learning_rate": 0.0001329281366647775, - "loss": 0.4809, - "step": 609000 - }, - { - "epoch": 0.08, - "learning_rate": 0.0001328481014866741, - "loss": 0.4822, - "step": 610000 - }, - { - "epoch": 0.08, - "eval_loss": 0.4625004827976227, - "eval_runtime": 82.853, - "eval_samples_per_second": 77.245, - "eval_steps_per_second": 0.603, - "step": 610000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00013276798560058145, - "loss": 0.4807, - "step": 611000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00013268770929537345, - "loss": 0.4793, - "step": 612000 - }, - { - "epoch": 0.08, - "learning_rate": 0.0001326071922179333, - "loss": 0.4794, - "step": 613000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00013252651488986224, - "loss": 0.4804, - "step": 614000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00013244567755691747, - "loss": 0.4814, - "step": 615000 - }, - { - "epoch": 0.08, - "eval_loss": 0.46313124895095825, - "eval_runtime": 83.0415, - "eval_samples_per_second": 77.07, - "eval_steps_per_second": 0.602, - "step": 615000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00013236468046534346, - "loss": 0.4802, - "step": 616000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00013228352386187144, - "loss": 0.4791, - "step": 617000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00013220228938905693, - "loss": 0.4795, - "step": 618000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00013212081466281838, - "loss": 0.481, - "step": 619000 - }, - { - "epoch": 0.09, - "learning_rate": 0.0001320393445928162, - "loss": 0.4812, - "step": 620000 - }, - { - "epoch": 0.09, - "eval_loss": 0.4640228748321533, - "eval_runtime": 83.7482, - "eval_samples_per_second": 76.42, - "eval_steps_per_second": 0.597, - "step": 620000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013195755289395868, - "loss": 0.4815, - "step": 621000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013187560292338433, - "loss": 0.4794, - "step": 622000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013179357711756845, - "loss": 0.4798, - "step": 623000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013171131151059012, - "loss": 0.4789, - "step": 624000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013162888838198894, - "loss": 0.4796, - "step": 625000 - }, - { - "epoch": 0.09, - "eval_loss": 0.46150654554367065, - "eval_runtime": 77.9007, - "eval_samples_per_second": 82.156, - "eval_steps_per_second": 0.642, - "step": 625000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013154630798284004, - "loss": 0.4796, - "step": 626000 - }, - { - "epoch": 0.09, - "learning_rate": 0.0001314636533804629, - "loss": 0.4797, - "step": 627000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013138084232425004, - "loss": 0.48, - "step": 628000 - }, - { - "epoch": 0.09, - "learning_rate": 0.0001312977919374723, - "loss": 0.4779, - "step": 629000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013121458528872397, - "loss": 0.481, - "step": 630000 - }, - { - "epoch": 0.09, - "eval_loss": 0.4608282148838043, - "eval_runtime": 78.9952, - "eval_samples_per_second": 81.018, - "eval_steps_per_second": 0.633, - "step": 630000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013113122263146702, - "loss": 0.4799, - "step": 631000 - }, - { - "epoch": 0.09, - "learning_rate": 0.0001310477878157651, - "loss": 0.4802, - "step": 632000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013096411405914954, - "loss": 0.4795, - "step": 633000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013088028505700412, - "loss": 0.4789, - "step": 634000 - }, - { - "epoch": 0.09, - "learning_rate": 0.0001307963010646865, - "loss": 0.478, - "step": 635000 - }, - { - "epoch": 0.09, - "eval_loss": 0.458748996257782, - "eval_runtime": 79.5488, - "eval_samples_per_second": 80.454, - "eval_steps_per_second": 0.629, - "step": 635000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013071224655395776, - "loss": 0.4792, - "step": 636000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013062803787373327, - "loss": 0.4787, - "step": 637000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013054359075594853, - "loss": 0.4779, - "step": 638000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00013045907435152275, - "loss": 0.4776, - "step": 639000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00013037440454726602, - "loss": 0.4786, - "step": 640000 - }, - { - "epoch": 0.1, - "eval_loss": 0.4617770314216614, - "eval_runtime": 77.6784, - "eval_samples_per_second": 82.391, - "eval_steps_per_second": 0.644, - "step": 640000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00013028949661608873, - "loss": 0.4772, - "step": 641000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00013020443549438331, - "loss": 0.4787, - "step": 642000 - }, - { - "epoch": 0.1, - "learning_rate": 0.0001301192214412607, - "loss": 0.4789, - "step": 643000 - }, - { - "epoch": 0.1, - "learning_rate": 0.0001300339401591957, - "loss": 0.4792, - "step": 644000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00012994842117471585, - "loss": 0.4802, - "step": 645000 - }, - { - "epoch": 0.1, - "eval_loss": 0.4589656591415405, - "eval_runtime": 78.0666, - "eval_samples_per_second": 81.981, - "eval_steps_per_second": 0.64, - "step": 645000 - }, - { - "epoch": 0.1, - "learning_rate": 0.0001298627500386828, - "loss": 0.4785, - "step": 646000 - }, - { - "epoch": 0.1, - "learning_rate": 0.0001297770129108747, - "loss": 0.4776, - "step": 647000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00012969112445675987, - "loss": 0.4777, - "step": 648000 - }, - { - "epoch": 0.1, - "learning_rate": 0.0001296049987362049, - "loss": 0.4782, - "step": 649000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00012951872191022104, - "loss": 0.4767, - "step": 650000 - }, - { - "epoch": 0.1, - "eval_loss": 0.4610587954521179, - "eval_runtime": 80.9253, - "eval_samples_per_second": 79.085, - "eval_steps_per_second": 0.618, - "step": 650000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00012943229424162249, - "loss": 0.4763, - "step": 651000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00012934588930028175, - "loss": 0.4782, - "step": 652000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00012925916103710117, - "loss": 0.4765, - "step": 653000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00012917228272197337, - "loss": 0.4771, - "step": 654000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00012908525461954466, - "loss": 0.4766, - "step": 655000 - }, - { - "epoch": 0.1, - "eval_loss": 0.4596790373325348, - "eval_runtime": 78.9864, - "eval_samples_per_second": 81.027, - "eval_steps_per_second": 0.633, - "step": 655000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00012899816424714043, - "loss": 0.4765, - "step": 656000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00012891083751499744, - "loss": 0.4771, - "step": 657000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00012882336179196102, - "loss": 0.4784, - "step": 658000 - }, - { - "epoch": 0.1, - "learning_rate": 0.00012873582504314393, - "loss": 0.478, - "step": 659000 - }, - { - "epoch": 0.1, - "learning_rate": 0.0001286480522864964, - "loss": 0.477, - "step": 660000 - }, - { - "epoch": 0.1, - "eval_loss": 0.45967376232147217, - "eval_runtime": 76.8819, - "eval_samples_per_second": 83.245, - "eval_steps_per_second": 0.65, - "step": 660000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00012856021933432393, - "loss": 0.4775, - "step": 661000 - }, - { - "epoch": 0.11, - "learning_rate": 0.0001284721506124795, - "loss": 0.4759, - "step": 662000 - }, - { - "epoch": 0.11, - "learning_rate": 0.000128383934236058, - "loss": 0.4762, - "step": 663000 - }, - { - "epoch": 0.11, - "learning_rate": 0.0001282956589110738, - "loss": 0.4768, - "step": 664000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00012820723676322364, - "loss": 0.4772, - "step": 665000 - }, - { - "epoch": 0.11, - "eval_loss": 0.45860305428504944, - "eval_runtime": 78.1325, - "eval_samples_per_second": 81.912, - "eval_steps_per_second": 0.64, - "step": 665000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00012811857933062512, - "loss": 0.4762, - "step": 666000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00012802977532048903, - "loss": 0.4779, - "step": 667000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00012794082500332778, - "loss": 0.4779, - "step": 668000 - }, - { - "epoch": 0.11, - "learning_rate": 0.0001278517286500994, - "loss": 0.4755, - "step": 669000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00012776257584704384, - "loss": 0.4766, - "step": 670000 - }, - { - "epoch": 0.11, - "eval_loss": 0.45803070068359375, - "eval_runtime": 78.0714, - "eval_samples_per_second": 81.976, - "eval_steps_per_second": 0.64, - "step": 670000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00012767327784173963, - "loss": 0.4757, - "step": 671000 - }, - { - "epoch": 0.11, - "learning_rate": 0.000127583745300672, - "loss": 0.4757, - "step": 672000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00012749406781126398, - "loss": 0.4767, - "step": 673000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00012740424564668877, - "loss": 0.4768, - "step": 674000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00012731427908056026, - "loss": 0.4764, - "step": 675000 - }, - { - "epoch": 0.11, - "eval_loss": 0.4579757750034332, - "eval_runtime": 78.9147, - "eval_samples_per_second": 81.1, - "eval_steps_per_second": 0.634, - "step": 675000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00012722416838693222, - "loss": 0.4778, - "step": 676000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00012713400416660716, - "loss": 0.4759, - "step": 677000 - }, - { - "epoch": 0.11, - "learning_rate": 0.00012704360618533725, - "loss": 0.4755, - "step": 678000 - }, - { - "epoch": 0.11, - "learning_rate": 0.0001269531555138567, - "loss": 0.4759, - "step": 679000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00012686247134531458, - "loss": 0.4775, - "step": 680000 - }, - { - "epoch": 0.12, - "eval_loss": 0.45441490411758423, - "eval_runtime": 78.5989, - "eval_samples_per_second": 81.426, - "eval_steps_per_second": 0.636, - "step": 680000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00012677173532368958, - "loss": 0.4763, - "step": 681000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00012668076607172854, - "loss": 0.4762, - "step": 682000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00012658965462205847, - "loss": 0.4764, - "step": 683000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00012649840125222064, - "loss": 0.4761, - "step": 684000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00012640700624018863, - "loss": 0.4751, - "step": 685000 - }, - { - "epoch": 0.12, - "eval_loss": 0.4569858908653259, - "eval_runtime": 78.7381, - "eval_samples_per_second": 81.282, - "eval_steps_per_second": 0.635, - "step": 685000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00012631546986436747, - "loss": 0.4751, - "step": 686000 - }, - { - "epoch": 0.12, - "learning_rate": 0.0001262238841514327, - "loss": 0.4765, - "step": 687000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00012613206602563615, - "loss": 0.4756, - "step": 688000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00012604029143092928, - "loss": 0.4767, - "step": 689000 - }, - { - "epoch": 0.12, - "learning_rate": 0.000125948284981498, - "loss": 0.4752, - "step": 690000 - }, - { - "epoch": 0.12, - "eval_loss": 0.456228107213974, - "eval_runtime": 80.1461, - "eval_samples_per_second": 79.854, - "eval_steps_per_second": 0.624, - "step": 690000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00012585604653715085, - "loss": 0.4758, - "step": 691000 - }, - { - "epoch": 0.12, - "learning_rate": 0.0001257636684073334, - "loss": 0.4757, - "step": 692000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00012567124346051786, - "loss": 0.4738, - "step": 693000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00012557858694336514, - "loss": 0.4739, - "step": 694000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00012548579158593213, - "loss": 0.4736, - "step": 695000 - }, - { - "epoch": 0.12, - "eval_loss": 0.45279574394226074, - "eval_runtime": 78.8366, - "eval_samples_per_second": 81.181, - "eval_steps_per_second": 0.634, - "step": 695000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00012539295067392003, - "loss": 0.4758, - "step": 696000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00012529987862249377, - "loss": 0.475, - "step": 697000 - }, - { - "epoch": 0.12, - "learning_rate": 0.0001252066685797811, - "loss": 0.4745, - "step": 698000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00012511332082971603, - "loss": 0.4738, - "step": 699000 - }, - { - "epoch": 0.12, - "learning_rate": 0.00012501983565665193, - "loss": 0.4737, - "step": 700000 - }, - { - "epoch": 0.12, - "eval_loss": 0.4576612412929535, - "eval_runtime": 79.2297, - "eval_samples_per_second": 80.778, - "eval_steps_per_second": 0.631, - "step": 700000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00012492621334536086, - "loss": 0.475, - "step": 701000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00012483254800846005, - "loss": 0.4748, - "step": 702000 - }, - { - "epoch": 0.13, - "learning_rate": 0.000124738652413126, - "loss": 0.476, - "step": 703000 - }, - { - "epoch": 0.13, - "learning_rate": 0.0001246447146359523, - "loss": 0.4738, - "step": 704000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00012455054689951964, - "loss": 0.4741, - "step": 705000 - }, - { - "epoch": 0.13, - "eval_loss": 0.4546070992946625, - "eval_runtime": 79.5809, - "eval_samples_per_second": 80.421, - "eval_steps_per_second": 0.628, - "step": 705000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00012445624345439494, - "loss": 0.4732, - "step": 706000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00012436180458784285, - "loss": 0.4735, - "step": 707000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00012426741987021326, - "loss": 0.4746, - "step": 708000 - }, - { - "epoch": 0.13, - "learning_rate": 0.0001241727112936531, - "loss": 0.4737, - "step": 709000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00012407786815935373, - "loss": 0.4727, - "step": 710000 - }, - { - "epoch": 0.13, - "eval_loss": 0.4537406861782074, - "eval_runtime": 78.7373, - "eval_samples_per_second": 81.283, - "eval_steps_per_second": 0.635, - "step": 710000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00012398289075622377, - "loss": 0.4743, - "step": 711000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00012388787455178986, - "loss": 0.4736, - "step": 712000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00012379262961290468, - "loss": 0.4737, - "step": 713000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00012369725127407496, - "loss": 0.4736, - "step": 714000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00012360173982583968, - "loss": 0.4736, - "step": 715000 - }, - { - "epoch": 0.13, - "eval_loss": 0.4546468257904053, - "eval_runtime": 78.1145, - "eval_samples_per_second": 81.931, - "eval_steps_per_second": 0.64, - "step": 715000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00012350619126965588, - "loss": 0.4734, - "step": 716000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00012341051045099063, - "loss": 0.4748, - "step": 717000 - }, - { - "epoch": 0.13, - "learning_rate": 0.00012331460168600204, - "loss": 0.4745, - "step": 718000 - }, - { - "epoch": 0.13, - "learning_rate": 0.0001232185609772249, - "loss": 0.4729, - "step": 719000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00012312238861721586, - "loss": 0.4736, - "step": 720000 - }, - { - "epoch": 0.14, - "eval_loss": 0.4559849202632904, - "eval_runtime": 80.1667, - "eval_samples_per_second": 79.834, - "eval_steps_per_second": 0.624, - "step": 720000 - }, - { - "epoch": 0.14, - "learning_rate": 0.0001230260848989326, - "loss": 0.4745, - "step": 721000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00012292984311590698, - "loss": 0.4735, - "step": 722000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00012283327782279692, - "loss": 0.4731, - "step": 723000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00012273658205209426, - "loss": 0.4728, - "step": 724000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00012263985298923314, - "loss": 0.4735, - "step": 725000 - }, - { - "epoch": 0.14, - "eval_loss": 0.4538413882255554, - "eval_runtime": 78.3414, - "eval_samples_per_second": 81.694, - "eval_steps_per_second": 0.638, - "step": 725000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00012254289727713877, - "loss": 0.4721, - "step": 726000 - }, - { - "epoch": 0.14, - "learning_rate": 0.0001224458119720013, - "loss": 0.4738, - "step": 727000 - }, - { - "epoch": 0.14, - "learning_rate": 0.0001223486946486473, - "loss": 0.4716, - "step": 728000 - }, - { - "epoch": 0.14, - "learning_rate": 0.0001222513511738867, - "loss": 0.4722, - "step": 729000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00012215387899418292, - "loss": 0.4714, - "step": 730000 - }, - { - "epoch": 0.14, - "eval_loss": 0.45341652631759644, - "eval_runtime": 81.7464, - "eval_samples_per_second": 78.291, - "eval_steps_per_second": 0.612, - "step": 730000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00012205637607108188, - "loss": 0.4721, - "step": 731000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00012195864750059665, - "loss": 0.4724, - "step": 732000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00012186088903692187, - "loss": 0.4726, - "step": 733000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00012176290526525648, - "loss": 0.4721, - "step": 734000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00012166479427653891, - "loss": 0.4736, - "step": 735000 - }, - { - "epoch": 0.14, - "eval_loss": 0.4538614749908447, - "eval_runtime": 76.948, - "eval_samples_per_second": 83.173, - "eval_steps_per_second": 0.65, - "step": 735000 - }, - { - "epoch": 0.14, - "learning_rate": 0.0001215665563696323, - "loss": 0.4717, - "step": 736000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00012146819184378635, - "loss": 0.4721, - "step": 737000 - }, - { - "epoch": 0.14, - "learning_rate": 0.00012136989810619402, - "loss": 0.472, - "step": 738000 - }, - { - "epoch": 0.14, - "learning_rate": 0.0001212712814934992, - "loss": 0.4725, - "step": 739000 - }, - { - "epoch": 0.14, - "learning_rate": 0.0001211725391613235, - "loss": 0.4721, - "step": 740000 - }, - { - "epoch": 0.14, - "eval_loss": 0.45002052187919617, - "eval_runtime": 81.7464, - "eval_samples_per_second": 78.291, - "eval_steps_per_second": 0.612, - "step": 740000 - }, - { - "epoch": 0.15, - "learning_rate": 0.00012107367141045319, - "loss": 0.4713, - "step": 741000 - }, - { - "epoch": 0.15, - "learning_rate": 0.00012097477759732085, - "loss": 0.4721, - "step": 742000 - }, - { - "epoch": 0.15, - "learning_rate": 0.00012087566003761258, - "loss": 0.4723, - "step": 743000 - }, - { - "epoch": 0.15, - "learning_rate": 0.00012077641796355505, - "loss": 0.4713, - "step": 744000 - }, - { - "epoch": 0.15, - "learning_rate": 0.00012067715110568603, - "loss": 0.4708, - "step": 745000 - }, - { - "epoch": 0.15, - "eval_loss": 0.45279091596603394, - "eval_runtime": 81.252, - "eval_samples_per_second": 78.767, - "eval_steps_per_second": 0.615, - "step": 745000 - }, - { - "epoch": 0.15, - "learning_rate": 0.00012057766103399194, - "loss": 0.4705, - "step": 746000 - }, - { - "epoch": 0.15, - "learning_rate": 0.0001204780473557052, - "loss": 0.4721, - "step": 747000 - }, - { - "epoch": 0.15, - "learning_rate": 0.00012037831037426628, - "loss": 0.4716, - "step": 748000 - }, - { - "epoch": 0.15, - "learning_rate": 0.00012027865023600415, - "loss": 0.4714, - "step": 749000 - }, - { - "epoch": 0.15, - "learning_rate": 0.00012017866780517054, - "loss": 0.4716, - "step": 750000 - }, - { - "epoch": 0.15, - "eval_loss": 0.45302364230155945, - "eval_runtime": 80.5157, - "eval_samples_per_second": 79.488, - "eval_steps_per_second": 0.621, - "step": 750000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00012007856298314685, - "loss": 0.4718, - "step": 751000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00011997833607486966, - "loss": 0.4709, - "step": 752000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00011987808779506468, - "loss": 0.472, - "step": 753000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00011977771828251805, - "loss": 0.472, - "step": 754000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00011967712719084747, - "loss": 0.4715, - "step": 755000 - }, - { - "epoch": 0.0, - "eval_loss": 0.4534043073654175, - "eval_runtime": 81.188, - "eval_samples_per_second": 78.829, - "eval_steps_per_second": 0.616, - "step": 755000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00011957641523576647, - "loss": 0.4711, - "step": 756000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00011947558272406116, - "loss": 0.4698, - "step": 757000 - }, - { - "epoch": 0.0, - "learning_rate": 0.0001193746299628848, - "loss": 0.4694, - "step": 758000 - }, - { - "epoch": 0.0, - "learning_rate": 0.00011927365839226875, - "loss": 0.4721, - "step": 759000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00011917246617455475, - "loss": 0.4685, - "step": 760000 - }, - { - "epoch": 0.01, - "eval_loss": 0.4503335952758789, - "eval_runtime": 78.7744, - "eval_samples_per_second": 81.245, - "eval_steps_per_second": 0.635, - "step": 760000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00011907125600175987, - "loss": 0.4697, - "step": 761000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00011896992704905985, - "loss": 0.4715, - "step": 762000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00011886837801627817, - "loss": 0.4708, - "step": 763000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0001187667105836763, - "loss": 0.472, - "step": 764000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00011866502690535649, - "loss": 0.4707, - "step": 765000 - }, - { - "epoch": 0.01, - "eval_loss": 0.4522061049938202, - "eval_runtime": 81.4742, - "eval_samples_per_second": 78.552, - "eval_steps_per_second": 0.614, - "step": 765000 - }, - { - "epoch": 0.01, - "learning_rate": 0.000118563123720189, - "loss": 0.47, - "step": 766000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0001184612051442868, - "loss": 0.4708, - "step": 767000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00011835906744697045, - "loss": 0.4705, - "step": 768000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00011825681290128225, - "loss": 0.4706, - "step": 769000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00011815444181870728, - "loss": 0.4695, - "step": 770000 - }, - { - "epoch": 0.01, - "eval_loss": 0.4502740204334259, - "eval_runtime": 80.2036, - "eval_samples_per_second": 79.797, - "eval_steps_per_second": 0.623, - "step": 770000 - }, - { - "epoch": 0.01, - "learning_rate": 0.000118052159601486, - "loss": 0.4714, - "step": 771000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00011794955661252567, - "loss": 0.4696, - "step": 772000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00011784683802263448, - "loss": 0.4709, - "step": 773000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0001177440041447111, - "loss": 0.4702, - "step": 774000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00011764105529200537, - "loss": 0.4718, - "step": 775000 - }, - { - "epoch": 0.01, - "eval_loss": 0.4481835961341858, - "eval_runtime": 81.6948, - "eval_samples_per_second": 78.34, - "eval_steps_per_second": 0.612, - "step": 775000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00011753809489880001, - "loss": 0.4715, - "step": 776000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00011743502038662826, - "loss": 0.4699, - "step": 777000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00011733172872032392, - "loss": 0.4693, - "step": 778000 - }, - { - "epoch": 0.01, - "learning_rate": 0.00011722842679718266, - "loss": 0.4696, - "step": 779000 - }, - { - "epoch": 0.01, - "learning_rate": 0.0001171249081212757, - "loss": 0.4686, - "step": 780000 - }, - { - "epoch": 0.01, - "eval_loss": 0.45043304562568665, - "eval_runtime": 83.0198, - "eval_samples_per_second": 77.09, - "eval_steps_per_second": 0.602, - "step": 780000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00011702127635646021, - "loss": 0.4694, - "step": 781000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00011691753181841654, - "loss": 0.4696, - "step": 782000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00011681367482316854, - "loss": 0.4702, - "step": 783000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00011670970568708263, - "loss": 0.4699, - "step": 784000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00011660572886357773, - "loss": 0.471, - "step": 785000 - }, - { - "epoch": 0.02, - "eval_loss": 0.4487108588218689, - "eval_runtime": 81.1941, - "eval_samples_per_second": 78.823, - "eval_steps_per_second": 0.616, - "step": 785000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00011650153650762927, - "loss": 0.4691, - "step": 786000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00011639723296167041, - "loss": 0.469, - "step": 787000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00011629292301312084, - "loss": 0.4703, - "step": 788000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00011618839815105406, - "loss": 0.4698, - "step": 789000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00011608376305285055, - "loss": 0.469, - "step": 790000 - }, - { - "epoch": 0.02, - "eval_loss": 0.45146042108535767, - "eval_runtime": 80.5783, - "eval_samples_per_second": 79.426, - "eval_steps_per_second": 0.621, - "step": 790000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00011597901803724696, - "loss": 0.4701, - "step": 791000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00011587426833256681, - "loss": 0.4702, - "step": 792000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00011576930454883066, - "loss": 0.4692, - "step": 793000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00011566423180558957, - "loss": 0.4688, - "step": 794000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00011555905042291336, - "loss": 0.4701, - "step": 795000 - }, - { - "epoch": 0.02, - "eval_loss": 0.44838276505470276, - "eval_runtime": 79.5446, - "eval_samples_per_second": 80.458, - "eval_steps_per_second": 0.629, - "step": 795000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00011545386606490308, - "loss": 0.4692, - "step": 796000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00011534857392415738, - "loss": 0.4686, - "step": 797000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00011524306876193287, - "loss": 0.4683, - "step": 798000 - }, - { - "epoch": 0.02, - "learning_rate": 0.00011513756190924513, - "loss": 0.4699, - "step": 799000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00011503184246260971, - "loss": 0.4694, - "step": 800000 - }, - { - "epoch": 0.03, - "eval_loss": 0.4521300494670868, - "eval_runtime": 81.6914, - "eval_samples_per_second": 78.344, - "eval_steps_per_second": 0.612, - "step": 800000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00011492601630290677, - "loss": 0.4693, - "step": 801000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00011482008375250114, - "loss": 0.4687, - "step": 802000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00011471415122557365, - "loss": 0.4671, - "step": 803000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00011460800696773604, - "loss": 0.4689, - "step": 804000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00011450186359013775, - "loss": 0.467, - "step": 805000 - }, - { - "epoch": 0.03, - "eval_loss": 0.4480891525745392, - "eval_runtime": 82.1197, - "eval_samples_per_second": 77.935, - "eval_steps_per_second": 0.609, - "step": 805000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00011439550891690946, - "loss": 0.4671, - "step": 806000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00011428904946899599, - "loss": 0.4684, - "step": 807000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00011418248557069125, - "loss": 0.4687, - "step": 808000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00011407592426653418, - "loss": 0.4681, - "step": 809000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00011396915254523895, - "loss": 0.4671, - "step": 810000 - }, - { - "epoch": 0.03, - "eval_loss": 0.4491584897041321, - "eval_runtime": 80.5568, - "eval_samples_per_second": 79.447, - "eval_steps_per_second": 0.621, - "step": 810000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00011386227734801394, - "loss": 0.469, - "step": 811000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00011375529900041961, - "loss": 0.4685, - "step": 812000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00011364843209307678, - "loss": 0.4686, - "step": 813000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00011354124862735163, - "loss": 0.466, - "step": 814000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00011343407032573083, - "loss": 0.4682, - "step": 815000 - }, - { - "epoch": 0.03, - "eval_loss": 0.4499417841434479, - "eval_runtime": 82.5085, - "eval_samples_per_second": 77.568, - "eval_steps_per_second": 0.606, - "step": 815000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00011332668294357827, - "loss": 0.4678, - "step": 816000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00011321919404256975, - "loss": 0.4682, - "step": 817000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00011311160395013516, - "loss": 0.467, - "step": 818000 - }, - { - "epoch": 0.03, - "learning_rate": 0.00011300391299401257, - "loss": 0.4667, - "step": 819000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00011289633718534724, - "loss": 0.467, - "step": 820000 - }, - { - "epoch": 0.04, - "eval_loss": 0.4481469690799713, - "eval_runtime": 82.9256, - "eval_samples_per_second": 77.178, - "eval_steps_per_second": 0.603, - "step": 820000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00011278844568637758, - "loss": 0.4666, - "step": 821000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00011268045430811603, - "loss": 0.4677, - "step": 822000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00011257236337952306, - "loss": 0.4687, - "step": 823000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00011246428146946332, - "loss": 0.4687, - "step": 824000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00011235610086525684, - "loss": 0.4675, - "step": 825000 - }, - { - "epoch": 0.04, - "eval_loss": 0.449359655380249, - "eval_runtime": 81.428, - "eval_samples_per_second": 78.597, - "eval_steps_per_second": 0.614, - "step": 825000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00011224771345925375, - "loss": 0.467, - "step": 826000 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001121392278211229, - "loss": 0.4688, - "step": 827000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00011203064428133039, - "loss": 0.4676, - "step": 828000 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001119220719003777, - "loss": 0.4664, - "step": 829000 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001118132936469263, - "loss": 0.4676, - "step": 830000 - }, - { - "epoch": 0.04, - "eval_loss": 0.44925883412361145, - "eval_runtime": 83.2246, - "eval_samples_per_second": 76.9, - "eval_steps_per_second": 0.601, - "step": 830000 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001117045274081231, - "loss": 0.4689, - "step": 831000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00011159566478488941, - "loss": 0.4686, - "step": 832000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00011148659699242007, - "loss": 0.4672, - "step": 833000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00011137743328631572, - "loss": 0.4678, - "step": 834000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00011126817399910799, - "loss": 0.4658, - "step": 835000 - }, - { - "epoch": 0.04, - "eval_loss": 0.450771689414978, - "eval_runtime": 83.2311, - "eval_samples_per_second": 76.894, - "eval_steps_per_second": 0.601, - "step": 835000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00011115892886562073, - "loss": 0.4664, - "step": 836000 - }, - { - "epoch": 0.04, - "learning_rate": 0.0001110494795097132, - "loss": 0.4651, - "step": 837000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00011094004516277705, - "loss": 0.4672, - "step": 838000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00011083040707044233, - "loss": 0.4674, - "step": 839000 - }, - { - "epoch": 0.04, - "learning_rate": 0.00011072078484214722, - "loss": 0.4676, - "step": 840000 - }, - { - "epoch": 0.04, - "eval_loss": 0.4472251832485199, - "eval_runtime": 80.5148, - "eval_samples_per_second": 79.488, - "eval_steps_per_second": 0.621, - "step": 840000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00011061095934795527, - "loss": 0.4667, - "step": 841000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00011050115057267808, - "loss": 0.4662, - "step": 842000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00011039113901348006, - "loss": 0.4667, - "step": 843000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00011028103487760205, - "loss": 0.4679, - "step": 844000 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001101709487427803, - "loss": 0.4665, - "step": 845000 - }, - { - "epoch": 0.05, - "eval_loss": 0.44985419511795044, - "eval_runtime": 82.384, - "eval_samples_per_second": 77.685, - "eval_steps_per_second": 0.607, - "step": 845000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00011006066055174999, - "loss": 0.4655, - "step": 846000 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001099503912161227, - "loss": 0.4666, - "step": 847000 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001098399203124234, - "loss": 0.4657, - "step": 848000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00010972946911824438, - "loss": 0.4672, - "step": 849000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00010961881684658344, - "loss": 0.4675, - "step": 850000 - }, - { - "epoch": 0.05, - "eval_loss": 0.4436103105545044, - "eval_runtime": 81.2881, - "eval_samples_per_second": 78.732, - "eval_steps_per_second": 0.615, - "step": 850000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00010950818513832014, - "loss": 0.4659, - "step": 851000 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001093973528456127, - "loss": 0.4661, - "step": 852000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00010928643100347024, - "loss": 0.4664, - "step": 853000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00010917541994977997, - "loss": 0.4647, - "step": 854000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00010906443116690753, - "loss": 0.4666, - "step": 855000 - }, - { - "epoch": 0.05, - "eval_loss": 0.44435253739356995, - "eval_runtime": 82.1977, - "eval_samples_per_second": 77.861, - "eval_steps_per_second": 0.608, - "step": 855000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00010895324279323492, - "loss": 0.467, - "step": 856000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00010884196622296353, - "loss": 0.4652, - "step": 857000 - }, - { - "epoch": 0.05, - "learning_rate": 0.00010873071320326107, - "loss": 0.4658, - "step": 858000 - }, - { - "epoch": 0.05, - "learning_rate": 0.0001086192613443119, - "loss": 0.4657, - "step": 859000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00010850772230612937, - "loss": 0.4656, - "step": 860000 - }, - { - "epoch": 0.06, - "eval_loss": 0.44788363575935364, - "eval_runtime": 81.6254, - "eval_samples_per_second": 78.407, - "eval_steps_per_second": 0.613, - "step": 860000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00010839609642848077, - "loss": 0.4663, - "step": 861000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00010828449580686817, - "loss": 0.4649, - "step": 862000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00010817269735663531, - "loss": 0.4656, - "step": 863000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00010806081308748046, - "loss": 0.4656, - "step": 864000 - }, - { - "epoch": 0.06, - "learning_rate": 0.0001079490673647975, - "loss": 0.4664, - "step": 865000 - }, - { - "epoch": 0.06, - "eval_loss": 0.4458025395870209, - "eval_runtime": 83.4659, - "eval_samples_per_second": 76.678, - "eval_steps_per_second": 0.599, - "step": 865000 - }, - { - "epoch": 0.06, - "learning_rate": 0.0001078370126504492, - "loss": 0.4645, - "step": 866000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00010772487313973276, - "loss": 0.4664, - "step": 867000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00010761276144028143, - "loss": 0.4661, - "step": 868000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00010750056579571238, - "loss": 0.4647, - "step": 869000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00010738817411369786, - "loss": 0.4648, - "step": 870000 - }, - { - "epoch": 0.06, - "eval_loss": 0.44627639651298523, - "eval_runtime": 78.7772, - "eval_samples_per_second": 81.242, - "eval_steps_per_second": 0.635, - "step": 870000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00010727569900255524, - "loss": 0.464, - "step": 871000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00010716314080490315, - "loss": 0.4656, - "step": 872000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00010705061254577097, - "loss": 0.4645, - "step": 873000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00010693788928619646, - "loss": 0.467, - "step": 874000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00010682508396913951, - "loss": 0.4644, - "step": 875000 - }, - { - "epoch": 0.06, - "eval_loss": 0.4447041451931, - "eval_runtime": 84.2151, - "eval_samples_per_second": 75.996, - "eval_steps_per_second": 0.594, - "step": 875000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00010671219693822467, - "loss": 0.4653, - "step": 876000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00010659934154625607, - "loss": 0.4652, - "step": 877000 - }, - { - "epoch": 0.06, - "learning_rate": 0.0001064862922003476, - "loss": 0.4654, - "step": 878000 - }, - { - "epoch": 0.06, - "learning_rate": 0.00010637327534281335, - "loss": 0.4641, - "step": 879000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010626006505800474, - "loss": 0.4642, - "step": 880000 - }, - { - "epoch": 0.07, - "eval_loss": 0.44686806201934814, - "eval_runtime": 79.0811, - "eval_samples_per_second": 80.93, - "eval_steps_per_second": 0.632, - "step": 880000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010614688811060288, - "loss": 0.4646, - "step": 881000 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001060336316744615, - "loss": 0.4652, - "step": 882000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010592018260532381, - "loss": 0.4655, - "step": 883000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010580665457881469, - "loss": 0.4649, - "step": 884000 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001056930479407602, - "loss": 0.4663, - "step": 885000 - }, - { - "epoch": 0.07, - "eval_loss": 0.44526371359825134, - "eval_runtime": 78.5736, - "eval_samples_per_second": 81.452, - "eval_steps_per_second": 0.636, - "step": 885000 - }, - { - "epoch": 0.07, - "learning_rate": 0.0001055793630372259, - "loss": 0.4655, - "step": 886000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010546560021451571, - "loss": 0.4629, - "step": 887000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010535187369819837, - "loss": 0.4643, - "step": 888000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010523795615404908, - "loss": 0.4637, - "step": 889000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010512396173070846, - "loss": 0.4653, - "step": 890000 - }, - { - "epoch": 0.07, - "eval_loss": 0.44463416934013367, - "eval_runtime": 78.8452, - "eval_samples_per_second": 81.172, - "eval_steps_per_second": 0.634, - "step": 890000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010500989077542328, - "loss": 0.4651, - "step": 891000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010489597200575402, - "loss": 0.4635, - "step": 892000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010478174918057791, - "loss": 0.4635, - "step": 893000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010466745086589606, - "loss": 0.4639, - "step": 894000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010455319182075405, - "loss": 0.4637, - "step": 895000 - }, - { - "epoch": 0.07, - "eval_loss": 0.4454698860645294, - "eval_runtime": 81.883, - "eval_samples_per_second": 78.16, - "eval_steps_per_second": 0.611, - "step": 895000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010443874364642582, - "loss": 0.4642, - "step": 896000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010432433558713417, - "loss": 0.4642, - "step": 897000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010420973894628034, - "loss": 0.4635, - "step": 898000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010409518326547141, - "loss": 0.4638, - "step": 899000 - }, - { - "epoch": 0.07, - "learning_rate": 0.00010398043955301988, - "loss": 0.4637, - "step": 900000 - }, - { - "epoch": 0.07, - "eval_loss": 0.44443508982658386, - "eval_runtime": 80.8881, - "eval_samples_per_second": 79.122, - "eval_steps_per_second": 0.618, - "step": 900000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00010386562279199064, - "loss": 0.4619, - "step": 901000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00010375084825779196, - "loss": 0.4642, - "step": 902000 - }, - { - "epoch": 0.08, - "learning_rate": 0.0001036358865212581, - "loss": 0.4644, - "step": 903000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00010352096785529729, - "loss": 0.4642, - "step": 904000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00010340586254263087, - "loss": 0.4625, - "step": 905000 - }, - { - "epoch": 0.08, - "eval_loss": 0.44563567638397217, - "eval_runtime": 78.8321, - "eval_samples_per_second": 81.185, - "eval_steps_per_second": 0.634, - "step": 905000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00010329080114375571, - "loss": 0.4652, - "step": 906000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00010317555365607644, - "loss": 0.4641, - "step": 907000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00010306023557164153, - "loss": 0.4634, - "step": 908000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00010294484724172987, - "loss": 0.4623, - "step": 909000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00010282938901783426, - "loss": 0.4621, - "step": 910000 - }, - { - "epoch": 0.08, - "eval_loss": 0.4458223581314087, - "eval_runtime": 80.4596, - "eval_samples_per_second": 79.543, - "eval_steps_per_second": 0.621, - "step": 910000 - }, - { - "epoch": 0.08, - "learning_rate": 0.000102713976814046, - "loss": 0.4627, - "step": 911000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00010259849555785737, - "loss": 0.4638, - "step": 912000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00010248282990041591, - "loss": 0.4626, - "step": 913000 - }, - { - "epoch": 0.08, - "learning_rate": 0.0001023670957563755, - "loss": 0.4615, - "step": 914000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00010225129347828241, - "loss": 0.4636, - "step": 915000 - }, - { - "epoch": 0.08, - "eval_loss": 0.44487082958221436, - "eval_runtime": 81.5786, - "eval_samples_per_second": 78.452, - "eval_steps_per_second": 0.613, - "step": 915000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00010213542341889048, - "loss": 0.4642, - "step": 916000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00010201960190221062, - "loss": 0.4627, - "step": 917000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00010190359740620597, - "loss": 0.4622, - "step": 918000 - }, - { - "epoch": 0.08, - "learning_rate": 0.00010178752618804497, - "loss": 0.4633, - "step": 919000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00010167150477192083, - "loss": 0.4605, - "step": 920000 - }, - { - "epoch": 0.09, - "eval_loss": 0.44223639369010925, - "eval_runtime": 78.8789, - "eval_samples_per_second": 81.137, - "eval_steps_per_second": 0.634, - "step": 920000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00010155530123620656, - "loss": 0.4632, - "step": 921000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00010143914834118332, - "loss": 0.4628, - "step": 922000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00010132281390240366, - "loss": 0.4628, - "step": 923000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00010120653094235274, - "loss": 0.4634, - "step": 924000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00010109006701658896, - "loss": 0.4619, - "step": 925000 - }, - { - "epoch": 0.09, - "eval_loss": 0.44582226872444153, - "eval_runtime": 79.59, - "eval_samples_per_second": 80.412, - "eval_steps_per_second": 0.628, - "step": 925000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00010097353884682351, - "loss": 0.4634, - "step": 926000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00010085706341187459, - "loss": 0.4616, - "step": 927000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00010074040788255187, - "loss": 0.4636, - "step": 928000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00010062368917434986, - "loss": 0.4636, - "step": 929000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00010050702445560737, - "loss": 0.4621, - "step": 930000 - }, - { - "epoch": 0.09, - "eval_loss": 0.4443116784095764, - "eval_runtime": 81.0459, - "eval_samples_per_second": 78.968, - "eval_steps_per_second": 0.617, - "step": 930000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00010039018051876424, - "loss": 0.4629, - "step": 931000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00010027327446989505, - "loss": 0.4637, - "step": 932000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00010015642366364926, - "loss": 0.4613, - "step": 933000 - }, - { - "epoch": 0.09, - "learning_rate": 0.00010003939452048603, - "loss": 0.4623, - "step": 934000 - }, - { - "epoch": 0.09, - "learning_rate": 9.992242145441094e-05, - "loss": 0.4611, - "step": 935000 - }, - { - "epoch": 0.09, - "eval_loss": 0.44253501296043396, - "eval_runtime": 80.3792, - "eval_samples_per_second": 79.623, - "eval_steps_per_second": 0.622, - "step": 935000 - }, - { - "epoch": 0.09, - "learning_rate": 9.980527064149124e-05, - "loss": 0.4616, - "step": 936000 - }, - { - "epoch": 0.09, - "learning_rate": 9.968817673944183e-05, - "loss": 0.4624, - "step": 937000 - }, - { - "epoch": 0.09, - "learning_rate": 9.957090568278425e-05, - "loss": 0.4624, - "step": 938000 - }, - { - "epoch": 0.09, - "learning_rate": 9.94535750097539e-05, - "loss": 0.4625, - "step": 939000 - }, - { - "epoch": 0.1, - "learning_rate": 9.933618507776033e-05, - "loss": 0.4631, - "step": 940000 - }, - { - "epoch": 0.1, - "eval_loss": 0.4430947005748749, - "eval_runtime": 79.9849, - "eval_samples_per_second": 80.015, - "eval_steps_per_second": 0.625, - "step": 940000 - }, - { - "epoch": 0.1, - "learning_rate": 9.921885372252918e-05, - "loss": 0.4624, - "step": 941000 - }, - { - "epoch": 0.1, - "learning_rate": 9.910134640392372e-05, - "loss": 0.4617, - "step": 942000 - }, - { - "epoch": 0.1, - "learning_rate": 9.898401608814534e-05, - "loss": 0.4625, - "step": 943000 - }, - { - "epoch": 0.1, - "learning_rate": 9.886639287093549e-05, - "loss": 0.4625, - "step": 944000 - }, - { - "epoch": 0.1, - "learning_rate": 9.874871218342105e-05, - "loss": 0.4623, - "step": 945000 - }, - { - "epoch": 0.1, - "eval_loss": 0.4420580267906189, - "eval_runtime": 78.2862, - "eval_samples_per_second": 81.751, - "eval_steps_per_second": 0.639, - "step": 945000 - }, - { - "epoch": 0.1, - "learning_rate": 9.863097438407781e-05, - "loss": 0.4612, - "step": 946000 - }, - { - "epoch": 0.1, - "learning_rate": 9.851317983155551e-05, - "loss": 0.4621, - "step": 947000 - }, - { - "epoch": 0.1, - "learning_rate": 9.839544676367323e-05, - "loss": 0.4615, - "step": 948000 - }, - { - "epoch": 0.1, - "learning_rate": 9.827753983728851e-05, - "loss": 0.4621, - "step": 949000 - }, - { - "epoch": 0.1, - "learning_rate": 9.815957723434772e-05, - "loss": 0.4625, - "step": 950000 - }, - { - "epoch": 0.1, - "eval_loss": 0.44188880920410156, - "eval_runtime": 79.7091, - "eval_samples_per_second": 80.292, - "eval_steps_per_second": 0.627, - "step": 950000 - }, - { - "epoch": 0.1, - "learning_rate": 9.804167735961679e-05, - "loss": 0.4612, - "step": 951000 - }, - { - "epoch": 0.1, - "learning_rate": 9.7923604536514e-05, - "loss": 0.4611, - "step": 952000 - }, - { - "epoch": 0.1, - "learning_rate": 9.780559526957703e-05, - "loss": 0.4621, - "step": 953000 - }, - { - "epoch": 0.1, - "learning_rate": 9.768741366355399e-05, - "loss": 0.4617, - "step": 954000 - }, - { - "epoch": 0.1, - "learning_rate": 9.756941470310317e-05, - "loss": 0.4623, - "step": 955000 - }, - { - "epoch": 0.1, - "eval_loss": 0.4422100782394409, - "eval_runtime": 78.3534, - "eval_samples_per_second": 81.681, - "eval_steps_per_second": 0.638, - "step": 955000 - }, - { - "epoch": 0.1, - "learning_rate": 9.7451125806065e-05, - "loss": 0.4611, - "step": 956000 - }, - { - "epoch": 0.1, - "learning_rate": 9.733278374987115e-05, - "loss": 0.4614, - "step": 957000 - }, - { - "epoch": 0.1, - "learning_rate": 9.721438889501207e-05, - "loss": 0.4627, - "step": 958000 - }, - { - "epoch": 0.1, - "learning_rate": 9.709594160213901e-05, - "loss": 0.4614, - "step": 959000 - }, - { - "epoch": 0.1, - "learning_rate": 9.697756075732548e-05, - "loss": 0.4612, - "step": 960000 - }, - { - "epoch": 0.1, - "eval_loss": 0.4385935962200165, - "eval_runtime": 79.7647, - "eval_samples_per_second": 80.236, - "eval_steps_per_second": 0.627, - "step": 960000 - }, - { - "epoch": 0.11, - "learning_rate": 9.685912829929902e-05, - "loss": 0.4614, - "step": 961000 - }, - { - "epoch": 0.11, - "learning_rate": 9.674052596023316e-05, - "loss": 0.4619, - "step": 962000 - }, - { - "epoch": 0.11, - "learning_rate": 9.662187262662203e-05, - "loss": 0.4627, - "step": 963000 - }, - { - "epoch": 0.11, - "learning_rate": 9.650316865990422e-05, - "loss": 0.4612, - "step": 964000 - }, - { - "epoch": 0.11, - "learning_rate": 9.638453320090109e-05, - "loss": 0.4601, - "step": 965000 - }, - { - "epoch": 0.11, - "eval_loss": 0.44188645482063293, - "eval_runtime": 78.5733, - "eval_samples_per_second": 81.453, - "eval_steps_per_second": 0.636, - "step": 965000 - }, - { - "epoch": 0.11, - "learning_rate": 9.626572910263071e-05, - "loss": 0.4598, - "step": 966000 - }, - { - "epoch": 0.11, - "learning_rate": 9.6146994334404e-05, - "loss": 0.4605, - "step": 967000 - }, - { - "epoch": 0.11, - "learning_rate": 9.60282104779561e-05, - "loss": 0.4619, - "step": 968000 - }, - { - "epoch": 0.11, - "learning_rate": 9.590925891857327e-05, - "loss": 0.4605, - "step": 969000 - }, - { - "epoch": 0.11, - "learning_rate": 9.579025889682872e-05, - "loss": 0.4614, - "step": 970000 - }, - { - "epoch": 0.11, - "eval_loss": 0.4392482042312622, - "eval_runtime": 78.2208, - "eval_samples_per_second": 81.82, - "eval_steps_per_second": 0.639, - "step": 970000 - }, - { - "epoch": 0.11, - "learning_rate": 9.56712107752171e-05, - "loss": 0.459, - "step": 971000 - }, - { - "epoch": 0.11, - "learning_rate": 9.555223403596256e-05, - "loss": 0.4609, - "step": 972000 - }, - { - "epoch": 0.11, - "learning_rate": 9.543309084987918e-05, - "loss": 0.4614, - "step": 973000 - }, - { - "epoch": 0.11, - "learning_rate": 9.531401986548419e-05, - "loss": 0.4608, - "step": 974000 - }, - { - "epoch": 0.11, - "learning_rate": 9.51947830651998e-05, - "loss": 0.46, - "step": 975000 - }, - { - "epoch": 0.11, - "eval_loss": 0.4403558373451233, - "eval_runtime": 78.7273, - "eval_samples_per_second": 81.293, - "eval_steps_per_second": 0.635, - "step": 975000 - }, - { - "epoch": 0.11, - "learning_rate": 9.507561928505619e-05, - "loss": 0.4601, - "step": 976000 - }, - { - "epoch": 0.11, - "learning_rate": 9.495629032198109e-05, - "loss": 0.4609, - "step": 977000 - }, - { - "epoch": 0.11, - "learning_rate": 9.48369157994519e-05, - "loss": 0.4606, - "step": 978000 - }, - { - "epoch": 0.11, - "learning_rate": 9.471749608110409e-05, - "loss": 0.4596, - "step": 979000 - }, - { - "epoch": 0.12, - "learning_rate": 9.459803153071084e-05, - "loss": 0.4597, - "step": 980000 - }, - { - "epoch": 0.12, - "eval_loss": 0.44013381004333496, - "eval_runtime": 77.6815, - "eval_samples_per_second": 82.388, - "eval_steps_per_second": 0.644, - "step": 980000 - }, - { - "epoch": 0.12, - "learning_rate": 9.447852251218189e-05, - "loss": 0.46, - "step": 981000 - }, - { - "epoch": 0.12, - "learning_rate": 9.435908896459387e-05, - "loss": 0.4592, - "step": 982000 - }, - { - "epoch": 0.12, - "learning_rate": 9.42394921456214e-05, - "loss": 0.4604, - "step": 983000 - }, - { - "epoch": 0.12, - "learning_rate": 9.411997161242634e-05, - "loss": 0.4601, - "step": 984000 - }, - { - "epoch": 0.12, - "learning_rate": 9.400028844880299e-05, - "loss": 0.4593, - "step": 985000 - }, - { - "epoch": 0.12, - "eval_loss": 0.4422694146633148, - "eval_runtime": 78.8261, - "eval_samples_per_second": 81.191, - "eval_steps_per_second": 0.634, - "step": 985000 - }, - { - "epoch": 0.12, - "learning_rate": 9.388068238486491e-05, - "loss": 0.4605, - "step": 986000 - }, - { - "epoch": 0.12, - "learning_rate": 9.376091433343432e-05, - "loss": 0.4603, - "step": 987000 - }, - { - "epoch": 0.12, - "learning_rate": 9.364110436387217e-05, - "loss": 0.4614, - "step": 988000 - }, - { - "epoch": 0.12, - "learning_rate": 9.352137271329748e-05, - "loss": 0.46, - "step": 989000 - }, - { - "epoch": 0.12, - "learning_rate": 9.340148004349036e-05, - "loss": 0.4605, - "step": 990000 - }, - { - "epoch": 0.12, - "eval_loss": 0.44141891598701477, - "eval_runtime": 80.7223, - "eval_samples_per_second": 79.284, - "eval_steps_per_second": 0.619, - "step": 990000 - }, - { - "epoch": 0.12, - "learning_rate": 9.328166650421353e-05, - "loss": 0.4585, - "step": 991000 - }, - { - "epoch": 0.12, - "learning_rate": 9.316169259355529e-05, - "loss": 0.4598, - "step": 992000 - }, - { - "epoch": 0.12, - "learning_rate": 9.30416785900988e-05, - "loss": 0.4601, - "step": 993000 - }, - { - "epoch": 0.12, - "learning_rate": 9.292174493288034e-05, - "loss": 0.4596, - "step": 994000 - }, - { - "epoch": 0.12, - "learning_rate": 9.28016518798777e-05, - "loss": 0.4601, - "step": 995000 - }, - { - "epoch": 0.12, - "eval_loss": 0.4383966028690338, - "eval_runtime": 82.6078, - "eval_samples_per_second": 77.475, - "eval_steps_per_second": 0.605, - "step": 995000 - }, - { - "epoch": 0.12, - "learning_rate": 9.26816399822291e-05, - "loss": 0.4592, - "step": 996000 - }, - { - "epoch": 0.12, - "learning_rate": 9.256146934151332e-05, - "loss": 0.4586, - "step": 997000 - }, - { - "epoch": 0.12, - "learning_rate": 9.244138066427978e-05, - "loss": 0.4607, - "step": 998000 - }, - { - "epoch": 0.12, - "learning_rate": 9.232113389862827e-05, - "loss": 0.4598, - "step": 999000 - }, - { - "epoch": 0.12, - "learning_rate": 9.220084960066614e-05, - "loss": 0.459, - "step": 1000000 - }, - { - "epoch": 0.12, - "eval_loss": 0.43742096424102783, - "eval_runtime": 80.287, - "eval_samples_per_second": 79.714, - "eval_steps_per_second": 0.623, - "step": 1000000 - }, - { - "epoch": 0.13, - "learning_rate": 9.208052813680018e-05, - "loss": 0.4597, - "step": 1001000 - }, - { - "epoch": 0.13, - "learning_rate": 9.196029025007298e-05, - "loss": 0.4589, - "step": 1002000 - }, - { - "epoch": 0.13, - "learning_rate": 9.183989559032122e-05, - "loss": 0.4593, - "step": 1003000 - }, - { - "epoch": 0.13, - "learning_rate": 9.171946486419417e-05, - "loss": 0.4588, - "step": 1004000 - }, - { - "epoch": 0.13, - "learning_rate": 9.159911892268011e-05, - "loss": 0.4592, - "step": 1005000 - }, - { - "epoch": 0.13, - "eval_loss": 0.44004783034324646, - "eval_runtime": 78.0539, - "eval_samples_per_second": 81.995, - "eval_steps_per_second": 0.641, - "step": 1005000 - }, - { - "epoch": 0.13, - "learning_rate": 9.14786171996191e-05, - "loss": 0.459, - "step": 1006000 - }, - { - "epoch": 0.13, - "learning_rate": 9.135808051069939e-05, - "loss": 0.4581, - "step": 1007000 - }, - { - "epoch": 0.13, - "learning_rate": 9.123762981154407e-05, - "loss": 0.4588, - "step": 1008000 - }, - { - "epoch": 0.13, - "learning_rate": 9.11170243265873e-05, - "loss": 0.4606, - "step": 1009000 - }, - { - "epoch": 0.13, - "learning_rate": 9.099650563338879e-05, - "loss": 0.4595, - "step": 1010000 - }, - { - "epoch": 0.13, - "eval_loss": 0.4407427906990051, - "eval_runtime": 80.6794, - "eval_samples_per_second": 79.326, - "eval_steps_per_second": 0.62, - "step": 1010000 - }, - { - "epoch": 0.13, - "learning_rate": 9.087595350988804e-05, - "loss": 0.46, - "step": 1011000 - }, - { - "epoch": 0.13, - "learning_rate": 9.075524760023795e-05, - "loss": 0.4585, - "step": 1012000 - }, - { - "epoch": 0.13, - "learning_rate": 9.063450892823849e-05, - "loss": 0.4594, - "step": 1013000 - }, - { - "epoch": 0.13, - "learning_rate": 9.051373786168048e-05, - "loss": 0.4591, - "step": 1014000 - }, - { - "epoch": 0.13, - "learning_rate": 9.039293476845361e-05, - "loss": 0.4586, - "step": 1015000 - }, - { - "epoch": 0.13, - "eval_loss": 0.438150554895401, - "eval_runtime": 80.3486, - "eval_samples_per_second": 79.653, - "eval_steps_per_second": 0.622, - "step": 1015000 - }, - { - "epoch": 0.13, - "learning_rate": 9.0272220866988e-05, - "loss": 0.4582, - "step": 1016000 - }, - { - "epoch": 0.13, - "learning_rate": 9.01513548555881e-05, - "loss": 0.4578, - "step": 1017000 - }, - { - "epoch": 0.13, - "learning_rate": 9.00304579214008e-05, - "loss": 0.4578, - "step": 1018000 - }, - { - "epoch": 0.13, - "learning_rate": 8.990965137532721e-05, - "loss": 0.4595, - "step": 1019000 - }, - { - "epoch": 0.14, - "learning_rate": 8.978869373047941e-05, - "loss": 0.4578, - "step": 1020000 - }, - { - "epoch": 0.14, - "eval_loss": 0.4372638761997223, - "eval_runtime": 79.8699, - "eval_samples_per_second": 80.13, - "eval_steps_per_second": 0.626, - "step": 1020000 - }, - { - "epoch": 0.14, - "learning_rate": 8.966782726980726e-05, - "loss": 0.4595, - "step": 1021000 - }, - { - "epoch": 0.14, - "learning_rate": 8.954681038665626e-05, - "loss": 0.4581, - "step": 1022000 - }, - { - "epoch": 0.14, - "learning_rate": 8.942588548263289e-05, - "loss": 0.4566, - "step": 1023000 - }, - { - "epoch": 0.14, - "learning_rate": 8.930493192312473e-05, - "loss": 0.4596, - "step": 1024000 - }, - { - "epoch": 0.14, - "learning_rate": 8.918382895884115e-05, - "loss": 0.458, - "step": 1025000 - }, - { - "epoch": 0.14, - "eval_loss": 0.4378396272659302, - "eval_runtime": 80.7357, - "eval_samples_per_second": 79.271, - "eval_steps_per_second": 0.619, - "step": 1025000 - }, - { - "epoch": 0.14, - "learning_rate": 8.906281916381266e-05, - "loss": 0.4577, - "step": 1026000 - }, - { - "epoch": 0.14, - "learning_rate": 8.894166064486576e-05, - "loss": 0.4586, - "step": 1027000 - }, - { - "epoch": 0.14, - "learning_rate": 8.882059608730904e-05, - "loss": 0.4586, - "step": 1028000 - }, - { - "epoch": 0.14, - "learning_rate": 8.869938348850115e-05, - "loss": 0.4591, - "step": 1029000 - }, - { - "epoch": 0.14, - "learning_rate": 8.85781443898625e-05, - "loss": 0.4592, - "step": 1030000 - }, - { - "epoch": 0.14, - "eval_loss": 0.43954479694366455, - "eval_runtime": 78.7138, - "eval_samples_per_second": 81.307, - "eval_steps_per_second": 0.635, - "step": 1030000 - }, - { - "epoch": 0.14, - "learning_rate": 8.845700043886677e-05, - "loss": 0.4591, - "step": 1031000 - }, - { - "epoch": 0.14, - "learning_rate": 8.833570947416866e-05, - "loss": 0.4569, - "step": 1032000 - }, - { - "epoch": 0.14, - "learning_rate": 8.82143931174539e-05, - "loss": 0.4581, - "step": 1033000 - }, - { - "epoch": 0.14, - "learning_rate": 8.809305173827309e-05, - "loss": 0.4579, - "step": 1034000 - }, - { - "epoch": 0.14, - "learning_rate": 8.797168570625316e-05, - "loss": 0.4576, - "step": 1035000 - }, - { - "epoch": 0.14, - "eval_loss": 0.44192150235176086, - "eval_runtime": 82.0458, - "eval_samples_per_second": 78.005, - "eval_steps_per_second": 0.609, - "step": 1035000 - }, - { - "epoch": 0.14, - "learning_rate": 8.785041679341762e-05, - "loss": 0.4594, - "step": 1036000 - }, - { - "epoch": 0.14, - "learning_rate": 8.772900258862799e-05, - "loss": 0.4585, - "step": 1037000 - }, - { - "epoch": 0.14, - "learning_rate": 8.760768628934177e-05, - "loss": 0.4576, - "step": 1038000 - }, - { - "epoch": 0.14, - "learning_rate": 8.748622538969719e-05, - "loss": 0.4581, - "step": 1039000 - }, - { - "epoch": 0.14, - "learning_rate": 8.736486318068334e-05, - "loss": 0.4595, - "step": 1040000 - }, - { - "epoch": 0.14, - "eval_loss": 0.4380738139152527, - "eval_runtime": 79.7233, - "eval_samples_per_second": 80.278, - "eval_steps_per_second": 0.627, - "step": 1040000 - }, - { - "epoch": 0.15, - "learning_rate": 8.724347858186586e-05, - "loss": 0.4577, - "step": 1041000 - }, - { - "epoch": 0.15, - "learning_rate": 8.712195042320727e-05, - "loss": 0.4582, - "step": 1042000 - }, - { - "epoch": 0.15, - "learning_rate": 8.700040056985257e-05, - "loss": 0.4583, - "step": 1043000 - }, - { - "epoch": 0.15, - "learning_rate": 8.687882939206366e-05, - "loss": 0.4585, - "step": 1044000 - }, - { - "epoch": 0.15, - "learning_rate": 8.67573588626427e-05, - "loss": 0.4577, - "step": 1045000 - }, - { - "epoch": 0.15, - "eval_loss": 0.437910795211792, - "eval_runtime": 79.8686, - "eval_samples_per_second": 80.132, - "eval_steps_per_second": 0.626, - "step": 1045000 - }, - { - "epoch": 0.15, - "learning_rate": 8.663586779028231e-05, - "loss": 0.4578, - "step": 1046000 - }, - { - "epoch": 0.15, - "learning_rate": 8.651435654432562e-05, - "loss": 0.4583, - "step": 1047000 - }, - { - "epoch": 0.15, - "learning_rate": 8.639270383167568e-05, - "loss": 0.4583, - "step": 1048000 - }, - { - "epoch": 0.15, - "learning_rate": 8.62710316457412e-05, - "loss": 0.4579, - "step": 1049000 - }, - { - "epoch": 0.15, - "learning_rate": 8.614934035715678e-05, - "loss": 0.4578, - "step": 1050000 - }, - { - "epoch": 0.15, - "eval_loss": 0.4384676218032837, - "eval_runtime": 84.4358, - "eval_samples_per_second": 75.797, - "eval_steps_per_second": 0.592, - "step": 1050000 - }, - { - "epoch": 0.0, - "learning_rate": 8.602763033661519e-05, - "loss": 0.4575, - "step": 1051000 - }, - { - "epoch": 0.0, - "learning_rate": 8.590602369229602e-05, - "loss": 0.4576, - "step": 1052000 - }, - { - "epoch": 0.0, - "learning_rate": 8.578427733795066e-05, - "loss": 0.4565, - "step": 1053000 - }, - { - "epoch": 0.0, - "learning_rate": 8.566251336369341e-05, - "loss": 0.4565, - "step": 1054000 - }, - { - "epoch": 0.0, - "learning_rate": 8.554073214043843e-05, - "loss": 0.4574, - "step": 1055000 - }, - { - "epoch": 0.0, - "eval_loss": 0.4384410083293915, - "eval_runtime": 82.1023, - "eval_samples_per_second": 77.951, - "eval_steps_per_second": 0.609, - "step": 1055000 - }, - { - "epoch": 0.0, - "learning_rate": 8.541893403915243e-05, - "loss": 0.4573, - "step": 1056000 - }, - { - "epoch": 0.0, - "learning_rate": 8.529724125358366e-05, - "loss": 0.4578, - "step": 1057000 - }, - { - "epoch": 0.0, - "learning_rate": 8.517541052529095e-05, - "loss": 0.4594, - "step": 1058000 - }, - { - "epoch": 0.0, - "learning_rate": 8.505368588604491e-05, - "loss": 0.4575, - "step": 1059000 - }, - { - "epoch": 0.01, - "learning_rate": 8.493182401373038e-05, - "loss": 0.456, - "step": 1060000 - }, - { - "epoch": 0.01, - "eval_loss": 0.4374844431877136, - "eval_runtime": 81.8521, - "eval_samples_per_second": 78.19, - "eval_steps_per_second": 0.611, - "step": 1060000 - }, - { - "epoch": 0.01, - "learning_rate": 8.480994711822458e-05, - "loss": 0.4569, - "step": 1061000 - }, - { - "epoch": 0.01, - "learning_rate": 8.468805557078571e-05, - "loss": 0.4569, - "step": 1062000 - }, - { - "epoch": 0.01, - "learning_rate": 8.456614974271656e-05, - "loss": 0.4574, - "step": 1063000 - }, - { - "epoch": 0.01, - "learning_rate": 8.444435193192483e-05, - "loss": 0.4561, - "step": 1064000 - }, - { - "epoch": 0.01, - "learning_rate": 8.432254060992934e-05, - "loss": 0.4574, - "step": 1065000 - }, - { - "epoch": 0.01, - "eval_loss": 0.43869927525520325, - "eval_runtime": 85.7182, - "eval_samples_per_second": 74.663, - "eval_steps_per_second": 0.583, - "step": 1065000 - }, - { - "epoch": 0.01, - "learning_rate": 8.420059419417763e-05, - "loss": 0.4574, - "step": 1066000 - }, - { - "epoch": 0.01, - "learning_rate": 8.407875694816683e-05, - "loss": 0.4565, - "step": 1067000 - }, - { - "epoch": 0.01, - "learning_rate": 8.395678532468588e-05, - "loss": 0.4557, - "step": 1068000 - }, - { - "epoch": 0.01, - "learning_rate": 8.383480164815062e-05, - "loss": 0.4571, - "step": 1069000 - }, - { - "epoch": 0.01, - "learning_rate": 8.371280629014447e-05, - "loss": 0.4583, - "step": 1070000 - }, - { - "epoch": 0.01, - "eval_loss": 0.43844690918922424, - "eval_runtime": 78.8816, - "eval_samples_per_second": 81.134, - "eval_steps_per_second": 0.634, - "step": 1070000 - }, - { - "epoch": 0.01, - "learning_rate": 8.359079962228652e-05, - "loss": 0.4568, - "step": 1071000 - }, - { - "epoch": 0.01, - "learning_rate": 8.346890403917613e-05, - "loss": 0.4551, - "step": 1072000 - }, - { - "epoch": 0.01, - "learning_rate": 8.334687587698898e-05, - "loss": 0.4569, - "step": 1073000 - }, - { - "epoch": 0.01, - "learning_rate": 8.322495956296366e-05, - "loss": 0.4567, - "step": 1074000 - }, - { - "epoch": 0.01, - "learning_rate": 8.310315549773483e-05, - "loss": 0.4558, - "step": 1075000 - }, - { - "epoch": 0.01, - "eval_loss": 0.43670228123664856, - "eval_runtime": 92.0013, - "eval_samples_per_second": 69.564, - "eval_steps_per_second": 0.543, - "step": 1075000 - }, - { - "epoch": 0.01, - "learning_rate": 8.298109789313348e-05, - "loss": 0.4578, - "step": 1076000 - }, - { - "epoch": 0.01, - "learning_rate": 8.285903120759091e-05, - "loss": 0.4559, - "step": 1077000 - }, - { - "epoch": 0.01, - "learning_rate": 8.273695581294347e-05, - "loss": 0.456, - "step": 1078000 - }, - { - "epoch": 0.01, - "learning_rate": 8.2614872081054e-05, - "loss": 0.4573, - "step": 1079000 - }, - { - "epoch": 0.01, - "learning_rate": 8.249290247936282e-05, - "loss": 0.457, - "step": 1080000 - }, - { - "epoch": 0.01, - "eval_loss": 0.43457353115081787, - "eval_runtime": 80.6194, - "eval_samples_per_second": 79.385, - "eval_steps_per_second": 0.62, - "step": 1080000 - }, - { - "epoch": 0.02, - "learning_rate": 8.237080319608577e-05, - "loss": 0.4561, - "step": 1081000 - }, - { - "epoch": 0.02, - "learning_rate": 8.224869669093106e-05, - "loss": 0.4572, - "step": 1082000 - }, - { - "epoch": 0.02, - "learning_rate": 8.212658333585626e-05, - "loss": 0.4564, - "step": 1083000 - }, - { - "epoch": 0.02, - "learning_rate": 8.200446350283978e-05, - "loss": 0.4561, - "step": 1084000 - }, - { - "epoch": 0.02, - "learning_rate": 8.188258182160425e-05, - "loss": 0.4564, - "step": 1085000 - }, - { - "epoch": 0.02, - "eval_loss": 0.4339691400527954, - "eval_runtime": 80.6306, - "eval_samples_per_second": 79.374, - "eval_steps_per_second": 0.62, - "step": 1085000 - }, - { - "epoch": 0.02, - "learning_rate": 8.176045015981425e-05, - "loss": 0.4559, - "step": 1086000 - }, - { - "epoch": 0.02, - "learning_rate": 8.163831313538776e-05, - "loss": 0.4571, - "step": 1087000 - }, - { - "epoch": 0.02, - "learning_rate": 8.151617112037531e-05, - "loss": 0.4568, - "step": 1088000 - }, - { - "epoch": 0.02, - "learning_rate": 8.139414663565934e-05, - "loss": 0.4562, - "step": 1089000 - }, - { - "epoch": 0.02, - "learning_rate": 8.127211791262032e-05, - "loss": 0.4564, - "step": 1090000 - }, - { - "epoch": 0.02, - "eval_loss": 0.435046911239624, - "eval_runtime": 80.4772, - "eval_samples_per_second": 79.526, - "eval_steps_per_second": 0.621, - "step": 1090000 - }, - { - "epoch": 0.02, - "learning_rate": 8.114996316567713e-05, - "loss": 0.4563, - "step": 1091000 - }, - { - "epoch": 0.02, - "learning_rate": 8.10278049157467e-05, - "loss": 0.4557, - "step": 1092000 - }, - { - "epoch": 0.02, - "learning_rate": 8.090564353494425e-05, - "loss": 0.4561, - "step": 1093000 - }, - { - "epoch": 0.02, - "learning_rate": 8.078347939539452e-05, - "loss": 0.4562, - "step": 1094000 - }, - { - "epoch": 0.02, - "learning_rate": 8.06615572044175e-05, - "loss": 0.4572, - "step": 1095000 - }, - { - "epoch": 0.02, - "eval_loss": 0.43538710474967957, - "eval_runtime": 79.2403, - "eval_samples_per_second": 80.767, - "eval_steps_per_second": 0.631, - "step": 1095000 - }, - { - "epoch": 0.02, - "learning_rate": 8.053938866743753e-05, - "loss": 0.4573, - "step": 1096000 - }, - { - "epoch": 0.02, - "learning_rate": 8.041721848738611e-05, - "loss": 0.4551, - "step": 1097000 - }, - { - "epoch": 0.02, - "learning_rate": 8.029504703641487e-05, - "loss": 0.4563, - "step": 1098000 - }, - { - "epoch": 0.02, - "learning_rate": 8.017287468667916e-05, - "loss": 0.4561, - "step": 1099000 - }, - { - "epoch": 0.03, - "learning_rate": 8.005082398335268e-05, - "loss": 0.4559, - "step": 1100000 - }, - { - "epoch": 0.03, - "eval_loss": 0.4374575912952423, - "eval_runtime": 81.4835, - "eval_samples_per_second": 78.543, - "eval_steps_per_second": 0.614, - "step": 1100000 - }, - { - "epoch": 0.03, - "learning_rate": 7.992865095253273e-05, - "loss": 0.4551, - "step": 1101000 - }, - { - "epoch": 0.03, - "learning_rate": 7.980647813905439e-05, - "loss": 0.4569, - "step": 1102000 - }, - { - "epoch": 0.03, - "learning_rate": 7.968430591507717e-05, - "loss": 0.4554, - "step": 1103000 - }, - { - "epoch": 0.03, - "learning_rate": 7.95623789940764e-05, - "loss": 0.4542, - "step": 1104000 - }, - { - "epoch": 0.03, - "learning_rate": 7.94402090625329e-05, - "loss": 0.455, - "step": 1105000 - }, - { - "epoch": 0.03, - "eval_loss": 0.43497970700263977, - "eval_runtime": 80.3312, - "eval_samples_per_second": 79.67, - "eval_steps_per_second": 0.622, - "step": 1105000 - }, - { - "epoch": 0.03, - "learning_rate": 7.931804083620962e-05, - "loss": 0.4547, - "step": 1106000 - }, - { - "epoch": 0.03, - "learning_rate": 7.919599685223971e-05, - "loss": 0.4568, - "step": 1107000 - }, - { - "epoch": 0.03, - "learning_rate": 7.90738331501521e-05, - "loss": 0.4551, - "step": 1108000 - }, - { - "epoch": 0.03, - "learning_rate": 7.895179442867707e-05, - "loss": 0.4551, - "step": 1109000 - }, - { - "epoch": 0.03, - "learning_rate": 7.882975889383518e-05, - "loss": 0.4551, - "step": 1110000 - }, - { - "epoch": 0.03, - "eval_loss": 0.43510711193084717, - "eval_runtime": 80.3235, - "eval_samples_per_second": 79.678, - "eval_steps_per_second": 0.622, - "step": 1110000 - }, - { - "epoch": 0.03, - "learning_rate": 7.87076047643997e-05, - "loss": 0.4553, - "step": 1111000 - }, - { - "epoch": 0.03, - "learning_rate": 7.858545457182424e-05, - "loss": 0.4551, - "step": 1112000 - }, - { - "epoch": 0.03, - "learning_rate": 7.846330868819947e-05, - "loss": 0.4547, - "step": 1113000 - }, - { - "epoch": 0.03, - "learning_rate": 7.834128962434351e-05, - "loss": 0.4547, - "step": 1114000 - }, - { - "epoch": 0.03, - "learning_rate": 7.821915346959958e-05, - "loss": 0.4549, - "step": 1115000 - }, - { - "epoch": 0.03, - "eval_loss": 0.436400443315506, - "eval_runtime": 81.1367, - "eval_samples_per_second": 78.879, - "eval_steps_per_second": 0.616, - "step": 1115000 - }, - { - "epoch": 0.03, - "learning_rate": 7.809702273962302e-05, - "loss": 0.4564, - "step": 1116000 - }, - { - "epoch": 0.03, - "learning_rate": 7.797501992835905e-05, - "loss": 0.4543, - "step": 1117000 - }, - { - "epoch": 0.03, - "learning_rate": 7.78529011576391e-05, - "loss": 0.4551, - "step": 1118000 - }, - { - "epoch": 0.03, - "learning_rate": 7.773091103619413e-05, - "loss": 0.4555, - "step": 1119000 - }, - { - "epoch": 0.04, - "learning_rate": 7.760880571122187e-05, - "loss": 0.4542, - "step": 1120000 - }, - { - "epoch": 0.04, - "eval_loss": 0.43270257115364075, - "eval_runtime": 81.0478, - "eval_samples_per_second": 78.966, - "eval_steps_per_second": 0.617, - "step": 1120000 - }, - { - "epoch": 0.04, - "learning_rate": 7.74867076702421e-05, - "loss": 0.454, - "step": 1121000 - }, - { - "epoch": 0.04, - "learning_rate": 7.73646172851866e-05, - "loss": 0.4548, - "step": 1122000 - }, - { - "epoch": 0.04, - "learning_rate": 7.724265700618751e-05, - "loss": 0.4545, - "step": 1123000 - }, - { - "epoch": 0.04, - "learning_rate": 7.712070510974305e-05, - "loss": 0.4547, - "step": 1124000 - }, - { - "epoch": 0.04, - "learning_rate": 7.699863990589822e-05, - "loss": 0.4557, - "step": 1125000 - }, - { - "epoch": 0.04, - "eval_loss": 0.4347490966320038, - "eval_runtime": 82.2635, - "eval_samples_per_second": 77.799, - "eval_steps_per_second": 0.608, - "step": 1125000 - }, - { - "epoch": 0.04, - "learning_rate": 7.687658384471673e-05, - "loss": 0.4557, - "step": 1126000 - }, - { - "epoch": 0.04, - "learning_rate": 7.675465933967303e-05, - "loss": 0.456, - "step": 1127000 - }, - { - "epoch": 0.04, - "learning_rate": 7.663262266912916e-05, - "loss": 0.4559, - "step": 1128000 - }, - { - "epoch": 0.04, - "learning_rate": 7.651071827736612e-05, - "loss": 0.455, - "step": 1129000 - }, - { - "epoch": 0.04, - "learning_rate": 7.638870248295153e-05, - "loss": 0.4552, - "step": 1130000 - }, - { - "epoch": 0.04, - "eval_loss": 0.43748027086257935, - "eval_runtime": 81.7683, - "eval_samples_per_second": 78.27, - "eval_steps_per_second": 0.611, - "step": 1130000 - }, - { - "epoch": 0.04, - "learning_rate": 7.626669768917545e-05, - "loss": 0.4546, - "step": 1131000 - }, - { - "epoch": 0.04, - "learning_rate": 7.614470426768568e-05, - "loss": 0.454, - "step": 1132000 - }, - { - "epoch": 0.04, - "learning_rate": 7.602284456578315e-05, - "loss": 0.4538, - "step": 1133000 - }, - { - "epoch": 0.04, - "learning_rate": 7.590087499136845e-05, - "loss": 0.4543, - "step": 1134000 - }, - { - "epoch": 0.04, - "learning_rate": 7.577903985432644e-05, - "loss": 0.4535, - "step": 1135000 - }, - { - "epoch": 0.04, - "eval_loss": 0.4324733018875122, - "eval_runtime": 79.5142, - "eval_samples_per_second": 80.489, - "eval_steps_per_second": 0.629, - "step": 1135000 - }, - { - "epoch": 0.04, - "learning_rate": 7.565709561166183e-05, - "loss": 0.4542, - "step": 1136000 - }, - { - "epoch": 0.04, - "learning_rate": 7.553528652251894e-05, - "loss": 0.4539, - "step": 1137000 - }, - { - "epoch": 0.04, - "learning_rate": 7.541336909597102e-05, - "loss": 0.4544, - "step": 1138000 - }, - { - "epoch": 0.04, - "learning_rate": 7.529158753744867e-05, - "loss": 0.4546, - "step": 1139000 - }, - { - "epoch": 0.04, - "learning_rate": 7.516969841105768e-05, - "loss": 0.4552, - "step": 1140000 - }, - { - "epoch": 0.04, - "eval_loss": 0.43759095668792725, - "eval_runtime": 81.8138, - "eval_samples_per_second": 78.226, - "eval_steps_per_second": 0.611, - "step": 1140000 - }, - { - "epoch": 0.05, - "learning_rate": 7.504782399860302e-05, - "loss": 0.453, - "step": 1141000 - }, - { - "epoch": 0.05, - "learning_rate": 7.492596467133529e-05, - "loss": 0.454, - "step": 1142000 - }, - { - "epoch": 0.05, - "learning_rate": 7.480424263648597e-05, - "loss": 0.4547, - "step": 1143000 - }, - { - "epoch": 0.05, - "learning_rate": 7.468253639717616e-05, - "loss": 0.4541, - "step": 1144000 - }, - { - "epoch": 0.05, - "learning_rate": 7.456072451973997e-05, - "loss": 0.4548, - "step": 1145000 - }, - { - "epoch": 0.05, - "eval_loss": 0.435017853975296, - "eval_runtime": 81.3392, - "eval_samples_per_second": 78.683, - "eval_steps_per_second": 0.615, - "step": 1145000 - }, - { - "epoch": 0.05, - "learning_rate": 7.443905099818896e-05, - "loss": 0.4538, - "step": 1146000 - }, - { - "epoch": 0.05, - "learning_rate": 7.431727261258993e-05, - "loss": 0.4538, - "step": 1147000 - }, - { - "epoch": 0.05, - "learning_rate": 7.419551153756405e-05, - "loss": 0.4537, - "step": 1148000 - }, - { - "epoch": 0.05, - "learning_rate": 7.407376814401668e-05, - "loss": 0.4538, - "step": 1149000 - }, - { - "epoch": 0.05, - "learning_rate": 7.395204280279928e-05, - "loss": 0.4538, - "step": 1150000 - }, - { - "epoch": 0.05, - "eval_loss": 0.4336395263671875, - "eval_runtime": 80.9118, - "eval_samples_per_second": 79.098, - "eval_steps_per_second": 0.618, - "step": 1150000 - }, - { - "epoch": 0.05, - "learning_rate": 7.383045758230073e-05, - "loss": 0.4541, - "step": 1151000 - }, - { - "epoch": 0.05, - "learning_rate": 7.370889111773011e-05, - "loss": 0.4531, - "step": 1152000 - }, - { - "epoch": 0.05, - "learning_rate": 7.358722211935709e-05, - "loss": 0.4531, - "step": 1153000 - }, - { - "epoch": 0.05, - "learning_rate": 7.346557265541761e-05, - "loss": 0.4528, - "step": 1154000 - }, - { - "epoch": 0.05, - "learning_rate": 7.334394309647706e-05, - "loss": 0.4528, - "step": 1155000 - }, - { - "epoch": 0.05, - "eval_loss": 0.43530333042144775, - "eval_runtime": 80.885, - "eval_samples_per_second": 79.125, - "eval_steps_per_second": 0.618, - "step": 1155000 - }, - { - "epoch": 0.05, - "learning_rate": 7.322233381304011e-05, - "loss": 0.4533, - "step": 1156000 - }, - { - "epoch": 0.05, - "learning_rate": 7.31008667537514e-05, - "loss": 0.4547, - "step": 1157000 - }, - { - "epoch": 0.05, - "learning_rate": 7.29792991113865e-05, - "loss": 0.4541, - "step": 1158000 - }, - { - "epoch": 0.05, - "learning_rate": 7.285775285529407e-05, - "loss": 0.4538, - "step": 1159000 - }, - { - "epoch": 0.06, - "learning_rate": 7.273634986923404e-05, - "loss": 0.454, - "step": 1160000 - }, - { - "epoch": 0.06, - "eval_loss": 0.43437331914901733, - "eval_runtime": 80.8329, - "eval_samples_per_second": 79.176, - "eval_steps_per_second": 0.619, - "step": 1160000 - }, - { - "epoch": 0.06, - "learning_rate": 7.26148474740616e-05, - "loss": 0.4538, - "step": 1161000 - }, - { - "epoch": 0.06, - "learning_rate": 7.249348904388309e-05, - "loss": 0.4549, - "step": 1162000 - }, - { - "epoch": 0.06, - "learning_rate": 7.237203198861888e-05, - "loss": 0.4542, - "step": 1163000 - }, - { - "epoch": 0.06, - "learning_rate": 7.225059816946814e-05, - "loss": 0.4523, - "step": 1164000 - }, - { - "epoch": 0.06, - "learning_rate": 7.21293093546381e-05, - "loss": 0.4528, - "step": 1165000 - }, - { - "epoch": 0.06, - "eval_loss": 0.43269097805023193, - "eval_runtime": 81.0783, - "eval_samples_per_second": 78.936, - "eval_steps_per_second": 0.617, - "step": 1165000 - }, - { - "epoch": 0.06, - "learning_rate": 7.200792309320717e-05, - "loss": 0.4529, - "step": 1166000 - }, - { - "epoch": 0.06, - "learning_rate": 7.188656117702852e-05, - "loss": 0.4529, - "step": 1167000 - }, - { - "epoch": 0.06, - "learning_rate": 7.17652239757916e-05, - "loss": 0.4545, - "step": 1168000 - }, - { - "epoch": 0.06, - "learning_rate": 7.164391185911055e-05, - "loss": 0.4525, - "step": 1169000 - }, - { - "epoch": 0.06, - "learning_rate": 7.15227464703484e-05, - "loss": 0.453, - "step": 1170000 - }, - { - "epoch": 0.06, - "eval_loss": 0.43528246879577637, - "eval_runtime": 77.7536, - "eval_samples_per_second": 82.311, - "eval_steps_per_second": 0.643, - "step": 1170000 - }, - { - "epoch": 0.06, - "learning_rate": 7.140148560530676e-05, - "loss": 0.454, - "step": 1171000 - }, - { - "epoch": 0.06, - "learning_rate": 7.128049337579044e-05, - "loss": 0.452, - "step": 1172000 - }, - { - "epoch": 0.06, - "learning_rate": 7.115940640646439e-05, - "loss": 0.4537, - "step": 1173000 - }, - { - "epoch": 0.06, - "learning_rate": 7.103822514560007e-05, - "loss": 0.4533, - "step": 1174000 - }, - { - "epoch": 0.06, - "learning_rate": 7.09170711838562e-05, - "loss": 0.4536, - "step": 1175000 - }, - { - "epoch": 0.06, - "eval_loss": 0.43259185552597046, - "eval_runtime": 78.8646, - "eval_samples_per_second": 81.152, - "eval_steps_per_second": 0.634, - "step": 1175000 - }, - { - "epoch": 0.06, - "learning_rate": 7.079594489028868e-05, - "loss": 0.4528, - "step": 1176000 - }, - { - "epoch": 0.06, - "learning_rate": 7.067484663386926e-05, - "loss": 0.4547, - "step": 1177000 - }, - { - "epoch": 0.06, - "learning_rate": 7.055377678348416e-05, - "loss": 0.4527, - "step": 1178000 - }, - { - "epoch": 0.06, - "learning_rate": 7.043285673451301e-05, - "loss": 0.4529, - "step": 1179000 - }, - { - "epoch": 0.07, - "learning_rate": 7.031184477318059e-05, - "loss": 0.4531, - "step": 1180000 - }, - { - "epoch": 0.07, - "eval_loss": 0.43487441539764404, - "eval_runtime": 80.3946, - "eval_samples_per_second": 79.607, - "eval_steps_per_second": 0.622, - "step": 1180000 - }, - { - "epoch": 0.07, - "learning_rate": 7.019098329123491e-05, - "loss": 0.4525, - "step": 1181000 - }, - { - "epoch": 0.07, - "learning_rate": 7.007003069197359e-05, - "loss": 0.4535, - "step": 1182000 - }, - { - "epoch": 0.07, - "learning_rate": 6.99491083411209e-05, - "loss": 0.4525, - "step": 1183000 - }, - { - "epoch": 0.07, - "learning_rate": 6.982833748334574e-05, - "loss": 0.4529, - "step": 1184000 - }, - { - "epoch": 0.07, - "learning_rate": 6.970747670309944e-05, - "loss": 0.4523, - "step": 1185000 - }, - { - "epoch": 0.07, - "eval_loss": 0.43467041850090027, - "eval_runtime": 79.4202, - "eval_samples_per_second": 80.584, - "eval_steps_per_second": 0.63, - "step": 1185000 - }, - { - "epoch": 0.07, - "learning_rate": 6.95868889029842e-05, - "loss": 0.4527, - "step": 1186000 - }, - { - "epoch": 0.07, - "learning_rate": 6.946609113261812e-05, - "loss": 0.4522, - "step": 1187000 - }, - { - "epoch": 0.07, - "learning_rate": 6.934532545036533e-05, - "loss": 0.4523, - "step": 1188000 - }, - { - "epoch": 0.07, - "learning_rate": 6.922471294099106e-05, - "loss": 0.4532, - "step": 1189000 - }, - { - "epoch": 0.07, - "learning_rate": 6.91040125054781e-05, - "loss": 0.4527, - "step": 1190000 - }, - { - "epoch": 0.07, - "eval_loss": 0.4338874816894531, - "eval_runtime": 78.8008, - "eval_samples_per_second": 81.217, - "eval_steps_per_second": 0.635, - "step": 1190000 - }, - { - "epoch": 0.07, - "learning_rate": 6.898346591157581e-05, - "loss": 0.4524, - "step": 1191000 - }, - { - "epoch": 0.07, - "learning_rate": 6.886283219202908e-05, - "loss": 0.4502, - "step": 1192000 - }, - { - "epoch": 0.07, - "learning_rate": 6.874235298095256e-05, - "loss": 0.4521, - "step": 1193000 - }, - { - "epoch": 0.07, - "learning_rate": 6.862178744578633e-05, - "loss": 0.4528, - "step": 1194000 - }, - { - "epoch": 0.07, - "learning_rate": 6.850125657062878e-05, - "loss": 0.4527, - "step": 1195000 - }, - { - "epoch": 0.07, - "eval_loss": 0.4356515109539032, - "eval_runtime": 78.1346, - "eval_samples_per_second": 81.91, - "eval_steps_per_second": 0.64, - "step": 1195000 - }, - { - "epoch": 0.07, - "learning_rate": 6.838088120086767e-05, - "loss": 0.4522, - "step": 1196000 - }, - { - "epoch": 0.07, - "learning_rate": 6.826042071151715e-05, - "loss": 0.4505, - "step": 1197000 - }, - { - "epoch": 0.07, - "learning_rate": 6.814011638970504e-05, - "loss": 0.4519, - "step": 1198000 - }, - { - "epoch": 0.07, - "learning_rate": 6.801972775246722e-05, - "loss": 0.452, - "step": 1199000 - }, - { - "epoch": 0.07, - "learning_rate": 6.789937560921629e-05, - "loss": 0.4534, - "step": 1200000 - }, - { - "epoch": 0.07, - "eval_loss": 0.4337690472602844, - "eval_runtime": 78.2694, - "eval_samples_per_second": 81.769, - "eval_steps_per_second": 0.639, - "step": 1200000 - }, - { - "epoch": 0.08, - "learning_rate": 6.777906032656586e-05, - "loss": 0.4514, - "step": 1201000 - }, - { - "epoch": 0.08, - "learning_rate": 6.765878227101707e-05, - "loss": 0.4509, - "step": 1202000 - }, - { - "epoch": 0.08, - "learning_rate": 6.753866203051995e-05, - "loss": 0.452, - "step": 1203000 - }, - { - "epoch": 0.08, - "learning_rate": 6.741857967353855e-05, - "loss": 0.4519, - "step": 1204000 - }, - { - "epoch": 0.08, - "learning_rate": 6.729841542014526e-05, - "loss": 0.4516, - "step": 1205000 - }, - { - "epoch": 0.08, - "eval_loss": 0.43374085426330566, - "eval_runtime": 78.9278, - "eval_samples_per_second": 81.087, - "eval_steps_per_second": 0.633, - "step": 1205000 - }, - { - "epoch": 0.08, - "learning_rate": 6.717828985798137e-05, - "loss": 0.451, - "step": 1206000 - }, - { - "epoch": 0.08, - "learning_rate": 6.705820335297017e-05, - "loss": 0.4515, - "step": 1207000 - }, - { - "epoch": 0.08, - "learning_rate": 6.693827629818452e-05, - "loss": 0.4499, - "step": 1208000 - }, - { - "epoch": 0.08, - "learning_rate": 6.681838895213777e-05, - "loss": 0.452, - "step": 1209000 - }, - { - "epoch": 0.08, - "learning_rate": 6.669842173225596e-05, - "loss": 0.4505, - "step": 1210000 - }, - { - "epoch": 0.08, - "eval_loss": 0.4336735010147095, - "eval_runtime": 78.5787, - "eval_samples_per_second": 81.447, - "eval_steps_per_second": 0.636, - "step": 1210000 - }, - { - "epoch": 0.08, - "learning_rate": 6.657849503128835e-05, - "loss": 0.4514, - "step": 1211000 - }, - { - "epoch": 0.08, - "learning_rate": 6.645860921455248e-05, - "loss": 0.452, - "step": 1212000 - }, - { - "epoch": 0.08, - "learning_rate": 6.633912411889133e-05, - "loss": 0.4525, - "step": 1213000 - }, - { - "epoch": 0.08, - "learning_rate": 6.621932104068297e-05, - "loss": 0.4506, - "step": 1214000 - }, - { - "epoch": 0.08, - "learning_rate": 6.609955994081265e-05, - "loss": 0.4523, - "step": 1215000 - }, - { - "epoch": 0.08, - "eval_loss": 0.43103519082069397, - "eval_runtime": 79.501, - "eval_samples_per_second": 80.502, - "eval_steps_per_second": 0.629, - "step": 1215000 - }, - { - "epoch": 0.08, - "learning_rate": 6.597984118409344e-05, - "loss": 0.4525, - "step": 1216000 - }, - { - "epoch": 0.08, - "learning_rate": 6.586016513520938e-05, - "loss": 0.4517, - "step": 1217000 - }, - { - "epoch": 0.08, - "learning_rate": 6.574065177005509e-05, - "loss": 0.4514, - "step": 1218000 - }, - { - "epoch": 0.08, - "learning_rate": 6.562106218675344e-05, - "loss": 0.4521, - "step": 1219000 - }, - { - "epoch": 0.09, - "learning_rate": 6.550163592797291e-05, - "loss": 0.4509, - "step": 1220000 - }, - { - "epoch": 0.09, - "eval_loss": 0.4309111535549164, - "eval_runtime": 78.8684, - "eval_samples_per_second": 81.148, - "eval_steps_per_second": 0.634, - "step": 1220000 - }, - { - "epoch": 0.09, - "learning_rate": 6.538213426595817e-05, - "loss": 0.4515, - "step": 1221000 - }, - { - "epoch": 0.09, - "learning_rate": 6.526267713249757e-05, - "loss": 0.4507, - "step": 1222000 - }, - { - "epoch": 0.09, - "learning_rate": 6.51433842811744e-05, - "loss": 0.4505, - "step": 1223000 - }, - { - "epoch": 0.09, - "learning_rate": 6.50240172509089e-05, - "loss": 0.4511, - "step": 1224000 - }, - { - "epoch": 0.09, - "learning_rate": 6.490481513858697e-05, - "loss": 0.4503, - "step": 1225000 - }, - { - "epoch": 0.09, - "eval_loss": 0.43227553367614746, - "eval_runtime": 78.1714, - "eval_samples_per_second": 81.871, - "eval_steps_per_second": 0.64, - "step": 1225000 - }, - { - "epoch": 0.09, - "learning_rate": 6.478553966451205e-05, - "loss": 0.4517, - "step": 1226000 - }, - { - "epoch": 0.09, - "learning_rate": 6.466642974217942e-05, - "loss": 0.4505, - "step": 1227000 - }, - { - "epoch": 0.09, - "learning_rate": 6.454724727617616e-05, - "loss": 0.4515, - "step": 1228000 - }, - { - "epoch": 0.09, - "learning_rate": 6.442811188193757e-05, - "loss": 0.4497, - "step": 1229000 - }, - { - "epoch": 0.09, - "learning_rate": 6.430902392237073e-05, - "loss": 0.4507, - "step": 1230000 - }, - { - "epoch": 0.09, - "eval_loss": 0.4315786361694336, - "eval_runtime": 79.653, - "eval_samples_per_second": 80.348, - "eval_steps_per_second": 0.628, - "step": 1230000 - }, - { - "epoch": 0.09, - "learning_rate": 6.419010277640487e-05, - "loss": 0.4516, - "step": 1231000 - }, - { - "epoch": 0.09, - "learning_rate": 6.407134866177907e-05, - "loss": 0.4508, - "step": 1232000 - }, - { - "epoch": 0.09, - "learning_rate": 6.395240503611e-05, - "loss": 0.4506, - "step": 1233000 - }, - { - "epoch": 0.09, - "learning_rate": 6.38335102941982e-05, - "loss": 0.4513, - "step": 1234000 - }, - { - "epoch": 0.09, - "learning_rate": 6.371466479821773e-05, - "loss": 0.452, - "step": 1235000 - }, - { - "epoch": 0.09, - "eval_loss": 0.43573638796806335, - "eval_runtime": 81.2864, - "eval_samples_per_second": 78.734, - "eval_steps_per_second": 0.615, - "step": 1235000 - }, - { - "epoch": 0.09, - "learning_rate": 6.359598768118097e-05, - "loss": 0.4505, - "step": 1236000 - }, - { - "epoch": 0.09, - "learning_rate": 6.347724171283347e-05, - "loss": 0.4511, - "step": 1237000 - }, - { - "epoch": 0.09, - "learning_rate": 6.335854607567315e-05, - "loss": 0.4513, - "step": 1238000 - }, - { - "epoch": 0.09, - "learning_rate": 6.32399011312675e-05, - "loss": 0.4507, - "step": 1239000 - }, - { - "epoch": 0.1, - "learning_rate": 6.312130724102964e-05, - "loss": 0.4512, - "step": 1240000 - }, - { - "epoch": 0.1, - "eval_loss": 0.4293386936187744, - "eval_runtime": 78.7202, - "eval_samples_per_second": 81.301, - "eval_steps_per_second": 0.635, - "step": 1240000 - }, - { - "epoch": 0.1, - "learning_rate": 6.300276476621711e-05, - "loss": 0.4493, - "step": 1241000 - }, - { - "epoch": 0.1, - "learning_rate": 6.288439253264661e-05, - "loss": 0.451, - "step": 1242000 - }, - { - "epoch": 0.1, - "learning_rate": 6.276595391951207e-05, - "loss": 0.4505, - "step": 1243000 - }, - { - "epoch": 0.1, - "learning_rate": 6.264768616404324e-05, - "loss": 0.45, - "step": 1244000 - }, - { - "epoch": 0.1, - "learning_rate": 6.252935285428112e-05, - "loss": 0.4496, - "step": 1245000 - }, - { - "epoch": 0.1, - "eval_loss": 0.42953261733055115, - "eval_runtime": 80.8366, - "eval_samples_per_second": 79.172, - "eval_steps_per_second": 0.619, - "step": 1245000 - }, - { - "epoch": 0.1, - "learning_rate": 6.241107276314003e-05, - "loss": 0.4495, - "step": 1246000 - }, - { - "epoch": 0.1, - "learning_rate": 6.229284625092165e-05, - "loss": 0.45, - "step": 1247000 - }, - { - "epoch": 0.1, - "learning_rate": 6.217479182327523e-05, - "loss": 0.4507, - "step": 1248000 - }, - { - "epoch": 0.1, - "learning_rate": 6.205679158576122e-05, - "loss": 0.4498, - "step": 1249000 - }, - { - "epoch": 0.1, - "learning_rate": 6.193872786080677e-05, - "loss": 0.4505, - "step": 1250000 - }, - { - "epoch": 0.1, - "eval_loss": 0.4315950870513916, - "eval_runtime": 78.3108, - "eval_samples_per_second": 81.726, - "eval_steps_per_second": 0.638, - "step": 1250000 - }, - { - "epoch": 0.1, - "learning_rate": 6.182071915361952e-05, - "loss": 0.4497, - "step": 1251000 - }, - { - "epoch": 0.1, - "learning_rate": 6.170276582367444e-05, - "loss": 0.4511, - "step": 1252000 - }, - { - "epoch": 0.1, - "learning_rate": 6.158498609991133e-05, - "loss": 0.4506, - "step": 1253000 - }, - { - "epoch": 0.1, - "learning_rate": 6.146714454592476e-05, - "loss": 0.4503, - "step": 1254000 - }, - { - "epoch": 0.1, - "learning_rate": 6.13494772030113e-05, - "loss": 0.4502, - "step": 1255000 - }, - { - "epoch": 0.1, - "eval_loss": 0.43262848258018494, - "eval_runtime": 79.5141, - "eval_samples_per_second": 80.489, - "eval_steps_per_second": 0.629, - "step": 1255000 - }, - { - "epoch": 0.1, - "learning_rate": 6.123174885940984e-05, - "loss": 0.4513, - "step": 1256000 - }, - { - "epoch": 0.1, - "learning_rate": 6.111407768715625e-05, - "loss": 0.4491, - "step": 1257000 - }, - { - "epoch": 0.1, - "learning_rate": 6.099658162948436e-05, - "loss": 0.4495, - "step": 1258000 - }, - { - "epoch": 0.1, - "learning_rate": 6.087902581702466e-05, - "loss": 0.4498, - "step": 1259000 - }, - { - "epoch": 0.1, - "learning_rate": 6.076164571872208e-05, - "loss": 0.4501, - "step": 1260000 - }, - { - "epoch": 0.1, - "eval_loss": 0.4304799735546112, - "eval_runtime": 80.0574, - "eval_samples_per_second": 79.943, - "eval_steps_per_second": 0.625, - "step": 1260000 - }, - { - "epoch": 0.11, - "learning_rate": 6.064420669700471e-05, - "loss": 0.4504, - "step": 1261000 - }, - { - "epoch": 0.11, - "learning_rate": 6.05268266363905e-05, - "loss": 0.4509, - "step": 1262000 - }, - { - "epoch": 0.11, - "learning_rate": 6.0409505894439516e-05, - "loss": 0.4498, - "step": 1263000 - }, - { - "epoch": 0.11, - "learning_rate": 6.029236205966984e-05, - "loss": 0.4514, - "step": 1264000 - }, - { - "epoch": 0.11, - "learning_rate": 6.01751609667898e-05, - "loss": 0.4504, - "step": 1265000 - }, - { - "epoch": 0.11, - "eval_loss": 0.4325391352176666, - "eval_runtime": 78.7597, - "eval_samples_per_second": 81.26, - "eval_steps_per_second": 0.635, - "step": 1265000 - }, - { - "epoch": 0.11, - "learning_rate": 6.005802026380759e-05, - "loss": 0.4504, - "step": 1266000 - }, - { - "epoch": 0.11, - "learning_rate": 5.994094030755408e-05, - "loss": 0.4508, - "step": 1267000 - }, - { - "epoch": 0.11, - "learning_rate": 5.982403844288822e-05, - "loss": 0.4489, - "step": 1268000 - }, - { - "epoch": 0.11, - "learning_rate": 5.9707080988205694e-05, - "loss": 0.4486, - "step": 1269000 - }, - { - "epoch": 0.11, - "learning_rate": 5.959018534927381e-05, - "loss": 0.4496, - "step": 1270000 - }, - { - "epoch": 0.11, - "eval_loss": 0.4292341470718384, - "eval_runtime": 81.0604, - "eval_samples_per_second": 78.953, - "eval_steps_per_second": 0.617, - "step": 1270000 - }, - { - "epoch": 0.11, - "learning_rate": 5.947346868447071e-05, - "loss": 0.4495, - "step": 1271000 - }, - { - "epoch": 0.11, - "learning_rate": 5.9356697682398364e-05, - "loss": 0.4497, - "step": 1272000 - }, - { - "epoch": 0.11, - "learning_rate": 5.9240222916649246e-05, - "loss": 0.4497, - "step": 1273000 - }, - { - "epoch": 0.11, - "learning_rate": 5.9123577909416044e-05, - "loss": 0.4494, - "step": 1274000 - }, - { - "epoch": 0.11, - "learning_rate": 5.9006996495385025e-05, - "loss": 0.4515, - "step": 1275000 - }, - { - "epoch": 0.11, - "eval_loss": 0.4315658211708069, - "eval_runtime": 79.1549, - "eval_samples_per_second": 80.854, - "eval_steps_per_second": 0.632, - "step": 1275000 - }, - { - "epoch": 0.11, - "learning_rate": 5.889047902968337e-05, - "loss": 0.4494, - "step": 1276000 - }, - { - "epoch": 0.11, - "learning_rate": 5.877414228816838e-05, - "loss": 0.4492, - "step": 1277000 - }, - { - "epoch": 0.11, - "learning_rate": 5.8657753718891686e-05, - "loss": 0.4496, - "step": 1278000 - }, - { - "epoch": 0.11, - "learning_rate": 5.8541546452764014e-05, - "loss": 0.4512, - "step": 1279000 - }, - { - "epoch": 0.12, - "learning_rate": 5.84252881966525e-05, - "loss": 0.4492, - "step": 1280000 - }, - { - "epoch": 0.12, - "eval_loss": 0.427651971578598, - "eval_runtime": 79.7313, - "eval_samples_per_second": 80.27, - "eval_steps_per_second": 0.627, - "step": 1280000 - }, - { - "epoch": 0.12, - "learning_rate": 5.830909566085471e-05, - "loss": 0.4483, - "step": 1281000 - }, - { - "epoch": 0.12, - "learning_rate": 5.819308529265298e-05, - "loss": 0.4501, - "step": 1282000 - }, - { - "epoch": 0.12, - "learning_rate": 5.807702519250463e-05, - "loss": 0.45, - "step": 1283000 - }, - { - "epoch": 0.12, - "learning_rate": 5.796103187353951e-05, - "loss": 0.4498, - "step": 1284000 - }, - { - "epoch": 0.12, - "learning_rate": 5.784510568909343e-05, - "loss": 0.4488, - "step": 1285000 - }, - { - "epoch": 0.12, - "eval_loss": 0.43311890959739685, - "eval_runtime": 78.3828, - "eval_samples_per_second": 81.651, - "eval_steps_per_second": 0.638, - "step": 1285000 - }, - { - "epoch": 0.12, - "learning_rate": 5.7729362817166816e-05, - "loss": 0.4488, - "step": 1286000 - }, - { - "epoch": 0.12, - "learning_rate": 5.7613571892930174e-05, - "loss": 0.4495, - "step": 1287000 - }, - { - "epoch": 0.12, - "learning_rate": 5.7497964850187466e-05, - "loss": 0.4489, - "step": 1288000 - }, - { - "epoch": 0.12, - "learning_rate": 5.73823105956256e-05, - "loss": 0.4498, - "step": 1289000 - }, - { - "epoch": 0.12, - "learning_rate": 5.7266725238468133e-05, - "loss": 0.4491, - "step": 1290000 - }, - { - "epoch": 0.12, - "eval_loss": 0.43072545528411865, - "eval_runtime": 79.3715, - "eval_samples_per_second": 80.633, - "eval_steps_per_second": 0.63, - "step": 1290000 - }, - { - "epoch": 0.12, - "learning_rate": 5.715132461220847e-05, - "loss": 0.4488, - "step": 1291000 - }, - { - "epoch": 0.12, - "learning_rate": 5.703587803615088e-05, - "loss": 0.4477, - "step": 1292000 - }, - { - "epoch": 0.12, - "learning_rate": 5.6920501412791343e-05, - "loss": 0.4488, - "step": 1293000 - }, - { - "epoch": 0.12, - "learning_rate": 5.680519509358706e-05, - "loss": 0.4492, - "step": 1294000 - }, - { - "epoch": 0.12, - "learning_rate": 5.669007463003561e-05, - "loss": 0.4482, - "step": 1295000 - }, - { - "epoch": 0.12, - "eval_loss": 0.4276537001132965, - "eval_runtime": 83.0062, - "eval_samples_per_second": 77.103, - "eval_steps_per_second": 0.602, - "step": 1295000 - }, - { - "epoch": 0.12, - "learning_rate": 5.6574909901474036e-05, - "loss": 0.4495, - "step": 1296000 - }, - { - "epoch": 0.12, - "learning_rate": 5.645981652979939e-05, - "loss": 0.4505, - "step": 1297000 - }, - { - "epoch": 0.12, - "learning_rate": 5.634490985133569e-05, - "loss": 0.4487, - "step": 1298000 - }, - { - "epoch": 0.12, - "learning_rate": 5.622996017276679e-05, - "loss": 0.45, - "step": 1299000 - }, - { - "epoch": 0.12, - "learning_rate": 5.6115082901861476e-05, - "loss": 0.4495, - "step": 1300000 - }, - { - "epoch": 0.12, - "eval_loss": 0.4289723336696625, - "eval_runtime": 83.6952, - "eval_samples_per_second": 76.468, - "eval_steps_per_second": 0.597, - "step": 1300000 - }, - { - "epoch": 0.13, - "learning_rate": 5.60002783885559e-05, - "loss": 0.4497, - "step": 1301000 - }, - { - "epoch": 0.13, - "learning_rate": 5.5885776372183025e-05, - "loss": 0.4476, - "step": 1302000 - }, - { - "epoch": 0.13, - "learning_rate": 5.57711182757353e-05, - "loss": 0.4495, - "step": 1303000 - }, - { - "epoch": 0.13, - "learning_rate": 5.565664853197224e-05, - "loss": 0.449, - "step": 1304000 - }, - { - "epoch": 0.13, - "learning_rate": 5.554213832099068e-05, - "loss": 0.4497, - "step": 1305000 - }, - { - "epoch": 0.13, - "eval_loss": 0.4274589419364929, - "eval_runtime": 77.8458, - "eval_samples_per_second": 82.214, - "eval_steps_per_second": 0.642, - "step": 1305000 - }, - { - "epoch": 0.13, - "learning_rate": 5.542781701127524e-05, - "loss": 0.4488, - "step": 1306000 - }, - { - "epoch": 0.13, - "learning_rate": 5.5313456079635704e-05, - "loss": 0.4485, - "step": 1307000 - }, - { - "epoch": 0.13, - "learning_rate": 5.519917034749017e-05, - "loss": 0.4495, - "step": 1308000 - }, - { - "epoch": 0.13, - "learning_rate": 5.5085074335305515e-05, - "loss": 0.45, - "step": 1309000 - }, - { - "epoch": 0.13, - "learning_rate": 5.4970939970251195e-05, - "loss": 0.4494, - "step": 1310000 - }, - { - "epoch": 0.13, - "eval_loss": 0.43162286281585693, - "eval_runtime": 78.176, - "eval_samples_per_second": 81.867, - "eval_steps_per_second": 0.64, - "step": 1310000 - }, - { - "epoch": 0.13, - "learning_rate": 5.485688184805438e-05, - "loss": 0.4483, - "step": 1311000 - }, - { - "epoch": 0.13, - "learning_rate": 5.474290031615589e-05, - "loss": 0.4486, - "step": 1312000 - }, - { - "epoch": 0.13, - "learning_rate": 5.462899572176321e-05, - "loss": 0.4492, - "step": 1313000 - }, - { - "epoch": 0.13, - "learning_rate": 5.4515395989108964e-05, - "loss": 0.447, - "step": 1314000 - }, - { - "epoch": 0.13, - "learning_rate": 5.440164615480369e-05, - "loss": 0.4483, - "step": 1315000 - }, - { - "epoch": 0.13, - "eval_loss": 0.4310395121574402, - "eval_runtime": 78.2586, - "eval_samples_per_second": 81.78, - "eval_steps_per_second": 0.639, - "step": 1315000 - }, - { - "epoch": 0.13, - "learning_rate": 5.428797429752361e-05, - "loss": 0.4486, - "step": 1316000 - }, - { - "epoch": 0.13, - "learning_rate": 5.4174494317829243e-05, - "loss": 0.4494, - "step": 1317000 - }, - { - "epoch": 0.13, - "learning_rate": 5.4060979374311454e-05, - "loss": 0.4483, - "step": 1318000 - }, - { - "epoch": 0.13, - "learning_rate": 5.3947656841894844e-05, - "loss": 0.4478, - "step": 1319000 - }, - { - "epoch": 0.14, - "learning_rate": 5.3834300193900343e-05, - "loss": 0.4478, - "step": 1320000 - }, - { - "epoch": 0.14, - "eval_loss": 0.4294278025627136, - "eval_runtime": 81.6018, - "eval_samples_per_second": 78.43, - "eval_steps_per_second": 0.613, - "step": 1320000 - }, - { - "epoch": 0.14, - "learning_rate": 5.372102325116521e-05, - "loss": 0.4487, - "step": 1321000 - }, - { - "epoch": 0.14, - "learning_rate": 5.3607826358750705e-05, - "loss": 0.4475, - "step": 1322000 - }, - { - "epoch": 0.14, - "learning_rate": 5.3494822937699356e-05, - "loss": 0.4471, - "step": 1323000 - }, - { - "epoch": 0.14, - "learning_rate": 5.3381787099221496e-05, - "loss": 0.4491, - "step": 1324000 - }, - { - "epoch": 0.14, - "learning_rate": 5.326883234443639e-05, - "loss": 0.4479, - "step": 1325000 - }, - { - "epoch": 0.14, - "eval_loss": 0.4298754036426544, - "eval_runtime": 83.1912, - "eval_samples_per_second": 76.931, - "eval_steps_per_second": 0.601, - "step": 1325000 - }, - { - "epoch": 0.14, - "learning_rate": 5.315595901742376e-05, - "loss": 0.4486, - "step": 1326000 - }, - { - "epoch": 0.14, - "learning_rate": 5.304339296328977e-05, - "loss": 0.4472, - "step": 1327000 - }, - { - "epoch": 0.14, - "learning_rate": 5.293068335849521e-05, - "loss": 0.4466, - "step": 1328000 - }, - { - "epoch": 0.14, - "learning_rate": 5.281805621153364e-05, - "loss": 0.4469, - "step": 1329000 - }, - { - "epoch": 0.14, - "learning_rate": 5.270562436835969e-05, - "loss": 0.4483, - "step": 1330000 - }, - { - "epoch": 0.14, - "eval_loss": 0.4302482604980469, - "eval_runtime": 78.9917, - "eval_samples_per_second": 81.021, - "eval_steps_per_second": 0.633, - "step": 1330000 - }, - { - "epoch": 0.14, - "learning_rate": 5.25931630827424e-05, - "loss": 0.4479, - "step": 1331000 - }, - { - "epoch": 0.14, - "learning_rate": 5.248078528310338e-05, - "loss": 0.4493, - "step": 1332000 - }, - { - "epoch": 0.14, - "learning_rate": 5.2368603563750103e-05, - "loss": 0.4483, - "step": 1333000 - }, - { - "epoch": 0.14, - "learning_rate": 5.225639367843791e-05, - "loss": 0.4482, - "step": 1334000 - }, - { - "epoch": 0.14, - "learning_rate": 5.2144380388008114e-05, - "loss": 0.4473, - "step": 1335000 - }, - { - "epoch": 0.14, - "eval_loss": 0.4291970133781433, - "eval_runtime": 82.7148, - "eval_samples_per_second": 77.374, - "eval_steps_per_second": 0.604, - "step": 1335000 - }, - { - "epoch": 0.14, - "learning_rate": 5.203233978289791e-05, - "loss": 0.4475, - "step": 1336000 - }, - { - "epoch": 0.14, - "learning_rate": 5.192038437213101e-05, - "loss": 0.4462, - "step": 1337000 - }, - { - "epoch": 0.14, - "learning_rate": 5.1808514496743016e-05, - "loss": 0.4476, - "step": 1338000 - }, - { - "epoch": 0.14, - "learning_rate": 5.169684223849966e-05, - "loss": 0.4494, - "step": 1339000 - }, - { - "epoch": 0.14, - "learning_rate": 5.158514436954633e-05, - "loss": 0.4471, - "step": 1340000 - }, - { - "epoch": 0.14, - "eval_loss": 0.4304357171058655, - "eval_runtime": 89.5128, - "eval_samples_per_second": 71.498, - "eval_steps_per_second": 0.559, - "step": 1340000 - }, - { - "epoch": 0.15, - "learning_rate": 5.147353305717107e-05, - "loss": 0.4485, - "step": 1341000 - }, - { - "epoch": 0.15, - "learning_rate": 5.1362120122259206e-05, - "loss": 0.4482, - "step": 1342000 - }, - { - "epoch": 0.15, - "learning_rate": 5.125079424891131e-05, - "loss": 0.4483, - "step": 1343000 - }, - { - "epoch": 0.15, - "learning_rate": 5.1139444469644865e-05, - "loss": 0.4466, - "step": 1344000 - }, - { - "epoch": 0.15, - "learning_rate": 5.10282938224747e-05, - "loss": 0.4483, - "step": 1345000 - }, - { - "epoch": 0.15, - "eval_loss": 0.4288882911205292, - "eval_runtime": 86.9362, - "eval_samples_per_second": 73.617, - "eval_steps_per_second": 0.575, - "step": 1345000 - }, - { - "epoch": 0.15, - "learning_rate": 5.091712012222336e-05, - "loss": 0.447, - "step": 1346000 - }, - { - "epoch": 0.15, - "learning_rate": 5.080603501346776e-05, - "loss": 0.4474, - "step": 1347000 - }, - { - "epoch": 0.15, - "learning_rate": 5.069503883459252e-05, - "loss": 0.4467, - "step": 1348000 - }, - { - "epoch": 0.15, - "learning_rate": 5.0584131923711175e-05, - "loss": 0.4483, - "step": 1349000 - }, - { - "epoch": 0.15, - "learning_rate": 5.047331461866543e-05, - "loss": 0.4482, - "step": 1350000 - }, - { - "epoch": 0.15, - "eval_loss": 0.4289691150188446, - "eval_runtime": 155.3438, - "eval_samples_per_second": 41.199, - "eval_steps_per_second": 0.322, - "step": 1350000 - }, - { - "epoch": 0.0, - "learning_rate": 5.036269793934667e-05, - "loss": 0.4474, - "step": 1351000 - }, - { - "epoch": 0.0, - "learning_rate": 5.02520607679552e-05, - "loss": 0.448, - "step": 1352000 - }, - { - "epoch": 0.0, - "learning_rate": 5.014162471512421e-05, - "loss": 0.4481, - "step": 1353000 - }, - { - "epoch": 0.0, - "learning_rate": 5.003116902411837e-05, - "loss": 0.4471, - "step": 1354000 - }, - { - "epoch": 0.0, - "learning_rate": 4.9920914942058365e-05, - "loss": 0.448, - "step": 1355000 - }, - { - "epoch": 0.0, - "eval_loss": 0.4262136220932007, - "eval_runtime": 84.0235, - "eval_samples_per_second": 76.169, - "eval_steps_per_second": 0.595, - "step": 1355000 - }, - { - "epoch": 0.0, - "learning_rate": 4.981075230294731e-05, - "loss": 0.4463, - "step": 1356000 - }, - { - "epoch": 0.0, - "learning_rate": 4.970057130673938e-05, - "loss": 0.4474, - "step": 1357000 - }, - { - "epoch": 0.0, - "learning_rate": 4.959048260784589e-05, - "loss": 0.4467, - "step": 1358000 - }, - { - "epoch": 0.0, - "learning_rate": 4.948048654161614e-05, - "loss": 0.4467, - "step": 1359000 - }, - { - "epoch": 0.01, - "learning_rate": 4.937069329966688e-05, - "loss": 0.447, - "step": 1360000 - }, - { - "epoch": 0.01, - "eval_loss": 0.4244381785392761, - "eval_runtime": 78.4672, - "eval_samples_per_second": 81.563, - "eval_steps_per_second": 0.637, - "step": 1360000 - }, - { - "epoch": 0.01, - "learning_rate": 4.926088341021301e-05, - "loss": 0.4475, - "step": 1361000 - }, - { - "epoch": 0.01, - "learning_rate": 4.915116715743923e-05, - "loss": 0.4463, - "step": 1362000 - }, - { - "epoch": 0.01, - "learning_rate": 4.904154487556023e-05, - "loss": 0.4475, - "step": 1363000 - }, - { - "epoch": 0.01, - "learning_rate": 4.8932126379265246e-05, - "loss": 0.4473, - "step": 1364000 - }, - { - "epoch": 0.01, - "learning_rate": 4.8822692945868926e-05, - "loss": 0.4465, - "step": 1365000 - }, - { - "epoch": 0.01, - "eval_loss": 0.42896711826324463, - "eval_runtime": 78.3264, - "eval_samples_per_second": 81.709, - "eval_steps_per_second": 0.638, - "step": 1365000 - }, - { - "epoch": 0.01, - "learning_rate": 4.871346377486962e-05, - "loss": 0.4473, - "step": 1366000 - }, - { - "epoch": 0.01, - "learning_rate": 4.860422052203485e-05, - "loss": 0.4456, - "step": 1367000 - }, - { - "epoch": 0.01, - "learning_rate": 4.849507290618916e-05, - "loss": 0.4463, - "step": 1368000 - }, - { - "epoch": 0.01, - "learning_rate": 4.838613026341415e-05, - "loss": 0.4464, - "step": 1369000 - }, - { - "epoch": 0.01, - "learning_rate": 4.827717482223451e-05, - "loss": 0.4475, - "step": 1370000 - }, - { - "epoch": 0.01, - "eval_loss": 0.4280179738998413, - "eval_runtime": 78.2271, - "eval_samples_per_second": 81.813, - "eval_steps_per_second": 0.639, - "step": 1370000 - }, - { - "epoch": 0.01, - "learning_rate": 4.8168316014281885e-05, - "loss": 0.4458, - "step": 1371000 - }, - { - "epoch": 0.01, - "learning_rate": 4.805966288445796e-05, - "loss": 0.4461, - "step": 1372000 - }, - { - "epoch": 0.01, - "learning_rate": 4.7950998240010834e-05, - "loss": 0.4456, - "step": 1373000 - }, - { - "epoch": 0.01, - "learning_rate": 4.784243122238121e-05, - "loss": 0.4463, - "step": 1374000 - }, - { - "epoch": 0.01, - "learning_rate": 4.773396216228298e-05, - "loss": 0.4462, - "step": 1375000 - }, - { - "epoch": 0.01, - "eval_loss": 0.4259638488292694, - "eval_runtime": 79.4294, - "eval_samples_per_second": 80.575, - "eval_steps_per_second": 0.629, - "step": 1375000 - }, - { - "epoch": 0.01, - "learning_rate": 4.762569971169915e-05, - "loss": 0.4461, - "step": 1376000 - }, - { - "epoch": 0.01, - "learning_rate": 4.751764390469416e-05, - "loss": 0.4445, - "step": 1377000 - }, - { - "epoch": 0.01, - "learning_rate": 4.740947040114839e-05, - "loss": 0.4467, - "step": 1378000 - }, - { - "epoch": 0.01, - "learning_rate": 4.730139617400774e-05, - "loss": 0.4463, - "step": 1379000 - }, - { - "epoch": 0.01, - "learning_rate": 4.7193421552485106e-05, - "loss": 0.4463, - "step": 1380000 - }, - { - "epoch": 0.01, - "eval_loss": 0.4267512559890747, - "eval_runtime": 79.7271, - "eval_samples_per_second": 80.274, - "eval_steps_per_second": 0.627, - "step": 1380000 - }, - { - "epoch": 0.02, - "learning_rate": 4.7085654690150224e-05, - "loss": 0.4468, - "step": 1381000 - }, - { - "epoch": 0.02, - "learning_rate": 4.697788016586024e-05, - "loss": 0.4458, - "step": 1382000 - }, - { - "epoch": 0.02, - "learning_rate": 4.687020623267417e-05, - "loss": 0.448, - "step": 1383000 - }, - { - "epoch": 0.02, - "learning_rate": 4.6762633218585406e-05, - "loss": 0.4468, - "step": 1384000 - }, - { - "epoch": 0.02, - "learning_rate": 4.6655161451280045e-05, - "loss": 0.4464, - "step": 1385000 - }, - { - "epoch": 0.02, - "eval_loss": 0.42754608392715454, - "eval_runtime": 79.2336, - "eval_samples_per_second": 80.774, - "eval_steps_per_second": 0.631, - "step": 1385000 - }, - { - "epoch": 0.02, - "learning_rate": 4.654779125813573e-05, - "loss": 0.4469, - "step": 1386000 - }, - { - "epoch": 0.02, - "learning_rate": 4.64406301835041e-05, - "loss": 0.446, - "step": 1387000 - }, - { - "epoch": 0.02, - "learning_rate": 4.633357113217997e-05, - "loss": 0.4465, - "step": 1388000 - }, - { - "epoch": 0.02, - "learning_rate": 4.622650741725104e-05, - "loss": 0.4453, - "step": 1389000 - }, - { - "epoch": 0.02, - "learning_rate": 4.6119546582237665e-05, - "loss": 0.4475, - "step": 1390000 - }, - { - "epoch": 0.02, - "eval_loss": 0.42643147706985474, - "eval_runtime": 78.7876, - "eval_samples_per_second": 81.231, - "eval_steps_per_second": 0.635, - "step": 1390000 - }, - { - "epoch": 0.02, - "learning_rate": 4.601279575893072e-05, - "loss": 0.4458, - "step": 1391000 - }, - { - "epoch": 0.02, - "learning_rate": 4.590604155720414e-05, - "loss": 0.4443, - "step": 1392000 - }, - { - "epoch": 0.02, - "learning_rate": 4.5799391211587674e-05, - "loss": 0.4465, - "step": 1393000 - }, - { - "epoch": 0.02, - "learning_rate": 4.569295154097499e-05, - "loss": 0.4457, - "step": 1394000 - }, - { - "epoch": 0.02, - "learning_rate": 4.5586509777220274e-05, - "loss": 0.4459, - "step": 1395000 - }, - { - "epoch": 0.02, - "eval_loss": 0.426248162984848, - "eval_runtime": 81.9909, - "eval_samples_per_second": 78.057, - "eval_steps_per_second": 0.61, - "step": 1395000 - }, - { - "epoch": 0.02, - "learning_rate": 4.54801728429249e-05, - "loss": 0.4461, - "step": 1396000 - }, - { - "epoch": 0.02, - "learning_rate": 4.537394106200966e-05, - "loss": 0.446, - "step": 1397000 - }, - { - "epoch": 0.02, - "learning_rate": 4.526792083158551e-05, - "loss": 0.4458, - "step": 1398000 - }, - { - "epoch": 0.02, - "learning_rate": 4.516190022194884e-05, - "loss": 0.4462, - "step": 1399000 - }, - { - "epoch": 0.03, - "learning_rate": 4.5056091596576796e-05, - "loss": 0.4467, - "step": 1400000 - }, - { - "epoch": 0.03, - "eval_loss": 0.4303908944129944, - "eval_runtime": 78.2681, - "eval_samples_per_second": 81.77, - "eval_steps_per_second": 0.639, - "step": 1400000 - }, - { - "epoch": 0.03, - "learning_rate": 4.495028344875484e-05, - "loss": 0.4471, - "step": 1401000 - }, - { - "epoch": 0.03, - "learning_rate": 4.4844582068448006e-05, - "loss": 0.4459, - "step": 1402000 - }, - { - "epoch": 0.03, - "learning_rate": 4.4739093318333515e-05, - "loss": 0.447, - "step": 1403000 - }, - { - "epoch": 0.03, - "learning_rate": 4.463360633111313e-05, - "loss": 0.4464, - "step": 1404000 - }, - { - "epoch": 0.03, - "learning_rate": 4.452822707606128e-05, - "loss": 0.4466, - "step": 1405000 - }, - { - "epoch": 0.03, - "eval_loss": 0.42388811707496643, - "eval_runtime": 81.8794, - "eval_samples_per_second": 78.164, - "eval_steps_per_second": 0.611, - "step": 1405000 - }, - { - "epoch": 0.03, - "learning_rate": 4.4423061091303984e-05, - "loss": 0.4462, - "step": 1406000 - }, - { - "epoch": 0.03, - "learning_rate": 4.431789815473655e-05, - "loss": 0.4452, - "step": 1407000 - }, - { - "epoch": 0.03, - "learning_rate": 4.421284391203949e-05, - "loss": 0.4451, - "step": 1408000 - }, - { - "epoch": 0.03, - "learning_rate": 4.410789868322626e-05, - "loss": 0.4459, - "step": 1409000 - }, - { - "epoch": 0.03, - "learning_rate": 4.400316756915505e-05, - "loss": 0.4454, - "step": 1410000 - }, - { - "epoch": 0.03, - "eval_loss": 0.4258207678794861, - "eval_runtime": 82.7222, - "eval_samples_per_second": 77.367, - "eval_steps_per_second": 0.604, - "step": 1410000 - }, - { - "epoch": 0.03, - "learning_rate": 4.3898441217008166e-05, - "loss": 0.446, - "step": 1411000 - }, - { - "epoch": 0.03, - "learning_rate": 4.3793929397813896e-05, - "loss": 0.446, - "step": 1412000 - }, - { - "epoch": 0.03, - "learning_rate": 4.368952764812006e-05, - "loss": 0.4444, - "step": 1413000 - }, - { - "epoch": 0.03, - "learning_rate": 4.358513194496827e-05, - "loss": 0.4461, - "step": 1414000 - }, - { - "epoch": 0.03, - "learning_rate": 4.348084716781979e-05, - "loss": 0.4454, - "step": 1415000 - }, - { - "epoch": 0.03, - "eval_loss": 0.4262985587120056, - "eval_runtime": 82.7185, - "eval_samples_per_second": 77.371, - "eval_steps_per_second": 0.604, - "step": 1415000 - }, - { - "epoch": 0.03, - "learning_rate": 4.3376777752205805e-05, - "loss": 0.4457, - "step": 1416000 - }, - { - "epoch": 0.03, - "learning_rate": 4.3272819674268335e-05, - "loss": 0.4455, - "step": 1417000 - }, - { - "epoch": 0.03, - "learning_rate": 4.3168869355718386e-05, - "loss": 0.4452, - "step": 1418000 - }, - { - "epoch": 0.03, - "learning_rate": 4.306503123117983e-05, - "loss": 0.4456, - "step": 1419000 - }, - { - "epoch": 0.04, - "learning_rate": 4.296130561696156e-05, - "loss": 0.4447, - "step": 1420000 - }, - { - "epoch": 0.04, - "eval_loss": 0.4243999421596527, - "eval_runtime": 78.935, - "eval_samples_per_second": 81.079, - "eval_steps_per_second": 0.633, - "step": 1420000 - }, - { - "epoch": 0.04, - "learning_rate": 4.2857692829029776e-05, - "loss": 0.4458, - "step": 1421000 - }, - { - "epoch": 0.04, - "learning_rate": 4.275419318300705e-05, - "loss": 0.4456, - "step": 1422000 - }, - { - "epoch": 0.04, - "learning_rate": 4.265091032358337e-05, - "loss": 0.4459, - "step": 1423000 - }, - { - "epoch": 0.04, - "learning_rate": 4.254774100853429e-05, - "loss": 0.4462, - "step": 1424000 - }, - { - "epoch": 0.04, - "learning_rate": 4.2444582450035355e-05, - "loss": 0.4449, - "step": 1425000 - }, - { - "epoch": 0.04, - "eval_loss": 0.4202989935874939, - "eval_runtime": 81.6727, - "eval_samples_per_second": 78.362, - "eval_steps_per_second": 0.612, - "step": 1425000 - }, - { - "epoch": 0.04, - "learning_rate": 4.234153829185128e-05, - "loss": 0.4439, - "step": 1426000 - }, - { - "epoch": 0.04, - "learning_rate": 4.223871171991236e-05, - "loss": 0.4454, - "step": 1427000 - }, - { - "epoch": 0.04, - "learning_rate": 4.213589718849552e-05, - "loss": 0.446, - "step": 1428000 - }, - { - "epoch": 0.04, - "learning_rate": 4.203319799770243e-05, - "loss": 0.4448, - "step": 1429000 - }, - { - "epoch": 0.04, - "learning_rate": 4.193061446037263e-05, - "loss": 0.4449, - "step": 1430000 - }, - { - "epoch": 0.04, - "eval_loss": 0.423859179019928, - "eval_runtime": 81.673, - "eval_samples_per_second": 78.361, - "eval_steps_per_second": 0.612, - "step": 1430000 - }, - { - "epoch": 0.04, - "learning_rate": 4.182814688899332e-05, - "loss": 0.4453, - "step": 1431000 - }, - { - "epoch": 0.04, - "learning_rate": 4.172589788880715e-05, - "loss": 0.4448, - "step": 1432000 - }, - { - "epoch": 0.04, - "learning_rate": 4.1623663068631195e-05, - "loss": 0.4443, - "step": 1433000 - }, - { - "epoch": 0.04, - "learning_rate": 4.15216472088565e-05, - "loss": 0.4448, - "step": 1434000 - }, - { - "epoch": 0.04, - "learning_rate": 4.141964638433753e-05, - "loss": 0.4447, - "step": 1435000 - }, - { - "epoch": 0.04, - "eval_loss": 0.4245039224624634, - "eval_runtime": 81.4224, - "eval_samples_per_second": 78.602, - "eval_steps_per_second": 0.614, - "step": 1435000 - }, - { - "epoch": 0.04, - "learning_rate": 4.131786490676274e-05, - "loss": 0.4453, - "step": 1436000 - }, - { - "epoch": 0.04, - "learning_rate": 4.1216099319506056e-05, - "loss": 0.4459, - "step": 1437000 - }, - { - "epoch": 0.04, - "learning_rate": 4.1114451874736935e-05, - "loss": 0.4457, - "step": 1438000 - }, - { - "epoch": 0.04, - "learning_rate": 4.101302435181392e-05, - "loss": 0.4445, - "step": 1439000 - }, - { - "epoch": 0.04, - "learning_rate": 4.091161400165054e-05, - "loss": 0.4451, - "step": 1440000 - }, - { - "epoch": 0.04, - "eval_loss": 0.42323625087738037, - "eval_runtime": 91.7597, - "eval_samples_per_second": 69.747, - "eval_steps_per_second": 0.545, - "step": 1440000 - }, - { - "epoch": 0.05, - "learning_rate": 4.0810322721489815e-05, - "loss": 0.4458, - "step": 1441000 - }, - { - "epoch": 0.05, - "learning_rate": 4.070915081988253e-05, - "loss": 0.4433, - "step": 1442000 - }, - { - "epoch": 0.05, - "learning_rate": 4.060819959734469e-05, - "loss": 0.4449, - "step": 1443000 - }, - { - "epoch": 0.05, - "learning_rate": 4.0507267256893e-05, - "loss": 0.4446, - "step": 1444000 - }, - { - "epoch": 0.05, - "learning_rate": 4.0406555970000004e-05, - "loss": 0.4442, - "step": 1445000 - }, - { - "epoch": 0.05, - "eval_loss": 0.4247336685657501, - "eval_runtime": 79.9188, - "eval_samples_per_second": 80.081, - "eval_steps_per_second": 0.626, - "step": 1445000 - }, - { - "epoch": 0.05, - "learning_rate": 4.03059650505077e-05, - "loss": 0.4449, - "step": 1446000 - }, - { - "epoch": 0.05, - "learning_rate": 4.020539429396338e-05, - "loss": 0.4449, - "step": 1447000 - }, - { - "epoch": 0.05, - "learning_rate": 4.010494475868937e-05, - "loss": 0.4444, - "step": 1448000 - }, - { - "epoch": 0.05, - "learning_rate": 4.000461675067245e-05, - "loss": 0.4434, - "step": 1449000 - }, - { - "epoch": 0.05, - "learning_rate": 3.990451072074708e-05, - "loss": 0.4445, - "step": 1450000 - }, - { - "epoch": 0.05, - "eval_loss": 0.42264366149902344, - "eval_runtime": 81.9965, - "eval_samples_per_second": 78.052, - "eval_steps_per_second": 0.61, - "step": 1450000 - }, - { - "epoch": 0.05, - "learning_rate": 3.980442656143221e-05, - "loss": 0.4449, - "step": 1451000 - }, - { - "epoch": 0.05, - "learning_rate": 3.970456474526e-05, - "loss": 0.4452, - "step": 1452000 - }, - { - "epoch": 0.05, - "learning_rate": 3.960472565292187e-05, - "loss": 0.4445, - "step": 1453000 - }, - { - "epoch": 0.05, - "learning_rate": 3.9505009611594935e-05, - "loss": 0.4451, - "step": 1454000 - }, - { - "epoch": 0.05, - "learning_rate": 3.9405416925031554e-05, - "loss": 0.4441, - "step": 1455000 - }, - { - "epoch": 0.05, - "eval_loss": 0.42445144057273865, - "eval_runtime": 84.4516, - "eval_samples_per_second": 75.783, - "eval_steps_per_second": 0.592, - "step": 1455000 - }, - { - "epoch": 0.05, - "learning_rate": 3.930614671105286e-05, - "loss": 0.4443, - "step": 1456000 - }, - { - "epoch": 0.05, - "learning_rate": 3.9206801395545245e-05, - "loss": 0.4444, - "step": 1457000 - }, - { - "epoch": 0.05, - "learning_rate": 3.9107580343195134e-05, - "loss": 0.4442, - "step": 1458000 - }, - { - "epoch": 0.05, - "learning_rate": 3.9008582890413084e-05, - "loss": 0.4441, - "step": 1459000 - }, - { - "epoch": 0.06, - "learning_rate": 3.890961114571417e-05, - "loss": 0.445, - "step": 1460000 - }, - { - "epoch": 0.06, - "eval_loss": 0.4252031743526459, - "eval_runtime": 78.9687, - "eval_samples_per_second": 81.045, - "eval_steps_per_second": 0.633, - "step": 1460000 - }, - { - "epoch": 0.06, - "learning_rate": 3.8810764569465806e-05, - "loss": 0.4443, - "step": 1461000 - }, - { - "epoch": 0.06, - "learning_rate": 3.87120434627718e-05, - "loss": 0.4439, - "step": 1462000 - }, - { - "epoch": 0.06, - "learning_rate": 3.86135466587679e-05, - "loss": 0.4449, - "step": 1463000 - }, - { - "epoch": 0.06, - "learning_rate": 3.85150772667438e-05, - "loss": 0.4434, - "step": 1464000 - }, - { - "epoch": 0.06, - "learning_rate": 3.841673424498882e-05, - "loss": 0.4442, - "step": 1465000 - }, - { - "epoch": 0.06, - "eval_loss": 0.4248676598072052, - "eval_runtime": 80.6767, - "eval_samples_per_second": 79.329, - "eval_steps_per_second": 0.62, - "step": 1465000 - }, - { - "epoch": 0.06, - "learning_rate": 3.8318616046053615e-05, - "loss": 0.4442, - "step": 1466000 - }, - { - "epoch": 0.06, - "learning_rate": 3.822052653604248e-05, - "loss": 0.4435, - "step": 1467000 - }, - { - "epoch": 0.06, - "learning_rate": 3.812256429355308e-05, - "loss": 0.4448, - "step": 1468000 - }, - { - "epoch": 0.06, - "learning_rate": 3.802472961699541e-05, - "loss": 0.443, - "step": 1469000 - }, - { - "epoch": 0.06, - "learning_rate": 3.7927120447236364e-05, - "loss": 0.4449, - "step": 1470000 - }, - { - "epoch": 0.06, - "eval_loss": 0.42114773392677307, - "eval_runtime": 85.6392, - "eval_samples_per_second": 74.732, - "eval_steps_per_second": 0.584, - "step": 1470000 - }, - { - "epoch": 0.06, - "learning_rate": 3.782963918257054e-05, - "loss": 0.4439, - "step": 1471000 - }, - { - "epoch": 0.06, - "learning_rate": 3.7732188733163535e-05, - "loss": 0.4446, - "step": 1472000 - }, - { - "epoch": 0.06, - "learning_rate": 3.763486703883993e-05, - "loss": 0.4437, - "step": 1473000 - }, - { - "epoch": 0.06, - "learning_rate": 3.753777152414142e-05, - "loss": 0.4443, - "step": 1474000 - }, - { - "epoch": 0.06, - "learning_rate": 3.74407080994724e-05, - "loss": 0.4444, - "step": 1475000 - }, - { - "epoch": 0.06, - "eval_loss": 0.4239863455295563, - "eval_runtime": 78.8432, - "eval_samples_per_second": 81.174, - "eval_steps_per_second": 0.634, - "step": 1475000 - }, - { - "epoch": 0.06, - "learning_rate": 3.734387118671403e-05, - "loss": 0.4443, - "step": 1476000 - }, - { - "epoch": 0.06, - "learning_rate": 3.724706721320442e-05, - "loss": 0.4444, - "step": 1477000 - }, - { - "epoch": 0.06, - "learning_rate": 3.715049008113378e-05, - "loss": 0.444, - "step": 1478000 - }, - { - "epoch": 0.06, - "learning_rate": 3.7053946737130206e-05, - "loss": 0.4445, - "step": 1479000 - }, - { - "epoch": 0.07, - "learning_rate": 3.6957534214252755e-05, - "loss": 0.4434, - "step": 1480000 - }, - { - "epoch": 0.07, - "eval_loss": 0.4229904115200043, - "eval_runtime": 119.8175, - "eval_samples_per_second": 53.415, - "eval_steps_per_second": 0.417, - "step": 1480000 - }, - { - "epoch": 0.07, - "learning_rate": 3.6861349022009304e-05, - "loss": 0.4442, - "step": 1481000 - }, - { - "epoch": 0.07, - "learning_rate": 3.676519889049805e-05, - "loss": 0.4441, - "step": 1482000 - }, - { - "epoch": 0.07, - "learning_rate": 3.666918045968899e-05, - "loss": 0.4443, - "step": 1483000 - }, - { - "epoch": 0.07, - "learning_rate": 3.6573294022070874e-05, - "loss": 0.4442, - "step": 1484000 - }, - { - "epoch": 0.07, - "learning_rate": 3.6477635557709146e-05, - "loss": 0.4435, - "step": 1485000 - }, - { - "epoch": 0.07, - "eval_loss": 0.42340168356895447, - "eval_runtime": 87.4539, - "eval_samples_per_second": 73.181, - "eval_steps_per_second": 0.572, - "step": 1485000 - }, - { - "epoch": 0.07, - "learning_rate": 3.638210940499669e-05, - "loss": 0.4425, - "step": 1486000 - }, - { - "epoch": 0.07, - "learning_rate": 3.628662043183205e-05, - "loss": 0.4437, - "step": 1487000 - }, - { - "epoch": 0.07, - "learning_rate": 3.619126461720257e-05, - "loss": 0.4432, - "step": 1488000 - }, - { - "epoch": 0.07, - "learning_rate": 3.6096042251578594e-05, - "loss": 0.4433, - "step": 1489000 - }, - { - "epoch": 0.07, - "learning_rate": 3.600095362502394e-05, - "loss": 0.443, - "step": 1490000 - }, - { - "epoch": 0.07, - "eval_loss": 0.42291781306266785, - "eval_runtime": 80.5128, - "eval_samples_per_second": 79.49, - "eval_steps_per_second": 0.621, - "step": 1490000 - }, - { - "epoch": 0.07, - "learning_rate": 3.59060939147492e-05, - "loss": 0.4439, - "step": 1491000 - }, - { - "epoch": 0.07, - "learning_rate": 3.581127350043195e-05, - "loss": 0.4431, - "step": 1492000 - }, - { - "epoch": 0.07, - "learning_rate": 3.571658769263903e-05, - "loss": 0.4433, - "step": 1493000 - }, - { - "epoch": 0.07, - "learning_rate": 3.562213126323673e-05, - "loss": 0.4423, - "step": 1494000 - }, - { - "epoch": 0.07, - "learning_rate": 3.55278097462887e-05, - "loss": 0.4434, - "step": 1495000 - }, - { - "epoch": 0.07, - "eval_loss": 0.42214784026145935, - "eval_runtime": 79.6252, - "eval_samples_per_second": 80.377, - "eval_steps_per_second": 0.628, - "step": 1495000 - }, - { - "epoch": 0.07, - "learning_rate": 3.5433529215772676e-05, - "loss": 0.4433, - "step": 1496000 - }, - { - "epoch": 0.07, - "learning_rate": 3.533938444245558e-05, - "loss": 0.4442, - "step": 1497000 - }, - { - "epoch": 0.07, - "learning_rate": 3.5245375713118724e-05, - "loss": 0.4433, - "step": 1498000 - }, - { - "epoch": 0.07, - "learning_rate": 3.515159711833576e-05, - "loss": 0.4426, - "step": 1499000 - }, - { - "epoch": 0.07, - "learning_rate": 3.505786119888575e-05, - "loss": 0.4434, - "step": 1500000 - }, - { - "epoch": 0.07, - "eval_loss": 0.4238176643848419, - "eval_runtime": 81.4375, - "eval_samples_per_second": 78.588, - "eval_steps_per_second": 0.614, - "step": 1500000 - }, - { - "epoch": 0.0, - "learning_rate": 3.496426218098463e-05, - "loss": 0.4434, - "step": 1501000 - }, - { - "epoch": 0.0, - "learning_rate": 3.4870893742962914e-05, - "loss": 0.4437, - "step": 1502000 - }, - { - "epoch": 0.0, - "learning_rate": 3.477756924548471e-05, - "loss": 0.4444, - "step": 1503000 - }, - { - "epoch": 0.0, - "learning_rate": 3.468447562121202e-05, - "loss": 0.4427, - "step": 1504000 - }, - { - "epoch": 0.0, - "learning_rate": 3.459142678014942e-05, - "loss": 0.4435, - "step": 1505000 - }, - { - "epoch": 0.0, - "eval_loss": 0.42447999119758606, - "eval_runtime": 81.5993, - "eval_samples_per_second": 78.432, - "eval_steps_per_second": 0.613, - "step": 1505000 - }, - { - "epoch": 0.0, - "learning_rate": 3.4498516261475824e-05, - "loss": 0.4427, - "step": 1506000 - }, - { - "epoch": 0.0, - "learning_rate": 3.440583705079855e-05, - "loss": 0.4426, - "step": 1507000 - }, - { - "epoch": 0.0, - "learning_rate": 3.431320388651632e-05, - "loss": 0.4418, - "step": 1508000 - }, - { - "epoch": 0.0, - "learning_rate": 3.422080231652349e-05, - "loss": 0.4432, - "step": 1509000 - }, - { - "epoch": 0.01, - "learning_rate": 3.412853991914283e-05, - "loss": 0.4435, - "step": 1510000 - }, - { - "epoch": 0.01, - "eval_loss": 0.4248586893081665, - "eval_runtime": 79.0364, - "eval_samples_per_second": 80.975, - "eval_steps_per_second": 0.633, - "step": 1510000 - }, - { - "epoch": 0.01, - "learning_rate": 3.403632482966014e-05, - "loss": 0.4428, - "step": 1511000 - }, - { - "epoch": 0.01, - "learning_rate": 3.394424975350325e-05, - "loss": 0.444, - "step": 1512000 - }, - { - "epoch": 0.01, - "learning_rate": 3.385231497114882e-05, - "loss": 0.4432, - "step": 1513000 - }, - { - "epoch": 0.01, - "learning_rate": 3.376061248654489e-05, - "loss": 0.4421, - "step": 1514000 - }, - { - "epoch": 0.01, - "learning_rate": 3.366895899052203e-05, - "loss": 0.4443, - "step": 1515000 - }, - { - "epoch": 0.01, - "eval_loss": 0.42498764395713806, - "eval_runtime": 78.5642, - "eval_samples_per_second": 81.462, - "eval_steps_per_second": 0.636, - "step": 1515000 - }, - { - "epoch": 0.01, - "learning_rate": 3.357744662688502e-05, - "loss": 0.4423, - "step": 1516000 - }, - { - "epoch": 0.01, - "learning_rate": 3.3486166974621315e-05, - "loss": 0.4445, - "step": 1517000 - }, - { - "epoch": 0.01, - "learning_rate": 3.3395028728323046e-05, - "loss": 0.4426, - "step": 1518000 - }, - { - "epoch": 0.01, - "learning_rate": 3.330394114848308e-05, - "loss": 0.4414, - "step": 1519000 - }, - { - "epoch": 0.01, - "learning_rate": 3.321308668712888e-05, - "loss": 0.4425, - "step": 1520000 - }, - { - "epoch": 0.01, - "eval_loss": 0.4216972291469574, - "eval_runtime": 78.7807, - "eval_samples_per_second": 81.238, - "eval_steps_per_second": 0.635, - "step": 1520000 - }, - { - "epoch": 0.01, - "learning_rate": 3.31222837302516e-05, - "loss": 0.4416, - "step": 1521000 - }, - { - "epoch": 0.01, - "learning_rate": 3.303162357102785e-05, - "loss": 0.4417, - "step": 1522000 - }, - { - "epoch": 0.01, - "learning_rate": 3.294119693115241e-05, - "loss": 0.4432, - "step": 1523000 - }, - { - "epoch": 0.01, - "learning_rate": 3.285082305181241e-05, - "loss": 0.4418, - "step": 1524000 - }, - { - "epoch": 0.01, - "learning_rate": 3.276059279704224e-05, - "loss": 0.4425, - "step": 1525000 - }, - { - "epoch": 0.01, - "eval_loss": 0.4240274727344513, - "eval_runtime": 83.7734, - "eval_samples_per_second": 76.397, - "eval_steps_per_second": 0.597, - "step": 1525000 - }, - { - "epoch": 0.01, - "learning_rate": 3.267059645608504e-05, - "loss": 0.4433, - "step": 1526000 - }, - { - "epoch": 0.01, - "learning_rate": 3.258065413027638e-05, - "loss": 0.4423, - "step": 1527000 - }, - { - "epoch": 0.01, - "learning_rate": 3.249085625201892e-05, - "loss": 0.4416, - "step": 1528000 - }, - { - "epoch": 0.01, - "learning_rate": 3.240120309485257e-05, - "loss": 0.4424, - "step": 1529000 - }, - { - "epoch": 0.01, - "learning_rate": 3.2311694931876376e-05, - "loss": 0.4435, - "step": 1530000 - }, - { - "epoch": 0.01, - "eval_loss": 0.4211861491203308, - "eval_runtime": 78.8009, - "eval_samples_per_second": 81.217, - "eval_steps_per_second": 0.635, - "step": 1530000 - }, - { - "epoch": 0.02, - "learning_rate": 3.222242132599248e-05, - "loss": 0.4433, - "step": 1531000 - }, - { - "epoch": 0.02, - "learning_rate": 3.21332038232514e-05, - "loss": 0.4409, - "step": 1532000 - }, - { - "epoch": 0.02, - "learning_rate": 3.2044221129842254e-05, - "loss": 0.4422, - "step": 1533000 - }, - { - "epoch": 0.02, - "learning_rate": 3.195529537333662e-05, - "loss": 0.442, - "step": 1534000 - }, - { - "epoch": 0.02, - "learning_rate": 3.186651596933338e-05, - "loss": 0.4432, - "step": 1535000 - }, - { - "epoch": 0.02, - "eval_loss": 0.42593932151794434, - "eval_runtime": 82.4195, - "eval_samples_per_second": 77.652, - "eval_steps_per_second": 0.607, - "step": 1535000 - }, - { - "epoch": 0.02, - "learning_rate": 3.177788318827007e-05, - "loss": 0.4427, - "step": 1536000 - }, - { - "epoch": 0.02, - "learning_rate": 3.168948571256281e-05, - "loss": 0.4431, - "step": 1537000 - }, - { - "epoch": 0.02, - "learning_rate": 3.1601146839607305e-05, - "loss": 0.4416, - "step": 1538000 - }, - { - "epoch": 0.02, - "learning_rate": 3.1512955397952055e-05, - "loss": 0.4426, - "step": 1539000 - }, - { - "epoch": 0.02, - "learning_rate": 3.142491165624354e-05, - "loss": 0.442, - "step": 1540000 - }, - { - "epoch": 0.02, - "eval_loss": 0.4219116270542145, - "eval_runtime": 79.7973, - "eval_samples_per_second": 80.203, - "eval_steps_per_second": 0.627, - "step": 1540000 - }, - { - "epoch": 0.02, - "learning_rate": 3.133719152637512e-05, - "loss": 0.4413, - "step": 1541000 - }, - { - "epoch": 0.02, - "learning_rate": 3.1249443691960104e-05, - "loss": 0.4426, - "step": 1542000 - }, - { - "epoch": 0.02, - "learning_rate": 3.1161844360194124e-05, - "loss": 0.4431, - "step": 1543000 - }, - { - "epoch": 0.02, - "learning_rate": 3.107448117408321e-05, - "loss": 0.4427, - "step": 1544000 - }, - { - "epoch": 0.02, - "learning_rate": 3.098717949852167e-05, - "loss": 0.4434, - "step": 1545000 - }, - { - "epoch": 0.02, - "eval_loss": 0.42146673798561096, - "eval_runtime": 79.3482, - "eval_samples_per_second": 80.657, - "eval_steps_per_second": 0.63, - "step": 1545000 - }, - { - "epoch": 0.02, - "learning_rate": 3.090002712451138e-05, - "loss": 0.4422, - "step": 1546000 - }, - { - "epoch": 0.02, - "learning_rate": 3.081311124554366e-05, - "loss": 0.4416, - "step": 1547000 - }, - { - "epoch": 0.02, - "learning_rate": 3.072625812065977e-05, - "loss": 0.4424, - "step": 1548000 - }, - { - "epoch": 0.02, - "learning_rate": 3.063955509213895e-05, - "loss": 0.4428, - "step": 1549000 - }, - { - "epoch": 0.03, - "learning_rate": 3.055317537919465e-05, - "loss": 0.4437, - "step": 1550000 - }, - { - "epoch": 0.03, - "eval_loss": 0.42286813259124756, - "eval_runtime": 78.4047, - "eval_samples_per_second": 81.628, - "eval_steps_per_second": 0.638, - "step": 1550000 - }, - { - "epoch": 0.03, - "learning_rate": 3.04667730337682e-05, - "loss": 0.4412, - "step": 1551000 - }, - { - "epoch": 0.03, - "learning_rate": 3.0380521575141337e-05, - "loss": 0.4416, - "step": 1552000 - }, - { - "epoch": 0.03, - "learning_rate": 3.029442126605103e-05, - "loss": 0.4435, - "step": 1553000 - }, - { - "epoch": 0.03, - "learning_rate": 3.02084723687737e-05, - "loss": 0.4427, - "step": 1554000 - }, - { - "epoch": 0.03, - "learning_rate": 3.0122760866500258e-05, - "loss": 0.442, - "step": 1555000 - }, - { - "epoch": 0.03, - "eval_loss": 0.4226168096065521, - "eval_runtime": 78.4147, - "eval_samples_per_second": 81.617, - "eval_steps_per_second": 0.638, - "step": 1555000 - }, - { - "epoch": 0.03, - "learning_rate": 3.003711542576724e-05, - "loss": 0.4425, - "step": 1556000 - }, - { - "epoch": 0.03, - "learning_rate": 2.9951793015071306e-05, - "loss": 0.4418, - "step": 1557000 - }, - { - "epoch": 0.03, - "learning_rate": 2.986645192081631e-05, - "loss": 0.4432, - "step": 1558000 - }, - { - "epoch": 0.03, - "learning_rate": 2.978126354204314e-05, - "loss": 0.4428, - "step": 1559000 - }, - { - "epoch": 0.03, - "learning_rate": 2.969631309715692e-05, - "loss": 0.4426, - "step": 1560000 - }, - { - "epoch": 0.03, - "eval_loss": 0.4223879873752594, - "eval_runtime": 80.6588, - "eval_samples_per_second": 79.347, - "eval_steps_per_second": 0.62, - "step": 1560000 - }, - { - "epoch": 0.03, - "learning_rate": 2.9611430774013924e-05, - "loss": 0.4416, - "step": 1561000 - }, - { - "epoch": 0.03, - "learning_rate": 2.9526701943191416e-05, - "loss": 0.4434, - "step": 1562000 - }, - { - "epoch": 0.03, - "learning_rate": 2.9442126862788104e-05, - "loss": 0.4407, - "step": 1563000 - }, - { - "epoch": 0.03, - "learning_rate": 2.9357705790434323e-05, - "loss": 0.4412, - "step": 1564000 - }, - { - "epoch": 0.03, - "learning_rate": 2.927352317295752e-05, - "loss": 0.4415, - "step": 1565000 - }, - { - "epoch": 0.03, - "eval_loss": 0.4247082471847534, - "eval_runtime": 78.1913, - "eval_samples_per_second": 81.851, - "eval_steps_per_second": 0.639, - "step": 1565000 - }, - { - "epoch": 0.03, - "learning_rate": 2.9189410733066673e-05, - "loss": 0.443, - "step": 1566000 - }, - { - "epoch": 0.03, - "learning_rate": 2.9105453071042536e-05, - "loss": 0.4409, - "step": 1567000 - }, - { - "epoch": 0.03, - "learning_rate": 2.902165044263471e-05, - "loss": 0.4409, - "step": 1568000 - }, - { - "epoch": 0.03, - "learning_rate": 2.893817024265179e-05, - "loss": 0.4414, - "step": 1569000 - }, - { - "epoch": 0.04, - "learning_rate": 2.885467813549419e-05, - "loss": 0.4429, - "step": 1570000 - }, - { - "epoch": 0.04, - "eval_loss": 0.4239417612552643, - "eval_runtime": 77.6323, - "eval_samples_per_second": 82.44, - "eval_steps_per_second": 0.644, - "step": 1570000 - }, - { - "epoch": 0.04, - "learning_rate": 2.877134182585685e-05, - "loss": 0.4421, - "step": 1571000 - }, - { - "epoch": 0.04, - "learning_rate": 2.8688161567596623e-05, - "loss": 0.4423, - "step": 1572000 - }, - { - "epoch": 0.04, - "learning_rate": 2.8605220559890042e-05, - "loss": 0.4417, - "step": 1573000 - }, - { - "epoch": 0.04, - "learning_rate": 2.8522353007368517e-05, - "loss": 0.4417, - "step": 1574000 - }, - { - "epoch": 0.04, - "learning_rate": 2.843972489701952e-05, - "loss": 0.4405, - "step": 1575000 - }, - { - "epoch": 0.04, - "eval_loss": 0.42241203784942627, - "eval_runtime": 79.4533, - "eval_samples_per_second": 80.55, - "eval_steps_per_second": 0.629, - "step": 1575000 - }, - { - "epoch": 0.04, - "learning_rate": 2.8357171058942266e-05, - "loss": 0.4405, - "step": 1576000 - }, - { - "epoch": 0.04, - "learning_rate": 2.82747745338781e-05, - "loss": 0.4415, - "step": 1577000 - }, - { - "epoch": 0.04, - "learning_rate": 2.819269989332776e-05, - "loss": 0.4408, - "step": 1578000 - }, - { - "epoch": 0.04, - "learning_rate": 2.8110618430913386e-05, - "loss": 0.4413, - "step": 1579000 - }, - { - "epoch": 0.04, - "learning_rate": 2.802869503255417e-05, - "loss": 0.4422, - "step": 1580000 - }, - { - "epoch": 0.04, - "eval_loss": 0.42269888520240784, - "eval_runtime": 80.0177, - "eval_samples_per_second": 79.982, - "eval_steps_per_second": 0.625, - "step": 1580000 - }, - { - "epoch": 0.04, - "learning_rate": 2.7946929947802964e-05, - "loss": 0.4411, - "step": 1581000 - }, - { - "epoch": 0.04, - "learning_rate": 2.7865323425730444e-05, - "loss": 0.4409, - "step": 1582000 - }, - { - "epoch": 0.04, - "learning_rate": 2.7783875714924202e-05, - "loss": 0.4419, - "step": 1583000 - }, - { - "epoch": 0.04, - "learning_rate": 2.7702587063488073e-05, - "loss": 0.4409, - "step": 1584000 - }, - { - "epoch": 0.04, - "learning_rate": 2.7621619818577606e-05, - "loss": 0.4429, - "step": 1585000 - }, - { - "epoch": 0.04, - "eval_loss": 0.4247879087924957, - "eval_runtime": 79.4063, - "eval_samples_per_second": 80.598, - "eval_steps_per_second": 0.63, - "step": 1585000 - }, - { - "epoch": 0.04, - "learning_rate": 2.7540649708899915e-05, - "loss": 0.4414, - "step": 1586000 - }, - { - "epoch": 0.04, - "learning_rate": 2.7459839399500925e-05, - "loss": 0.4403, - "step": 1587000 - }, - { - "epoch": 0.04, - "learning_rate": 2.7379189136542904e-05, - "loss": 0.4419, - "step": 1588000 - }, - { - "epoch": 0.04, - "learning_rate": 2.7298699165700516e-05, - "loss": 0.4409, - "step": 1589000 - }, - { - "epoch": 0.04, - "learning_rate": 2.7218530230648304e-05, - "loss": 0.4421, - "step": 1590000 - }, - { - "epoch": 0.04, - "eval_loss": 0.4219193756580353, - "eval_runtime": 77.0263, - "eval_samples_per_second": 83.089, - "eval_steps_per_second": 0.649, - "step": 1590000 - }, - { - "epoch": 0.05, - "learning_rate": 2.713836125729949e-05, - "loss": 0.4432, - "step": 1591000 - }, - { - "epoch": 0.05, - "learning_rate": 2.7058353309669814e-05, - "loss": 0.4422, - "step": 1592000 - }, - { - "epoch": 0.05, - "learning_rate": 2.6978506631477342e-05, - "loss": 0.4413, - "step": 1593000 - }, - { - "epoch": 0.05, - "learning_rate": 2.6898901070358037e-05, - "loss": 0.4409, - "step": 1594000 - }, - { - "epoch": 0.05, - "learning_rate": 2.6819456941046587e-05, - "loss": 0.4412, - "step": 1595000 - }, - { - "epoch": 0.05, - "eval_loss": 0.4237200617790222, - "eval_runtime": 79.3809, - "eval_samples_per_second": 80.624, - "eval_steps_per_second": 0.63, - "step": 1595000 - }, - { - "epoch": 0.05, - "learning_rate": 2.6740095204321262e-05, - "loss": 0.4407, - "step": 1596000 - }, - { - "epoch": 0.05, - "learning_rate": 2.6660895706502525e-05, - "loss": 0.4408, - "step": 1597000 - }, - { - "epoch": 0.05, - "learning_rate": 2.658185868884578e-05, - "loss": 0.4415, - "step": 1598000 - }, - { - "epoch": 0.05, - "learning_rate": 2.65030631850491e-05, - "loss": 0.4423, - "step": 1599000 - }, - { - "epoch": 0.05, - "learning_rate": 2.642435168642101e-05, - "loss": 0.4398, - "step": 1600000 - }, - { - "epoch": 0.05, - "eval_loss": 0.42158567905426025, - "eval_runtime": 77.7364, - "eval_samples_per_second": 82.329, - "eval_steps_per_second": 0.643, - "step": 1600000 - }, - { - "epoch": 0.05, - "learning_rate": 2.6345803388508994e-05, - "loss": 0.4418, - "step": 1601000 - }, - { - "epoch": 0.05, - "learning_rate": 2.6267418530584848e-05, - "loss": 0.4409, - "step": 1602000 - }, - { - "epoch": 0.05, - "learning_rate": 2.6189275490764684e-05, - "loss": 0.4408, - "step": 1603000 - }, - { - "epoch": 0.05, - "learning_rate": 2.611121806460332e-05, - "loss": 0.4402, - "step": 1604000 - }, - { - "epoch": 0.05, - "learning_rate": 2.603332479301739e-05, - "loss": 0.4404, - "step": 1605000 - }, - { - "epoch": 0.05, - "eval_loss": 0.4226343035697937, - "eval_runtime": 78.8772, - "eval_samples_per_second": 81.139, - "eval_steps_per_second": 0.634, - "step": 1605000 - }, - { - "epoch": 0.05, - "learning_rate": 2.5955751206822222e-05, - "loss": 0.4412, - "step": 1606000 - }, - { - "epoch": 0.05, - "learning_rate": 2.5878186626222375e-05, - "loss": 0.4411, - "step": 1607000 - }, - { - "epoch": 0.05, - "learning_rate": 2.580078691005209e-05, - "loss": 0.4417, - "step": 1608000 - }, - { - "epoch": 0.05, - "learning_rate": 2.5723629446154437e-05, - "loss": 0.4399, - "step": 1609000 - }, - { - "epoch": 0.06, - "learning_rate": 2.5646636986989314e-05, - "loss": 0.4403, - "step": 1610000 - }, - { - "epoch": 0.06, - "eval_loss": 0.42356759309768677, - "eval_runtime": 78.3383, - "eval_samples_per_second": 81.697, - "eval_steps_per_second": 0.638, - "step": 1610000 - }, - { - "epoch": 0.06, - "learning_rate": 2.556973294535873e-05, - "loss": 0.4407, - "step": 1611000 - }, - { - "epoch": 0.06, - "learning_rate": 2.5492994707761112e-05, - "loss": 0.4413, - "step": 1612000 - }, - { - "epoch": 0.06, - "learning_rate": 2.5416422507954494e-05, - "loss": 0.4402, - "step": 1613000 - }, - { - "epoch": 0.06, - "learning_rate": 2.5340016579191055e-05, - "loss": 0.4407, - "step": 1614000 - }, - { - "epoch": 0.06, - "learning_rate": 2.526385331039548e-05, - "loss": 0.4407, - "step": 1615000 - }, - { - "epoch": 0.06, - "eval_loss": 0.4227603077888489, - "eval_runtime": 78.8019, - "eval_samples_per_second": 81.216, - "eval_steps_per_second": 0.635, - "step": 1615000 - }, - { - "epoch": 0.06, - "learning_rate": 2.518778045459646e-05, - "loss": 0.4418, - "step": 1616000 - }, - { - "epoch": 0.06, - "learning_rate": 2.5111874566323936e-05, - "loss": 0.4402, - "step": 1617000 - }, - { - "epoch": 0.06, - "learning_rate": 2.503621153189737e-05, - "loss": 0.4411, - "step": 1618000 - }, - { - "epoch": 0.06, - "learning_rate": 2.4960640104291568e-05, - "loss": 0.4405, - "step": 1619000 - }, - { - "epoch": 0.06, - "learning_rate": 2.4885236336121185e-05, - "loss": 0.4412, - "step": 1620000 - }, - { - "epoch": 0.06, - "eval_loss": 0.4221682846546173, - "eval_runtime": 79.6082, - "eval_samples_per_second": 80.394, - "eval_steps_per_second": 0.628, - "step": 1620000 - }, - { - "epoch": 0.06, - "learning_rate": 2.481000045707917e-05, - "loss": 0.4392, - "step": 1621000 - }, - { - "epoch": 0.06, - "learning_rate": 2.4735007680056548e-05, - "loss": 0.4411, - "step": 1622000 - }, - { - "epoch": 0.06, - "learning_rate": 2.4660182913259782e-05, - "loss": 0.4397, - "step": 1623000 - }, - { - "epoch": 0.06, - "learning_rate": 2.458545173726531e-05, - "loss": 0.4421, - "step": 1624000 - }, - { - "epoch": 0.06, - "learning_rate": 2.4510963841576386e-05, - "loss": 0.4412, - "step": 1625000 - }, - { - "epoch": 0.06, - "eval_loss": 0.41993048787117004, - "eval_runtime": 77.9039, - "eval_samples_per_second": 82.152, - "eval_steps_per_second": 0.642, - "step": 1625000 - }, - { - "epoch": 0.06, - "learning_rate": 2.4436570328217597e-05, - "loss": 0.4407, - "step": 1626000 - }, - { - "epoch": 0.06, - "learning_rate": 2.4362346070701672e-05, - "loss": 0.4406, - "step": 1627000 - }, - { - "epoch": 0.06, - "learning_rate": 2.4288365265172765e-05, - "loss": 0.4405, - "step": 1628000 - }, - { - "epoch": 0.06, - "learning_rate": 2.421448002730611e-05, - "loss": 0.4401, - "step": 1629000 - }, - { - "epoch": 0.07, - "learning_rate": 2.4140838352157365e-05, - "loss": 0.4405, - "step": 1630000 - }, - { - "epoch": 0.07, - "eval_loss": 0.4204855263233185, - "eval_runtime": 79.4366, - "eval_samples_per_second": 80.567, - "eval_steps_per_second": 0.629, - "step": 1630000 - }, - { - "epoch": 0.07, - "learning_rate": 2.4067293033308242e-05, - "loss": 0.4397, - "step": 1631000 - }, - { - "epoch": 0.07, - "learning_rate": 2.3993918095184646e-05, - "loss": 0.4405, - "step": 1632000 - }, - { - "epoch": 0.07, - "learning_rate": 2.392078688034217e-05, - "loss": 0.4397, - "step": 1633000 - }, - { - "epoch": 0.07, - "learning_rate": 2.384782615102526e-05, - "loss": 0.4397, - "step": 1634000 - }, - { - "epoch": 0.07, - "learning_rate": 2.3774963351754417e-05, - "loss": 0.4392, - "step": 1635000 - }, - { - "epoch": 0.07, - "eval_loss": 0.4220183193683624, - "eval_runtime": 78.5206, - "eval_samples_per_second": 81.507, - "eval_steps_per_second": 0.637, - "step": 1635000 - }, - { - "epoch": 0.07, - "learning_rate": 2.3702344429600395e-05, - "loss": 0.4404, - "step": 1636000 - }, - { - "epoch": 0.07, - "learning_rate": 2.3629824222582374e-05, - "loss": 0.4405, - "step": 1637000 - }, - { - "epoch": 0.07, - "learning_rate": 2.3557475728895742e-05, - "loss": 0.4406, - "step": 1638000 - }, - { - "epoch": 0.07, - "learning_rate": 2.3485299168926543e-05, - "loss": 0.4399, - "step": 1639000 - }, - { - "epoch": 0.07, - "learning_rate": 2.341336668087976e-05, - "loss": 0.439, - "step": 1640000 - }, - { - "epoch": 0.07, - "eval_loss": 0.42630383372306824, - "eval_runtime": 79.5813, - "eval_samples_per_second": 80.421, - "eval_steps_per_second": 0.628, - "step": 1640000 - }, - { - "epoch": 0.07, - "learning_rate": 2.3341534474925735e-05, - "loss": 0.4398, - "step": 1641000 - }, - { - "epoch": 0.07, - "learning_rate": 2.3269874860483666e-05, - "loss": 0.4406, - "step": 1642000 - }, - { - "epoch": 0.07, - "learning_rate": 2.319838805584115e-05, - "loss": 0.441, - "step": 1643000 - }, - { - "epoch": 0.07, - "learning_rate": 2.3127074278759437e-05, - "loss": 0.4402, - "step": 1644000 - }, - { - "epoch": 0.07, - "learning_rate": 2.3056004800396985e-05, - "loss": 0.4394, - "step": 1645000 - }, - { - "epoch": 0.07, - "eval_loss": 0.42043250799179077, - "eval_runtime": 77.5183, - "eval_samples_per_second": 82.561, - "eval_steps_per_second": 0.645, - "step": 1645000 - }, - { - "epoch": 0.07, - "learning_rate": 2.298510843657037e-05, - "loss": 0.4404, - "step": 1646000 - }, - { - "epoch": 0.07, - "learning_rate": 2.2914385402810497e-05, - "loss": 0.4403, - "step": 1647000 - }, - { - "epoch": 0.07, - "learning_rate": 2.2843765381030626e-05, - "loss": 0.4401, - "step": 1648000 - }, - { - "epoch": 0.07, - "learning_rate": 2.2773319467053702e-05, - "loss": 0.4388, - "step": 1649000 - }, - { - "epoch": 0.07, - "learning_rate": 2.2703047875470186e-05, - "loss": 0.4388, - "step": 1650000 - }, - { - "epoch": 0.07, - "eval_loss": 0.4251132309436798, - "eval_runtime": 79.2673, - "eval_samples_per_second": 80.739, - "eval_steps_per_second": 0.631, - "step": 1650000 - }, - { - "epoch": 0.08, - "learning_rate": 2.263295082033955e-05, - "loss": 0.4402, - "step": 1651000 - }, - { - "epoch": 0.08, - "learning_rate": 2.256302851518958e-05, - "loss": 0.4408, - "step": 1652000 - }, - { - "epoch": 0.08, - "learning_rate": 2.2493281173015714e-05, - "loss": 0.4392, - "step": 1653000 - }, - { - "epoch": 0.08, - "learning_rate": 2.242370900628049e-05, - "loss": 0.4401, - "step": 1654000 - }, - { - "epoch": 0.08, - "learning_rate": 2.235438153601577e-05, - "loss": 0.4399, - "step": 1655000 - }, - { - "epoch": 0.08, - "eval_loss": 0.42048439383506775, - "eval_runtime": 77.8384, - "eval_samples_per_second": 82.222, - "eval_steps_per_second": 0.642, - "step": 1655000 - }, - { - "epoch": 0.08, - "learning_rate": 2.2285160179706007e-05, - "loss": 0.44, - "step": 1656000 - }, - { - "epoch": 0.08, - "learning_rate": 2.2216114632807524e-05, - "loss": 0.4404, - "step": 1657000 - }, - { - "epoch": 0.08, - "learning_rate": 2.214731388718044e-05, - "loss": 0.4406, - "step": 1658000 - }, - { - "epoch": 0.08, - "learning_rate": 2.2078620413208303e-05, - "loss": 0.4402, - "step": 1659000 - }, - { - "epoch": 0.08, - "learning_rate": 2.201010337780338e-05, - "loss": 0.4405, - "step": 1660000 - }, - { - "epoch": 0.08, - "eval_loss": 0.4228270649909973, - "eval_runtime": 78.2009, - "eval_samples_per_second": 81.841, - "eval_steps_per_second": 0.639, - "step": 1660000 - }, - { - "epoch": 0.08, - "learning_rate": 2.1941831241763897e-05, - "loss": 0.4402, - "step": 1661000 - }, - { - "epoch": 0.08, - "learning_rate": 2.1873667532140358e-05, - "loss": 0.439, - "step": 1662000 - }, - { - "epoch": 0.08, - "learning_rate": 2.1805748783540877e-05, - "loss": 0.44, - "step": 1663000 - }, - { - "epoch": 0.08, - "learning_rate": 2.1737939229421666e-05, - "loss": 0.4407, - "step": 1664000 - }, - { - "epoch": 0.08, - "learning_rate": 2.167037469500335e-05, - "loss": 0.4404, - "step": 1665000 - }, - { - "epoch": 0.08, - "eval_loss": 0.41908711194992065, - "eval_runtime": 78.8241, - "eval_samples_per_second": 81.193, - "eval_steps_per_second": 0.634, - "step": 1665000 - }, - { - "epoch": 0.08, - "learning_rate": 2.160292012180046e-05, - "loss": 0.4405, - "step": 1666000 - }, - { - "epoch": 0.08, - "learning_rate": 2.1535643436230335e-05, - "loss": 0.4401, - "step": 1667000 - }, - { - "epoch": 0.08, - "learning_rate": 2.146854484322948e-05, - "loss": 0.4403, - "step": 1668000 - }, - { - "epoch": 0.08, - "learning_rate": 2.140162454719184e-05, - "loss": 0.4418, - "step": 1669000 - }, - { - "epoch": 0.09, - "learning_rate": 2.1334882751968192e-05, - "loss": 0.4397, - "step": 1670000 - }, - { - "epoch": 0.09, - "eval_loss": 0.42122882604599, - "eval_runtime": 76.417, - "eval_samples_per_second": 83.751, - "eval_steps_per_second": 0.654, - "step": 1670000 - }, - { - "epoch": 0.09, - "learning_rate": 2.126838613462656e-05, - "loss": 0.4387, - "step": 1671000 - }, - { - "epoch": 0.09, - "learning_rate": 2.1202001771399895e-05, - "loss": 0.4387, - "step": 1672000 - }, - { - "epoch": 0.09, - "learning_rate": 2.1135796517072863e-05, - "loss": 0.4394, - "step": 1673000 - }, - { - "epoch": 0.09, - "learning_rate": 2.106977057331812e-05, - "loss": 0.4398, - "step": 1674000 - }, - { - "epoch": 0.09, - "learning_rate": 2.1003989897961326e-05, - "loss": 0.44, - "step": 1675000 - }, - { - "epoch": 0.09, - "eval_loss": 0.41976797580718994, - "eval_runtime": 77.5035, - "eval_samples_per_second": 82.577, - "eval_steps_per_second": 0.645, - "step": 1675000 - }, - { - "epoch": 0.09, - "learning_rate": 2.0938388575438328e-05, - "loss": 0.4403, - "step": 1676000 - }, - { - "epoch": 0.09, - "learning_rate": 2.0872901407947595e-05, - "loss": 0.4413, - "step": 1677000 - }, - { - "epoch": 0.09, - "learning_rate": 2.080759435185324e-05, - "loss": 0.4397, - "step": 1678000 - }, - { - "epoch": 0.09, - "learning_rate": 2.0742467606091935e-05, - "loss": 0.4395, - "step": 1679000 - }, - { - "epoch": 0.09, - "learning_rate": 2.0677586225058045e-05, - "loss": 0.4407, - "step": 1680000 - }, - { - "epoch": 0.09, - "eval_loss": 0.42079994082450867, - "eval_runtime": 79.6958, - "eval_samples_per_second": 80.305, - "eval_steps_per_second": 0.627, - "step": 1680000 - }, - { - "epoch": 0.09, - "learning_rate": 2.0612885189152567e-05, - "loss": 0.4399, - "step": 1681000 - }, - { - "epoch": 0.09, - "learning_rate": 2.0548300200510223e-05, - "loss": 0.4382, - "step": 1682000 - }, - { - "epoch": 0.09, - "learning_rate": 2.048389631205587e-05, - "loss": 0.4393, - "step": 1683000 - }, - { - "epoch": 0.09, - "learning_rate": 2.041967371997491e-05, - "loss": 0.4392, - "step": 1684000 - }, - { - "epoch": 0.09, - "learning_rate": 2.0355760520841843e-05, - "loss": 0.4403, - "step": 1685000 - }, - { - "epoch": 0.09, - "eval_loss": 0.41910338401794434, - "eval_runtime": 79.2458, - "eval_samples_per_second": 80.761, - "eval_steps_per_second": 0.631, - "step": 1685000 - }, - { - "epoch": 0.09, - "learning_rate": 2.0291900744285765e-05, - "loss": 0.4397, - "step": 1686000 - }, - { - "epoch": 0.09, - "learning_rate": 2.022822284895487e-05, - "loss": 0.4401, - "step": 1687000 - }, - { - "epoch": 0.09, - "learning_rate": 2.016472702882308e-05, - "loss": 0.4395, - "step": 1688000 - }, - { - "epoch": 0.09, - "learning_rate": 2.0101476699753774e-05, - "loss": 0.4394, - "step": 1689000 - }, - { - "epoch": 0.1, - "learning_rate": 2.003840846723428e-05, - "loss": 0.4408, - "step": 1690000 - }, - { - "epoch": 0.1, - "eval_loss": 0.41959914565086365, - "eval_runtime": 79.6028, - "eval_samples_per_second": 80.399, - "eval_steps_per_second": 0.628, - "step": 1690000 - }, - { - "epoch": 0.1, - "learning_rate": 1.9975459665494844e-05, - "loss": 0.4406, - "step": 1691000 - }, - { - "epoch": 0.1, - "learning_rate": 1.9912693708915007e-05, - "loss": 0.4403, - "step": 1692000 - }, - { - "epoch": 0.1, - "learning_rate": 1.9850110788690757e-05, - "loss": 0.4391, - "step": 1693000 - }, - { - "epoch": 0.1, - "learning_rate": 1.978771109546051e-05, - "loss": 0.4388, - "step": 1694000 - }, - { - "epoch": 0.1, - "learning_rate": 1.9725681193643978e-05, - "loss": 0.439, - "step": 1695000 - }, - { - "epoch": 0.1, - "eval_loss": 0.4219348132610321, - "eval_runtime": 78.5115, - "eval_samples_per_second": 81.517, - "eval_steps_per_second": 0.637, - "step": 1695000 - }, - { - "epoch": 0.1, - "learning_rate": 1.9663647972981225e-05, - "loss": 0.4389, - "step": 1696000 - }, - { - "epoch": 0.1, - "learning_rate": 1.9601798547310563e-05, - "loss": 0.4396, - "step": 1697000 - }, - { - "epoch": 0.1, - "learning_rate": 1.954019467851605e-05, - "loss": 0.4405, - "step": 1698000 - }, - { - "epoch": 0.1, - "learning_rate": 1.9478713223216454e-05, - "loss": 0.4403, - "step": 1699000 - }, - { - "epoch": 0.1, - "learning_rate": 1.9417416126252245e-05, - "loss": 0.4394, - "step": 1700000 - }, - { - "epoch": 0.1, - "eval_loss": 0.42123520374298096, - "eval_runtime": 78.2147, - "eval_samples_per_second": 81.826, - "eval_steps_per_second": 0.639, - "step": 1700000 - }, - { - "epoch": 0.1, - "learning_rate": 1.9356303574345033e-05, - "loss": 0.44, - "step": 1701000 - }, - { - "epoch": 0.1, - "learning_rate": 1.9295375753654256e-05, - "loss": 0.4406, - "step": 1702000 - }, - { - "epoch": 0.1, - "learning_rate": 1.9234693500252896e-05, - "loss": 0.4392, - "step": 1703000 - }, - { - "epoch": 0.1, - "learning_rate": 1.9174195978495195e-05, - "loss": 0.4389, - "step": 1704000 - }, - { - "epoch": 0.1, - "learning_rate": 1.9113823092023844e-05, - "loss": 0.4395, - "step": 1705000 - }, - { - "epoch": 0.1, - "eval_loss": 0.42103949189186096, - "eval_runtime": 79.7626, - "eval_samples_per_second": 80.238, - "eval_steps_per_second": 0.627, - "step": 1705000 - }, - { - "epoch": 0.1, - "learning_rate": 1.9053635675406527e-05, - "loss": 0.4387, - "step": 1706000 - }, - { - "epoch": 0.1, - "learning_rate": 1.899363391198454e-05, - "loss": 0.4405, - "step": 1707000 - }, - { - "epoch": 0.1, - "learning_rate": 1.893381798453365e-05, - "loss": 0.4395, - "step": 1708000 - }, - { - "epoch": 0.1, - "learning_rate": 1.887418807526355e-05, - "loss": 0.439, - "step": 1709000 - }, - { - "epoch": 0.1, - "learning_rate": 1.8814803716459616e-05, - "loss": 0.439, - "step": 1710000 - }, - { - "epoch": 0.1, - "eval_loss": 0.42225512862205505, - "eval_runtime": 77.1239, - "eval_samples_per_second": 82.983, - "eval_steps_per_second": 0.648, - "step": 1710000 - }, - { - "epoch": 0.11, - "learning_rate": 1.875560536579964e-05, - "loss": 0.439, - "step": 1711000 - }, - { - "epoch": 0.11, - "learning_rate": 1.8696534225358292e-05, - "loss": 0.4392, - "step": 1712000 - }, - { - "epoch": 0.11, - "learning_rate": 1.8637708616967782e-05, - "loss": 0.4397, - "step": 1713000 - }, - { - "epoch": 0.11, - "learning_rate": 1.8579010950865316e-05, - "loss": 0.4385, - "step": 1714000 - }, - { - "epoch": 0.11, - "learning_rate": 1.852050038374741e-05, - "loss": 0.4391, - "step": 1715000 - }, - { - "epoch": 0.11, - "eval_loss": 0.420716792345047, - "eval_runtime": 78.5003, - "eval_samples_per_second": 81.528, - "eval_steps_per_second": 0.637, - "step": 1715000 - }, - { - "epoch": 0.11, - "learning_rate": 1.8462235323533128e-05, - "loss": 0.4395, - "step": 1716000 - }, - { - "epoch": 0.11, - "learning_rate": 1.8404099300970416e-05, - "loss": 0.4378, - "step": 1717000 - }, - { - "epoch": 0.11, - "learning_rate": 1.8346208764813356e-05, - "loss": 0.4408, - "step": 1718000 - }, - { - "epoch": 0.11, - "learning_rate": 1.8288447994466744e-05, - "loss": 0.4388, - "step": 1719000 - }, - { - "epoch": 0.11, - "learning_rate": 1.8230932687039617e-05, - "loss": 0.439, - "step": 1720000 - }, - { - "epoch": 0.11, - "eval_loss": 0.41757285594940186, - "eval_runtime": 79.8473, - "eval_samples_per_second": 80.153, - "eval_steps_per_second": 0.626, - "step": 1720000 - }, - { - "epoch": 0.11, - "learning_rate": 1.8173547872002242e-05, - "loss": 0.4384, - "step": 1721000 - }, - { - "epoch": 0.11, - "learning_rate": 1.811640849341029e-05, - "loss": 0.4401, - "step": 1722000 - }, - { - "epoch": 0.11, - "learning_rate": 1.8059400332198968e-05, - "loss": 0.438, - "step": 1723000 - }, - { - "epoch": 0.11, - "learning_rate": 1.8002580852796262e-05, - "loss": 0.4401, - "step": 1724000 - }, - { - "epoch": 0.11, - "learning_rate": 1.7945950228284155e-05, - "loss": 0.4401, - "step": 1725000 - }, - { - "epoch": 0.11, - "eval_loss": 0.41903457045555115, - "eval_runtime": 77.4134, - "eval_samples_per_second": 82.673, - "eval_steps_per_second": 0.646, - "step": 1725000 - }, - { - "epoch": 0.11, - "learning_rate": 1.788950863116934e-05, - "loss": 0.4383, - "step": 1726000 - }, - { - "epoch": 0.11, - "learning_rate": 1.783331239121836e-05, - "loss": 0.4383, - "step": 1727000 - }, - { - "epoch": 0.11, - "learning_rate": 1.7777305143227536e-05, - "loss": 0.4401, - "step": 1728000 - }, - { - "epoch": 0.11, - "learning_rate": 1.772143127833117e-05, - "loss": 0.4391, - "step": 1729000 - }, - { - "epoch": 0.12, - "learning_rate": 1.766574712475575e-05, - "loss": 0.439, - "step": 1730000 - }, - { - "epoch": 0.12, - "eval_loss": 0.4182310104370117, - "eval_runtime": 75.5121, - "eval_samples_per_second": 84.755, - "eval_steps_per_second": 0.662, - "step": 1730000 - }, - { - "epoch": 0.12, - "learning_rate": 1.7610252852124898e-05, - "loss": 0.4387, - "step": 1731000 - }, - { - "epoch": 0.12, - "learning_rate": 1.755494862948377e-05, - "loss": 0.4382, - "step": 1732000 - }, - { - "epoch": 0.12, - "learning_rate": 1.7499889644232756e-05, - "loss": 0.4385, - "step": 1733000 - }, - { - "epoch": 0.12, - "learning_rate": 1.744496583592041e-05, - "loss": 0.4408, - "step": 1734000 - }, - { - "epoch": 0.12, - "learning_rate": 1.7390287219108524e-05, - "loss": 0.4401, - "step": 1735000 - }, - { - "epoch": 0.12, - "eval_loss": 0.4186602830886841, - "eval_runtime": 80.3977, - "eval_samples_per_second": 79.604, - "eval_steps_per_second": 0.622, - "step": 1735000 - }, - { - "epoch": 0.12, - "learning_rate": 1.733574449368513e-05, - "loss": 0.4391, - "step": 1736000 - }, - { - "epoch": 0.12, - "learning_rate": 1.7281392654451555e-05, - "loss": 0.4401, - "step": 1737000 - }, - { - "epoch": 0.12, - "learning_rate": 1.7227339997768454e-05, - "loss": 0.4405, - "step": 1738000 - }, - { - "epoch": 0.12, - "learning_rate": 1.7173370044430122e-05, - "loss": 0.439, - "step": 1739000 - }, - { - "epoch": 0.12, - "learning_rate": 1.7119591471902336e-05, - "loss": 0.4397, - "step": 1740000 - }, - { - "epoch": 0.12, - "eval_loss": 0.41898131370544434, - "eval_runtime": 77.8556, - "eval_samples_per_second": 82.203, - "eval_steps_per_second": 0.642, - "step": 1740000 - }, - { - "epoch": 0.12, - "learning_rate": 1.7066004444003927e-05, - "loss": 0.4388, - "step": 1741000 - }, - { - "epoch": 0.12, - "learning_rate": 1.7012609123970294e-05, - "loss": 0.4388, - "step": 1742000 - }, - { - "epoch": 0.12, - "learning_rate": 1.6959405674452816e-05, - "loss": 0.4393, - "step": 1743000 - }, - { - "epoch": 0.12, - "learning_rate": 1.6906447172961322e-05, - "loss": 0.4386, - "step": 1744000 - }, - { - "epoch": 0.12, - "learning_rate": 1.6853627757817506e-05, - "loss": 0.4379, - "step": 1745000 - }, - { - "epoch": 0.12, - "eval_loss": 0.41874217987060547, - "eval_runtime": 79.3439, - "eval_samples_per_second": 80.662, - "eval_steps_per_second": 0.63, - "step": 1745000 - }, - { - "epoch": 0.12, - "learning_rate": 1.6801053228400387e-05, - "loss": 0.4407, - "step": 1746000 - }, - { - "epoch": 0.12, - "learning_rate": 1.6748618490574697e-05, - "loss": 0.4398, - "step": 1747000 - }, - { - "epoch": 0.12, - "learning_rate": 1.669637642742642e-05, - "loss": 0.4385, - "step": 1748000 - }, - { - "epoch": 0.12, - "learning_rate": 1.6644327198093962e-05, - "loss": 0.4376, - "step": 1749000 - }, - { - "epoch": 0.12, - "learning_rate": 1.6592522720912954e-05, - "loss": 0.4381, - "step": 1750000 - }, - { - "epoch": 0.12, - "eval_loss": 0.42223626375198364, - "eval_runtime": 79.3599, - "eval_samples_per_second": 80.645, - "eval_steps_per_second": 0.63, - "step": 1750000 - }, - { - "epoch": 0.13, - "learning_rate": 1.6540859441048118e-05, - "loss": 0.4394, - "step": 1751000 - }, - { - "epoch": 0.13, - "learning_rate": 1.6489389468730806e-05, - "loss": 0.439, - "step": 1752000 - }, - { - "epoch": 0.13, - "learning_rate": 1.6438215320582125e-05, - "loss": 0.4382, - "step": 1753000 - }, - { - "epoch": 0.13, - "learning_rate": 1.638713204573334e-05, - "loss": 0.4381, - "step": 1754000 - }, - { - "epoch": 0.13, - "learning_rate": 1.6336293339368757e-05, - "loss": 0.4392, - "step": 1755000 - }, - { - "epoch": 0.13, - "eval_loss": 0.420003205537796, - "eval_runtime": 78.6855, - "eval_samples_per_second": 81.336, - "eval_steps_per_second": 0.635, - "step": 1755000 - }, - { - "epoch": 0.13, - "learning_rate": 1.628559757718579e-05, - "loss": 0.439, - "step": 1756000 - }, - { - "epoch": 0.13, - "learning_rate": 1.6235095900122255e-05, - "loss": 0.4383, - "step": 1757000 - }, - { - "epoch": 0.13, - "learning_rate": 1.61848386723797e-05, - "loss": 0.4386, - "step": 1758000 - }, - { - "epoch": 0.13, - "learning_rate": 1.6134725432005385e-05, - "loss": 0.4395, - "step": 1759000 - }, - { - "epoch": 0.13, - "learning_rate": 1.6084856557803128e-05, - "loss": 0.438, - "step": 1760000 - }, - { - "epoch": 0.13, - "eval_loss": 0.4174318015575409, - "eval_runtime": 76.8661, - "eval_samples_per_second": 83.262, - "eval_steps_per_second": 0.65, - "step": 1760000 - }, - { - "epoch": 0.13, - "learning_rate": 1.6035132364121584e-05, - "loss": 0.4388, - "step": 1761000 - }, - { - "epoch": 0.13, - "learning_rate": 1.5985603018519935e-05, - "loss": 0.4378, - "step": 1762000 - }, - { - "epoch": 0.13, - "learning_rate": 1.5936317908767756e-05, - "loss": 0.4368, - "step": 1763000 - }, - { - "epoch": 0.13, - "learning_rate": 1.5887178516132736e-05, - "loss": 0.4412, - "step": 1764000 - }, - { - "epoch": 0.13, - "learning_rate": 1.5838283268763148e-05, - "loss": 0.4383, - "step": 1765000 - }, - { - "epoch": 0.13, - "eval_loss": 0.41864004731178284, - "eval_runtime": 86.8232, - "eval_samples_per_second": 73.713, - "eval_steps_per_second": 0.576, - "step": 1765000 - }, - { - "epoch": 0.13, - "learning_rate": 1.5789583078410045e-05, - "loss": 0.4389, - "step": 1766000 - }, - { - "epoch": 0.13, - "learning_rate": 1.574102963743466e-05, - "loss": 0.439, - "step": 1767000 - }, - { - "epoch": 0.13, - "learning_rate": 1.5692671940427092e-05, - "loss": 0.4385, - "step": 1768000 - }, - { - "epoch": 0.13, - "learning_rate": 1.5644510134693248e-05, - "loss": 0.4384, - "step": 1769000 - }, - { - "epoch": 0.14, - "learning_rate": 1.559654436694238e-05, - "loss": 0.4392, - "step": 1770000 - }, - { - "epoch": 0.14, - "eval_loss": 0.4172964096069336, - "eval_runtime": 79.9939, - "eval_samples_per_second": 80.006, - "eval_steps_per_second": 0.625, - "step": 1770000 - }, - { - "epoch": 0.14, - "learning_rate": 1.5548822454827717e-05, - "loss": 0.4393, - "step": 1771000 - }, - { - "epoch": 0.14, - "learning_rate": 1.5501249004379188e-05, - "loss": 0.4392, - "step": 1772000 - }, - { - "epoch": 0.14, - "learning_rate": 1.54539193071009e-05, - "loss": 0.4386, - "step": 1773000 - }, - { - "epoch": 0.14, - "learning_rate": 1.5406738753042658e-05, - "loss": 0.4393, - "step": 1774000 - }, - { - "epoch": 0.14, - "learning_rate": 1.5359754961260252e-05, - "loss": 0.4387, - "step": 1775000 - }, - { - "epoch": 0.14, - "eval_loss": 0.4169865548610687, - "eval_runtime": 77.7982, - "eval_samples_per_second": 82.264, - "eval_steps_per_second": 0.643, - "step": 1775000 - }, - { - "epoch": 0.14, - "learning_rate": 1.5312968074874446e-05, - "loss": 0.4381, - "step": 1776000 - }, - { - "epoch": 0.14, - "learning_rate": 1.5266424727771944e-05, - "loss": 0.4396, - "step": 1777000 - }, - { - "epoch": 0.14, - "learning_rate": 1.522003188188146e-05, - "loss": 0.4395, - "step": 1778000 - }, - { - "epoch": 0.14, - "learning_rate": 1.517383636700831e-05, - "loss": 0.4382, - "step": 1779000 - }, - { - "epoch": 0.14, - "learning_rate": 1.5127838323872036e-05, - "loss": 0.4364, - "step": 1780000 - }, - { - "epoch": 0.14, - "eval_loss": 0.41849958896636963, - "eval_runtime": 76.5418, - "eval_samples_per_second": 83.614, - "eval_steps_per_second": 0.653, - "step": 1780000 - }, - { - "epoch": 0.14, - "learning_rate": 1.5082037892590664e-05, - "loss": 0.439, - "step": 1781000 - }, - { - "epoch": 0.14, - "learning_rate": 1.5036480716537045e-05, - "loss": 0.4393, - "step": 1782000 - }, - { - "epoch": 0.14, - "learning_rate": 1.4991121035047137e-05, - "loss": 0.4383, - "step": 1783000 - }, - { - "epoch": 0.14, - "learning_rate": 1.4945913877821996e-05, - "loss": 0.4383, - "step": 1784000 - }, - { - "epoch": 0.14, - "learning_rate": 1.4900904886625165e-05, - "loss": 0.4377, - "step": 1785000 - }, - { - "epoch": 0.14, - "eval_loss": 0.4204372465610504, - "eval_runtime": 76.9476, - "eval_samples_per_second": 83.173, - "eval_steps_per_second": 0.65, - "step": 1785000 - }, - { - "epoch": 0.14, - "learning_rate": 1.4856138910151988e-05, - "loss": 0.4388, - "step": 1786000 - }, - { - "epoch": 0.14, - "learning_rate": 1.4811526463215664e-05, - "loss": 0.4371, - "step": 1787000 - }, - { - "epoch": 0.14, - "learning_rate": 1.476715690631307e-05, - "loss": 0.438, - "step": 1788000 - }, - { - "epoch": 0.14, - "learning_rate": 1.4722941546682392e-05, - "loss": 0.4381, - "step": 1789000 - }, - { - "epoch": 0.14, - "learning_rate": 1.4678968949438921e-05, - "loss": 0.4363, - "step": 1790000 - }, - { - "epoch": 0.14, - "eval_loss": 0.4183988869190216, - "eval_runtime": 76.9826, - "eval_samples_per_second": 83.136, - "eval_steps_per_second": 0.649, - "step": 1790000 - }, - { - "epoch": 0.15, - "learning_rate": 1.4635151215325466e-05, - "loss": 0.4366, - "step": 1791000 - }, - { - "epoch": 0.15, - "learning_rate": 1.4591576112997706e-05, - "loss": 0.4391, - "step": 1792000 - }, - { - "epoch": 0.15, - "learning_rate": 1.4548156537772989e-05, - "loss": 0.4391, - "step": 1793000 - }, - { - "epoch": 0.15, - "learning_rate": 1.4504936340214418e-05, - "loss": 0.4385, - "step": 1794000 - }, - { - "epoch": 0.15, - "learning_rate": 1.4461958572967858e-05, - "loss": 0.4378, - "step": 1795000 - }, - { - "epoch": 0.15, - "eval_loss": 0.4223540425300598, - "eval_runtime": 77.2417, - "eval_samples_per_second": 82.857, - "eval_steps_per_second": 0.647, - "step": 1795000 - }, - { - "epoch": 0.15, - "learning_rate": 1.4419137325396865e-05, - "loss": 0.4389, - "step": 1796000 - }, - { - "epoch": 0.15, - "learning_rate": 1.437651584850691e-05, - "loss": 0.4386, - "step": 1797000 - }, - { - "epoch": 0.15, - "learning_rate": 1.4334094272130413e-05, - "loss": 0.4367, - "step": 1798000 - }, - { - "epoch": 0.15, - "learning_rate": 1.4291872725490842e-05, - "loss": 0.4384, - "step": 1799000 - }, - { - "epoch": 0.15, - "learning_rate": 1.4249893258568889e-05, - "loss": 0.4384, - "step": 1800000 - }, - { - "epoch": 0.15, - "eval_loss": 0.4171189069747925, - "eval_runtime": 76.9772, - "eval_samples_per_second": 83.142, - "eval_steps_per_second": 0.65, - "step": 1800000 - }, - { - "epoch": 0.0, - "learning_rate": 1.4208113677502687e-05, - "loss": 0.4365, - "step": 1801000 - }, - { - "epoch": 0.0, - "learning_rate": 1.4166492588365344e-05, - "loss": 0.4384, - "step": 1802000 - }, - { - "epoch": 0.0, - "learning_rate": 1.4125072039508715e-05, - "loss": 0.4379, - "step": 1803000 - }, - { - "epoch": 0.0, - "learning_rate": 1.4083852157106983e-05, - "loss": 0.4377, - "step": 1804000 - }, - { - "epoch": 0.0, - "learning_rate": 1.4042833066723076e-05, - "loss": 0.4385, - "step": 1805000 - }, - { - "epoch": 0.0, - "eval_loss": 0.42015281319618225, - "eval_runtime": 78.5938, - "eval_samples_per_second": 81.431, - "eval_steps_per_second": 0.636, - "step": 1805000 - }, - { - "epoch": 0.0, - "learning_rate": 1.4002055611082185e-05, - "loss": 0.4387, - "step": 1806000 - }, - { - "epoch": 0.0, - "learning_rate": 1.396143827787245e-05, - "loss": 0.4379, - "step": 1807000 - }, - { - "epoch": 0.0, - "learning_rate": 1.3921022109574423e-05, - "loss": 0.4373, - "step": 1808000 - }, - { - "epoch": 0.0, - "learning_rate": 1.3880847343598854e-05, - "loss": 0.4382, - "step": 1809000 - }, - { - "epoch": 0.01, - "learning_rate": 1.384087358540966e-05, - "loss": 0.438, - "step": 1810000 - }, - { - "epoch": 0.01, - "eval_loss": 0.4181618392467499, - "eval_runtime": 77.0873, - "eval_samples_per_second": 83.023, - "eval_steps_per_second": 0.649, - "step": 1810000 - }, - { - "epoch": 0.01, - "learning_rate": 1.3801061244895656e-05, - "loss": 0.4382, - "step": 1811000 - }, - { - "epoch": 0.01, - "learning_rate": 1.3761450557829634e-05, - "loss": 0.4392, - "step": 1812000 - }, - { - "epoch": 0.01, - "learning_rate": 1.372204164487259e-05, - "loss": 0.4387, - "step": 1813000 - }, - { - "epoch": 0.01, - "learning_rate": 1.368283462607094e-05, - "loss": 0.4388, - "step": 1814000 - }, - { - "epoch": 0.01, - "learning_rate": 1.3643868524915881e-05, - "loss": 0.4392, - "step": 1815000 - }, - { - "epoch": 0.01, - "eval_loss": 0.42043235898017883, - "eval_runtime": 79.2626, - "eval_samples_per_second": 80.744, - "eval_steps_per_second": 0.631, - "step": 1815000 - }, - { - "epoch": 0.01, - "learning_rate": 1.3605065449912204e-05, - "loss": 0.4395, - "step": 1816000 - }, - { - "epoch": 0.01, - "learning_rate": 1.3566464625393676e-05, - "loss": 0.4391, - "step": 1817000 - }, - { - "epoch": 0.01, - "learning_rate": 1.352810446627972e-05, - "loss": 0.4379, - "step": 1818000 - }, - { - "epoch": 0.01, - "learning_rate": 1.3489908292326226e-05, - "loss": 0.4377, - "step": 1819000 - }, - { - "epoch": 0.01, - "learning_rate": 1.3451952611981318e-05, - "loss": 0.4389, - "step": 1820000 - }, - { - "epoch": 0.01, - "eval_loss": 0.41748473048210144, - "eval_runtime": 77.7669, - "eval_samples_per_second": 82.297, - "eval_steps_per_second": 0.643, - "step": 1820000 - }, - { - "epoch": 0.01, - "learning_rate": 1.3414161553535873e-05, - "loss": 0.4386, - "step": 1821000 - }, - { - "epoch": 0.01, - "learning_rate": 1.3376573327101957e-05, - "loss": 0.4383, - "step": 1822000 - }, - { - "epoch": 0.01, - "learning_rate": 1.333918804717982e-05, - "loss": 0.4371, - "step": 1823000 - }, - { - "epoch": 0.01, - "learning_rate": 1.3302079989360922e-05, - "loss": 0.4369, - "step": 1824000 - }, - { - "epoch": 0.01, - "learning_rate": 1.3265100537030001e-05, - "loss": 0.4378, - "step": 1825000 - }, - { - "epoch": 0.01, - "eval_loss": 0.4193136692047119, - "eval_runtime": 79.079, - "eval_samples_per_second": 80.932, - "eval_steps_per_second": 0.632, - "step": 1825000 - }, - { - "epoch": 0.01, - "learning_rate": 1.3228324370776315e-05, - "loss": 0.4385, - "step": 1826000 - }, - { - "epoch": 0.01, - "learning_rate": 1.319175160262646e-05, - "loss": 0.4363, - "step": 1827000 - }, - { - "epoch": 0.01, - "learning_rate": 1.3155418611556128e-05, - "loss": 0.438, - "step": 1828000 - }, - { - "epoch": 0.01, - "learning_rate": 1.3119252769539538e-05, - "loss": 0.4378, - "step": 1829000 - }, - { - "epoch": 0.01, - "learning_rate": 1.3083326518189592e-05, - "loss": 0.4377, - "step": 1830000 - }, - { - "epoch": 0.01, - "eval_loss": 0.4179893732070923, - "eval_runtime": 78.6845, - "eval_samples_per_second": 81.337, - "eval_steps_per_second": 0.635, - "step": 1830000 - }, - { - "epoch": 0.02, - "learning_rate": 1.3047568042535075e-05, - "loss": 0.4388, - "step": 1831000 - }, - { - "epoch": 0.02, - "learning_rate": 1.3012013515599501e-05, - "loss": 0.439, - "step": 1832000 - }, - { - "epoch": 0.02, - "learning_rate": 1.2976698294195656e-05, - "loss": 0.4392, - "step": 1833000 - }, - { - "epoch": 0.02, - "learning_rate": 1.2941586829267356e-05, - "loss": 0.4378, - "step": 1834000 - }, - { - "epoch": 0.02, - "learning_rate": 1.2906644387183456e-05, - "loss": 0.4372, - "step": 1835000 - }, - { - "epoch": 0.02, - "eval_loss": 0.4213528037071228, - "eval_runtime": 77.6423, - "eval_samples_per_second": 82.429, - "eval_steps_per_second": 0.644, - "step": 1835000 - }, - { - "epoch": 0.02, - "learning_rate": 1.287194095903841e-05, - "loss": 0.4367, - "step": 1836000 - }, - { - "epoch": 0.02, - "learning_rate": 1.2837407174229876e-05, - "loss": 0.437, - "step": 1837000 - }, - { - "epoch": 0.02, - "learning_rate": 1.2803077978326747e-05, - "loss": 0.4377, - "step": 1838000 - }, - { - "epoch": 0.02, - "learning_rate": 1.2768953475901701e-05, - "loss": 0.4383, - "step": 1839000 - }, - { - "epoch": 0.02, - "learning_rate": 1.2735101405857255e-05, - "loss": 0.4379, - "step": 1840000 - }, - { - "epoch": 0.02, - "eval_loss": 0.41641688346862793, - "eval_runtime": 79.1386, - "eval_samples_per_second": 80.871, - "eval_steps_per_second": 0.632, - "step": 1840000 - }, - { - "epoch": 0.02, - "learning_rate": 1.2701386191707756e-05, - "loss": 0.4379, - "step": 1841000 - }, - { - "epoch": 0.02, - "learning_rate": 1.2667875980807157e-05, - "loss": 0.4384, - "step": 1842000 - }, - { - "epoch": 0.02, - "learning_rate": 1.2634570875233356e-05, - "loss": 0.4379, - "step": 1843000 - }, - { - "epoch": 0.02, - "learning_rate": 1.2601470976439498e-05, - "loss": 0.4368, - "step": 1844000 - }, - { - "epoch": 0.02, - "learning_rate": 1.2568576385253613e-05, - "loss": 0.4379, - "step": 1845000 - }, - { - "epoch": 0.02, - "eval_loss": 0.41581401228904724, - "eval_runtime": 81.6085, - "eval_samples_per_second": 78.423, - "eval_steps_per_second": 0.613, - "step": 1845000 - }, - { - "epoch": 0.02, - "learning_rate": 1.2535919788427315e-05, - "loss": 0.4365, - "step": 1846000 - }, - { - "epoch": 0.02, - "learning_rate": 1.2503435906882624e-05, - "loss": 0.4374, - "step": 1847000 - }, - { - "epoch": 0.02, - "learning_rate": 1.247115763157773e-05, - "loss": 0.4381, - "step": 1848000 - }, - { - "epoch": 0.02, - "learning_rate": 1.2439117030626584e-05, - "loss": 0.4368, - "step": 1849000 - }, - { - "epoch": 0.03, - "learning_rate": 1.2407250056299487e-05, - "loss": 0.4383, - "step": 1850000 - }, - { - "epoch": 0.03, - "eval_loss": 0.4171139597892761, - "eval_runtime": 78.6968, - "eval_samples_per_second": 81.325, - "eval_steps_per_second": 0.635, - "step": 1850000 - }, - { - "epoch": 0.03, - "learning_rate": 1.2375588981210953e-05, - "loss": 0.4373, - "step": 1851000 - }, - { - "epoch": 0.03, - "learning_rate": 1.2344133901806119e-05, - "loss": 0.4373, - "step": 1852000 - }, - { - "epoch": 0.03, - "learning_rate": 1.2312916059916086e-05, - "loss": 0.4373, - "step": 1853000 - }, - { - "epoch": 0.03, - "learning_rate": 1.2281873052469721e-05, - "loss": 0.4382, - "step": 1854000 - }, - { - "epoch": 0.03, - "learning_rate": 1.225106705983955e-05, - "loss": 0.4364, - "step": 1855000 - }, - { - "epoch": 0.03, - "eval_loss": 0.4179363548755646, - "eval_runtime": 79.5482, - "eval_samples_per_second": 80.454, - "eval_steps_per_second": 0.629, - "step": 1855000 - }, - { - "epoch": 0.03, - "learning_rate": 1.2220436502222735e-05, - "loss": 0.4369, - "step": 1856000 - }, - { - "epoch": 0.03, - "learning_rate": 1.2190012412910979e-05, - "loss": 0.437, - "step": 1857000 - }, - { - "epoch": 0.03, - "learning_rate": 1.2159794884581337e-05, - "loss": 0.4387, - "step": 1858000 - }, - { - "epoch": 0.03, - "learning_rate": 1.212981391690326e-05, - "loss": 0.437, - "step": 1859000 - }, - { - "epoch": 0.03, - "learning_rate": 1.21000392803005e-05, - "loss": 0.4385, - "step": 1860000 - }, - { - "epoch": 0.03, - "eval_loss": 0.41541191935539246, - "eval_runtime": 80.5191, - "eval_samples_per_second": 79.484, - "eval_steps_per_second": 0.621, - "step": 1860000 - }, - { - "epoch": 0.03, - "learning_rate": 1.2070471065290428e-05, - "loss": 0.4373, - "step": 1861000 - }, - { - "epoch": 0.03, - "learning_rate": 1.2041080074160751e-05, - "loss": 0.4387, - "step": 1862000 - }, - { - "epoch": 0.03, - "learning_rate": 1.2011896097686919e-05, - "loss": 0.4385, - "step": 1863000 - }, - { - "epoch": 0.03, - "learning_rate": 1.1982948098163646e-05, - "loss": 0.4385, - "step": 1864000 - }, - { - "epoch": 0.03, - "learning_rate": 1.1954178209833276e-05, - "loss": 0.4375, - "step": 1865000 - }, - { - "epoch": 0.03, - "eval_loss": 0.4207480549812317, - "eval_runtime": 78.9778, - "eval_samples_per_second": 81.035, - "eval_steps_per_second": 0.633, - "step": 1865000 - }, - { - "epoch": 0.03, - "learning_rate": 1.1925615600876905e-05, - "loss": 0.4376, - "step": 1866000 - }, - { - "epoch": 0.03, - "learning_rate": 1.1897260358301189e-05, - "loss": 0.4372, - "step": 1867000 - }, - { - "epoch": 0.03, - "learning_rate": 1.1869112568481112e-05, - "loss": 0.4371, - "step": 1868000 - }, - { - "epoch": 0.03, - "learning_rate": 1.1841172317159714e-05, - "loss": 0.437, - "step": 1869000 - }, - { - "epoch": 0.04, - "learning_rate": 1.1813494947438669e-05, - "loss": 0.4375, - "step": 1870000 - }, - { - "epoch": 0.04, - "eval_loss": 0.4189639985561371, - "eval_runtime": 79.747, - "eval_samples_per_second": 80.254, - "eval_steps_per_second": 0.627, - "step": 1870000 - }, - { - "epoch": 0.04, - "learning_rate": 1.1785969612314684e-05, - "loss": 0.4374, - "step": 1871000 - }, - { - "epoch": 0.04, - "learning_rate": 1.175865206895725e-05, - "loss": 0.4369, - "step": 1872000 - }, - { - "epoch": 0.04, - "learning_rate": 1.1731542400580344e-05, - "loss": 0.4389, - "step": 1873000 - }, - { - "epoch": 0.04, - "learning_rate": 1.17046674875734e-05, - "loss": 0.4373, - "step": 1874000 - }, - { - "epoch": 0.04, - "learning_rate": 1.1677973608186099e-05, - "loss": 0.437, - "step": 1875000 - }, - { - "epoch": 0.04, - "eval_loss": 0.41854363679885864, - "eval_runtime": 77.0323, - "eval_samples_per_second": 83.082, - "eval_steps_per_second": 0.649, - "step": 1875000 - }, - { - "epoch": 0.04, - "learning_rate": 1.1651514231315332e-05, - "loss": 0.4382, - "step": 1876000 - }, - { - "epoch": 0.04, - "learning_rate": 1.1625262639591158e-05, - "loss": 0.4386, - "step": 1877000 - }, - { - "epoch": 0.04, - "learning_rate": 1.1599218912820604e-05, - "loss": 0.4363, - "step": 1878000 - }, - { - "epoch": 0.04, - "learning_rate": 1.1573357372741149e-05, - "loss": 0.4384, - "step": 1879000 - }, - { - "epoch": 0.04, - "learning_rate": 1.1547729821041103e-05, - "loss": 0.437, - "step": 1880000 - }, - { - "epoch": 0.04, - "eval_loss": 0.4172144830226898, - "eval_runtime": 79.3075, - "eval_samples_per_second": 80.699, - "eval_steps_per_second": 0.63, - "step": 1880000 - }, - { - "epoch": 0.04, - "learning_rate": 1.1522285029389667e-05, - "loss": 0.4374, - "step": 1881000 - }, - { - "epoch": 0.04, - "learning_rate": 1.149704883273374e-05, - "loss": 0.4381, - "step": 1882000 - }, - { - "epoch": 0.04, - "learning_rate": 1.1472021307947182e-05, - "loss": 0.4369, - "step": 1883000 - }, - { - "epoch": 0.04, - "learning_rate": 1.1447227245749926e-05, - "loss": 0.4371, - "step": 1884000 - }, - { - "epoch": 0.04, - "learning_rate": 1.1422617083919464e-05, - "loss": 0.4361, - "step": 1885000 - }, - { - "epoch": 0.04, - "eval_loss": 0.41780930757522583, - "eval_runtime": 80.7871, - "eval_samples_per_second": 79.221, - "eval_steps_per_second": 0.619, - "step": 1885000 - }, - { - "epoch": 0.04, - "learning_rate": 1.1398215820690375e-05, - "loss": 0.4367, - "step": 1886000 - }, - { - "epoch": 0.04, - "learning_rate": 1.1374023530393122e-05, - "loss": 0.4376, - "step": 1887000 - }, - { - "epoch": 0.04, - "learning_rate": 1.1350040286721638e-05, - "loss": 0.4385, - "step": 1888000 - }, - { - "epoch": 0.04, - "learning_rate": 1.1326289832377572e-05, - "loss": 0.437, - "step": 1889000 - }, - { - "epoch": 0.04, - "learning_rate": 1.1302724691263888e-05, - "loss": 0.437, - "step": 1890000 - }, - { - "epoch": 0.04, - "eval_loss": 0.41528433561325073, - "eval_runtime": 78.5162, - "eval_samples_per_second": 81.512, - "eval_steps_per_second": 0.637, - "step": 1890000 - }, - { - "epoch": 0.05, - "learning_rate": 1.1279368813964581e-05, - "loss": 0.4374, - "step": 1891000 - }, - { - "epoch": 0.05, - "learning_rate": 1.125624531358172e-05, - "loss": 0.4375, - "step": 1892000 - }, - { - "epoch": 0.05, - "learning_rate": 1.1233307967271214e-05, - "loss": 0.4375, - "step": 1893000 - }, - { - "epoch": 0.05, - "learning_rate": 1.1210625342870107e-05, - "loss": 0.4372, - "step": 1894000 - }, - { - "epoch": 0.05, - "learning_rate": 1.1188106597174481e-05, - "loss": 0.4367, - "step": 1895000 - }, - { - "epoch": 0.05, - "eval_loss": 0.4200913608074188, - "eval_runtime": 79.1042, - "eval_samples_per_second": 80.906, - "eval_steps_per_second": 0.632, - "step": 1895000 - }, - { - "epoch": 0.05, - "learning_rate": 1.1165797464439845e-05, - "loss": 0.4384, - "step": 1896000 - }, - { - "epoch": 0.05, - "learning_rate": 1.1143720007317412e-05, - "loss": 0.4366, - "step": 1897000 - }, - { - "epoch": 0.05, - "learning_rate": 1.1121830093956824e-05, - "loss": 0.4366, - "step": 1898000 - }, - { - "epoch": 0.05, - "learning_rate": 1.1100149995446934e-05, - "loss": 0.4371, - "step": 1899000 - }, - { - "epoch": 0.05, - "learning_rate": 1.1078679777829087e-05, - "loss": 0.4384, - "step": 1900000 - }, - { - "epoch": 0.05, - "eval_loss": 0.41656479239463806, - "eval_runtime": 80.5362, - "eval_samples_per_second": 79.467, - "eval_steps_per_second": 0.621, - "step": 1900000 - }, - { - "epoch": 0.05, - "learning_rate": 1.105744066188684e-05, - "loss": 0.4371, - "step": 1901000 - }, - { - "epoch": 0.05, - "learning_rate": 1.1036390191576373e-05, - "loss": 0.4379, - "step": 1902000 - }, - { - "epoch": 0.05, - "learning_rate": 1.1015549796381372e-05, - "loss": 0.4373, - "step": 1903000 - }, - { - "epoch": 0.05, - "learning_rate": 1.0994960590538279e-05, - "loss": 0.4375, - "step": 1904000 - }, - { - "epoch": 0.05, - "learning_rate": 1.0974540114919287e-05, - "loss": 0.4363, - "step": 1905000 - }, - { - "epoch": 0.05, - "eval_loss": 0.4150693416595459, - "eval_runtime": 80.8571, - "eval_samples_per_second": 79.152, - "eval_steps_per_second": 0.618, - "step": 1905000 - }, - { - "epoch": 0.05, - "learning_rate": 1.0954329902821809e-05, - "loss": 0.4375, - "step": 1906000 - }, - { - "epoch": 0.05, - "learning_rate": 1.0934330015809674e-05, - "loss": 0.437, - "step": 1907000 - }, - { - "epoch": 0.05, - "learning_rate": 1.0914560199199067e-05, - "loss": 0.4379, - "step": 1908000 - }, - { - "epoch": 0.05, - "learning_rate": 1.0894980934009906e-05, - "loss": 0.4372, - "step": 1909000 - }, - { - "epoch": 0.06, - "learning_rate": 1.0875612174693328e-05, - "loss": 0.437, - "step": 1910000 - }, - { - "epoch": 0.06, - "eval_loss": 0.4164562225341797, - "eval_runtime": 79.4864, - "eval_samples_per_second": 80.517, - "eval_steps_per_second": 0.629, - "step": 1910000 - }, - { - "epoch": 0.06, - "learning_rate": 1.0856473033247752e-05, - "loss": 0.4369, - "step": 1911000 - }, - { - "epoch": 0.06, - "learning_rate": 1.0837525251384567e-05, - "loss": 0.4367, - "step": 1912000 - }, - { - "epoch": 0.06, - "learning_rate": 1.0818806782260748e-05, - "loss": 0.4367, - "step": 1913000 - }, - { - "epoch": 0.06, - "learning_rate": 1.0800280208492865e-05, - "loss": 0.4379, - "step": 1914000 - }, - { - "epoch": 0.06, - "learning_rate": 1.0781982639541429e-05, - "loss": 0.4373, - "step": 1915000 - }, - { - "epoch": 0.06, - "eval_loss": 0.4175663888454437, - "eval_runtime": 80.7249, - "eval_samples_per_second": 79.282, - "eval_steps_per_second": 0.619, - "step": 1915000 - }, - { - "epoch": 0.06, - "learning_rate": 1.0763895499185767e-05, - "loss": 0.4362, - "step": 1916000 - }, - { - "epoch": 0.06, - "learning_rate": 1.0746001053331784e-05, - "loss": 0.4367, - "step": 1917000 - }, - { - "epoch": 0.06, - "learning_rate": 1.0728317567168942e-05, - "loss": 0.4373, - "step": 1918000 - }, - { - "epoch": 0.06, - "learning_rate": 1.0710845094564199e-05, - "loss": 0.4377, - "step": 1919000 - }, - { - "epoch": 0.06, - "learning_rate": 1.0693583688741745e-05, - "loss": 0.4364, - "step": 1920000 - }, - { - "epoch": 0.06, - "eval_loss": 0.4133068919181824, - "eval_runtime": 78.4611, - "eval_samples_per_second": 81.569, - "eval_steps_per_second": 0.637, - "step": 1920000 - }, - { - "epoch": 0.06, - "learning_rate": 1.0676550347097805e-05, - "loss": 0.4376, - "step": 1921000 - }, - { - "epoch": 0.06, - "learning_rate": 1.06597110207435e-05, - "loss": 0.437, - "step": 1922000 - }, - { - "epoch": 0.06, - "learning_rate": 1.0643082916934733e-05, - "loss": 0.4378, - "step": 1923000 - }, - { - "epoch": 0.06, - "learning_rate": 1.0626682397606544e-05, - "loss": 0.4365, - "step": 1924000 - }, - { - "epoch": 0.06, - "learning_rate": 1.0610492778999931e-05, - "loss": 0.4366, - "step": 1925000 - }, - { - "epoch": 0.06, - "eval_loss": 0.41611722111701965, - "eval_runtime": 81.6547, - "eval_samples_per_second": 78.379, - "eval_steps_per_second": 0.612, - "step": 1925000 - }, - { - "epoch": 0.06, - "learning_rate": 1.059449822137189e-05, - "loss": 0.4372, - "step": 1926000 - }, - { - "epoch": 0.06, - "learning_rate": 1.0578715084938887e-05, - "loss": 0.4374, - "step": 1927000 - }, - { - "epoch": 0.06, - "learning_rate": 1.0563143417779096e-05, - "loss": 0.4366, - "step": 1928000 - }, - { - "epoch": 0.06, - "learning_rate": 1.0547798521808734e-05, - "loss": 0.437, - "step": 1929000 - }, - { - "epoch": 0.07, - "learning_rate": 1.0532649723266384e-05, - "loss": 0.4365, - "step": 1930000 - }, - { - "epoch": 0.07, - "eval_loss": 0.4162156581878662, - "eval_runtime": 80.1168, - "eval_samples_per_second": 79.883, - "eval_steps_per_second": 0.624, - "step": 1930000 - }, - { - "epoch": 0.07, - "learning_rate": 1.0517727365795085e-05, - "loss": 0.4369, - "step": 1931000 - }, - { - "epoch": 0.07, - "learning_rate": 1.0503001620268975e-05, - "loss": 0.4373, - "step": 1932000 - }, - { - "epoch": 0.07, - "learning_rate": 1.0488487574652423e-05, - "loss": 0.4374, - "step": 1933000 - }, - { - "epoch": 0.07, - "learning_rate": 1.0474199469678468e-05, - "loss": 0.437, - "step": 1934000 - }, - { - "epoch": 0.07, - "learning_rate": 1.0460108744063674e-05, - "loss": 0.4369, - "step": 1935000 - }, - { - "epoch": 0.07, - "eval_loss": 0.4142652451992035, - "eval_runtime": 77.8865, - "eval_samples_per_second": 82.171, - "eval_steps_per_second": 0.642, - "step": 1935000 - }, - { - "epoch": 0.07, - "learning_rate": 1.0446243622089129e-05, - "loss": 0.4389, - "step": 1936000 - }, - { - "epoch": 0.07, - "learning_rate": 1.0432576387995491e-05, - "loss": 0.4371, - "step": 1937000 - }, - { - "epoch": 0.07, - "learning_rate": 1.0419121068338878e-05, - "loss": 0.4372, - "step": 1938000 - }, - { - "epoch": 0.07, - "learning_rate": 1.0405877704106532e-05, - "loss": 0.4366, - "step": 1939000 - }, - { - "epoch": 0.07, - "learning_rate": 1.0392859261103349e-05, - "loss": 0.4355, - "step": 1940000 - }, - { - "epoch": 0.07, - "eval_loss": 0.4190742075443268, - "eval_runtime": 80.8959, - "eval_samples_per_second": 79.114, - "eval_steps_per_second": 0.618, - "step": 1940000 - }, - { - "epoch": 0.07, - "learning_rate": 1.0380039716043426e-05, - "loss": 0.4357, - "step": 1941000 - }, - { - "epoch": 0.07, - "learning_rate": 1.0367432245456347e-05, - "loss": 0.4362, - "step": 1942000 - }, - { - "epoch": 0.07, - "learning_rate": 1.0355049177141353e-05, - "loss": 0.4362, - "step": 1943000 - }, - { - "epoch": 0.07, - "learning_rate": 1.0342865757898152e-05, - "loss": 0.437, - "step": 1944000 - }, - { - "epoch": 0.07, - "learning_rate": 1.0330906391597708e-05, - "loss": 0.4357, - "step": 1945000 - }, - { - "epoch": 0.07, - "eval_loss": 0.4155297577381134, - "eval_runtime": 77.8064, - "eval_samples_per_second": 82.255, - "eval_steps_per_second": 0.643, - "step": 1945000 - }, - { - "epoch": 0.07, - "learning_rate": 1.0319147172001108e-05, - "loss": 0.4367, - "step": 1946000 - }, - { - "epoch": 0.07, - "learning_rate": 1.0307600212366596e-05, - "loss": 0.4355, - "step": 1947000 - }, - { - "epoch": 0.07, - "learning_rate": 1.029627677647975e-05, - "loss": 0.4367, - "step": 1948000 - }, - { - "epoch": 0.07, - "learning_rate": 1.0285154229298157e-05, - "loss": 0.4369, - "step": 1949000 - }, - { - "epoch": 0.07, - "learning_rate": 1.0274244045627054e-05, - "loss": 0.4373, - "step": 1950000 - }, - { - "epoch": 0.07, - "eval_loss": 0.41703131794929504, - "eval_runtime": 77.2751, - "eval_samples_per_second": 82.821, - "eval_steps_per_second": 0.647, - "step": 1950000 - }, - { - "epoch": 0.08, - "learning_rate": 1.026354625870075e-05, - "loss": 0.4364, - "step": 1951000 - }, - { - "epoch": 0.08, - "learning_rate": 1.0253060901106556e-05, - "loss": 0.4361, - "step": 1952000 - }, - { - "epoch": 0.08, - "learning_rate": 1.0242798171546145e-05, - "loss": 0.4365, - "step": 1953000 - }, - { - "epoch": 0.08, - "learning_rate": 1.0232747509747644e-05, - "loss": 0.4373, - "step": 1954000 - }, - { - "epoch": 0.08, - "learning_rate": 1.0222899204125646e-05, - "loss": 0.4362, - "step": 1955000 - }, - { - "epoch": 0.08, - "eval_loss": 0.4164978265762329, - "eval_runtime": 80.0596, - "eval_samples_per_second": 79.94, - "eval_steps_per_second": 0.625, - "step": 1955000 - }, - { - "epoch": 0.08, - "learning_rate": 1.0213263451653737e-05, - "loss": 0.4367, - "step": 1956000 - }, - { - "epoch": 0.08, - "learning_rate": 1.0203849598659497e-05, - "loss": 0.4367, - "step": 1957000 - }, - { - "epoch": 0.08, - "learning_rate": 1.0194638827271399e-05, - "loss": 0.4364, - "step": 1958000 - }, - { - "epoch": 0.08, - "learning_rate": 1.0185640695119401e-05, - "loss": 0.4363, - "step": 1959000 - }, - { - "epoch": 0.08, - "learning_rate": 1.017685522961337e-05, - "loss": 0.4362, - "step": 1960000 - }, - { - "epoch": 0.08, - "eval_loss": 0.42052188515663147, - "eval_runtime": 77.8558, - "eval_samples_per_second": 82.203, - "eval_steps_per_second": 0.642, - "step": 1960000 - }, - { - "epoch": 0.08, - "learning_rate": 1.0168282457515363e-05, - "loss": 0.4369, - "step": 1961000 - }, - { - "epoch": 0.08, - "learning_rate": 1.0159930658730172e-05, - "loss": 0.4364, - "step": 1962000 - }, - { - "epoch": 0.08, - "learning_rate": 1.0151791179631108e-05, - "loss": 0.4359, - "step": 1963000 - }, - { - "epoch": 0.08, - "learning_rate": 1.0143856216286122e-05, - "loss": 0.4368, - "step": 1964000 - }, - { - "epoch": 0.08, - "learning_rate": 1.0136134046869866e-05, - "loss": 0.4357, - "step": 1965000 - }, - { - "epoch": 0.08, - "eval_loss": 0.41740044951438904, - "eval_runtime": 78.1991, - "eval_samples_per_second": 81.842, - "eval_steps_per_second": 0.639, - "step": 1965000 - }, - { - "epoch": 0.08, - "learning_rate": 1.0128632097947403e-05, - "loss": 0.4365, - "step": 1966000 - }, - { - "epoch": 0.08, - "learning_rate": 1.0121335373458022e-05, - "loss": 0.4362, - "step": 1967000 - }, - { - "epoch": 0.08, - "learning_rate": 1.011425151149977e-05, - "loss": 0.4361, - "step": 1968000 - }, - { - "epoch": 0.08, - "learning_rate": 1.010738729828653e-05, - "loss": 0.4375, - "step": 1969000 - }, - { - "epoch": 0.09, - "learning_rate": 1.0100729012562797e-05, - "loss": 0.4372, - "step": 1970000 - }, - { - "epoch": 0.09, - "eval_loss": 0.4145086705684662, - "eval_runtime": 79.8319, - "eval_samples_per_second": 80.168, - "eval_steps_per_second": 0.626, - "step": 1970000 - }, - { - "epoch": 0.09, - "learning_rate": 1.0094289991138392e-05, - "loss": 0.4363, - "step": 1971000 - }, - { - "epoch": 0.09, - "learning_rate": 1.0088057362697175e-05, - "loss": 0.4375, - "step": 1972000 - }, - { - "epoch": 0.09, - "learning_rate": 1.0082049524936494e-05, - "loss": 0.4372, - "step": 1973000 - }, - { - "epoch": 0.09, - "learning_rate": 1.0076242416653332e-05, - "loss": 0.4349, - "step": 1974000 - }, - { - "epoch": 0.09, - "learning_rate": 1.0070648308262255e-05, - "loss": 0.436, - "step": 1975000 - }, - { - "epoch": 0.09, - "eval_loss": 0.4151042103767395, - "eval_runtime": 79.0273, - "eval_samples_per_second": 80.985, - "eval_steps_per_second": 0.633, - "step": 1975000 - }, - { - "epoch": 0.09, - "learning_rate": 1.006526721680391e-05, - "loss": 0.4342, - "step": 1976000 - }, - { - "epoch": 0.09, - "learning_rate": 1.0060099158670026e-05, - "loss": 0.4363, - "step": 1977000 - }, - { - "epoch": 0.09, - "learning_rate": 1.0055148998189381e-05, - "loss": 0.437, - "step": 1978000 - }, - { - "epoch": 0.09, - "learning_rate": 1.0050411475939925e-05, - "loss": 0.436, - "step": 1979000 - }, - { - "epoch": 0.09, - "learning_rate": 1.0045882183469046e-05, - "loss": 0.4355, - "step": 1980000 - }, - { - "epoch": 0.09, - "eval_loss": 0.4141569435596466, - "eval_runtime": 79.5726, - "eval_samples_per_second": 80.43, - "eval_steps_per_second": 0.628, - "step": 1980000 - }, - { - "epoch": 0.09, - "learning_rate": 1.0041565983372807e-05, - "loss": 0.4359, - "step": 1981000 - }, - { - "epoch": 0.09, - "learning_rate": 1.0037462888799093e-05, - "loss": 0.4362, - "step": 1982000 - }, - { - "epoch": 0.09, - "learning_rate": 1.0033576695766748e-05, - "loss": 0.4376, - "step": 1983000 - }, - { - "epoch": 0.09, - "learning_rate": 1.0029899635949539e-05, - "loss": 0.4373, - "step": 1984000 - }, - { - "epoch": 0.09, - "learning_rate": 1.0026435717192568e-05, - "loss": 0.4367, - "step": 1985000 - }, - { - "epoch": 0.09, - "eval_loss": 0.4171934127807617, - "eval_runtime": 77.9474, - "eval_samples_per_second": 82.107, - "eval_steps_per_second": 0.641, - "step": 1985000 - }, - { - "epoch": 0.09, - "learning_rate": 1.0023184950047551e-05, - "loss": 0.4361, - "step": 1986000 - }, - { - "epoch": 0.09, - "learning_rate": 1.002015027554519e-05, - "loss": 0.4377, - "step": 1987000 - }, - { - "epoch": 0.09, - "learning_rate": 1.0017325627506754e-05, - "loss": 0.4373, - "step": 1988000 - }, - { - "epoch": 0.09, - "learning_rate": 1.0014716663814055e-05, - "loss": 0.4368, - "step": 1989000 - }, - { - "epoch": 0.1, - "learning_rate": 1.0012320461270247e-05, - "loss": 0.4358, - "step": 1990000 - }, - { - "epoch": 0.1, - "eval_loss": 0.41612717509269714, - "eval_runtime": 80.5577, - "eval_samples_per_second": 79.446, - "eval_steps_per_second": 0.621, - "step": 1990000 - }, - { - "epoch": 0.1, - "learning_rate": 1.0010134948139825e-05, - "loss": 0.4366, - "step": 1991000 - }, - { - "epoch": 0.1, - "learning_rate": 1.0008162636276321e-05, - "loss": 0.4369, - "step": 1992000 - }, - { - "epoch": 0.1, - "learning_rate": 1.0006403531687724e-05, - "loss": 0.4372, - "step": 1993000 - }, - { - "epoch": 0.1, - "learning_rate": 1.0004859079123212e-05, - "loss": 0.4361, - "step": 1994000 - }, - { - "epoch": 0.1, - "learning_rate": 1.0003526191291106e-05, - "loss": 0.4369, - "step": 1995000 - }, - { - "epoch": 0.1, - "eval_loss": 0.4170204997062683, - "eval_runtime": 80.1918, - "eval_samples_per_second": 79.809, - "eval_steps_per_second": 0.624, - "step": 1995000 - }, - { - "epoch": 0.1, - "learning_rate": 1.0002406524857334e-05, - "loss": 0.436, - "step": 1996000 - }, - { - "epoch": 0.1, - "learning_rate": 1.0001500883167451e-05, - "loss": 0.4372, - "step": 1997000 - }, - { - "epoch": 0.1, - "learning_rate": 1.0000807455884181e-05, - "loss": 0.4369, - "step": 1998000 - }, - { - "epoch": 0.1, - "learning_rate": 1.0000327631969819e-05, - "loss": 0.4362, - "step": 1999000 - }, - { - "epoch": 0.1, - "learning_rate": 1.00000604522778e-05, - "loss": 0.4363, - "step": 2000000 - }, - { - "epoch": 0.1, - "eval_loss": 0.41442054510116577, - "eval_runtime": 79.9098, - "eval_samples_per_second": 80.09, - "eval_steps_per_second": 0.626, - "step": 2000000 - } - ], - "max_steps": 2000000, - "num_train_epochs": 9223372036854775807, - "total_flos": 1.752506547830784e+22, - "trial_name": null, - "trial_params": null -}