qp-mscoco-sbert-lr5e-5 / trainer_state.json
madhavsankar's picture
Upload 11 files
c89dbb8
raw
history blame
42.4 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"global_step": 168750,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"learning_rate": 4.9851851851851855e-05,
"loss": 0.0172,
"step": 500
},
{
"epoch": 0.02,
"learning_rate": 4.970370370370371e-05,
"loss": 0.0128,
"step": 1000
},
{
"epoch": 0.03,
"learning_rate": 4.955555555555556e-05,
"loss": 0.0124,
"step": 1500
},
{
"epoch": 0.04,
"learning_rate": 4.940740740740741e-05,
"loss": 0.0125,
"step": 2000
},
{
"epoch": 0.04,
"learning_rate": 4.925925925925926e-05,
"loss": 0.0116,
"step": 2500
},
{
"epoch": 0.05,
"learning_rate": 4.9111111111111114e-05,
"loss": 0.0117,
"step": 3000
},
{
"epoch": 0.06,
"learning_rate": 4.896296296296297e-05,
"loss": 0.0119,
"step": 3500
},
{
"epoch": 0.07,
"learning_rate": 4.881481481481482e-05,
"loss": 0.0129,
"step": 4000
},
{
"epoch": 0.08,
"learning_rate": 4.866666666666667e-05,
"loss": 0.0139,
"step": 4500
},
{
"epoch": 0.09,
"learning_rate": 4.851851851851852e-05,
"loss": 0.0131,
"step": 5000
},
{
"epoch": 0.1,
"learning_rate": 4.837037037037037e-05,
"loss": 0.013,
"step": 5500
},
{
"epoch": 0.11,
"learning_rate": 4.8222222222222225e-05,
"loss": 0.0133,
"step": 6000
},
{
"epoch": 0.12,
"learning_rate": 4.807407407407408e-05,
"loss": 0.0132,
"step": 6500
},
{
"epoch": 0.12,
"learning_rate": 4.792592592592593e-05,
"loss": 0.0136,
"step": 7000
},
{
"epoch": 0.13,
"learning_rate": 4.7777777777777784e-05,
"loss": 0.0132,
"step": 7500
},
{
"epoch": 0.14,
"learning_rate": 4.762962962962963e-05,
"loss": 0.0133,
"step": 8000
},
{
"epoch": 0.15,
"learning_rate": 4.7481481481481483e-05,
"loss": 0.013,
"step": 8500
},
{
"epoch": 0.16,
"learning_rate": 4.7333333333333336e-05,
"loss": 0.0131,
"step": 9000
},
{
"epoch": 0.17,
"learning_rate": 4.718518518518519e-05,
"loss": 0.0128,
"step": 9500
},
{
"epoch": 0.18,
"learning_rate": 4.703703703703704e-05,
"loss": 0.013,
"step": 10000
},
{
"epoch": 0.19,
"learning_rate": 4.6888888888888895e-05,
"loss": 0.0132,
"step": 10500
},
{
"epoch": 0.2,
"learning_rate": 4.674074074074074e-05,
"loss": 0.0134,
"step": 11000
},
{
"epoch": 0.2,
"learning_rate": 4.6592592592592595e-05,
"loss": 0.0131,
"step": 11500
},
{
"epoch": 0.21,
"learning_rate": 4.644444444444445e-05,
"loss": 0.0131,
"step": 12000
},
{
"epoch": 0.22,
"learning_rate": 4.62962962962963e-05,
"loss": 0.0131,
"step": 12500
},
{
"epoch": 0.23,
"learning_rate": 4.6148148148148154e-05,
"loss": 0.0128,
"step": 13000
},
{
"epoch": 0.24,
"learning_rate": 4.600000000000001e-05,
"loss": 0.0131,
"step": 13500
},
{
"epoch": 0.25,
"learning_rate": 4.585185185185185e-05,
"loss": 0.0135,
"step": 14000
},
{
"epoch": 0.26,
"learning_rate": 4.5703703703703706e-05,
"loss": 0.0131,
"step": 14500
},
{
"epoch": 0.27,
"learning_rate": 4.555555555555556e-05,
"loss": 0.0134,
"step": 15000
},
{
"epoch": 0.28,
"learning_rate": 4.540740740740741e-05,
"loss": 0.0131,
"step": 15500
},
{
"epoch": 0.28,
"learning_rate": 4.5259259259259265e-05,
"loss": 0.013,
"step": 16000
},
{
"epoch": 0.29,
"learning_rate": 4.511111111111112e-05,
"loss": 0.0128,
"step": 16500
},
{
"epoch": 0.3,
"learning_rate": 4.496296296296297e-05,
"loss": 0.013,
"step": 17000
},
{
"epoch": 0.31,
"learning_rate": 4.481481481481482e-05,
"loss": 0.0127,
"step": 17500
},
{
"epoch": 0.32,
"learning_rate": 4.466666666666667e-05,
"loss": 0.013,
"step": 18000
},
{
"epoch": 0.33,
"learning_rate": 4.4518518518518523e-05,
"loss": 0.0127,
"step": 18500
},
{
"epoch": 0.34,
"learning_rate": 4.4370370370370376e-05,
"loss": 0.0125,
"step": 19000
},
{
"epoch": 0.35,
"learning_rate": 4.422222222222222e-05,
"loss": 0.0131,
"step": 19500
},
{
"epoch": 0.36,
"learning_rate": 4.4074074074074076e-05,
"loss": 0.0129,
"step": 20000
},
{
"epoch": 0.36,
"learning_rate": 4.392592592592593e-05,
"loss": 0.0131,
"step": 20500
},
{
"epoch": 0.37,
"learning_rate": 4.377777777777778e-05,
"loss": 0.0127,
"step": 21000
},
{
"epoch": 0.38,
"learning_rate": 4.3629629629629635e-05,
"loss": 0.0126,
"step": 21500
},
{
"epoch": 0.39,
"learning_rate": 4.348148148148148e-05,
"loss": 0.013,
"step": 22000
},
{
"epoch": 0.4,
"learning_rate": 4.3333333333333334e-05,
"loss": 0.0131,
"step": 22500
},
{
"epoch": 0.41,
"learning_rate": 4.318518518518519e-05,
"loss": 0.0131,
"step": 23000
},
{
"epoch": 0.42,
"learning_rate": 4.303703703703704e-05,
"loss": 0.0129,
"step": 23500
},
{
"epoch": 0.43,
"learning_rate": 4.2888888888888886e-05,
"loss": 0.0129,
"step": 24000
},
{
"epoch": 0.44,
"learning_rate": 4.274074074074074e-05,
"loss": 0.0128,
"step": 24500
},
{
"epoch": 0.44,
"learning_rate": 4.259259259259259e-05,
"loss": 0.0132,
"step": 25000
},
{
"epoch": 0.45,
"learning_rate": 4.2444444444444445e-05,
"loss": 0.0128,
"step": 25500
},
{
"epoch": 0.46,
"learning_rate": 4.22962962962963e-05,
"loss": 0.0131,
"step": 26000
},
{
"epoch": 0.47,
"learning_rate": 4.2148148148148145e-05,
"loss": 0.0131,
"step": 26500
},
{
"epoch": 0.48,
"learning_rate": 4.2e-05,
"loss": 0.0133,
"step": 27000
},
{
"epoch": 0.49,
"learning_rate": 4.185185185185185e-05,
"loss": 0.013,
"step": 27500
},
{
"epoch": 0.5,
"learning_rate": 4.1703703703703704e-05,
"loss": 0.0132,
"step": 28000
},
{
"epoch": 0.51,
"learning_rate": 4.155555555555556e-05,
"loss": 0.0132,
"step": 28500
},
{
"epoch": 0.52,
"learning_rate": 4.140740740740741e-05,
"loss": 0.0125,
"step": 29000
},
{
"epoch": 0.52,
"learning_rate": 4.1259259259259256e-05,
"loss": 0.0128,
"step": 29500
},
{
"epoch": 0.53,
"learning_rate": 4.111111111111111e-05,
"loss": 0.0126,
"step": 30000
},
{
"epoch": 0.54,
"learning_rate": 4.096296296296296e-05,
"loss": 0.0127,
"step": 30500
},
{
"epoch": 0.55,
"learning_rate": 4.0814814814814815e-05,
"loss": 0.0129,
"step": 31000
},
{
"epoch": 0.56,
"learning_rate": 4.066666666666667e-05,
"loss": 0.0129,
"step": 31500
},
{
"epoch": 0.57,
"learning_rate": 4.051851851851852e-05,
"loss": 0.0129,
"step": 32000
},
{
"epoch": 0.58,
"learning_rate": 4.0370370370370374e-05,
"loss": 0.0127,
"step": 32500
},
{
"epoch": 0.59,
"learning_rate": 4.022222222222222e-05,
"loss": 0.0127,
"step": 33000
},
{
"epoch": 0.6,
"learning_rate": 4.007407407407407e-05,
"loss": 0.0133,
"step": 33500
},
{
"epoch": 0.6,
"learning_rate": 3.9925925925925926e-05,
"loss": 0.0131,
"step": 34000
},
{
"epoch": 0.61,
"learning_rate": 3.977777777777778e-05,
"loss": 0.0126,
"step": 34500
},
{
"epoch": 0.62,
"learning_rate": 3.962962962962963e-05,
"loss": 0.013,
"step": 35000
},
{
"epoch": 0.63,
"learning_rate": 3.9481481481481485e-05,
"loss": 0.0128,
"step": 35500
},
{
"epoch": 0.64,
"learning_rate": 3.933333333333333e-05,
"loss": 0.0127,
"step": 36000
},
{
"epoch": 0.65,
"learning_rate": 3.9185185185185185e-05,
"loss": 0.0131,
"step": 36500
},
{
"epoch": 0.66,
"learning_rate": 3.903703703703704e-05,
"loss": 0.0135,
"step": 37000
},
{
"epoch": 0.67,
"learning_rate": 3.888888888888889e-05,
"loss": 0.0125,
"step": 37500
},
{
"epoch": 0.68,
"learning_rate": 3.8740740740740744e-05,
"loss": 0.013,
"step": 38000
},
{
"epoch": 0.68,
"learning_rate": 3.85925925925926e-05,
"loss": 0.0132,
"step": 38500
},
{
"epoch": 0.69,
"learning_rate": 3.844444444444444e-05,
"loss": 0.013,
"step": 39000
},
{
"epoch": 0.7,
"learning_rate": 3.8296296296296296e-05,
"loss": 0.0135,
"step": 39500
},
{
"epoch": 0.71,
"learning_rate": 3.814814814814815e-05,
"loss": 0.0128,
"step": 40000
},
{
"epoch": 0.72,
"learning_rate": 3.8e-05,
"loss": 0.013,
"step": 40500
},
{
"epoch": 0.73,
"learning_rate": 3.7851851851851855e-05,
"loss": 0.0128,
"step": 41000
},
{
"epoch": 0.74,
"learning_rate": 3.770370370370371e-05,
"loss": 0.0129,
"step": 41500
},
{
"epoch": 0.75,
"learning_rate": 3.7555555555555554e-05,
"loss": 0.0126,
"step": 42000
},
{
"epoch": 0.76,
"learning_rate": 3.740740740740741e-05,
"loss": 0.0129,
"step": 42500
},
{
"epoch": 0.76,
"learning_rate": 3.725925925925926e-05,
"loss": 0.0132,
"step": 43000
},
{
"epoch": 0.77,
"learning_rate": 3.7111111111111113e-05,
"loss": 0.0129,
"step": 43500
},
{
"epoch": 0.78,
"learning_rate": 3.6962962962962966e-05,
"loss": 0.0131,
"step": 44000
},
{
"epoch": 0.79,
"learning_rate": 3.681481481481482e-05,
"loss": 0.0126,
"step": 44500
},
{
"epoch": 0.8,
"learning_rate": 3.6666666666666666e-05,
"loss": 0.0132,
"step": 45000
},
{
"epoch": 0.81,
"learning_rate": 3.651851851851852e-05,
"loss": 0.0131,
"step": 45500
},
{
"epoch": 0.82,
"learning_rate": 3.637037037037037e-05,
"loss": 0.0129,
"step": 46000
},
{
"epoch": 0.83,
"learning_rate": 3.6222222222222225e-05,
"loss": 0.0126,
"step": 46500
},
{
"epoch": 0.84,
"learning_rate": 3.607407407407408e-05,
"loss": 0.0128,
"step": 47000
},
{
"epoch": 0.84,
"learning_rate": 3.592592592592593e-05,
"loss": 0.0129,
"step": 47500
},
{
"epoch": 0.85,
"learning_rate": 3.577777777777778e-05,
"loss": 0.0128,
"step": 48000
},
{
"epoch": 0.86,
"learning_rate": 3.562962962962963e-05,
"loss": 0.0127,
"step": 48500
},
{
"epoch": 0.87,
"learning_rate": 3.548148148148148e-05,
"loss": 0.0131,
"step": 49000
},
{
"epoch": 0.88,
"learning_rate": 3.5333333333333336e-05,
"loss": 0.0133,
"step": 49500
},
{
"epoch": 0.89,
"learning_rate": 3.518518518518519e-05,
"loss": 0.0128,
"step": 50000
},
{
"epoch": 0.9,
"learning_rate": 3.503703703703704e-05,
"loss": 0.0133,
"step": 50500
},
{
"epoch": 0.91,
"learning_rate": 3.4888888888888895e-05,
"loss": 0.0129,
"step": 51000
},
{
"epoch": 0.92,
"learning_rate": 3.474074074074074e-05,
"loss": 0.0128,
"step": 51500
},
{
"epoch": 0.92,
"learning_rate": 3.4592592592592594e-05,
"loss": 0.0133,
"step": 52000
},
{
"epoch": 0.93,
"learning_rate": 3.444444444444445e-05,
"loss": 0.013,
"step": 52500
},
{
"epoch": 0.94,
"learning_rate": 3.42962962962963e-05,
"loss": 0.0132,
"step": 53000
},
{
"epoch": 0.95,
"learning_rate": 3.4148148148148153e-05,
"loss": 0.0129,
"step": 53500
},
{
"epoch": 0.96,
"learning_rate": 3.4000000000000007e-05,
"loss": 0.0129,
"step": 54000
},
{
"epoch": 0.97,
"learning_rate": 3.385185185185185e-05,
"loss": 0.0127,
"step": 54500
},
{
"epoch": 0.98,
"learning_rate": 3.3703703703703706e-05,
"loss": 0.0125,
"step": 55000
},
{
"epoch": 0.99,
"learning_rate": 3.355555555555556e-05,
"loss": 0.0132,
"step": 55500
},
{
"epoch": 1.0,
"learning_rate": 3.340740740740741e-05,
"loss": 0.0129,
"step": 56000
},
{
"epoch": 1.0,
"eval_loss": 0.013578644022345543,
"eval_mse": 0.013578643091022968,
"eval_runtime": 99.5064,
"eval_samples_per_second": 140.694,
"step": 56250
},
{
"epoch": 1.0,
"learning_rate": 3.3259259259259265e-05,
"loss": 0.0126,
"step": 56500
},
{
"epoch": 1.01,
"learning_rate": 3.311111111111112e-05,
"loss": 0.013,
"step": 57000
},
{
"epoch": 1.02,
"learning_rate": 3.2962962962962964e-05,
"loss": 0.0127,
"step": 57500
},
{
"epoch": 1.03,
"learning_rate": 3.281481481481482e-05,
"loss": 0.0129,
"step": 58000
},
{
"epoch": 1.04,
"learning_rate": 3.266666666666667e-05,
"loss": 0.0128,
"step": 58500
},
{
"epoch": 1.05,
"learning_rate": 3.251851851851852e-05,
"loss": 0.013,
"step": 59000
},
{
"epoch": 1.06,
"learning_rate": 3.2370370370370376e-05,
"loss": 0.0128,
"step": 59500
},
{
"epoch": 1.07,
"learning_rate": 3.222222222222223e-05,
"loss": 0.0132,
"step": 60000
},
{
"epoch": 1.08,
"learning_rate": 3.2074074074074075e-05,
"loss": 0.0129,
"step": 60500
},
{
"epoch": 1.08,
"learning_rate": 3.192592592592593e-05,
"loss": 0.0128,
"step": 61000
},
{
"epoch": 1.09,
"learning_rate": 3.177777777777778e-05,
"loss": 0.0129,
"step": 61500
},
{
"epoch": 1.1,
"learning_rate": 3.1629629629629634e-05,
"loss": 0.0131,
"step": 62000
},
{
"epoch": 1.11,
"learning_rate": 3.148148148148148e-05,
"loss": 0.0128,
"step": 62500
},
{
"epoch": 1.12,
"learning_rate": 3.1333333333333334e-05,
"loss": 0.0131,
"step": 63000
},
{
"epoch": 1.13,
"learning_rate": 3.118518518518519e-05,
"loss": 0.0134,
"step": 63500
},
{
"epoch": 1.14,
"learning_rate": 3.103703703703704e-05,
"loss": 0.0129,
"step": 64000
},
{
"epoch": 1.15,
"learning_rate": 3.088888888888889e-05,
"loss": 0.0129,
"step": 64500
},
{
"epoch": 1.16,
"learning_rate": 3.074074074074074e-05,
"loss": 0.0128,
"step": 65000
},
{
"epoch": 1.16,
"learning_rate": 3.059259259259259e-05,
"loss": 0.0132,
"step": 65500
},
{
"epoch": 1.17,
"learning_rate": 3.044444444444445e-05,
"loss": 0.0128,
"step": 66000
},
{
"epoch": 1.18,
"learning_rate": 3.02962962962963e-05,
"loss": 0.0131,
"step": 66500
},
{
"epoch": 1.19,
"learning_rate": 3.0148148148148148e-05,
"loss": 0.0128,
"step": 67000
},
{
"epoch": 1.2,
"learning_rate": 3e-05,
"loss": 0.0129,
"step": 67500
},
{
"epoch": 1.21,
"learning_rate": 2.9851851851851854e-05,
"loss": 0.0128,
"step": 68000
},
{
"epoch": 1.22,
"learning_rate": 2.9703703703703707e-05,
"loss": 0.0131,
"step": 68500
},
{
"epoch": 1.23,
"learning_rate": 2.955555555555556e-05,
"loss": 0.0131,
"step": 69000
},
{
"epoch": 1.24,
"learning_rate": 2.9407407407407413e-05,
"loss": 0.0126,
"step": 69500
},
{
"epoch": 1.24,
"learning_rate": 2.925925925925926e-05,
"loss": 0.0133,
"step": 70000
},
{
"epoch": 1.25,
"learning_rate": 2.9111111111111112e-05,
"loss": 0.0131,
"step": 70500
},
{
"epoch": 1.26,
"learning_rate": 2.8962962962962965e-05,
"loss": 0.0129,
"step": 71000
},
{
"epoch": 1.27,
"learning_rate": 2.8814814814814818e-05,
"loss": 0.0131,
"step": 71500
},
{
"epoch": 1.28,
"learning_rate": 2.8666666666666668e-05,
"loss": 0.0126,
"step": 72000
},
{
"epoch": 1.29,
"learning_rate": 2.851851851851852e-05,
"loss": 0.0125,
"step": 72500
},
{
"epoch": 1.3,
"learning_rate": 2.837037037037037e-05,
"loss": 0.0129,
"step": 73000
},
{
"epoch": 1.31,
"learning_rate": 2.8222222222222223e-05,
"loss": 0.013,
"step": 73500
},
{
"epoch": 1.32,
"learning_rate": 2.8074074074074076e-05,
"loss": 0.013,
"step": 74000
},
{
"epoch": 1.32,
"learning_rate": 2.7925925925925926e-05,
"loss": 0.0128,
"step": 74500
},
{
"epoch": 1.33,
"learning_rate": 2.777777777777778e-05,
"loss": 0.0132,
"step": 75000
},
{
"epoch": 1.34,
"learning_rate": 2.7629629629629632e-05,
"loss": 0.0128,
"step": 75500
},
{
"epoch": 1.35,
"learning_rate": 2.7481481481481482e-05,
"loss": 0.0126,
"step": 76000
},
{
"epoch": 1.36,
"learning_rate": 2.733333333333333e-05,
"loss": 0.0127,
"step": 76500
},
{
"epoch": 1.37,
"learning_rate": 2.7185185185185184e-05,
"loss": 0.0127,
"step": 77000
},
{
"epoch": 1.38,
"learning_rate": 2.7037037037037037e-05,
"loss": 0.0126,
"step": 77500
},
{
"epoch": 1.39,
"learning_rate": 2.688888888888889e-05,
"loss": 0.013,
"step": 78000
},
{
"epoch": 1.4,
"learning_rate": 2.6740740740740743e-05,
"loss": 0.0129,
"step": 78500
},
{
"epoch": 1.4,
"learning_rate": 2.659259259259259e-05,
"loss": 0.013,
"step": 79000
},
{
"epoch": 1.41,
"learning_rate": 2.6444444444444443e-05,
"loss": 0.0123,
"step": 79500
},
{
"epoch": 1.42,
"learning_rate": 2.6296296296296296e-05,
"loss": 0.0132,
"step": 80000
},
{
"epoch": 1.43,
"learning_rate": 2.614814814814815e-05,
"loss": 0.0135,
"step": 80500
},
{
"epoch": 1.44,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.0128,
"step": 81000
},
{
"epoch": 1.45,
"learning_rate": 2.5851851851851855e-05,
"loss": 0.013,
"step": 81500
},
{
"epoch": 1.46,
"learning_rate": 2.5703703703703708e-05,
"loss": 0.013,
"step": 82000
},
{
"epoch": 1.47,
"learning_rate": 2.5555555555555554e-05,
"loss": 0.0129,
"step": 82500
},
{
"epoch": 1.48,
"learning_rate": 2.5407407407407407e-05,
"loss": 0.0128,
"step": 83000
},
{
"epoch": 1.48,
"learning_rate": 2.525925925925926e-05,
"loss": 0.0124,
"step": 83500
},
{
"epoch": 1.49,
"learning_rate": 2.5111111111111113e-05,
"loss": 0.0132,
"step": 84000
},
{
"epoch": 1.5,
"learning_rate": 2.4962962962962963e-05,
"loss": 0.0129,
"step": 84500
},
{
"epoch": 1.51,
"learning_rate": 2.4814814814814816e-05,
"loss": 0.0126,
"step": 85000
},
{
"epoch": 1.52,
"learning_rate": 2.466666666666667e-05,
"loss": 0.0129,
"step": 85500
},
{
"epoch": 1.53,
"learning_rate": 2.451851851851852e-05,
"loss": 0.013,
"step": 86000
},
{
"epoch": 1.54,
"learning_rate": 2.437037037037037e-05,
"loss": 0.013,
"step": 86500
},
{
"epoch": 1.55,
"learning_rate": 2.4222222222222224e-05,
"loss": 0.0131,
"step": 87000
},
{
"epoch": 1.56,
"learning_rate": 2.4074074074074074e-05,
"loss": 0.0127,
"step": 87500
},
{
"epoch": 1.56,
"learning_rate": 2.3925925925925927e-05,
"loss": 0.013,
"step": 88000
},
{
"epoch": 1.57,
"learning_rate": 2.377777777777778e-05,
"loss": 0.013,
"step": 88500
},
{
"epoch": 1.58,
"learning_rate": 2.3629629629629633e-05,
"loss": 0.0124,
"step": 89000
},
{
"epoch": 1.59,
"learning_rate": 2.3481481481481483e-05,
"loss": 0.0131,
"step": 89500
},
{
"epoch": 1.6,
"learning_rate": 2.3333333333333336e-05,
"loss": 0.0132,
"step": 90000
},
{
"epoch": 1.61,
"learning_rate": 2.318518518518519e-05,
"loss": 0.0126,
"step": 90500
},
{
"epoch": 1.62,
"learning_rate": 2.303703703703704e-05,
"loss": 0.0129,
"step": 91000
},
{
"epoch": 1.63,
"learning_rate": 2.288888888888889e-05,
"loss": 0.0131,
"step": 91500
},
{
"epoch": 1.64,
"learning_rate": 2.2740740740740744e-05,
"loss": 0.0127,
"step": 92000
},
{
"epoch": 1.64,
"learning_rate": 2.2592592592592594e-05,
"loss": 0.0131,
"step": 92500
},
{
"epoch": 1.65,
"learning_rate": 2.2444444444444447e-05,
"loss": 0.013,
"step": 93000
},
{
"epoch": 1.66,
"learning_rate": 2.2296296296296297e-05,
"loss": 0.0129,
"step": 93500
},
{
"epoch": 1.67,
"learning_rate": 2.214814814814815e-05,
"loss": 0.0127,
"step": 94000
},
{
"epoch": 1.68,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.0123,
"step": 94500
},
{
"epoch": 1.69,
"learning_rate": 2.1851851851851852e-05,
"loss": 0.0127,
"step": 95000
},
{
"epoch": 1.7,
"learning_rate": 2.1703703703703705e-05,
"loss": 0.0128,
"step": 95500
},
{
"epoch": 1.71,
"learning_rate": 2.1555555555555555e-05,
"loss": 0.0129,
"step": 96000
},
{
"epoch": 1.72,
"learning_rate": 2.1407407407407408e-05,
"loss": 0.0126,
"step": 96500
},
{
"epoch": 1.72,
"learning_rate": 2.1259259259259258e-05,
"loss": 0.0126,
"step": 97000
},
{
"epoch": 1.73,
"learning_rate": 2.111111111111111e-05,
"loss": 0.013,
"step": 97500
},
{
"epoch": 1.74,
"learning_rate": 2.0962962962962964e-05,
"loss": 0.0132,
"step": 98000
},
{
"epoch": 1.75,
"learning_rate": 2.0814814814814813e-05,
"loss": 0.0127,
"step": 98500
},
{
"epoch": 1.76,
"learning_rate": 2.0666666666666666e-05,
"loss": 0.0126,
"step": 99000
},
{
"epoch": 1.77,
"learning_rate": 2.051851851851852e-05,
"loss": 0.0127,
"step": 99500
},
{
"epoch": 1.78,
"learning_rate": 2.037037037037037e-05,
"loss": 0.0128,
"step": 100000
},
{
"epoch": 1.79,
"learning_rate": 2.0222222222222222e-05,
"loss": 0.0129,
"step": 100500
},
{
"epoch": 1.8,
"learning_rate": 2.0074074074074075e-05,
"loss": 0.0125,
"step": 101000
},
{
"epoch": 1.8,
"learning_rate": 1.9925925925925925e-05,
"loss": 0.0126,
"step": 101500
},
{
"epoch": 1.81,
"learning_rate": 1.9777777777777778e-05,
"loss": 0.0126,
"step": 102000
},
{
"epoch": 1.82,
"learning_rate": 1.962962962962963e-05,
"loss": 0.013,
"step": 102500
},
{
"epoch": 1.83,
"learning_rate": 1.948148148148148e-05,
"loss": 0.013,
"step": 103000
},
{
"epoch": 1.84,
"learning_rate": 1.9333333333333333e-05,
"loss": 0.0128,
"step": 103500
},
{
"epoch": 1.85,
"learning_rate": 1.9185185185185186e-05,
"loss": 0.0129,
"step": 104000
},
{
"epoch": 1.86,
"learning_rate": 1.903703703703704e-05,
"loss": 0.0127,
"step": 104500
},
{
"epoch": 1.87,
"learning_rate": 1.888888888888889e-05,
"loss": 0.0129,
"step": 105000
},
{
"epoch": 1.88,
"learning_rate": 1.8740740740740742e-05,
"loss": 0.0128,
"step": 105500
},
{
"epoch": 1.88,
"learning_rate": 1.8592592592592595e-05,
"loss": 0.0127,
"step": 106000
},
{
"epoch": 1.89,
"learning_rate": 1.8444444444444445e-05,
"loss": 0.0128,
"step": 106500
},
{
"epoch": 1.9,
"learning_rate": 1.8296296296296298e-05,
"loss": 0.0124,
"step": 107000
},
{
"epoch": 1.91,
"learning_rate": 1.814814814814815e-05,
"loss": 0.0126,
"step": 107500
},
{
"epoch": 1.92,
"learning_rate": 1.8e-05,
"loss": 0.0128,
"step": 108000
},
{
"epoch": 1.93,
"learning_rate": 1.7851851851851853e-05,
"loss": 0.0128,
"step": 108500
},
{
"epoch": 1.94,
"learning_rate": 1.7703703703703706e-05,
"loss": 0.0131,
"step": 109000
},
{
"epoch": 1.95,
"learning_rate": 1.7555555555555556e-05,
"loss": 0.0131,
"step": 109500
},
{
"epoch": 1.96,
"learning_rate": 1.740740740740741e-05,
"loss": 0.0127,
"step": 110000
},
{
"epoch": 1.96,
"learning_rate": 1.7259259259259262e-05,
"loss": 0.0129,
"step": 110500
},
{
"epoch": 1.97,
"learning_rate": 1.7111111111111112e-05,
"loss": 0.0128,
"step": 111000
},
{
"epoch": 1.98,
"learning_rate": 1.6962962962962965e-05,
"loss": 0.0127,
"step": 111500
},
{
"epoch": 1.99,
"learning_rate": 1.6814814814814818e-05,
"loss": 0.0125,
"step": 112000
},
{
"epoch": 2.0,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.0126,
"step": 112500
},
{
"epoch": 2.0,
"eval_loss": 0.013725973665714264,
"eval_mse": 0.013725974597036839,
"eval_runtime": 99.3457,
"eval_samples_per_second": 140.922,
"step": 112500
},
{
"epoch": 2.01,
"learning_rate": 1.651851851851852e-05,
"loss": 0.0128,
"step": 113000
},
{
"epoch": 2.02,
"learning_rate": 1.6370370370370374e-05,
"loss": 0.0127,
"step": 113500
},
{
"epoch": 2.03,
"learning_rate": 1.6222222222222223e-05,
"loss": 0.0129,
"step": 114000
},
{
"epoch": 2.04,
"learning_rate": 1.6074074074074076e-05,
"loss": 0.0128,
"step": 114500
},
{
"epoch": 2.04,
"learning_rate": 1.5925925925925926e-05,
"loss": 0.0126,
"step": 115000
},
{
"epoch": 2.05,
"learning_rate": 1.577777777777778e-05,
"loss": 0.0131,
"step": 115500
},
{
"epoch": 2.06,
"learning_rate": 1.5629629629629632e-05,
"loss": 0.0129,
"step": 116000
},
{
"epoch": 2.07,
"learning_rate": 1.548148148148148e-05,
"loss": 0.0129,
"step": 116500
},
{
"epoch": 2.08,
"learning_rate": 1.5333333333333334e-05,
"loss": 0.0127,
"step": 117000
},
{
"epoch": 2.09,
"learning_rate": 1.5185185185185186e-05,
"loss": 0.013,
"step": 117500
},
{
"epoch": 2.1,
"learning_rate": 1.5037037037037039e-05,
"loss": 0.0129,
"step": 118000
},
{
"epoch": 2.11,
"learning_rate": 1.4888888888888888e-05,
"loss": 0.0126,
"step": 118500
},
{
"epoch": 2.12,
"learning_rate": 1.4740740740740741e-05,
"loss": 0.0129,
"step": 119000
},
{
"epoch": 2.12,
"learning_rate": 1.4592592592592594e-05,
"loss": 0.0125,
"step": 119500
},
{
"epoch": 2.13,
"learning_rate": 1.4444444444444444e-05,
"loss": 0.013,
"step": 120000
},
{
"epoch": 2.14,
"learning_rate": 1.4296296296296297e-05,
"loss": 0.0128,
"step": 120500
},
{
"epoch": 2.15,
"learning_rate": 1.4148148148148148e-05,
"loss": 0.0125,
"step": 121000
},
{
"epoch": 2.16,
"learning_rate": 1.4000000000000001e-05,
"loss": 0.0129,
"step": 121500
},
{
"epoch": 2.17,
"learning_rate": 1.3851851851851853e-05,
"loss": 0.0132,
"step": 122000
},
{
"epoch": 2.18,
"learning_rate": 1.3703703703703704e-05,
"loss": 0.0126,
"step": 122500
},
{
"epoch": 2.19,
"learning_rate": 1.3555555555555557e-05,
"loss": 0.0127,
"step": 123000
},
{
"epoch": 2.2,
"learning_rate": 1.3407407407407407e-05,
"loss": 0.013,
"step": 123500
},
{
"epoch": 2.2,
"learning_rate": 1.325925925925926e-05,
"loss": 0.0134,
"step": 124000
},
{
"epoch": 2.21,
"learning_rate": 1.3111111111111113e-05,
"loss": 0.0129,
"step": 124500
},
{
"epoch": 2.22,
"learning_rate": 1.2962962962962962e-05,
"loss": 0.0125,
"step": 125000
},
{
"epoch": 2.23,
"learning_rate": 1.2814814814814815e-05,
"loss": 0.0128,
"step": 125500
},
{
"epoch": 2.24,
"learning_rate": 1.2666666666666668e-05,
"loss": 0.0125,
"step": 126000
},
{
"epoch": 2.25,
"learning_rate": 1.2518518518518518e-05,
"loss": 0.0127,
"step": 126500
},
{
"epoch": 2.26,
"learning_rate": 1.2370370370370371e-05,
"loss": 0.0127,
"step": 127000
},
{
"epoch": 2.27,
"learning_rate": 1.2222222222222222e-05,
"loss": 0.0127,
"step": 127500
},
{
"epoch": 2.28,
"learning_rate": 1.2074074074074075e-05,
"loss": 0.013,
"step": 128000
},
{
"epoch": 2.28,
"learning_rate": 1.1925925925925927e-05,
"loss": 0.0125,
"step": 128500
},
{
"epoch": 2.29,
"learning_rate": 1.1777777777777778e-05,
"loss": 0.0128,
"step": 129000
},
{
"epoch": 2.3,
"learning_rate": 1.1629629629629631e-05,
"loss": 0.0128,
"step": 129500
},
{
"epoch": 2.31,
"learning_rate": 1.1481481481481482e-05,
"loss": 0.013,
"step": 130000
},
{
"epoch": 2.32,
"learning_rate": 1.1333333333333334e-05,
"loss": 0.0128,
"step": 130500
},
{
"epoch": 2.33,
"learning_rate": 1.1185185185185187e-05,
"loss": 0.0131,
"step": 131000
},
{
"epoch": 2.34,
"learning_rate": 1.1037037037037038e-05,
"loss": 0.013,
"step": 131500
},
{
"epoch": 2.35,
"learning_rate": 1.088888888888889e-05,
"loss": 0.0126,
"step": 132000
},
{
"epoch": 2.36,
"learning_rate": 1.074074074074074e-05,
"loss": 0.0127,
"step": 132500
},
{
"epoch": 2.36,
"learning_rate": 1.0592592592592592e-05,
"loss": 0.0127,
"step": 133000
},
{
"epoch": 2.37,
"learning_rate": 1.0444444444444445e-05,
"loss": 0.0135,
"step": 133500
},
{
"epoch": 2.38,
"learning_rate": 1.0296296296296296e-05,
"loss": 0.0127,
"step": 134000
},
{
"epoch": 2.39,
"learning_rate": 1.0148148148148148e-05,
"loss": 0.0128,
"step": 134500
},
{
"epoch": 2.4,
"learning_rate": 1e-05,
"loss": 0.0127,
"step": 135000
},
{
"epoch": 2.41,
"learning_rate": 9.851851851851852e-06,
"loss": 0.0126,
"step": 135500
},
{
"epoch": 2.42,
"learning_rate": 9.703703703703703e-06,
"loss": 0.0129,
"step": 136000
},
{
"epoch": 2.43,
"learning_rate": 9.555555555555556e-06,
"loss": 0.0128,
"step": 136500
},
{
"epoch": 2.44,
"learning_rate": 9.407407407407408e-06,
"loss": 0.0129,
"step": 137000
},
{
"epoch": 2.44,
"learning_rate": 9.259259259259259e-06,
"loss": 0.0127,
"step": 137500
},
{
"epoch": 2.45,
"learning_rate": 9.111111111111112e-06,
"loss": 0.013,
"step": 138000
},
{
"epoch": 2.46,
"learning_rate": 8.962962962962963e-06,
"loss": 0.0129,
"step": 138500
},
{
"epoch": 2.47,
"learning_rate": 8.814814814814815e-06,
"loss": 0.0126,
"step": 139000
},
{
"epoch": 2.48,
"learning_rate": 8.666666666666668e-06,
"loss": 0.0125,
"step": 139500
},
{
"epoch": 2.49,
"learning_rate": 8.518518518518519e-06,
"loss": 0.0129,
"step": 140000
},
{
"epoch": 2.5,
"learning_rate": 8.37037037037037e-06,
"loss": 0.013,
"step": 140500
},
{
"epoch": 2.51,
"learning_rate": 8.222222222222223e-06,
"loss": 0.0131,
"step": 141000
},
{
"epoch": 2.52,
"learning_rate": 8.074074074074075e-06,
"loss": 0.0127,
"step": 141500
},
{
"epoch": 2.52,
"learning_rate": 7.925925925925926e-06,
"loss": 0.0129,
"step": 142000
},
{
"epoch": 2.53,
"learning_rate": 7.777777777777777e-06,
"loss": 0.0129,
"step": 142500
},
{
"epoch": 2.54,
"learning_rate": 7.629629629629629e-06,
"loss": 0.0124,
"step": 143000
},
{
"epoch": 2.55,
"learning_rate": 7.481481481481483e-06,
"loss": 0.0128,
"step": 143500
},
{
"epoch": 2.56,
"learning_rate": 7.333333333333334e-06,
"loss": 0.0126,
"step": 144000
},
{
"epoch": 2.57,
"learning_rate": 7.185185185185185e-06,
"loss": 0.0127,
"step": 144500
},
{
"epoch": 2.58,
"learning_rate": 7.0370370370370375e-06,
"loss": 0.0124,
"step": 145000
},
{
"epoch": 2.59,
"learning_rate": 6.888888888888889e-06,
"loss": 0.0126,
"step": 145500
},
{
"epoch": 2.6,
"learning_rate": 6.74074074074074e-06,
"loss": 0.0126,
"step": 146000
},
{
"epoch": 2.6,
"learning_rate": 6.592592592592593e-06,
"loss": 0.0126,
"step": 146500
},
{
"epoch": 2.61,
"learning_rate": 6.4444444444444445e-06,
"loss": 0.0125,
"step": 147000
},
{
"epoch": 2.62,
"learning_rate": 6.296296296296296e-06,
"loss": 0.0125,
"step": 147500
},
{
"epoch": 2.63,
"learning_rate": 6.148148148148149e-06,
"loss": 0.0128,
"step": 148000
},
{
"epoch": 2.64,
"learning_rate": 6e-06,
"loss": 0.0127,
"step": 148500
},
{
"epoch": 2.65,
"learning_rate": 5.851851851851852e-06,
"loss": 0.0129,
"step": 149000
},
{
"epoch": 2.66,
"learning_rate": 5.7037037037037045e-06,
"loss": 0.0127,
"step": 149500
},
{
"epoch": 2.67,
"learning_rate": 5.555555555555556e-06,
"loss": 0.0126,
"step": 150000
},
{
"epoch": 2.68,
"learning_rate": 5.407407407407407e-06,
"loss": 0.0127,
"step": 150500
},
{
"epoch": 2.68,
"learning_rate": 5.259259259259259e-06,
"loss": 0.0127,
"step": 151000
},
{
"epoch": 2.69,
"learning_rate": 5.1111111111111115e-06,
"loss": 0.0127,
"step": 151500
},
{
"epoch": 2.7,
"learning_rate": 4.962962962962963e-06,
"loss": 0.0124,
"step": 152000
},
{
"epoch": 2.71,
"learning_rate": 4.814814814814815e-06,
"loss": 0.0125,
"step": 152500
},
{
"epoch": 2.72,
"learning_rate": 4.666666666666667e-06,
"loss": 0.0126,
"step": 153000
},
{
"epoch": 2.73,
"learning_rate": 4.5185185185185185e-06,
"loss": 0.0126,
"step": 153500
},
{
"epoch": 2.74,
"learning_rate": 4.370370370370371e-06,
"loss": 0.0123,
"step": 154000
},
{
"epoch": 2.75,
"learning_rate": 4.222222222222223e-06,
"loss": 0.0126,
"step": 154500
},
{
"epoch": 2.76,
"learning_rate": 4.074074074074075e-06,
"loss": 0.0129,
"step": 155000
},
{
"epoch": 2.76,
"learning_rate": 3.925925925925926e-06,
"loss": 0.0124,
"step": 155500
},
{
"epoch": 2.77,
"learning_rate": 3.777777777777778e-06,
"loss": 0.0125,
"step": 156000
},
{
"epoch": 2.78,
"learning_rate": 3.6296296296296302e-06,
"loss": 0.0126,
"step": 156500
},
{
"epoch": 2.79,
"learning_rate": 3.4814814814814816e-06,
"loss": 0.0127,
"step": 157000
},
{
"epoch": 2.8,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0131,
"step": 157500
},
{
"epoch": 2.81,
"learning_rate": 3.1851851851851855e-06,
"loss": 0.0124,
"step": 158000
},
{
"epoch": 2.82,
"learning_rate": 3.0370370370370372e-06,
"loss": 0.0124,
"step": 158500
},
{
"epoch": 2.83,
"learning_rate": 2.888888888888889e-06,
"loss": 0.0125,
"step": 159000
},
{
"epoch": 2.84,
"learning_rate": 2.7407407407407407e-06,
"loss": 0.0129,
"step": 159500
},
{
"epoch": 2.84,
"learning_rate": 2.5925925925925925e-06,
"loss": 0.0129,
"step": 160000
},
{
"epoch": 2.85,
"learning_rate": 2.4444444444444447e-06,
"loss": 0.0126,
"step": 160500
},
{
"epoch": 2.86,
"learning_rate": 2.2962962962962964e-06,
"loss": 0.0127,
"step": 161000
},
{
"epoch": 2.87,
"learning_rate": 2.148148148148148e-06,
"loss": 0.0125,
"step": 161500
},
{
"epoch": 2.88,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0128,
"step": 162000
},
{
"epoch": 2.89,
"learning_rate": 1.8518518518518519e-06,
"loss": 0.0125,
"step": 162500
},
{
"epoch": 2.9,
"learning_rate": 1.7037037037037038e-06,
"loss": 0.0127,
"step": 163000
},
{
"epoch": 2.91,
"learning_rate": 1.5555555555555556e-06,
"loss": 0.0124,
"step": 163500
},
{
"epoch": 2.92,
"learning_rate": 1.4074074074074075e-06,
"loss": 0.0129,
"step": 164000
},
{
"epoch": 2.92,
"learning_rate": 1.2592592592592593e-06,
"loss": 0.0128,
"step": 164500
},
{
"epoch": 2.93,
"learning_rate": 1.1111111111111112e-06,
"loss": 0.0124,
"step": 165000
},
{
"epoch": 2.94,
"learning_rate": 9.62962962962963e-07,
"loss": 0.0123,
"step": 165500
},
{
"epoch": 2.95,
"learning_rate": 8.148148148148147e-07,
"loss": 0.0128,
"step": 166000
},
{
"epoch": 2.96,
"learning_rate": 6.666666666666667e-07,
"loss": 0.0129,
"step": 166500
},
{
"epoch": 2.97,
"learning_rate": 5.185185185185186e-07,
"loss": 0.0122,
"step": 167000
},
{
"epoch": 2.98,
"learning_rate": 3.703703703703704e-07,
"loss": 0.0128,
"step": 167500
},
{
"epoch": 2.99,
"learning_rate": 2.2222222222222224e-07,
"loss": 0.0127,
"step": 168000
},
{
"epoch": 3.0,
"learning_rate": 7.407407407407407e-08,
"loss": 0.0127,
"step": 168500
},
{
"epoch": 3.0,
"eval_loss": 0.013624305836856365,
"eval_mse": 0.013624305836856365,
"eval_runtime": 98.4295,
"eval_samples_per_second": 142.234,
"step": 168750
},
{
"epoch": 3.0,
"step": 168750,
"total_flos": 5045118796800000.0,
"train_runtime": 51961.6794,
"train_samples_per_second": 3.248
}
],
"max_steps": 168750,
"num_train_epochs": 3,
"total_flos": 5045118796800000.0,
"trial_name": null,
"trial_params": null
}