jais-13b-poem-generation / trainer_state.json
boda's picture
upload jais
f1d02b4
raw
history blame
40.3 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0000058823875435,
"eval_steps": 10000,
"global_step": 170000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 0.0002,
"loss": 2.848,
"step": 500
},
{
"epoch": 0.01,
"learning_rate": 0.0002,
"loss": 2.7223,
"step": 1000
},
{
"epoch": 0.01,
"learning_rate": 0.0002,
"loss": 2.6764,
"step": 1500
},
{
"epoch": 0.01,
"learning_rate": 0.0002,
"loss": 2.7175,
"step": 2000
},
{
"epoch": 0.01,
"learning_rate": 0.0002,
"loss": 2.7574,
"step": 2500
},
{
"epoch": 0.02,
"learning_rate": 0.0002,
"loss": 2.7264,
"step": 3000
},
{
"epoch": 0.02,
"learning_rate": 0.0002,
"loss": 2.663,
"step": 3500
},
{
"epoch": 0.02,
"learning_rate": 0.0002,
"loss": 2.6491,
"step": 4000
},
{
"epoch": 0.03,
"learning_rate": 0.0002,
"loss": 2.6539,
"step": 4500
},
{
"epoch": 0.03,
"learning_rate": 0.0002,
"loss": 2.6336,
"step": 5000
},
{
"epoch": 0.03,
"learning_rate": 0.0002,
"loss": 2.714,
"step": 5500
},
{
"epoch": 0.04,
"learning_rate": 0.0002,
"loss": 2.6978,
"step": 6000
},
{
"epoch": 0.04,
"learning_rate": 0.0002,
"loss": 2.6446,
"step": 6500
},
{
"epoch": 0.04,
"learning_rate": 0.0002,
"loss": 2.6985,
"step": 7000
},
{
"epoch": 0.04,
"learning_rate": 0.0002,
"loss": 2.717,
"step": 7500
},
{
"epoch": 0.05,
"learning_rate": 0.0002,
"loss": 2.665,
"step": 8000
},
{
"epoch": 0.05,
"learning_rate": 0.0002,
"loss": 2.6201,
"step": 8500
},
{
"epoch": 0.05,
"learning_rate": 0.0002,
"loss": 2.6889,
"step": 9000
},
{
"epoch": 0.06,
"learning_rate": 0.0002,
"loss": 2.6772,
"step": 9500
},
{
"epoch": 0.06,
"learning_rate": 0.0002,
"loss": 2.677,
"step": 10000
},
{
"epoch": 0.06,
"eval_loss": 1.0160539150238037,
"eval_runtime": 10760.1771,
"eval_samples_per_second": 1.975,
"eval_steps_per_second": 1.975,
"step": 10000
},
{
"epoch": 0.06,
"learning_rate": 0.0002,
"loss": 2.6959,
"step": 10500
},
{
"epoch": 0.06,
"learning_rate": 0.0002,
"loss": 2.6537,
"step": 11000
},
{
"epoch": 0.07,
"learning_rate": 0.0002,
"loss": 2.646,
"step": 11500
},
{
"epoch": 0.07,
"learning_rate": 0.0002,
"loss": 2.6385,
"step": 12000
},
{
"epoch": 0.07,
"learning_rate": 0.0002,
"loss": 2.7758,
"step": 12500
},
{
"epoch": 0.08,
"learning_rate": 0.0002,
"loss": 2.6346,
"step": 13000
},
{
"epoch": 0.08,
"learning_rate": 0.0002,
"loss": 2.6672,
"step": 13500
},
{
"epoch": 0.08,
"learning_rate": 0.0002,
"loss": 2.6466,
"step": 14000
},
{
"epoch": 0.09,
"learning_rate": 0.0002,
"loss": 2.6491,
"step": 14500
},
{
"epoch": 0.09,
"learning_rate": 0.0002,
"loss": 2.6354,
"step": 15000
},
{
"epoch": 0.09,
"learning_rate": 0.0002,
"loss": 2.6093,
"step": 15500
},
{
"epoch": 0.09,
"learning_rate": 0.0002,
"loss": 2.6614,
"step": 16000
},
{
"epoch": 0.1,
"learning_rate": 0.0002,
"loss": 2.6479,
"step": 16500
},
{
"epoch": 0.1,
"learning_rate": 0.0002,
"loss": 2.6402,
"step": 17000
},
{
"epoch": 0.1,
"learning_rate": 0.0002,
"loss": 2.6736,
"step": 17500
},
{
"epoch": 0.11,
"learning_rate": 0.0002,
"loss": 2.7083,
"step": 18000
},
{
"epoch": 0.11,
"learning_rate": 0.0002,
"loss": 2.5751,
"step": 18500
},
{
"epoch": 0.11,
"learning_rate": 0.0002,
"loss": 2.7049,
"step": 19000
},
{
"epoch": 0.11,
"learning_rate": 0.0002,
"loss": 2.6079,
"step": 19500
},
{
"epoch": 0.12,
"learning_rate": 0.0002,
"loss": 2.6031,
"step": 20000
},
{
"epoch": 0.12,
"eval_loss": 0.9962247014045715,
"eval_runtime": 10758.8023,
"eval_samples_per_second": 1.975,
"eval_steps_per_second": 1.975,
"step": 20000
},
{
"epoch": 0.12,
"learning_rate": 0.0002,
"loss": 2.6583,
"step": 20500
},
{
"epoch": 0.12,
"learning_rate": 0.0002,
"loss": 2.6182,
"step": 21000
},
{
"epoch": 0.13,
"learning_rate": 0.0002,
"loss": 2.6299,
"step": 21500
},
{
"epoch": 0.13,
"learning_rate": 0.0002,
"loss": 2.6448,
"step": 22000
},
{
"epoch": 0.13,
"learning_rate": 0.0002,
"loss": 2.6013,
"step": 22500
},
{
"epoch": 0.14,
"learning_rate": 0.0002,
"loss": 2.6863,
"step": 23000
},
{
"epoch": 0.14,
"learning_rate": 0.0002,
"loss": 2.6656,
"step": 23500
},
{
"epoch": 0.14,
"learning_rate": 0.0002,
"loss": 2.6429,
"step": 24000
},
{
"epoch": 0.14,
"learning_rate": 0.0002,
"loss": 2.6574,
"step": 24500
},
{
"epoch": 0.15,
"learning_rate": 0.0002,
"loss": 2.6068,
"step": 25000
},
{
"epoch": 0.15,
"learning_rate": 0.0002,
"loss": 2.608,
"step": 25500
},
{
"epoch": 0.15,
"learning_rate": 0.0002,
"loss": 2.5827,
"step": 26000
},
{
"epoch": 0.16,
"learning_rate": 0.0002,
"loss": 2.5467,
"step": 26500
},
{
"epoch": 0.16,
"learning_rate": 0.0002,
"loss": 2.5671,
"step": 27000
},
{
"epoch": 0.16,
"learning_rate": 0.0002,
"loss": 2.5771,
"step": 27500
},
{
"epoch": 0.16,
"learning_rate": 0.0002,
"loss": 2.5697,
"step": 28000
},
{
"epoch": 0.17,
"learning_rate": 0.0002,
"loss": 2.5959,
"step": 28500
},
{
"epoch": 0.17,
"learning_rate": 0.0002,
"loss": 2.6184,
"step": 29000
},
{
"epoch": 0.17,
"learning_rate": 0.0002,
"loss": 2.5886,
"step": 29500
},
{
"epoch": 0.18,
"learning_rate": 0.0002,
"loss": 2.6198,
"step": 30000
},
{
"epoch": 0.18,
"eval_loss": 0.9876086115837097,
"eval_runtime": 4361.4057,
"eval_samples_per_second": 4.872,
"eval_steps_per_second": 4.872,
"step": 30000
},
{
"epoch": 0.18,
"learning_rate": 0.0002,
"loss": 2.6259,
"step": 30500
},
{
"epoch": 0.18,
"learning_rate": 0.0002,
"loss": 2.531,
"step": 31000
},
{
"epoch": 0.19,
"learning_rate": 0.0002,
"loss": 2.6404,
"step": 31500
},
{
"epoch": 0.19,
"learning_rate": 0.0002,
"loss": 2.5938,
"step": 32000
},
{
"epoch": 0.19,
"learning_rate": 0.0002,
"loss": 2.6468,
"step": 32500
},
{
"epoch": 0.19,
"learning_rate": 0.0002,
"loss": 2.5516,
"step": 33000
},
{
"epoch": 0.2,
"learning_rate": 0.0002,
"loss": 2.5667,
"step": 33500
},
{
"epoch": 0.2,
"learning_rate": 0.0002,
"loss": 2.5937,
"step": 34000
},
{
"epoch": 0.2,
"learning_rate": 0.0002,
"loss": 2.554,
"step": 34500
},
{
"epoch": 0.21,
"learning_rate": 0.0002,
"loss": 2.6059,
"step": 35000
},
{
"epoch": 0.21,
"learning_rate": 0.0002,
"loss": 2.6047,
"step": 35500
},
{
"epoch": 0.21,
"learning_rate": 0.0002,
"loss": 2.6003,
"step": 36000
},
{
"epoch": 0.21,
"learning_rate": 0.0002,
"loss": 2.5939,
"step": 36500
},
{
"epoch": 0.22,
"learning_rate": 0.0002,
"loss": 2.6079,
"step": 37000
},
{
"epoch": 0.22,
"learning_rate": 0.0002,
"loss": 2.5879,
"step": 37500
},
{
"epoch": 0.22,
"learning_rate": 0.0002,
"loss": 2.5627,
"step": 38000
},
{
"epoch": 0.23,
"learning_rate": 0.0002,
"loss": 2.5913,
"step": 38500
},
{
"epoch": 0.23,
"learning_rate": 0.0002,
"loss": 2.5924,
"step": 39000
},
{
"epoch": 0.23,
"learning_rate": 0.0002,
"loss": 2.541,
"step": 39500
},
{
"epoch": 0.24,
"learning_rate": 0.0002,
"loss": 2.5684,
"step": 40000
},
{
"epoch": 0.24,
"eval_loss": 0.9882351160049438,
"eval_runtime": 4362.1091,
"eval_samples_per_second": 4.871,
"eval_steps_per_second": 4.871,
"step": 40000
},
{
"epoch": 0.24,
"learning_rate": 0.0002,
"loss": 2.6912,
"step": 40500
},
{
"epoch": 0.24,
"learning_rate": 0.0002,
"loss": 2.5589,
"step": 41000
},
{
"epoch": 0.24,
"learning_rate": 0.0002,
"loss": 2.5529,
"step": 41500
},
{
"epoch": 0.25,
"learning_rate": 0.0002,
"loss": 2.6136,
"step": 42000
},
{
"epoch": 0.25,
"learning_rate": 0.0002,
"loss": 2.5734,
"step": 42500
},
{
"epoch": 0.25,
"learning_rate": 0.0002,
"loss": 2.5784,
"step": 43000
},
{
"epoch": 0.26,
"learning_rate": 0.0002,
"loss": 2.571,
"step": 43500
},
{
"epoch": 0.26,
"learning_rate": 0.0002,
"loss": 2.5876,
"step": 44000
},
{
"epoch": 0.26,
"learning_rate": 0.0002,
"loss": 2.5619,
"step": 44500
},
{
"epoch": 0.26,
"learning_rate": 0.0002,
"loss": 2.5827,
"step": 45000
},
{
"epoch": 0.27,
"learning_rate": 0.0002,
"loss": 2.5261,
"step": 45500
},
{
"epoch": 0.27,
"learning_rate": 0.0002,
"loss": 2.5892,
"step": 46000
},
{
"epoch": 0.27,
"learning_rate": 0.0002,
"loss": 2.5985,
"step": 46500
},
{
"epoch": 0.28,
"learning_rate": 0.0002,
"loss": 2.5743,
"step": 47000
},
{
"epoch": 0.28,
"learning_rate": 0.0002,
"loss": 2.5872,
"step": 47500
},
{
"epoch": 0.28,
"learning_rate": 0.0002,
"loss": 2.631,
"step": 48000
},
{
"epoch": 0.29,
"learning_rate": 0.0002,
"loss": 2.5579,
"step": 48500
},
{
"epoch": 0.29,
"learning_rate": 0.0002,
"loss": 2.5076,
"step": 49000
},
{
"epoch": 0.29,
"learning_rate": 0.0002,
"loss": 2.5812,
"step": 49500
},
{
"epoch": 0.29,
"learning_rate": 0.0002,
"loss": 2.5464,
"step": 50000
},
{
"epoch": 0.29,
"eval_loss": 0.9824422597885132,
"eval_runtime": 4360.8255,
"eval_samples_per_second": 4.873,
"eval_steps_per_second": 4.873,
"step": 50000
},
{
"epoch": 0.3,
"learning_rate": 0.0002,
"loss": 2.5652,
"step": 50500
},
{
"epoch": 0.3,
"learning_rate": 0.0002,
"loss": 2.6049,
"step": 51000
},
{
"epoch": 0.3,
"learning_rate": 0.0002,
"loss": 2.5393,
"step": 51500
},
{
"epoch": 0.31,
"learning_rate": 0.0002,
"loss": 2.567,
"step": 52000
},
{
"epoch": 0.31,
"learning_rate": 0.0002,
"loss": 2.6241,
"step": 52500
},
{
"epoch": 0.31,
"learning_rate": 0.0002,
"loss": 2.5549,
"step": 53000
},
{
"epoch": 0.31,
"learning_rate": 0.0002,
"loss": 2.5779,
"step": 53500
},
{
"epoch": 0.32,
"learning_rate": 0.0002,
"loss": 2.5542,
"step": 54000
},
{
"epoch": 0.32,
"learning_rate": 0.0002,
"loss": 2.5893,
"step": 54500
},
{
"epoch": 0.32,
"learning_rate": 0.0002,
"loss": 2.5856,
"step": 55000
},
{
"epoch": 0.33,
"learning_rate": 0.0002,
"loss": 2.6168,
"step": 55500
},
{
"epoch": 0.33,
"learning_rate": 0.0002,
"loss": 2.537,
"step": 56000
},
{
"epoch": 0.33,
"learning_rate": 0.0002,
"loss": 2.4988,
"step": 56500
},
{
"epoch": 0.34,
"learning_rate": 0.0002,
"loss": 2.5916,
"step": 57000
},
{
"epoch": 0.34,
"learning_rate": 0.0002,
"loss": 2.5199,
"step": 57500
},
{
"epoch": 0.34,
"learning_rate": 0.0002,
"loss": 2.5634,
"step": 58000
},
{
"epoch": 0.34,
"learning_rate": 0.0002,
"loss": 2.5705,
"step": 58500
},
{
"epoch": 0.35,
"learning_rate": 0.0002,
"loss": 2.5698,
"step": 59000
},
{
"epoch": 0.35,
"learning_rate": 0.0002,
"loss": 2.5691,
"step": 59500
},
{
"epoch": 0.35,
"learning_rate": 0.0002,
"loss": 2.5616,
"step": 60000
},
{
"epoch": 0.35,
"eval_loss": 0.982897162437439,
"eval_runtime": 4362.1163,
"eval_samples_per_second": 4.871,
"eval_steps_per_second": 4.871,
"step": 60000
},
{
"epoch": 0.36,
"learning_rate": 0.0002,
"loss": 2.5571,
"step": 60500
},
{
"epoch": 0.36,
"learning_rate": 0.0002,
"loss": 2.5538,
"step": 61000
},
{
"epoch": 0.36,
"learning_rate": 0.0002,
"loss": 2.5666,
"step": 61500
},
{
"epoch": 0.36,
"learning_rate": 0.0002,
"loss": 2.5532,
"step": 62000
},
{
"epoch": 0.37,
"learning_rate": 0.0002,
"loss": 2.5578,
"step": 62500
},
{
"epoch": 0.37,
"learning_rate": 0.0002,
"loss": 2.5467,
"step": 63000
},
{
"epoch": 0.37,
"learning_rate": 0.0002,
"loss": 2.5936,
"step": 63500
},
{
"epoch": 0.38,
"learning_rate": 0.0002,
"loss": 2.5476,
"step": 64000
},
{
"epoch": 0.38,
"learning_rate": 0.0002,
"loss": 2.584,
"step": 64500
},
{
"epoch": 0.38,
"learning_rate": 0.0002,
"loss": 2.4837,
"step": 65000
},
{
"epoch": 0.39,
"learning_rate": 0.0002,
"loss": 2.5666,
"step": 65500
},
{
"epoch": 0.39,
"learning_rate": 0.0002,
"loss": 2.5333,
"step": 66000
},
{
"epoch": 0.39,
"learning_rate": 0.0002,
"loss": 2.4762,
"step": 66500
},
{
"epoch": 0.39,
"learning_rate": 0.0002,
"loss": 2.5221,
"step": 67000
},
{
"epoch": 0.4,
"learning_rate": 0.0002,
"loss": 2.5362,
"step": 67500
},
{
"epoch": 0.4,
"learning_rate": 0.0002,
"loss": 2.5554,
"step": 68000
},
{
"epoch": 0.4,
"learning_rate": 0.0002,
"loss": 2.5525,
"step": 68500
},
{
"epoch": 0.41,
"learning_rate": 0.0002,
"loss": 2.6418,
"step": 69000
},
{
"epoch": 0.41,
"learning_rate": 0.0002,
"loss": 2.5773,
"step": 69500
},
{
"epoch": 0.41,
"learning_rate": 0.0002,
"loss": 2.4553,
"step": 70000
},
{
"epoch": 0.41,
"eval_loss": 0.9774429202079773,
"eval_runtime": 4360.3352,
"eval_samples_per_second": 4.873,
"eval_steps_per_second": 4.873,
"step": 70000
},
{
"epoch": 0.41,
"learning_rate": 0.0002,
"loss": 2.5563,
"step": 70500
},
{
"epoch": 0.42,
"learning_rate": 0.0002,
"loss": 2.5383,
"step": 71000
},
{
"epoch": 0.42,
"learning_rate": 0.0002,
"loss": 2.569,
"step": 71500
},
{
"epoch": 0.42,
"learning_rate": 0.0002,
"loss": 2.5254,
"step": 72000
},
{
"epoch": 0.43,
"learning_rate": 0.0002,
"loss": 2.5403,
"step": 72500
},
{
"epoch": 0.43,
"learning_rate": 0.0002,
"loss": 2.5346,
"step": 73000
},
{
"epoch": 0.43,
"learning_rate": 0.0002,
"loss": 2.5149,
"step": 73500
},
{
"epoch": 0.44,
"learning_rate": 0.0002,
"loss": 2.5542,
"step": 74000
},
{
"epoch": 0.44,
"learning_rate": 0.0002,
"loss": 2.4683,
"step": 74500
},
{
"epoch": 0.44,
"learning_rate": 0.0002,
"loss": 2.5426,
"step": 75000
},
{
"epoch": 0.44,
"learning_rate": 0.0002,
"loss": 2.5678,
"step": 75500
},
{
"epoch": 0.45,
"learning_rate": 0.0002,
"loss": 2.5233,
"step": 76000
},
{
"epoch": 0.45,
"learning_rate": 0.0002,
"loss": 2.5101,
"step": 76500
},
{
"epoch": 0.45,
"learning_rate": 0.0002,
"loss": 2.5563,
"step": 77000
},
{
"epoch": 0.46,
"learning_rate": 0.0002,
"loss": 2.4892,
"step": 77500
},
{
"epoch": 0.46,
"learning_rate": 0.0002,
"loss": 2.5376,
"step": 78000
},
{
"epoch": 0.46,
"learning_rate": 0.0002,
"loss": 2.5388,
"step": 78500
},
{
"epoch": 0.46,
"learning_rate": 0.0002,
"loss": 2.6034,
"step": 79000
},
{
"epoch": 0.47,
"learning_rate": 0.0002,
"loss": 2.5005,
"step": 79500
},
{
"epoch": 0.47,
"learning_rate": 0.0002,
"loss": 2.5341,
"step": 80000
},
{
"epoch": 0.47,
"eval_loss": 0.9903130531311035,
"eval_runtime": 4360.6192,
"eval_samples_per_second": 4.873,
"eval_steps_per_second": 4.873,
"step": 80000
},
{
"epoch": 0.47,
"learning_rate": 0.0002,
"loss": 2.524,
"step": 80500
},
{
"epoch": 0.48,
"learning_rate": 0.0002,
"loss": 2.5629,
"step": 81000
},
{
"epoch": 0.48,
"learning_rate": 0.0002,
"loss": 2.5062,
"step": 81500
},
{
"epoch": 0.48,
"learning_rate": 0.0002,
"loss": 2.5454,
"step": 82000
},
{
"epoch": 0.49,
"learning_rate": 0.0002,
"loss": 2.5476,
"step": 82500
},
{
"epoch": 0.49,
"learning_rate": 0.0002,
"loss": 2.5474,
"step": 83000
},
{
"epoch": 0.49,
"learning_rate": 0.0002,
"loss": 2.5022,
"step": 83500
},
{
"epoch": 0.49,
"learning_rate": 0.0002,
"loss": 2.5171,
"step": 84000
},
{
"epoch": 0.5,
"learning_rate": 0.0002,
"loss": 2.5552,
"step": 84500
},
{
"epoch": 0.5,
"learning_rate": 0.0002,
"loss": 2.5438,
"step": 85000
},
{
"epoch": 0.5,
"learning_rate": 0.0002,
"loss": 2.5372,
"step": 85500
},
{
"epoch": 0.51,
"learning_rate": 0.0002,
"loss": 2.5573,
"step": 86000
},
{
"epoch": 0.51,
"learning_rate": 0.0002,
"loss": 2.529,
"step": 86500
},
{
"epoch": 0.51,
"learning_rate": 0.0002,
"loss": 2.5663,
"step": 87000
},
{
"epoch": 0.51,
"learning_rate": 0.0002,
"loss": 2.6346,
"step": 87500
},
{
"epoch": 0.52,
"learning_rate": 0.0002,
"loss": 2.5278,
"step": 88000
},
{
"epoch": 0.52,
"learning_rate": 0.0002,
"loss": 2.3625,
"step": 88500
},
{
"epoch": 0.52,
"learning_rate": 0.0002,
"loss": 2.3439,
"step": 89000
},
{
"epoch": 0.53,
"learning_rate": 0.0002,
"loss": 2.3465,
"step": 89500
},
{
"epoch": 0.53,
"learning_rate": 0.0002,
"loss": 2.3382,
"step": 90000
},
{
"epoch": 0.53,
"eval_loss": 2.6655304431915283,
"eval_runtime": 4818.1263,
"eval_samples_per_second": 4.41,
"eval_steps_per_second": 4.41,
"step": 90000
},
{
"epoch": 0.53,
"learning_rate": 0.0002,
"loss": 2.3589,
"step": 90500
},
{
"epoch": 0.54,
"learning_rate": 0.0002,
"loss": 2.3548,
"step": 91000
},
{
"epoch": 0.54,
"learning_rate": 0.0002,
"loss": 2.3523,
"step": 91500
},
{
"epoch": 0.54,
"learning_rate": 0.0002,
"loss": 2.3466,
"step": 92000
},
{
"epoch": 0.54,
"learning_rate": 0.0002,
"loss": 2.3472,
"step": 92500
},
{
"epoch": 0.55,
"learning_rate": 0.0002,
"loss": 2.3599,
"step": 93000
},
{
"epoch": 0.55,
"learning_rate": 0.0002,
"loss": 2.3891,
"step": 93500
},
{
"epoch": 0.55,
"learning_rate": 0.0002,
"loss": 2.3451,
"step": 94000
},
{
"epoch": 0.56,
"learning_rate": 0.0002,
"loss": 2.3753,
"step": 94500
},
{
"epoch": 0.56,
"learning_rate": 0.0002,
"loss": 2.3492,
"step": 95000
},
{
"epoch": 0.56,
"learning_rate": 0.0002,
"loss": 2.315,
"step": 95500
},
{
"epoch": 0.56,
"learning_rate": 0.0002,
"loss": 2.361,
"step": 96000
},
{
"epoch": 0.57,
"learning_rate": 0.0002,
"loss": 2.3837,
"step": 96500
},
{
"epoch": 0.57,
"learning_rate": 0.0002,
"loss": 2.3445,
"step": 97000
},
{
"epoch": 0.57,
"learning_rate": 0.0002,
"loss": 2.349,
"step": 97500
},
{
"epoch": 0.58,
"learning_rate": 0.0002,
"loss": 2.3372,
"step": 98000
},
{
"epoch": 0.58,
"learning_rate": 0.0002,
"loss": 2.3443,
"step": 98500
},
{
"epoch": 0.58,
"learning_rate": 0.0002,
"loss": 2.3264,
"step": 99000
},
{
"epoch": 0.59,
"learning_rate": 0.0002,
"loss": 2.3284,
"step": 99500
},
{
"epoch": 0.59,
"learning_rate": 0.0002,
"loss": 2.364,
"step": 100000
},
{
"epoch": 0.59,
"eval_loss": 2.6744492053985596,
"eval_runtime": 4817.3407,
"eval_samples_per_second": 4.411,
"eval_steps_per_second": 4.411,
"step": 100000
},
{
"epoch": 0.59,
"learning_rate": 0.0002,
"loss": 2.3644,
"step": 100500
},
{
"epoch": 0.59,
"learning_rate": 0.0002,
"loss": 2.3559,
"step": 101000
},
{
"epoch": 0.6,
"learning_rate": 0.0002,
"loss": 2.3435,
"step": 101500
},
{
"epoch": 0.6,
"learning_rate": 0.0002,
"loss": 2.3936,
"step": 102000
},
{
"epoch": 0.6,
"learning_rate": 0.0002,
"loss": 2.3371,
"step": 102500
},
{
"epoch": 0.61,
"learning_rate": 0.0002,
"loss": 2.3501,
"step": 103000
},
{
"epoch": 0.61,
"learning_rate": 0.0002,
"loss": 2.3649,
"step": 103500
},
{
"epoch": 0.61,
"learning_rate": 0.0002,
"loss": 2.3642,
"step": 104000
},
{
"epoch": 0.61,
"learning_rate": 0.0002,
"loss": 2.3534,
"step": 104500
},
{
"epoch": 0.62,
"learning_rate": 0.0002,
"loss": 2.3466,
"step": 105000
},
{
"epoch": 0.62,
"learning_rate": 0.0002,
"loss": 2.2974,
"step": 105500
},
{
"epoch": 0.62,
"learning_rate": 0.0002,
"loss": 2.3481,
"step": 106000
},
{
"epoch": 0.63,
"learning_rate": 0.0002,
"loss": 2.3419,
"step": 106500
},
{
"epoch": 0.63,
"learning_rate": 0.0002,
"loss": 2.3576,
"step": 107000
},
{
"epoch": 0.63,
"learning_rate": 0.0002,
"loss": 2.3623,
"step": 107500
},
{
"epoch": 0.64,
"learning_rate": 0.0002,
"loss": 2.3495,
"step": 108000
},
{
"epoch": 0.64,
"learning_rate": 0.0002,
"loss": 2.3712,
"step": 108500
},
{
"epoch": 0.64,
"learning_rate": 0.0002,
"loss": 2.3906,
"step": 109000
},
{
"epoch": 0.64,
"learning_rate": 0.0002,
"loss": 2.4141,
"step": 109500
},
{
"epoch": 0.65,
"learning_rate": 0.0002,
"loss": 2.3699,
"step": 110000
},
{
"epoch": 0.65,
"eval_loss": 2.673431634902954,
"eval_runtime": 4815.7605,
"eval_samples_per_second": 4.413,
"eval_steps_per_second": 4.413,
"step": 110000
},
{
"epoch": 0.65,
"learning_rate": 0.0002,
"loss": 2.3651,
"step": 110500
},
{
"epoch": 0.65,
"learning_rate": 0.0002,
"loss": 2.3447,
"step": 111000
},
{
"epoch": 0.66,
"learning_rate": 0.0002,
"loss": 2.3671,
"step": 111500
},
{
"epoch": 0.66,
"learning_rate": 0.0002,
"loss": 2.356,
"step": 112000
},
{
"epoch": 0.66,
"learning_rate": 0.0002,
"loss": 2.3909,
"step": 112500
},
{
"epoch": 0.66,
"learning_rate": 0.0002,
"loss": 2.3871,
"step": 113000
},
{
"epoch": 0.67,
"learning_rate": 0.0002,
"loss": 2.3786,
"step": 113500
},
{
"epoch": 0.67,
"learning_rate": 0.0002,
"loss": 2.3473,
"step": 114000
},
{
"epoch": 0.67,
"learning_rate": 0.0002,
"loss": 2.3424,
"step": 114500
},
{
"epoch": 0.68,
"learning_rate": 0.0002,
"loss": 2.3396,
"step": 115000
},
{
"epoch": 0.68,
"learning_rate": 0.0002,
"loss": 2.3395,
"step": 115500
},
{
"epoch": 0.68,
"learning_rate": 0.0002,
"loss": 2.3115,
"step": 116000
},
{
"epoch": 0.69,
"learning_rate": 0.0002,
"loss": 2.3832,
"step": 116500
},
{
"epoch": 0.69,
"learning_rate": 0.0002,
"loss": 2.3606,
"step": 117000
},
{
"epoch": 0.69,
"learning_rate": 0.0002,
"loss": 2.3634,
"step": 117500
},
{
"epoch": 0.69,
"learning_rate": 0.0002,
"loss": 2.365,
"step": 118000
},
{
"epoch": 0.7,
"learning_rate": 0.0002,
"loss": 2.3456,
"step": 118500
},
{
"epoch": 0.7,
"learning_rate": 0.0002,
"loss": 2.3678,
"step": 119000
},
{
"epoch": 0.7,
"learning_rate": 0.0002,
"loss": 2.3941,
"step": 119500
},
{
"epoch": 0.71,
"learning_rate": 0.0002,
"loss": 2.3415,
"step": 120000
},
{
"epoch": 0.71,
"eval_loss": 2.6743311882019043,
"eval_runtime": 4816.0272,
"eval_samples_per_second": 4.412,
"eval_steps_per_second": 4.412,
"step": 120000
},
{
"epoch": 0.71,
"learning_rate": 0.0002,
"loss": 2.3634,
"step": 120500
},
{
"epoch": 0.71,
"learning_rate": 0.0002,
"loss": 2.3707,
"step": 121000
},
{
"epoch": 0.71,
"learning_rate": 0.0002,
"loss": 2.3635,
"step": 121500
},
{
"epoch": 0.72,
"learning_rate": 0.0002,
"loss": 2.351,
"step": 122000
},
{
"epoch": 0.72,
"learning_rate": 0.0002,
"loss": 2.3491,
"step": 122500
},
{
"epoch": 0.72,
"learning_rate": 0.0002,
"loss": 2.3578,
"step": 123000
},
{
"epoch": 0.73,
"learning_rate": 0.0002,
"loss": 2.3551,
"step": 123500
},
{
"epoch": 0.73,
"learning_rate": 0.0002,
"loss": 2.3329,
"step": 124000
},
{
"epoch": 0.73,
"learning_rate": 0.0002,
"loss": 2.3587,
"step": 124500
},
{
"epoch": 0.74,
"learning_rate": 0.0002,
"loss": 2.3825,
"step": 125000
},
{
"epoch": 0.74,
"learning_rate": 0.0002,
"loss": 2.3499,
"step": 125500
},
{
"epoch": 0.74,
"learning_rate": 0.0002,
"loss": 2.3618,
"step": 126000
},
{
"epoch": 0.74,
"learning_rate": 0.0002,
"loss": 2.3611,
"step": 126500
},
{
"epoch": 0.75,
"learning_rate": 0.0002,
"loss": 2.3519,
"step": 127000
},
{
"epoch": 0.75,
"learning_rate": 0.0002,
"loss": 2.3942,
"step": 127500
},
{
"epoch": 0.75,
"learning_rate": 0.0002,
"loss": 2.3654,
"step": 128000
},
{
"epoch": 0.76,
"learning_rate": 0.0002,
"loss": 2.34,
"step": 128500
},
{
"epoch": 0.76,
"learning_rate": 0.0002,
"loss": 2.3108,
"step": 129000
},
{
"epoch": 0.76,
"learning_rate": 0.0002,
"loss": 2.378,
"step": 129500
},
{
"epoch": 0.76,
"learning_rate": 0.0002,
"loss": 2.356,
"step": 130000
},
{
"epoch": 0.76,
"eval_loss": 2.6768763065338135,
"eval_runtime": 4816.6925,
"eval_samples_per_second": 4.412,
"eval_steps_per_second": 4.412,
"step": 130000
},
{
"epoch": 0.77,
"learning_rate": 0.0002,
"loss": 2.3822,
"step": 130500
},
{
"epoch": 0.77,
"learning_rate": 0.0002,
"loss": 2.3459,
"step": 131000
},
{
"epoch": 0.77,
"learning_rate": 0.0002,
"loss": 2.3586,
"step": 131500
},
{
"epoch": 0.78,
"learning_rate": 0.0002,
"loss": 2.3454,
"step": 132000
},
{
"epoch": 0.78,
"learning_rate": 0.0002,
"loss": 2.3247,
"step": 132500
},
{
"epoch": 0.78,
"learning_rate": 0.0002,
"loss": 2.4073,
"step": 133000
},
{
"epoch": 0.79,
"learning_rate": 0.0002,
"loss": 2.3414,
"step": 133500
},
{
"epoch": 0.79,
"learning_rate": 0.0002,
"loss": 2.3574,
"step": 134000
},
{
"epoch": 0.79,
"learning_rate": 0.0002,
"loss": 2.3388,
"step": 134500
},
{
"epoch": 0.79,
"learning_rate": 0.0002,
"loss": 2.3689,
"step": 135000
},
{
"epoch": 0.8,
"learning_rate": 0.0002,
"loss": 2.3185,
"step": 135500
},
{
"epoch": 0.8,
"learning_rate": 0.0002,
"loss": 2.337,
"step": 136000
},
{
"epoch": 0.8,
"learning_rate": 0.0002,
"loss": 2.3462,
"step": 136500
},
{
"epoch": 0.81,
"learning_rate": 0.0002,
"loss": 2.3661,
"step": 137000
},
{
"epoch": 0.81,
"learning_rate": 0.0002,
"loss": 2.3596,
"step": 137500
},
{
"epoch": 0.81,
"learning_rate": 0.0002,
"loss": 2.3333,
"step": 138000
},
{
"epoch": 0.81,
"learning_rate": 0.0002,
"loss": 2.3709,
"step": 138500
},
{
"epoch": 0.82,
"learning_rate": 0.0002,
"loss": 2.3717,
"step": 139000
},
{
"epoch": 0.82,
"learning_rate": 0.0002,
"loss": 2.3709,
"step": 139500
},
{
"epoch": 0.82,
"learning_rate": 0.0002,
"loss": 2.3056,
"step": 140000
},
{
"epoch": 0.82,
"eval_loss": 2.681723117828369,
"eval_runtime": 4815.0048,
"eval_samples_per_second": 4.413,
"eval_steps_per_second": 4.413,
"step": 140000
},
{
"epoch": 0.83,
"learning_rate": 0.0002,
"loss": 2.3498,
"step": 140500
},
{
"epoch": 0.83,
"learning_rate": 0.0002,
"loss": 2.372,
"step": 141000
},
{
"epoch": 0.83,
"learning_rate": 0.0002,
"loss": 2.3824,
"step": 141500
},
{
"epoch": 0.84,
"learning_rate": 0.0002,
"loss": 2.3486,
"step": 142000
},
{
"epoch": 0.84,
"learning_rate": 0.0002,
"loss": 2.3141,
"step": 142500
},
{
"epoch": 0.84,
"learning_rate": 0.0002,
"loss": 2.3696,
"step": 143000
},
{
"epoch": 0.84,
"learning_rate": 0.0002,
"loss": 2.3762,
"step": 143500
},
{
"epoch": 0.85,
"learning_rate": 0.0002,
"loss": 2.3766,
"step": 144000
},
{
"epoch": 0.85,
"learning_rate": 0.0002,
"loss": 2.3511,
"step": 144500
},
{
"epoch": 0.85,
"learning_rate": 0.0002,
"loss": 2.3493,
"step": 145000
},
{
"epoch": 0.86,
"learning_rate": 0.0002,
"loss": 2.3244,
"step": 145500
},
{
"epoch": 0.86,
"learning_rate": 0.0002,
"loss": 2.3146,
"step": 146000
},
{
"epoch": 0.86,
"learning_rate": 0.0002,
"loss": 2.3676,
"step": 146500
},
{
"epoch": 0.86,
"learning_rate": 0.0002,
"loss": 2.3497,
"step": 147000
},
{
"epoch": 0.87,
"learning_rate": 0.0002,
"loss": 2.375,
"step": 147500
},
{
"epoch": 0.87,
"learning_rate": 0.0002,
"loss": 2.2942,
"step": 148000
},
{
"epoch": 0.87,
"learning_rate": 0.0002,
"loss": 2.384,
"step": 148500
},
{
"epoch": 0.88,
"learning_rate": 0.0002,
"loss": 2.3533,
"step": 149000
},
{
"epoch": 0.88,
"learning_rate": 0.0002,
"loss": 2.3455,
"step": 149500
},
{
"epoch": 0.88,
"learning_rate": 0.0002,
"loss": 2.3398,
"step": 150000
},
{
"epoch": 0.88,
"eval_loss": 2.672765016555786,
"eval_runtime": 4814.8179,
"eval_samples_per_second": 4.413,
"eval_steps_per_second": 4.413,
"step": 150000
},
{
"epoch": 0.89,
"learning_rate": 0.0002,
"loss": 2.318,
"step": 150500
},
{
"epoch": 0.89,
"learning_rate": 0.0002,
"loss": 2.3638,
"step": 151000
},
{
"epoch": 0.89,
"learning_rate": 0.0002,
"loss": 2.3714,
"step": 151500
},
{
"epoch": 0.89,
"learning_rate": 0.0002,
"loss": 2.334,
"step": 152000
},
{
"epoch": 0.9,
"learning_rate": 0.0002,
"loss": 2.3208,
"step": 152500
},
{
"epoch": 0.9,
"learning_rate": 0.0002,
"loss": 2.3684,
"step": 153000
},
{
"epoch": 0.9,
"learning_rate": 0.0002,
"loss": 2.3963,
"step": 153500
},
{
"epoch": 0.91,
"learning_rate": 0.0002,
"loss": 2.3087,
"step": 154000
},
{
"epoch": 0.91,
"learning_rate": 0.0002,
"loss": 2.3193,
"step": 154500
},
{
"epoch": 0.91,
"learning_rate": 0.0002,
"loss": 2.3295,
"step": 155000
},
{
"epoch": 0.91,
"learning_rate": 0.0002,
"loss": 2.3711,
"step": 155500
},
{
"epoch": 0.92,
"learning_rate": 0.0002,
"loss": 2.3697,
"step": 156000
},
{
"epoch": 0.92,
"learning_rate": 0.0002,
"loss": 2.3053,
"step": 156500
},
{
"epoch": 0.92,
"learning_rate": 0.0002,
"loss": 2.3356,
"step": 157000
},
{
"epoch": 0.93,
"learning_rate": 0.0002,
"loss": 2.3786,
"step": 157500
},
{
"epoch": 0.93,
"learning_rate": 0.0002,
"loss": 2.3693,
"step": 158000
},
{
"epoch": 0.93,
"learning_rate": 0.0002,
"loss": 2.3334,
"step": 158500
},
{
"epoch": 0.94,
"learning_rate": 0.0002,
"loss": 2.3657,
"step": 159000
},
{
"epoch": 0.94,
"learning_rate": 0.0002,
"loss": 2.3242,
"step": 159500
},
{
"epoch": 0.94,
"learning_rate": 0.0002,
"loss": 2.3508,
"step": 160000
},
{
"epoch": 0.94,
"eval_loss": 2.671438694000244,
"eval_runtime": 4816.9535,
"eval_samples_per_second": 4.412,
"eval_steps_per_second": 4.412,
"step": 160000
},
{
"epoch": 0.94,
"learning_rate": 0.0002,
"loss": 2.345,
"step": 160500
},
{
"epoch": 0.95,
"learning_rate": 0.0002,
"loss": 2.3409,
"step": 161000
},
{
"epoch": 0.95,
"learning_rate": 0.0002,
"loss": 2.3205,
"step": 161500
},
{
"epoch": 0.95,
"learning_rate": 0.0002,
"loss": 2.3881,
"step": 162000
},
{
"epoch": 0.96,
"learning_rate": 0.0002,
"loss": 2.3543,
"step": 162500
},
{
"epoch": 0.96,
"learning_rate": 0.0002,
"loss": 2.3467,
"step": 163000
},
{
"epoch": 0.96,
"learning_rate": 0.0002,
"loss": 2.3788,
"step": 163500
},
{
"epoch": 0.96,
"learning_rate": 0.0002,
"loss": 2.359,
"step": 164000
},
{
"epoch": 0.97,
"learning_rate": 0.0002,
"loss": 2.339,
"step": 164500
},
{
"epoch": 0.97,
"learning_rate": 0.0002,
"loss": 2.3557,
"step": 165000
},
{
"epoch": 0.97,
"learning_rate": 0.0002,
"loss": 2.3321,
"step": 165500
},
{
"epoch": 0.98,
"learning_rate": 0.0002,
"loss": 2.4048,
"step": 166000
},
{
"epoch": 0.98,
"learning_rate": 0.0002,
"loss": 2.3428,
"step": 166500
},
{
"epoch": 0.98,
"learning_rate": 0.0002,
"loss": 2.3249,
"step": 167000
},
{
"epoch": 0.99,
"learning_rate": 0.0002,
"loss": 2.3743,
"step": 167500
},
{
"epoch": 0.99,
"learning_rate": 0.0002,
"loss": 2.3225,
"step": 168000
},
{
"epoch": 0.99,
"learning_rate": 0.0002,
"loss": 2.3038,
"step": 168500
},
{
"epoch": 0.99,
"learning_rate": 0.0002,
"loss": 2.3347,
"step": 169000
},
{
"epoch": 1.0,
"learning_rate": 0.0002,
"loss": 2.3798,
"step": 169500
},
{
"epoch": 1.0,
"learning_rate": 0.0002,
"loss": 2.3568,
"step": 170000
},
{
"epoch": 1.0,
"eval_loss": 2.66947865486145,
"eval_runtime": 4817.1455,
"eval_samples_per_second": 4.411,
"eval_steps_per_second": 4.411,
"step": 170000
}
],
"logging_steps": 500,
"max_steps": 1699990,
"num_train_epochs": 10,
"save_steps": 1000,
"total_flos": 1.772080615538583e+18,
"trial_name": null,
"trial_params": null
}