2020-Q2-full_tweets_combined90 / trainer_state.json
DouglasPontes's picture
Training in progress, step 64000
b4ceef1 verified
raw
history blame
80.8 kB
{
"best_metric": 2.000014066696167,
"best_model_checkpoint": "./model_tweets_2020_Q2_full/checkpoint-2400000",
"epoch": 2.618552224296455,
"eval_steps": 8000,
"global_step": 2400000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"eval_loss": 2.1043460369110107,
"eval_runtime": 841.4118,
"eval_samples_per_second": 917.292,
"eval_steps_per_second": 57.331,
"step": 8000
},
{
"epoch": 0.02,
"learning_rate": 4.0726666666666665e-07,
"loss": 2.2608,
"step": 16000
},
{
"epoch": 0.02,
"eval_loss": 2.093374013900757,
"eval_runtime": 839.5651,
"eval_samples_per_second": 919.309,
"eval_steps_per_second": 57.457,
"step": 16000
},
{
"epoch": 0.03,
"eval_loss": 2.0861904621124268,
"eval_runtime": 840.327,
"eval_samples_per_second": 918.476,
"eval_steps_per_second": 57.405,
"step": 24000
},
{
"epoch": 0.03,
"learning_rate": 4.0453333333333336e-07,
"loss": 2.2409,
"step": 32000
},
{
"epoch": 0.03,
"eval_loss": 2.080547332763672,
"eval_runtime": 841.1888,
"eval_samples_per_second": 917.535,
"eval_steps_per_second": 57.346,
"step": 32000
},
{
"epoch": 0.04,
"eval_loss": 2.079263210296631,
"eval_runtime": 841.5773,
"eval_samples_per_second": 917.111,
"eval_steps_per_second": 57.32,
"step": 40000
},
{
"epoch": 0.05,
"learning_rate": 4.018e-07,
"loss": 2.2278,
"step": 48000
},
{
"epoch": 0.05,
"eval_loss": 2.071790933609009,
"eval_runtime": 841.5993,
"eval_samples_per_second": 917.087,
"eval_steps_per_second": 57.318,
"step": 48000
},
{
"epoch": 0.06,
"eval_loss": 2.0752639770507812,
"eval_runtime": 841.4195,
"eval_samples_per_second": 917.283,
"eval_steps_per_second": 57.33,
"step": 56000
},
{
"epoch": 0.07,
"learning_rate": 3.9906666666666667e-07,
"loss": 2.2059,
"step": 64000
},
{
"epoch": 0.07,
"eval_loss": 2.0668046474456787,
"eval_runtime": 844.6529,
"eval_samples_per_second": 913.772,
"eval_steps_per_second": 57.111,
"step": 64000
},
{
"epoch": 0.08,
"eval_loss": 2.0657169818878174,
"eval_runtime": 844.5291,
"eval_samples_per_second": 913.906,
"eval_steps_per_second": 57.119,
"step": 72000
},
{
"epoch": 0.09,
"learning_rate": 3.963333333333333e-07,
"loss": 2.1997,
"step": 80000
},
{
"epoch": 0.09,
"eval_loss": 2.062004566192627,
"eval_runtime": 845.6772,
"eval_samples_per_second": 912.665,
"eval_steps_per_second": 57.042,
"step": 80000
},
{
"epoch": 0.1,
"eval_loss": 2.0553247928619385,
"eval_runtime": 846.4058,
"eval_samples_per_second": 911.879,
"eval_steps_per_second": 56.993,
"step": 88000
},
{
"epoch": 0.1,
"learning_rate": 3.936e-07,
"loss": 2.1988,
"step": 96000
},
{
"epoch": 0.1,
"eval_loss": 2.0569465160369873,
"eval_runtime": 841.5807,
"eval_samples_per_second": 917.108,
"eval_steps_per_second": 57.32,
"step": 96000
},
{
"epoch": 0.11,
"eval_loss": 2.052541971206665,
"eval_runtime": 845.3721,
"eval_samples_per_second": 912.994,
"eval_steps_per_second": 57.062,
"step": 104000
},
{
"epoch": 0.12,
"learning_rate": 3.908666666666667e-07,
"loss": 2.1861,
"step": 112000
},
{
"epoch": 0.12,
"eval_loss": 2.05564284324646,
"eval_runtime": 847.9385,
"eval_samples_per_second": 910.231,
"eval_steps_per_second": 56.89,
"step": 112000
},
{
"epoch": 0.13,
"eval_loss": 2.04929256439209,
"eval_runtime": 842.77,
"eval_samples_per_second": 915.813,
"eval_steps_per_second": 57.239,
"step": 120000
},
{
"epoch": 0.14,
"learning_rate": 3.8813333333333334e-07,
"loss": 2.1823,
"step": 128000
},
{
"epoch": 0.14,
"eval_loss": 2.0508854389190674,
"eval_runtime": 846.1188,
"eval_samples_per_second": 912.189,
"eval_steps_per_second": 57.012,
"step": 128000
},
{
"epoch": 0.15,
"eval_loss": 2.0460989475250244,
"eval_runtime": 845.0745,
"eval_samples_per_second": 913.316,
"eval_steps_per_second": 57.083,
"step": 136000
},
{
"epoch": 0.16,
"learning_rate": 3.854e-07,
"loss": 2.1851,
"step": 144000
},
{
"epoch": 0.16,
"eval_loss": 2.0476059913635254,
"eval_runtime": 844.5648,
"eval_samples_per_second": 913.867,
"eval_steps_per_second": 57.117,
"step": 144000
},
{
"epoch": 0.17,
"eval_loss": 2.04502010345459,
"eval_runtime": 845.4437,
"eval_samples_per_second": 912.917,
"eval_steps_per_second": 57.058,
"step": 152000
},
{
"epoch": 0.17,
"learning_rate": 3.8266666666666665e-07,
"loss": 2.1862,
"step": 160000
},
{
"epoch": 0.17,
"eval_loss": 2.046872854232788,
"eval_runtime": 843.2469,
"eval_samples_per_second": 915.295,
"eval_steps_per_second": 57.206,
"step": 160000
},
{
"epoch": 0.18,
"eval_loss": 2.0441744327545166,
"eval_runtime": 845.6614,
"eval_samples_per_second": 912.682,
"eval_steps_per_second": 57.043,
"step": 168000
},
{
"epoch": 0.19,
"learning_rate": 3.799333333333333e-07,
"loss": 2.1741,
"step": 176000
},
{
"epoch": 0.19,
"eval_loss": 2.0456435680389404,
"eval_runtime": 847.979,
"eval_samples_per_second": 910.188,
"eval_steps_per_second": 56.887,
"step": 176000
},
{
"epoch": 0.2,
"eval_loss": 2.0441741943359375,
"eval_runtime": 846.0243,
"eval_samples_per_second": 912.291,
"eval_steps_per_second": 57.018,
"step": 184000
},
{
"epoch": 0.21,
"learning_rate": 3.772e-07,
"loss": 2.181,
"step": 192000
},
{
"epoch": 0.21,
"eval_loss": 2.040196418762207,
"eval_runtime": 851.9304,
"eval_samples_per_second": 905.966,
"eval_steps_per_second": 56.623,
"step": 192000
},
{
"epoch": 0.22,
"eval_loss": 2.0422918796539307,
"eval_runtime": 847.5458,
"eval_samples_per_second": 910.653,
"eval_steps_per_second": 56.916,
"step": 200000
},
{
"epoch": 0.23,
"learning_rate": 3.7446666666666667e-07,
"loss": 2.1692,
"step": 208000
},
{
"epoch": 0.23,
"eval_loss": 2.041342258453369,
"eval_runtime": 847.0952,
"eval_samples_per_second": 911.137,
"eval_steps_per_second": 56.946,
"step": 208000
},
{
"epoch": 0.24,
"eval_loss": 2.0448198318481445,
"eval_runtime": 846.694,
"eval_samples_per_second": 911.569,
"eval_steps_per_second": 56.973,
"step": 216000
},
{
"epoch": 0.24,
"learning_rate": 3.7173333333333333e-07,
"loss": 2.1678,
"step": 224000
},
{
"epoch": 0.24,
"eval_loss": 2.0417792797088623,
"eval_runtime": 846.2034,
"eval_samples_per_second": 912.098,
"eval_steps_per_second": 57.006,
"step": 224000
},
{
"epoch": 0.25,
"eval_loss": 2.041692018508911,
"eval_runtime": 848.6147,
"eval_samples_per_second": 909.506,
"eval_steps_per_second": 56.844,
"step": 232000
},
{
"epoch": 0.26,
"learning_rate": 3.69e-07,
"loss": 2.1756,
"step": 240000
},
{
"epoch": 0.26,
"eval_loss": 2.034193754196167,
"eval_runtime": 847.0585,
"eval_samples_per_second": 911.177,
"eval_steps_per_second": 56.949,
"step": 240000
},
{
"epoch": 0.27,
"eval_loss": 2.037684202194214,
"eval_runtime": 846.2239,
"eval_samples_per_second": 912.075,
"eval_steps_per_second": 57.005,
"step": 248000
},
{
"epoch": 0.28,
"learning_rate": 3.6626666666666664e-07,
"loss": 2.1752,
"step": 256000
},
{
"epoch": 0.28,
"eval_loss": 2.0381019115448,
"eval_runtime": 846.7119,
"eval_samples_per_second": 911.55,
"eval_steps_per_second": 56.972,
"step": 256000
},
{
"epoch": 0.29,
"eval_loss": 2.035405158996582,
"eval_runtime": 851.4653,
"eval_samples_per_second": 906.461,
"eval_steps_per_second": 56.654,
"step": 264000
},
{
"epoch": 0.3,
"learning_rate": 3.6353333333333335e-07,
"loss": 2.1673,
"step": 272000
},
{
"epoch": 0.3,
"eval_loss": 2.038097858428955,
"eval_runtime": 846.955,
"eval_samples_per_second": 911.288,
"eval_steps_per_second": 56.956,
"step": 272000
},
{
"epoch": 0.31,
"eval_loss": 2.0375349521636963,
"eval_runtime": 846.7581,
"eval_samples_per_second": 911.5,
"eval_steps_per_second": 56.969,
"step": 280000
},
{
"epoch": 0.31,
"learning_rate": 3.608e-07,
"loss": 2.1585,
"step": 288000
},
{
"epoch": 0.31,
"eval_loss": 2.033590078353882,
"eval_runtime": 848.1336,
"eval_samples_per_second": 910.022,
"eval_steps_per_second": 56.877,
"step": 288000
},
{
"epoch": 0.32,
"eval_loss": 2.0344314575195312,
"eval_runtime": 847.2304,
"eval_samples_per_second": 910.992,
"eval_steps_per_second": 56.937,
"step": 296000
},
{
"epoch": 0.33,
"learning_rate": 3.5806666666666666e-07,
"loss": 2.1703,
"step": 304000
},
{
"epoch": 0.33,
"eval_loss": 2.034810781478882,
"eval_runtime": 846.3544,
"eval_samples_per_second": 911.935,
"eval_steps_per_second": 56.996,
"step": 304000
},
{
"epoch": 0.34,
"eval_loss": 2.0329954624176025,
"eval_runtime": 847.3997,
"eval_samples_per_second": 910.81,
"eval_steps_per_second": 56.926,
"step": 312000
},
{
"epoch": 0.35,
"learning_rate": 3.553333333333333e-07,
"loss": 2.1667,
"step": 320000
},
{
"epoch": 0.35,
"eval_loss": 2.0352213382720947,
"eval_runtime": 846.3586,
"eval_samples_per_second": 911.93,
"eval_steps_per_second": 56.996,
"step": 320000
},
{
"epoch": 0.36,
"eval_loss": 2.0359089374542236,
"eval_runtime": 848.8487,
"eval_samples_per_second": 909.255,
"eval_steps_per_second": 56.829,
"step": 328000
},
{
"epoch": 0.37,
"learning_rate": 3.5259999999999997e-07,
"loss": 2.1649,
"step": 336000
},
{
"epoch": 0.37,
"eval_loss": 2.031733512878418,
"eval_runtime": 848.6246,
"eval_samples_per_second": 909.495,
"eval_steps_per_second": 56.844,
"step": 336000
},
{
"epoch": 0.38,
"eval_loss": 2.0314059257507324,
"eval_runtime": 851.4761,
"eval_samples_per_second": 906.449,
"eval_steps_per_second": 56.653,
"step": 344000
},
{
"epoch": 0.38,
"learning_rate": 3.498666666666667e-07,
"loss": 2.1564,
"step": 352000
},
{
"epoch": 0.38,
"eval_loss": 2.030597686767578,
"eval_runtime": 850.4287,
"eval_samples_per_second": 907.566,
"eval_steps_per_second": 56.723,
"step": 352000
},
{
"epoch": 0.39,
"eval_loss": 2.029878616333008,
"eval_runtime": 850.6967,
"eval_samples_per_second": 907.28,
"eval_steps_per_second": 56.705,
"step": 360000
},
{
"epoch": 0.4,
"learning_rate": 3.4713333333333333e-07,
"loss": 2.161,
"step": 368000
},
{
"epoch": 0.4,
"eval_loss": 2.0317320823669434,
"eval_runtime": 851.1347,
"eval_samples_per_second": 906.813,
"eval_steps_per_second": 56.676,
"step": 368000
},
{
"epoch": 0.41,
"eval_loss": 2.032505989074707,
"eval_runtime": 854.4271,
"eval_samples_per_second": 903.319,
"eval_steps_per_second": 56.458,
"step": 376000
},
{
"epoch": 0.42,
"learning_rate": 3.444e-07,
"loss": 2.1551,
"step": 384000
},
{
"epoch": 0.42,
"eval_loss": 2.0273916721343994,
"eval_runtime": 850.6274,
"eval_samples_per_second": 907.354,
"eval_steps_per_second": 56.71,
"step": 384000
},
{
"epoch": 0.43,
"eval_loss": 2.0281741619110107,
"eval_runtime": 850.1523,
"eval_samples_per_second": 907.861,
"eval_steps_per_second": 56.742,
"step": 392000
},
{
"epoch": 0.44,
"learning_rate": 3.416666666666667e-07,
"loss": 2.1602,
"step": 400000
},
{
"epoch": 0.44,
"eval_loss": 2.0300543308258057,
"eval_runtime": 852.8839,
"eval_samples_per_second": 904.953,
"eval_steps_per_second": 56.56,
"step": 400000
},
{
"epoch": 0.45,
"eval_loss": 2.0302786827087402,
"eval_runtime": 854.7636,
"eval_samples_per_second": 902.963,
"eval_steps_per_second": 56.435,
"step": 408000
},
{
"epoch": 0.45,
"learning_rate": 3.3893333333333335e-07,
"loss": 2.1581,
"step": 416000
},
{
"epoch": 0.45,
"eval_loss": 2.026031732559204,
"eval_runtime": 852.2087,
"eval_samples_per_second": 905.67,
"eval_steps_per_second": 56.605,
"step": 416000
},
{
"epoch": 0.46,
"eval_loss": 2.0248208045959473,
"eval_runtime": 850.4117,
"eval_samples_per_second": 907.584,
"eval_steps_per_second": 56.724,
"step": 424000
},
{
"epoch": 0.47,
"learning_rate": 3.3619999999999995e-07,
"loss": 2.1494,
"step": 432000
},
{
"epoch": 0.47,
"eval_loss": 2.026501178741455,
"eval_runtime": 848.7671,
"eval_samples_per_second": 909.343,
"eval_steps_per_second": 56.834,
"step": 432000
},
{
"epoch": 0.48,
"eval_loss": 2.0246880054473877,
"eval_runtime": 849.7267,
"eval_samples_per_second": 908.316,
"eval_steps_per_second": 56.77,
"step": 440000
},
{
"epoch": 0.49,
"learning_rate": 3.3346666666666666e-07,
"loss": 2.1508,
"step": 448000
},
{
"epoch": 0.49,
"eval_loss": 2.0231027603149414,
"eval_runtime": 849.0484,
"eval_samples_per_second": 909.041,
"eval_steps_per_second": 56.815,
"step": 448000
},
{
"epoch": 0.5,
"eval_loss": 2.0276315212249756,
"eval_runtime": 849.4168,
"eval_samples_per_second": 908.647,
"eval_steps_per_second": 56.791,
"step": 456000
},
{
"epoch": 0.51,
"learning_rate": 3.307333333333333e-07,
"loss": 2.153,
"step": 464000
},
{
"epoch": 0.51,
"eval_loss": 2.0275754928588867,
"eval_runtime": 848.4629,
"eval_samples_per_second": 909.669,
"eval_steps_per_second": 56.855,
"step": 464000
},
{
"epoch": 0.51,
"eval_loss": 2.0241763591766357,
"eval_runtime": 849.6091,
"eval_samples_per_second": 908.441,
"eval_steps_per_second": 56.778,
"step": 472000
},
{
"epoch": 0.52,
"learning_rate": 3.28e-07,
"loss": 2.1489,
"step": 480000
},
{
"epoch": 0.52,
"eval_loss": 2.0259480476379395,
"eval_runtime": 849.4664,
"eval_samples_per_second": 908.594,
"eval_steps_per_second": 56.787,
"step": 480000
},
{
"epoch": 0.53,
"eval_loss": 2.025740623474121,
"eval_runtime": 850.1732,
"eval_samples_per_second": 907.839,
"eval_steps_per_second": 56.74,
"step": 488000
},
{
"epoch": 0.54,
"learning_rate": 3.252666666666667e-07,
"loss": 2.1468,
"step": 496000
},
{
"epoch": 0.54,
"eval_loss": 2.027461528778076,
"eval_runtime": 850.2923,
"eval_samples_per_second": 907.711,
"eval_steps_per_second": 56.732,
"step": 496000
},
{
"epoch": 0.55,
"eval_loss": 2.030271053314209,
"eval_runtime": 851.4114,
"eval_samples_per_second": 906.518,
"eval_steps_per_second": 56.658,
"step": 504000
},
{
"epoch": 0.56,
"learning_rate": 3.2253333333333334e-07,
"loss": 2.1446,
"step": 512000
},
{
"epoch": 0.56,
"eval_loss": 2.0248193740844727,
"eval_runtime": 852.1182,
"eval_samples_per_second": 905.766,
"eval_steps_per_second": 56.611,
"step": 512000
},
{
"epoch": 0.57,
"eval_loss": 2.0285604000091553,
"eval_runtime": 849.8013,
"eval_samples_per_second": 908.236,
"eval_steps_per_second": 56.765,
"step": 520000
},
{
"epoch": 0.58,
"learning_rate": 3.198e-07,
"loss": 2.1409,
"step": 528000
},
{
"epoch": 0.58,
"eval_loss": 2.0211498737335205,
"eval_runtime": 855.0597,
"eval_samples_per_second": 902.65,
"eval_steps_per_second": 56.416,
"step": 528000
},
{
"epoch": 0.58,
"eval_loss": 2.0204012393951416,
"eval_runtime": 856.0145,
"eval_samples_per_second": 901.644,
"eval_steps_per_second": 56.353,
"step": 536000
},
{
"epoch": 0.59,
"learning_rate": 3.1706666666666665e-07,
"loss": 2.1536,
"step": 544000
},
{
"epoch": 0.59,
"eval_loss": 2.0198850631713867,
"eval_runtime": 856.7067,
"eval_samples_per_second": 900.915,
"eval_steps_per_second": 56.307,
"step": 544000
},
{
"epoch": 0.6,
"eval_loss": 2.0281307697296143,
"eval_runtime": 867.0343,
"eval_samples_per_second": 890.184,
"eval_steps_per_second": 55.637,
"step": 552000
},
{
"epoch": 0.61,
"learning_rate": 3.1433333333333336e-07,
"loss": 2.1416,
"step": 560000
},
{
"epoch": 0.61,
"eval_loss": 2.0237483978271484,
"eval_runtime": 866.1166,
"eval_samples_per_second": 891.127,
"eval_steps_per_second": 55.696,
"step": 560000
},
{
"epoch": 0.62,
"eval_loss": 2.0231337547302246,
"eval_runtime": 863.3507,
"eval_samples_per_second": 893.982,
"eval_steps_per_second": 55.874,
"step": 568000
},
{
"epoch": 0.63,
"learning_rate": 3.116e-07,
"loss": 2.1502,
"step": 576000
},
{
"epoch": 0.63,
"eval_loss": 2.0205323696136475,
"eval_runtime": 857.8171,
"eval_samples_per_second": 899.749,
"eval_steps_per_second": 56.235,
"step": 576000
},
{
"epoch": 0.64,
"eval_loss": 2.021655559539795,
"eval_runtime": 853.6943,
"eval_samples_per_second": 904.094,
"eval_steps_per_second": 56.506,
"step": 584000
},
{
"epoch": 0.65,
"learning_rate": 3.0886666666666667e-07,
"loss": 2.1424,
"step": 592000
},
{
"epoch": 0.65,
"eval_loss": 2.024162769317627,
"eval_runtime": 861.2895,
"eval_samples_per_second": 896.121,
"eval_steps_per_second": 56.008,
"step": 592000
},
{
"epoch": 0.65,
"eval_loss": 2.0237643718719482,
"eval_runtime": 859.5317,
"eval_samples_per_second": 897.954,
"eval_steps_per_second": 56.122,
"step": 600000
},
{
"epoch": 0.66,
"learning_rate": 3.061333333333333e-07,
"loss": 2.1469,
"step": 608000
},
{
"epoch": 0.66,
"eval_loss": 2.0191547870635986,
"eval_runtime": 855.9495,
"eval_samples_per_second": 901.712,
"eval_steps_per_second": 56.357,
"step": 608000
},
{
"epoch": 0.67,
"eval_loss": 2.024866819381714,
"eval_runtime": 857.0469,
"eval_samples_per_second": 900.557,
"eval_steps_per_second": 56.285,
"step": 616000
},
{
"epoch": 0.68,
"learning_rate": 3.034e-07,
"loss": 2.145,
"step": 624000
},
{
"epoch": 0.68,
"eval_loss": 2.0195770263671875,
"eval_runtime": 858.8544,
"eval_samples_per_second": 898.662,
"eval_steps_per_second": 56.167,
"step": 624000
},
{
"epoch": 0.69,
"eval_loss": 2.022365093231201,
"eval_runtime": 854.0414,
"eval_samples_per_second": 903.727,
"eval_steps_per_second": 56.483,
"step": 632000
},
{
"epoch": 0.7,
"learning_rate": 3.0066666666666663e-07,
"loss": 2.1503,
"step": 640000
},
{
"epoch": 0.7,
"eval_loss": 2.0216493606567383,
"eval_runtime": 854.8203,
"eval_samples_per_second": 902.903,
"eval_steps_per_second": 56.432,
"step": 640000
},
{
"epoch": 0.71,
"eval_loss": 2.022836208343506,
"eval_runtime": 857.6145,
"eval_samples_per_second": 899.962,
"eval_steps_per_second": 56.248,
"step": 648000
},
{
"epoch": 0.72,
"learning_rate": 2.9793333333333334e-07,
"loss": 2.1355,
"step": 656000
},
{
"epoch": 0.72,
"eval_loss": 2.019666910171509,
"eval_runtime": 859.7029,
"eval_samples_per_second": 897.775,
"eval_steps_per_second": 56.111,
"step": 656000
},
{
"epoch": 0.72,
"eval_loss": 2.0240182876586914,
"eval_runtime": 858.0715,
"eval_samples_per_second": 899.482,
"eval_steps_per_second": 56.218,
"step": 664000
},
{
"epoch": 0.73,
"learning_rate": 2.952e-07,
"loss": 2.1392,
"step": 672000
},
{
"epoch": 0.73,
"eval_loss": 2.0232093334198,
"eval_runtime": 856.593,
"eval_samples_per_second": 901.035,
"eval_steps_per_second": 56.315,
"step": 672000
},
{
"epoch": 0.74,
"eval_loss": 2.020932912826538,
"eval_runtime": 858.8309,
"eval_samples_per_second": 898.687,
"eval_steps_per_second": 56.168,
"step": 680000
},
{
"epoch": 0.75,
"learning_rate": 2.9246666666666665e-07,
"loss": 2.1378,
"step": 688000
},
{
"epoch": 0.75,
"eval_loss": 2.0219063758850098,
"eval_runtime": 860.0126,
"eval_samples_per_second": 897.452,
"eval_steps_per_second": 56.091,
"step": 688000
},
{
"epoch": 0.76,
"eval_loss": 2.019192695617676,
"eval_runtime": 861.8149,
"eval_samples_per_second": 895.575,
"eval_steps_per_second": 55.974,
"step": 696000
},
{
"epoch": 0.77,
"learning_rate": 2.897333333333333e-07,
"loss": 2.1446,
"step": 704000
},
{
"epoch": 0.77,
"eval_loss": 2.0194740295410156,
"eval_runtime": 857.8914,
"eval_samples_per_second": 899.671,
"eval_steps_per_second": 56.23,
"step": 704000
},
{
"epoch": 0.78,
"eval_loss": 2.01971173286438,
"eval_runtime": 857.8638,
"eval_samples_per_second": 899.7,
"eval_steps_per_second": 56.232,
"step": 712000
},
{
"epoch": 0.79,
"learning_rate": 2.8699999999999996e-07,
"loss": 2.1351,
"step": 720000
},
{
"epoch": 0.79,
"eval_loss": 2.0183634757995605,
"eval_runtime": 857.8713,
"eval_samples_per_second": 899.692,
"eval_steps_per_second": 56.231,
"step": 720000
},
{
"epoch": 0.79,
"eval_loss": 2.0162270069122314,
"eval_runtime": 857.9238,
"eval_samples_per_second": 899.637,
"eval_steps_per_second": 56.228,
"step": 728000
},
{
"epoch": 0.8,
"learning_rate": 2.8426666666666667e-07,
"loss": 2.1437,
"step": 736000
},
{
"epoch": 0.8,
"eval_loss": 2.015068531036377,
"eval_runtime": 857.7851,
"eval_samples_per_second": 899.783,
"eval_steps_per_second": 56.237,
"step": 736000
},
{
"epoch": 0.81,
"eval_loss": 2.0202245712280273,
"eval_runtime": 857.6732,
"eval_samples_per_second": 899.9,
"eval_steps_per_second": 56.244,
"step": 744000
},
{
"epoch": 0.82,
"learning_rate": 2.815333333333333e-07,
"loss": 2.1249,
"step": 752000
},
{
"epoch": 0.82,
"eval_loss": 2.0169003009796143,
"eval_runtime": 860.8823,
"eval_samples_per_second": 896.545,
"eval_steps_per_second": 56.034,
"step": 752000
},
{
"epoch": 0.83,
"eval_loss": 2.018857002258301,
"eval_runtime": 856.9399,
"eval_samples_per_second": 900.67,
"eval_steps_per_second": 56.292,
"step": 760000
},
{
"epoch": 0.84,
"learning_rate": 2.7880000000000003e-07,
"loss": 2.1355,
"step": 768000
},
{
"epoch": 0.84,
"eval_loss": 2.022115707397461,
"eval_runtime": 860.0914,
"eval_samples_per_second": 897.37,
"eval_steps_per_second": 56.086,
"step": 768000
},
{
"epoch": 0.85,
"eval_loss": 2.0194284915924072,
"eval_runtime": 858.1451,
"eval_samples_per_second": 899.405,
"eval_steps_per_second": 56.213,
"step": 776000
},
{
"epoch": 0.86,
"learning_rate": 2.7606666666666664e-07,
"loss": 2.1387,
"step": 784000
},
{
"epoch": 0.86,
"eval_loss": 2.018942356109619,
"eval_runtime": 862.7177,
"eval_samples_per_second": 894.638,
"eval_steps_per_second": 55.915,
"step": 784000
},
{
"epoch": 0.86,
"eval_loss": 2.016535520553589,
"eval_runtime": 858.1148,
"eval_samples_per_second": 899.437,
"eval_steps_per_second": 56.215,
"step": 792000
},
{
"epoch": 0.87,
"learning_rate": 2.733333333333333e-07,
"loss": 2.1334,
"step": 800000
},
{
"epoch": 0.87,
"eval_loss": 2.0169451236724854,
"eval_runtime": 860.3041,
"eval_samples_per_second": 897.148,
"eval_steps_per_second": 56.072,
"step": 800000
},
{
"epoch": 0.88,
"eval_loss": 2.0188918113708496,
"eval_runtime": 861.004,
"eval_samples_per_second": 896.419,
"eval_steps_per_second": 56.026,
"step": 808000
},
{
"epoch": 0.89,
"learning_rate": 2.706e-07,
"loss": 2.137,
"step": 816000
},
{
"epoch": 0.89,
"eval_loss": 2.016237258911133,
"eval_runtime": 862.2544,
"eval_samples_per_second": 895.119,
"eval_steps_per_second": 55.945,
"step": 816000
},
{
"epoch": 0.9,
"eval_loss": 2.0168325901031494,
"eval_runtime": 860.8877,
"eval_samples_per_second": 896.54,
"eval_steps_per_second": 56.034,
"step": 824000
},
{
"epoch": 0.91,
"learning_rate": 2.6786666666666666e-07,
"loss": 2.1331,
"step": 832000
},
{
"epoch": 0.91,
"eval_loss": 2.0192737579345703,
"eval_runtime": 859.4597,
"eval_samples_per_second": 898.029,
"eval_steps_per_second": 56.127,
"step": 832000
},
{
"epoch": 0.92,
"eval_loss": 2.016619920730591,
"eval_runtime": 863.1851,
"eval_samples_per_second": 894.153,
"eval_steps_per_second": 55.885,
"step": 840000
},
{
"epoch": 0.93,
"learning_rate": 2.651333333333333e-07,
"loss": 2.1293,
"step": 848000
},
{
"epoch": 0.93,
"eval_loss": 2.013720989227295,
"eval_runtime": 863.4541,
"eval_samples_per_second": 893.875,
"eval_steps_per_second": 55.867,
"step": 848000
},
{
"epoch": 0.93,
"eval_loss": 2.018291711807251,
"eval_runtime": 877.742,
"eval_samples_per_second": 879.324,
"eval_steps_per_second": 54.958,
"step": 856000
},
{
"epoch": 0.94,
"learning_rate": 2.624e-07,
"loss": 2.1358,
"step": 864000
},
{
"epoch": 0.94,
"eval_loss": 2.018421173095703,
"eval_runtime": 873.6563,
"eval_samples_per_second": 883.437,
"eval_steps_per_second": 55.215,
"step": 864000
},
{
"epoch": 0.95,
"eval_loss": 2.017104387283325,
"eval_runtime": 874.261,
"eval_samples_per_second": 882.826,
"eval_steps_per_second": 55.177,
"step": 872000
},
{
"epoch": 0.96,
"learning_rate": 2.596666666666667e-07,
"loss": 2.1296,
"step": 880000
},
{
"epoch": 0.96,
"eval_loss": 2.0179190635681152,
"eval_runtime": 874.7051,
"eval_samples_per_second": 882.377,
"eval_steps_per_second": 55.149,
"step": 880000
},
{
"epoch": 0.97,
"eval_loss": 2.015188455581665,
"eval_runtime": 875.6595,
"eval_samples_per_second": 881.416,
"eval_steps_per_second": 55.089,
"step": 888000
},
{
"epoch": 0.98,
"learning_rate": 2.5693333333333333e-07,
"loss": 2.1319,
"step": 896000
},
{
"epoch": 0.98,
"eval_loss": 2.0173678398132324,
"eval_runtime": 877.4749,
"eval_samples_per_second": 879.592,
"eval_steps_per_second": 54.975,
"step": 896000
},
{
"epoch": 0.99,
"eval_loss": 2.020580291748047,
"eval_runtime": 874.219,
"eval_samples_per_second": 882.868,
"eval_steps_per_second": 55.18,
"step": 904000
},
{
"epoch": 1.0,
"learning_rate": 2.542e-07,
"loss": 2.1344,
"step": 912000
},
{
"epoch": 1.0,
"eval_loss": 2.0178616046905518,
"eval_runtime": 871.4372,
"eval_samples_per_second": 885.686,
"eval_steps_per_second": 55.356,
"step": 912000
},
{
"epoch": 1.0,
"eval_loss": 2.0153729915618896,
"eval_runtime": 874.8229,
"eval_samples_per_second": 882.259,
"eval_steps_per_second": 55.141,
"step": 920000
},
{
"epoch": 1.01,
"learning_rate": 2.5146666666666664e-07,
"loss": 2.1352,
"step": 928000
},
{
"epoch": 1.01,
"eval_loss": 2.018483877182007,
"eval_runtime": 876.0163,
"eval_samples_per_second": 881.057,
"eval_steps_per_second": 55.066,
"step": 928000
},
{
"epoch": 1.02,
"eval_loss": 2.016976833343506,
"eval_runtime": 878.2619,
"eval_samples_per_second": 878.804,
"eval_steps_per_second": 54.926,
"step": 936000
},
{
"epoch": 1.03,
"learning_rate": 2.4873333333333335e-07,
"loss": 2.1336,
"step": 944000
},
{
"epoch": 1.03,
"eval_loss": 2.016388416290283,
"eval_runtime": 877.6593,
"eval_samples_per_second": 879.407,
"eval_steps_per_second": 54.963,
"step": 944000
},
{
"epoch": 1.04,
"eval_loss": 2.013742208480835,
"eval_runtime": 871.0407,
"eval_samples_per_second": 886.09,
"eval_steps_per_second": 55.381,
"step": 952000
},
{
"epoch": 1.05,
"learning_rate": 2.46e-07,
"loss": 2.1315,
"step": 960000
},
{
"epoch": 1.05,
"eval_loss": 2.0176327228546143,
"eval_runtime": 877.004,
"eval_samples_per_second": 880.064,
"eval_steps_per_second": 55.004,
"step": 960000
},
{
"epoch": 1.06,
"eval_loss": 2.0155346393585205,
"eval_runtime": 872.5922,
"eval_samples_per_second": 884.514,
"eval_steps_per_second": 55.282,
"step": 968000
},
{
"epoch": 1.06,
"learning_rate": 2.4326666666666666e-07,
"loss": 2.1255,
"step": 976000
},
{
"epoch": 1.06,
"eval_loss": 2.014533281326294,
"eval_runtime": 871.4139,
"eval_samples_per_second": 885.71,
"eval_steps_per_second": 55.357,
"step": 976000
},
{
"epoch": 1.07,
"eval_loss": 2.023314952850342,
"eval_runtime": 879.3224,
"eval_samples_per_second": 877.744,
"eval_steps_per_second": 54.859,
"step": 984000
},
{
"epoch": 1.08,
"learning_rate": 2.405333333333333e-07,
"loss": 2.1249,
"step": 992000
},
{
"epoch": 1.08,
"eval_loss": 2.0147762298583984,
"eval_runtime": 866.8225,
"eval_samples_per_second": 890.401,
"eval_steps_per_second": 55.65,
"step": 992000
},
{
"epoch": 1.09,
"eval_loss": 2.016249895095825,
"eval_runtime": 867.6683,
"eval_samples_per_second": 889.533,
"eval_steps_per_second": 55.596,
"step": 1000000
},
{
"epoch": 1.1,
"learning_rate": 2.3779999999999997e-07,
"loss": 2.123,
"step": 1008000
},
{
"epoch": 1.1,
"eval_loss": 2.017381191253662,
"eval_runtime": 868.2141,
"eval_samples_per_second": 888.974,
"eval_steps_per_second": 55.561,
"step": 1008000
},
{
"epoch": 1.11,
"eval_loss": 2.015009880065918,
"eval_runtime": 865.5792,
"eval_samples_per_second": 891.68,
"eval_steps_per_second": 55.73,
"step": 1016000
},
{
"epoch": 1.12,
"learning_rate": 2.3506666666666668e-07,
"loss": 2.1263,
"step": 1024000
},
{
"epoch": 1.12,
"eval_loss": 2.0160863399505615,
"eval_runtime": 869.2474,
"eval_samples_per_second": 887.917,
"eval_steps_per_second": 55.495,
"step": 1024000
},
{
"epoch": 1.13,
"eval_loss": 2.0128889083862305,
"eval_runtime": 866.9502,
"eval_samples_per_second": 890.27,
"eval_steps_per_second": 55.642,
"step": 1032000
},
{
"epoch": 1.13,
"learning_rate": 2.3233333333333334e-07,
"loss": 2.1232,
"step": 1040000
},
{
"epoch": 1.13,
"eval_loss": 2.0166754722595215,
"eval_runtime": 901.7962,
"eval_samples_per_second": 855.87,
"eval_steps_per_second": 53.492,
"step": 1040000
},
{
"epoch": 1.14,
"eval_loss": 2.012477397918701,
"eval_runtime": 911.6669,
"eval_samples_per_second": 846.603,
"eval_steps_per_second": 52.913,
"step": 1048000
},
{
"epoch": 1.15,
"learning_rate": 2.2960000000000002e-07,
"loss": 2.1168,
"step": 1056000
},
{
"epoch": 1.15,
"eval_loss": 2.0113391876220703,
"eval_runtime": 912.2557,
"eval_samples_per_second": 846.057,
"eval_steps_per_second": 52.879,
"step": 1056000
},
{
"epoch": 1.16,
"eval_loss": 2.013575792312622,
"eval_runtime": 901.3301,
"eval_samples_per_second": 856.312,
"eval_steps_per_second": 53.52,
"step": 1064000
},
{
"epoch": 1.17,
"learning_rate": 2.2686666666666667e-07,
"loss": 2.1307,
"step": 1072000
},
{
"epoch": 1.17,
"eval_loss": 2.014338254928589,
"eval_runtime": 891.2807,
"eval_samples_per_second": 865.967,
"eval_steps_per_second": 54.123,
"step": 1072000
},
{
"epoch": 1.18,
"eval_loss": 2.0166401863098145,
"eval_runtime": 886.4005,
"eval_samples_per_second": 870.735,
"eval_steps_per_second": 54.421,
"step": 1080000
},
{
"epoch": 1.19,
"learning_rate": 2.2413333333333333e-07,
"loss": 2.1336,
"step": 1088000
},
{
"epoch": 1.19,
"eval_loss": 2.0103185176849365,
"eval_runtime": 886.4458,
"eval_samples_per_second": 870.691,
"eval_steps_per_second": 54.418,
"step": 1088000
},
{
"epoch": 1.2,
"eval_loss": 2.0129764080047607,
"eval_runtime": 890.355,
"eval_samples_per_second": 866.868,
"eval_steps_per_second": 54.18,
"step": 1096000
},
{
"epoch": 1.2,
"learning_rate": 2.214e-07,
"loss": 2.1227,
"step": 1104000
},
{
"epoch": 1.2,
"eval_loss": 2.012451648712158,
"eval_runtime": 895.3428,
"eval_samples_per_second": 862.039,
"eval_steps_per_second": 53.878,
"step": 1104000
},
{
"epoch": 1.21,
"eval_loss": 2.0183231830596924,
"eval_runtime": 888.3913,
"eval_samples_per_second": 868.784,
"eval_steps_per_second": 54.299,
"step": 1112000
},
{
"epoch": 1.22,
"learning_rate": 2.1866666666666667e-07,
"loss": 2.1223,
"step": 1120000
},
{
"epoch": 1.22,
"eval_loss": 2.014848470687866,
"eval_runtime": 889.5583,
"eval_samples_per_second": 867.644,
"eval_steps_per_second": 54.228,
"step": 1120000
},
{
"epoch": 1.23,
"eval_loss": 2.0147109031677246,
"eval_runtime": 884.3146,
"eval_samples_per_second": 872.789,
"eval_steps_per_second": 54.55,
"step": 1128000
},
{
"epoch": 1.24,
"learning_rate": 2.1593333333333332e-07,
"loss": 2.1289,
"step": 1136000
},
{
"epoch": 1.24,
"eval_loss": 2.0108699798583984,
"eval_runtime": 888.3584,
"eval_samples_per_second": 868.816,
"eval_steps_per_second": 54.301,
"step": 1136000
},
{
"epoch": 1.25,
"eval_loss": 2.0163819789886475,
"eval_runtime": 887.4195,
"eval_samples_per_second": 869.735,
"eval_steps_per_second": 54.359,
"step": 1144000
},
{
"epoch": 1.26,
"learning_rate": 2.132e-07,
"loss": 2.1278,
"step": 1152000
},
{
"epoch": 1.26,
"eval_loss": 2.0163345336914062,
"eval_runtime": 886.1604,
"eval_samples_per_second": 870.971,
"eval_steps_per_second": 54.436,
"step": 1152000
},
{
"epoch": 1.27,
"eval_loss": 2.012103319168091,
"eval_runtime": 889.5174,
"eval_samples_per_second": 867.684,
"eval_steps_per_second": 54.231,
"step": 1160000
},
{
"epoch": 1.27,
"learning_rate": 2.1046666666666666e-07,
"loss": 2.1261,
"step": 1168000
},
{
"epoch": 1.27,
"eval_loss": 2.011343240737915,
"eval_runtime": 890.9332,
"eval_samples_per_second": 866.305,
"eval_steps_per_second": 54.144,
"step": 1168000
},
{
"epoch": 1.28,
"eval_loss": 2.0137104988098145,
"eval_runtime": 883.4659,
"eval_samples_per_second": 873.627,
"eval_steps_per_second": 54.602,
"step": 1176000
},
{
"epoch": 1.29,
"learning_rate": 2.0773333333333334e-07,
"loss": 2.126,
"step": 1184000
},
{
"epoch": 1.29,
"eval_loss": 2.015174627304077,
"eval_runtime": 885.9678,
"eval_samples_per_second": 871.16,
"eval_steps_per_second": 54.448,
"step": 1184000
},
{
"epoch": 1.3,
"eval_loss": 2.010411500930786,
"eval_runtime": 888.6748,
"eval_samples_per_second": 868.507,
"eval_steps_per_second": 54.282,
"step": 1192000
},
{
"epoch": 1.31,
"learning_rate": 2.05e-07,
"loss": 2.1235,
"step": 1200000
},
{
"epoch": 1.31,
"eval_loss": 2.013165235519409,
"eval_runtime": 888.6503,
"eval_samples_per_second": 868.531,
"eval_steps_per_second": 54.283,
"step": 1200000
},
{
"epoch": 1.32,
"eval_loss": 2.0113847255706787,
"eval_runtime": 884.261,
"eval_samples_per_second": 872.842,
"eval_steps_per_second": 54.553,
"step": 1208000
},
{
"epoch": 1.33,
"learning_rate": 2.0226666666666668e-07,
"loss": 2.1229,
"step": 1216000
},
{
"epoch": 1.33,
"eval_loss": 2.010532855987549,
"eval_runtime": 887.5065,
"eval_samples_per_second": 869.65,
"eval_steps_per_second": 54.353,
"step": 1216000
},
{
"epoch": 1.34,
"eval_loss": 2.0130858421325684,
"eval_runtime": 881.1399,
"eval_samples_per_second": 875.934,
"eval_steps_per_second": 54.746,
"step": 1224000
},
{
"epoch": 1.34,
"learning_rate": 1.9953333333333333e-07,
"loss": 2.1213,
"step": 1232000
},
{
"epoch": 1.34,
"eval_loss": 2.0141072273254395,
"eval_runtime": 882.2467,
"eval_samples_per_second": 874.835,
"eval_steps_per_second": 54.677,
"step": 1232000
},
{
"epoch": 1.35,
"eval_loss": 2.010868549346924,
"eval_runtime": 881.7078,
"eval_samples_per_second": 875.369,
"eval_steps_per_second": 54.711,
"step": 1240000
},
{
"epoch": 1.36,
"learning_rate": 1.968e-07,
"loss": 2.1185,
"step": 1248000
},
{
"epoch": 1.36,
"eval_loss": 2.0129363536834717,
"eval_runtime": 886.2455,
"eval_samples_per_second": 870.887,
"eval_steps_per_second": 54.431,
"step": 1248000
},
{
"epoch": 1.37,
"eval_loss": 2.011003017425537,
"eval_runtime": 888.1974,
"eval_samples_per_second": 868.974,
"eval_steps_per_second": 54.311,
"step": 1256000
},
{
"epoch": 1.38,
"learning_rate": 1.9406666666666667e-07,
"loss": 2.131,
"step": 1264000
},
{
"epoch": 1.38,
"eval_loss": 2.01228928565979,
"eval_runtime": 884.9282,
"eval_samples_per_second": 872.184,
"eval_steps_per_second": 54.512,
"step": 1264000
},
{
"epoch": 1.39,
"eval_loss": 2.0104737281799316,
"eval_runtime": 881.1611,
"eval_samples_per_second": 875.912,
"eval_steps_per_second": 54.745,
"step": 1272000
},
{
"epoch": 1.4,
"learning_rate": 1.9133333333333333e-07,
"loss": 2.1141,
"step": 1280000
},
{
"epoch": 1.4,
"eval_loss": 2.010425090789795,
"eval_runtime": 882.3806,
"eval_samples_per_second": 874.702,
"eval_steps_per_second": 54.669,
"step": 1280000
},
{
"epoch": 1.41,
"eval_loss": 2.015007734298706,
"eval_runtime": 879.3909,
"eval_samples_per_second": 877.676,
"eval_steps_per_second": 54.855,
"step": 1288000
},
{
"epoch": 1.41,
"learning_rate": 1.886e-07,
"loss": 2.1219,
"step": 1296000
},
{
"epoch": 1.41,
"eval_loss": 2.0161073207855225,
"eval_runtime": 879.4904,
"eval_samples_per_second": 877.576,
"eval_steps_per_second": 54.849,
"step": 1296000
},
{
"epoch": 1.42,
"eval_loss": 2.00930118560791,
"eval_runtime": 882.5935,
"eval_samples_per_second": 874.491,
"eval_steps_per_second": 54.656,
"step": 1304000
},
{
"epoch": 1.43,
"learning_rate": 1.8586666666666666e-07,
"loss": 2.1203,
"step": 1312000
},
{
"epoch": 1.43,
"eval_loss": 2.0104291439056396,
"eval_runtime": 882.9969,
"eval_samples_per_second": 874.091,
"eval_steps_per_second": 54.631,
"step": 1312000
},
{
"epoch": 1.44,
"eval_loss": 2.0144429206848145,
"eval_runtime": 878.5955,
"eval_samples_per_second": 878.47,
"eval_steps_per_second": 54.905,
"step": 1320000
},
{
"epoch": 1.45,
"learning_rate": 1.8313333333333332e-07,
"loss": 2.1264,
"step": 1328000
},
{
"epoch": 1.45,
"eval_loss": 2.0084986686706543,
"eval_runtime": 878.8817,
"eval_samples_per_second": 878.184,
"eval_steps_per_second": 54.887,
"step": 1328000
},
{
"epoch": 1.46,
"eval_loss": 2.0118672847747803,
"eval_runtime": 880.8514,
"eval_samples_per_second": 876.22,
"eval_steps_per_second": 54.764,
"step": 1336000
},
{
"epoch": 1.47,
"learning_rate": 1.804e-07,
"loss": 2.1194,
"step": 1344000
},
{
"epoch": 1.47,
"eval_loss": 2.011784076690674,
"eval_runtime": 878.874,
"eval_samples_per_second": 878.192,
"eval_steps_per_second": 54.887,
"step": 1344000
},
{
"epoch": 1.48,
"eval_loss": 2.0109827518463135,
"eval_runtime": 893.715,
"eval_samples_per_second": 863.609,
"eval_steps_per_second": 53.976,
"step": 1352000
},
{
"epoch": 1.48,
"learning_rate": 1.7766666666666666e-07,
"loss": 2.117,
"step": 1360000
},
{
"epoch": 1.48,
"eval_loss": 2.014660596847534,
"eval_runtime": 915.8924,
"eval_samples_per_second": 842.697,
"eval_steps_per_second": 52.669,
"step": 1360000
},
{
"epoch": 1.49,
"eval_loss": 2.013535261154175,
"eval_runtime": 909.1816,
"eval_samples_per_second": 848.917,
"eval_steps_per_second": 53.058,
"step": 1368000
},
{
"epoch": 1.5,
"learning_rate": 1.7493333333333334e-07,
"loss": 2.1311,
"step": 1376000
},
{
"epoch": 1.5,
"eval_loss": 2.0076611042022705,
"eval_runtime": 909.3083,
"eval_samples_per_second": 848.799,
"eval_steps_per_second": 53.05,
"step": 1376000
},
{
"epoch": 1.51,
"eval_loss": 2.006574869155884,
"eval_runtime": 904.8344,
"eval_samples_per_second": 852.996,
"eval_steps_per_second": 53.313,
"step": 1384000
},
{
"epoch": 1.52,
"learning_rate": 1.722e-07,
"loss": 2.1215,
"step": 1392000
},
{
"epoch": 1.52,
"eval_loss": 2.008929967880249,
"eval_runtime": 903.4488,
"eval_samples_per_second": 854.304,
"eval_steps_per_second": 53.394,
"step": 1392000
},
{
"epoch": 1.53,
"eval_loss": 2.0118260383605957,
"eval_runtime": 913.6278,
"eval_samples_per_second": 844.786,
"eval_steps_per_second": 52.799,
"step": 1400000
},
{
"epoch": 1.54,
"learning_rate": 1.6946666666666668e-07,
"loss": 2.1185,
"step": 1408000
},
{
"epoch": 1.54,
"eval_loss": 2.0105414390563965,
"eval_runtime": 907.6551,
"eval_samples_per_second": 850.345,
"eval_steps_per_second": 53.147,
"step": 1408000
},
{
"epoch": 1.54,
"eval_loss": 2.012268304824829,
"eval_runtime": 903.9952,
"eval_samples_per_second": 853.788,
"eval_steps_per_second": 53.362,
"step": 1416000
},
{
"epoch": 1.55,
"learning_rate": 1.6673333333333333e-07,
"loss": 2.1284,
"step": 1424000
},
{
"epoch": 1.55,
"eval_loss": 2.0133912563323975,
"eval_runtime": 910.6028,
"eval_samples_per_second": 847.592,
"eval_steps_per_second": 52.975,
"step": 1424000
},
{
"epoch": 1.56,
"eval_loss": 2.009307861328125,
"eval_runtime": 904.2587,
"eval_samples_per_second": 853.539,
"eval_steps_per_second": 53.346,
"step": 1432000
},
{
"epoch": 1.57,
"learning_rate": 1.64e-07,
"loss": 2.1174,
"step": 1440000
},
{
"epoch": 1.57,
"eval_loss": 2.0101728439331055,
"eval_runtime": 912.2693,
"eval_samples_per_second": 846.044,
"eval_steps_per_second": 52.878,
"step": 1440000
},
{
"epoch": 1.58,
"eval_loss": 2.00759220123291,
"eval_runtime": 910.2393,
"eval_samples_per_second": 847.931,
"eval_steps_per_second": 52.996,
"step": 1448000
},
{
"epoch": 1.59,
"learning_rate": 1.6126666666666667e-07,
"loss": 2.1108,
"step": 1456000
},
{
"epoch": 1.59,
"eval_loss": 2.00740909576416,
"eval_runtime": 914.6796,
"eval_samples_per_second": 843.815,
"eval_steps_per_second": 52.739,
"step": 1456000
},
{
"epoch": 1.6,
"eval_loss": 2.007056474685669,
"eval_runtime": 908.0025,
"eval_samples_per_second": 850.02,
"eval_steps_per_second": 53.127,
"step": 1464000
},
{
"epoch": 1.61,
"learning_rate": 1.5853333333333332e-07,
"loss": 2.1252,
"step": 1472000
},
{
"epoch": 1.61,
"eval_loss": 2.0092082023620605,
"eval_runtime": 905.6872,
"eval_samples_per_second": 852.193,
"eval_steps_per_second": 53.262,
"step": 1472000
},
{
"epoch": 1.61,
"eval_loss": 2.007967233657837,
"eval_runtime": 910.9272,
"eval_samples_per_second": 847.291,
"eval_steps_per_second": 52.956,
"step": 1480000
},
{
"epoch": 1.62,
"learning_rate": 1.558e-07,
"loss": 2.121,
"step": 1488000
},
{
"epoch": 1.62,
"eval_loss": 2.0052874088287354,
"eval_runtime": 908.8472,
"eval_samples_per_second": 849.23,
"eval_steps_per_second": 53.077,
"step": 1488000
},
{
"epoch": 1.63,
"eval_loss": 2.0071661472320557,
"eval_runtime": 907.693,
"eval_samples_per_second": 850.31,
"eval_steps_per_second": 53.145,
"step": 1496000
},
{
"epoch": 1.64,
"learning_rate": 1.5306666666666666e-07,
"loss": 2.1178,
"step": 1504000
},
{
"epoch": 1.64,
"eval_loss": 2.0059070587158203,
"eval_runtime": 908.356,
"eval_samples_per_second": 849.689,
"eval_steps_per_second": 53.106,
"step": 1504000
},
{
"epoch": 1.65,
"eval_loss": 2.00836443901062,
"eval_runtime": 908.0246,
"eval_samples_per_second": 849.999,
"eval_steps_per_second": 53.125,
"step": 1512000
},
{
"epoch": 1.66,
"learning_rate": 1.5033333333333332e-07,
"loss": 2.1154,
"step": 1520000
},
{
"epoch": 1.66,
"eval_loss": 2.0105550289154053,
"eval_runtime": 903.6608,
"eval_samples_per_second": 854.104,
"eval_steps_per_second": 53.382,
"step": 1520000
},
{
"epoch": 1.67,
"eval_loss": 2.0116729736328125,
"eval_runtime": 909.1515,
"eval_samples_per_second": 848.945,
"eval_steps_per_second": 53.059,
"step": 1528000
},
{
"epoch": 1.68,
"learning_rate": 1.476e-07,
"loss": 2.1214,
"step": 1536000
},
{
"epoch": 1.68,
"eval_loss": 2.006955146789551,
"eval_runtime": 907.2355,
"eval_samples_per_second": 850.738,
"eval_steps_per_second": 53.171,
"step": 1536000
},
{
"epoch": 1.68,
"eval_loss": 2.0078775882720947,
"eval_runtime": 908.5609,
"eval_samples_per_second": 849.497,
"eval_steps_per_second": 53.094,
"step": 1544000
},
{
"epoch": 1.69,
"learning_rate": 1.4486666666666665e-07,
"loss": 2.1175,
"step": 1552000
},
{
"epoch": 1.69,
"eval_loss": 2.0101876258850098,
"eval_runtime": 901.3076,
"eval_samples_per_second": 856.334,
"eval_steps_per_second": 53.521,
"step": 1552000
},
{
"epoch": 1.7,
"eval_loss": 2.009697675704956,
"eval_runtime": 906.1011,
"eval_samples_per_second": 851.803,
"eval_steps_per_second": 53.238,
"step": 1560000
},
{
"epoch": 1.71,
"learning_rate": 1.4213333333333334e-07,
"loss": 2.1206,
"step": 1568000
},
{
"epoch": 1.71,
"eval_loss": 2.0092358589172363,
"eval_runtime": 901.2376,
"eval_samples_per_second": 856.4,
"eval_steps_per_second": 53.525,
"step": 1568000
},
{
"epoch": 1.72,
"eval_loss": 2.005527973175049,
"eval_runtime": 896.3075,
"eval_samples_per_second": 861.111,
"eval_steps_per_second": 53.82,
"step": 1576000
},
{
"epoch": 1.73,
"learning_rate": 1.3940000000000002e-07,
"loss": 2.1302,
"step": 1584000
},
{
"epoch": 1.73,
"eval_loss": 2.008502244949341,
"eval_runtime": 899.3251,
"eval_samples_per_second": 858.221,
"eval_steps_per_second": 53.639,
"step": 1584000
},
{
"epoch": 1.74,
"eval_loss": 2.0109806060791016,
"eval_runtime": 906.7205,
"eval_samples_per_second": 851.222,
"eval_steps_per_second": 53.202,
"step": 1592000
},
{
"epoch": 1.75,
"learning_rate": 1.3666666666666665e-07,
"loss": 2.1177,
"step": 1600000
},
{
"epoch": 1.75,
"eval_loss": 2.006521701812744,
"eval_runtime": 898.4764,
"eval_samples_per_second": 859.032,
"eval_steps_per_second": 53.69,
"step": 1600000
},
{
"epoch": 1.75,
"eval_loss": 2.0131704807281494,
"eval_runtime": 906.0839,
"eval_samples_per_second": 851.82,
"eval_steps_per_second": 53.239,
"step": 1608000
},
{
"epoch": 1.76,
"learning_rate": 1.3393333333333333e-07,
"loss": 2.1101,
"step": 1616000
},
{
"epoch": 1.76,
"eval_loss": 2.0085511207580566,
"eval_runtime": 896.2709,
"eval_samples_per_second": 861.146,
"eval_steps_per_second": 53.822,
"step": 1616000
},
{
"epoch": 1.77,
"eval_loss": 2.0077245235443115,
"eval_runtime": 897.3988,
"eval_samples_per_second": 860.064,
"eval_steps_per_second": 53.754,
"step": 1624000
},
{
"epoch": 1.78,
"learning_rate": 1.312e-07,
"loss": 2.1194,
"step": 1632000
},
{
"epoch": 1.78,
"eval_loss": 2.008148431777954,
"eval_runtime": 896.5575,
"eval_samples_per_second": 860.871,
"eval_steps_per_second": 53.805,
"step": 1632000
},
{
"epoch": 1.79,
"eval_loss": 2.008798122406006,
"eval_runtime": 897.2787,
"eval_samples_per_second": 860.179,
"eval_steps_per_second": 53.761,
"step": 1640000
},
{
"epoch": 1.8,
"learning_rate": 1.2846666666666667e-07,
"loss": 2.1167,
"step": 1648000
},
{
"epoch": 1.8,
"eval_loss": 2.002239942550659,
"eval_runtime": 893.5655,
"eval_samples_per_second": 863.753,
"eval_steps_per_second": 53.985,
"step": 1648000
},
{
"epoch": 1.81,
"eval_loss": 2.007662296295166,
"eval_runtime": 895.7141,
"eval_samples_per_second": 861.681,
"eval_steps_per_second": 53.855,
"step": 1656000
},
{
"epoch": 1.82,
"learning_rate": 1.2573333333333332e-07,
"loss": 2.1083,
"step": 1664000
},
{
"epoch": 1.82,
"eval_loss": 2.0065953731536865,
"eval_runtime": 890.9713,
"eval_samples_per_second": 866.268,
"eval_steps_per_second": 54.142,
"step": 1664000
},
{
"epoch": 1.82,
"eval_loss": 2.0137040615081787,
"eval_runtime": 885.7627,
"eval_samples_per_second": 871.362,
"eval_steps_per_second": 54.46,
"step": 1672000
},
{
"epoch": 1.83,
"learning_rate": 1.23e-07,
"loss": 2.1232,
"step": 1680000
},
{
"epoch": 1.83,
"eval_loss": 2.0067014694213867,
"eval_runtime": 890.51,
"eval_samples_per_second": 866.717,
"eval_steps_per_second": 54.17,
"step": 1680000
},
{
"epoch": 1.84,
"eval_loss": 2.0039150714874268,
"eval_runtime": 889.3586,
"eval_samples_per_second": 867.839,
"eval_steps_per_second": 54.24,
"step": 1688000
},
{
"epoch": 1.85,
"learning_rate": 1.2026666666666666e-07,
"loss": 2.1212,
"step": 1696000
},
{
"epoch": 1.85,
"eval_loss": 2.008970022201538,
"eval_runtime": 893.785,
"eval_samples_per_second": 863.541,
"eval_steps_per_second": 53.972,
"step": 1696000
},
{
"epoch": 1.86,
"eval_loss": 2.0079498291015625,
"eval_runtime": 882.5613,
"eval_samples_per_second": 874.523,
"eval_steps_per_second": 54.658,
"step": 1704000
},
{
"epoch": 1.87,
"learning_rate": 1.1753333333333334e-07,
"loss": 2.1246,
"step": 1712000
},
{
"epoch": 1.87,
"eval_loss": 2.0082814693450928,
"eval_runtime": 886.133,
"eval_samples_per_second": 870.998,
"eval_steps_per_second": 54.438,
"step": 1712000
},
{
"epoch": 1.88,
"eval_loss": 2.003898859024048,
"eval_runtime": 887.1853,
"eval_samples_per_second": 869.965,
"eval_steps_per_second": 54.373,
"step": 1720000
},
{
"epoch": 1.89,
"learning_rate": 1.1480000000000001e-07,
"loss": 2.1129,
"step": 1728000
},
{
"epoch": 1.89,
"eval_loss": 2.0069074630737305,
"eval_runtime": 891.3907,
"eval_samples_per_second": 865.86,
"eval_steps_per_second": 54.117,
"step": 1728000
},
{
"epoch": 1.89,
"eval_loss": 2.007922410964966,
"eval_runtime": 884.1175,
"eval_samples_per_second": 872.984,
"eval_steps_per_second": 54.562,
"step": 1736000
},
{
"epoch": 1.9,
"learning_rate": 1.1206666666666666e-07,
"loss": 2.1209,
"step": 1744000
},
{
"epoch": 1.9,
"eval_loss": 2.00584077835083,
"eval_runtime": 888.6359,
"eval_samples_per_second": 868.545,
"eval_steps_per_second": 54.284,
"step": 1744000
},
{
"epoch": 1.91,
"eval_loss": 2.0071957111358643,
"eval_runtime": 891.8674,
"eval_samples_per_second": 865.398,
"eval_steps_per_second": 54.088,
"step": 1752000
},
{
"epoch": 1.92,
"learning_rate": 1.0933333333333333e-07,
"loss": 2.1209,
"step": 1760000
},
{
"epoch": 1.92,
"eval_loss": 2.0067615509033203,
"eval_runtime": 884.8141,
"eval_samples_per_second": 872.296,
"eval_steps_per_second": 54.519,
"step": 1760000
},
{
"epoch": 1.93,
"eval_loss": 2.0078628063201904,
"eval_runtime": 888.3025,
"eval_samples_per_second": 868.871,
"eval_steps_per_second": 54.305,
"step": 1768000
},
{
"epoch": 1.94,
"learning_rate": 1.066e-07,
"loss": 2.1184,
"step": 1776000
},
{
"epoch": 1.94,
"eval_loss": 2.0036442279815674,
"eval_runtime": 887.5766,
"eval_samples_per_second": 869.581,
"eval_steps_per_second": 54.349,
"step": 1776000
},
{
"epoch": 1.95,
"eval_loss": 2.0064985752105713,
"eval_runtime": 890.3705,
"eval_samples_per_second": 866.853,
"eval_steps_per_second": 54.179,
"step": 1784000
},
{
"epoch": 1.96,
"learning_rate": 1.0386666666666667e-07,
"loss": 2.1065,
"step": 1792000
},
{
"epoch": 1.96,
"eval_loss": 2.007737159729004,
"eval_runtime": 889.1985,
"eval_samples_per_second": 867.995,
"eval_steps_per_second": 54.25,
"step": 1792000
},
{
"epoch": 1.96,
"eval_loss": 2.006197452545166,
"eval_runtime": 889.8901,
"eval_samples_per_second": 867.321,
"eval_steps_per_second": 54.208,
"step": 1800000
},
{
"epoch": 1.97,
"learning_rate": 1.0113333333333334e-07,
"loss": 2.109,
"step": 1808000
},
{
"epoch": 1.97,
"eval_loss": 2.0090434551239014,
"eval_runtime": 888.3297,
"eval_samples_per_second": 868.844,
"eval_steps_per_second": 54.303,
"step": 1808000
},
{
"epoch": 1.98,
"eval_loss": 2.012356758117676,
"eval_runtime": 893.3256,
"eval_samples_per_second": 863.985,
"eval_steps_per_second": 53.999,
"step": 1816000
},
{
"epoch": 1.99,
"learning_rate": 9.84e-08,
"loss": 2.1081,
"step": 1824000
},
{
"epoch": 1.99,
"eval_loss": 2.0065596103668213,
"eval_runtime": 893.6122,
"eval_samples_per_second": 863.708,
"eval_steps_per_second": 53.982,
"step": 1824000
},
{
"epoch": 2.0,
"eval_loss": 2.008080005645752,
"eval_runtime": 891.4247,
"eval_samples_per_second": 865.828,
"eval_steps_per_second": 54.115,
"step": 1832000
},
{
"epoch": 2.01,
"learning_rate": 9.566666666666666e-08,
"loss": 2.1151,
"step": 1840000
},
{
"epoch": 2.01,
"eval_loss": 2.008512258529663,
"eval_runtime": 884.9554,
"eval_samples_per_second": 872.157,
"eval_steps_per_second": 54.51,
"step": 1840000
},
{
"epoch": 2.02,
"eval_loss": 2.0054173469543457,
"eval_runtime": 886.9049,
"eval_samples_per_second": 870.24,
"eval_steps_per_second": 54.39,
"step": 1848000
},
{
"epoch": 2.03,
"learning_rate": 9.293333333333333e-08,
"loss": 2.1178,
"step": 1856000
},
{
"epoch": 2.03,
"eval_loss": 2.005777359008789,
"eval_runtime": 886.5315,
"eval_samples_per_second": 870.606,
"eval_steps_per_second": 54.413,
"step": 1856000
},
{
"epoch": 2.03,
"eval_loss": 2.0048415660858154,
"eval_runtime": 893.5519,
"eval_samples_per_second": 863.766,
"eval_steps_per_second": 53.986,
"step": 1864000
},
{
"epoch": 2.04,
"learning_rate": 9.02e-08,
"loss": 2.1035,
"step": 1872000
},
{
"epoch": 2.04,
"eval_loss": 2.004007339477539,
"eval_runtime": 890.5358,
"eval_samples_per_second": 866.692,
"eval_steps_per_second": 54.169,
"step": 1872000
},
{
"epoch": 2.05,
"eval_loss": 2.0059244632720947,
"eval_runtime": 887.0437,
"eval_samples_per_second": 870.104,
"eval_steps_per_second": 54.382,
"step": 1880000
},
{
"epoch": 2.06,
"learning_rate": 8.746666666666667e-08,
"loss": 2.1197,
"step": 1888000
},
{
"epoch": 2.06,
"eval_loss": 2.0071017742156982,
"eval_runtime": 889.191,
"eval_samples_per_second": 868.003,
"eval_steps_per_second": 54.25,
"step": 1888000
},
{
"epoch": 2.07,
"eval_loss": 2.005682945251465,
"eval_runtime": 888.8818,
"eval_samples_per_second": 868.304,
"eval_steps_per_second": 54.269,
"step": 1896000
},
{
"epoch": 2.08,
"learning_rate": 8.473333333333334e-08,
"loss": 2.1143,
"step": 1904000
},
{
"epoch": 2.08,
"eval_loss": 2.005943536758423,
"eval_runtime": 884.5437,
"eval_samples_per_second": 872.563,
"eval_steps_per_second": 54.535,
"step": 1904000
},
{
"epoch": 2.09,
"eval_loss": 2.0042991638183594,
"eval_runtime": 884.1715,
"eval_samples_per_second": 872.93,
"eval_steps_per_second": 54.558,
"step": 1912000
},
{
"epoch": 2.09,
"learning_rate": 8.2e-08,
"loss": 2.1082,
"step": 1920000
},
{
"epoch": 2.09,
"eval_loss": 2.0067648887634277,
"eval_runtime": 885.4828,
"eval_samples_per_second": 871.637,
"eval_steps_per_second": 54.478,
"step": 1920000
},
{
"epoch": 2.1,
"eval_loss": 2.0057313442230225,
"eval_runtime": 887.8665,
"eval_samples_per_second": 869.297,
"eval_steps_per_second": 54.331,
"step": 1928000
},
{
"epoch": 2.11,
"learning_rate": 7.926666666666666e-08,
"loss": 2.1202,
"step": 1936000
},
{
"epoch": 2.11,
"eval_loss": 2.007241725921631,
"eval_runtime": 885.5971,
"eval_samples_per_second": 871.525,
"eval_steps_per_second": 54.471,
"step": 1936000
},
{
"epoch": 2.12,
"eval_loss": 2.0057430267333984,
"eval_runtime": 888.4045,
"eval_samples_per_second": 868.771,
"eval_steps_per_second": 54.298,
"step": 1944000
},
{
"epoch": 2.13,
"learning_rate": 7.653333333333333e-08,
"loss": 2.1138,
"step": 1952000
},
{
"epoch": 2.13,
"eval_loss": 2.0051097869873047,
"eval_runtime": 889.7536,
"eval_samples_per_second": 867.454,
"eval_steps_per_second": 54.216,
"step": 1952000
},
{
"epoch": 2.14,
"eval_loss": 2.008528709411621,
"eval_runtime": 887.8548,
"eval_samples_per_second": 869.309,
"eval_steps_per_second": 54.332,
"step": 1960000
},
{
"epoch": 2.15,
"learning_rate": 7.38e-08,
"loss": 2.1082,
"step": 1968000
},
{
"epoch": 2.15,
"eval_loss": 2.007629871368408,
"eval_runtime": 886.2101,
"eval_samples_per_second": 870.922,
"eval_steps_per_second": 54.433,
"step": 1968000
},
{
"epoch": 2.16,
"eval_loss": 2.0076658725738525,
"eval_runtime": 886.4111,
"eval_samples_per_second": 870.725,
"eval_steps_per_second": 54.421,
"step": 1976000
},
{
"epoch": 2.16,
"learning_rate": 7.106666666666667e-08,
"loss": 2.1084,
"step": 1984000
},
{
"epoch": 2.16,
"eval_loss": 2.001997470855713,
"eval_runtime": 885.1567,
"eval_samples_per_second": 871.959,
"eval_steps_per_second": 54.498,
"step": 1984000
},
{
"epoch": 2.17,
"eval_loss": 2.005009651184082,
"eval_runtime": 889.5629,
"eval_samples_per_second": 867.64,
"eval_steps_per_second": 54.228,
"step": 1992000
},
{
"epoch": 2.18,
"learning_rate": 6.833333333333332e-08,
"loss": 2.1151,
"step": 2000000
},
{
"epoch": 2.18,
"eval_loss": 2.0065817832946777,
"eval_runtime": 885.7641,
"eval_samples_per_second": 871.361,
"eval_steps_per_second": 54.46,
"step": 2000000
},
{
"epoch": 2.19,
"eval_loss": 2.003136396408081,
"eval_runtime": 886.578,
"eval_samples_per_second": 870.561,
"eval_steps_per_second": 54.41,
"step": 2008000
},
{
"epoch": 2.2,
"learning_rate": 6.56e-08,
"loss": 2.1141,
"step": 2016000
},
{
"epoch": 2.2,
"eval_loss": 2.0128238201141357,
"eval_runtime": 891.0219,
"eval_samples_per_second": 866.219,
"eval_steps_per_second": 54.139,
"step": 2016000
},
{
"epoch": 2.21,
"eval_loss": 2.0021839141845703,
"eval_runtime": 895.8435,
"eval_samples_per_second": 861.557,
"eval_steps_per_second": 53.848,
"step": 2024000
},
{
"epoch": 2.22,
"learning_rate": 6.286666666666666e-08,
"loss": 2.1129,
"step": 2032000
},
{
"epoch": 2.22,
"eval_loss": 2.0065131187438965,
"eval_runtime": 890.2528,
"eval_samples_per_second": 866.967,
"eval_steps_per_second": 54.186,
"step": 2032000
},
{
"epoch": 2.23,
"eval_loss": 2.005363941192627,
"eval_runtime": 890.9681,
"eval_samples_per_second": 866.271,
"eval_steps_per_second": 54.142,
"step": 2040000
},
{
"epoch": 2.23,
"learning_rate": 6.013333333333333e-08,
"loss": 2.1164,
"step": 2048000
},
{
"epoch": 2.23,
"eval_loss": 2.0038933753967285,
"eval_runtime": 892.3995,
"eval_samples_per_second": 864.882,
"eval_steps_per_second": 54.055,
"step": 2048000
},
{
"epoch": 2.24,
"eval_loss": 2.003117561340332,
"eval_runtime": 894.495,
"eval_samples_per_second": 862.856,
"eval_steps_per_second": 53.929,
"step": 2056000
},
{
"epoch": 2.25,
"learning_rate": 5.7400000000000004e-08,
"loss": 2.1121,
"step": 2064000
},
{
"epoch": 2.25,
"eval_loss": 2.0101029872894287,
"eval_runtime": 886.6646,
"eval_samples_per_second": 870.476,
"eval_steps_per_second": 54.405,
"step": 2064000
},
{
"epoch": 2.26,
"eval_loss": 2.0098650455474854,
"eval_runtime": 887.3882,
"eval_samples_per_second": 869.766,
"eval_steps_per_second": 54.361,
"step": 2072000
},
{
"epoch": 2.27,
"learning_rate": 5.4666666666666666e-08,
"loss": 2.1071,
"step": 2080000
},
{
"epoch": 2.27,
"eval_loss": 2.0041701793670654,
"eval_runtime": 891.5578,
"eval_samples_per_second": 865.698,
"eval_steps_per_second": 54.106,
"step": 2080000
},
{
"epoch": 2.28,
"eval_loss": 2.0030367374420166,
"eval_runtime": 886.7055,
"eval_samples_per_second": 870.436,
"eval_steps_per_second": 54.403,
"step": 2088000
},
{
"epoch": 2.29,
"learning_rate": 5.1933333333333335e-08,
"loss": 2.1094,
"step": 2096000
},
{
"epoch": 2.29,
"eval_loss": 2.00482439994812,
"eval_runtime": 887.8886,
"eval_samples_per_second": 869.276,
"eval_steps_per_second": 54.33,
"step": 2096000
},
{
"epoch": 2.3,
"eval_loss": 2.004595994949341,
"eval_runtime": 887.4455,
"eval_samples_per_second": 869.71,
"eval_steps_per_second": 54.357,
"step": 2104000
},
{
"epoch": 2.3,
"learning_rate": 4.92e-08,
"loss": 2.1017,
"step": 2112000
},
{
"epoch": 2.3,
"eval_loss": 2.0038633346557617,
"eval_runtime": 888.4121,
"eval_samples_per_second": 868.764,
"eval_steps_per_second": 54.298,
"step": 2112000
},
{
"epoch": 2.31,
"eval_loss": 2.0011472702026367,
"eval_runtime": 889.7748,
"eval_samples_per_second": 867.433,
"eval_steps_per_second": 54.215,
"step": 2120000
},
{
"epoch": 2.32,
"learning_rate": 4.6466666666666666e-08,
"loss": 2.1124,
"step": 2128000
},
{
"epoch": 2.32,
"eval_loss": 2.007091522216797,
"eval_runtime": 892.2658,
"eval_samples_per_second": 865.011,
"eval_steps_per_second": 54.063,
"step": 2128000
},
{
"epoch": 2.33,
"eval_loss": 2.0060718059539795,
"eval_runtime": 887.502,
"eval_samples_per_second": 869.654,
"eval_steps_per_second": 54.354,
"step": 2136000
},
{
"epoch": 2.34,
"learning_rate": 4.3733333333333335e-08,
"loss": 2.1064,
"step": 2144000
},
{
"epoch": 2.34,
"eval_loss": 2.0040297508239746,
"eval_runtime": 888.8512,
"eval_samples_per_second": 868.334,
"eval_steps_per_second": 54.271,
"step": 2144000
},
{
"epoch": 2.35,
"eval_loss": 2.007528066635132,
"eval_runtime": 895.8909,
"eval_samples_per_second": 861.511,
"eval_steps_per_second": 53.845,
"step": 2152000
},
{
"epoch": 2.36,
"learning_rate": 4.1e-08,
"loss": 2.115,
"step": 2160000
},
{
"epoch": 2.36,
"eval_loss": 2.0025811195373535,
"eval_runtime": 894.6822,
"eval_samples_per_second": 862.675,
"eval_steps_per_second": 53.917,
"step": 2160000
},
{
"epoch": 2.37,
"eval_loss": 2.006788492202759,
"eval_runtime": 885.9111,
"eval_samples_per_second": 871.216,
"eval_steps_per_second": 54.451,
"step": 2168000
},
{
"epoch": 2.37,
"learning_rate": 3.8266666666666665e-08,
"loss": 2.114,
"step": 2176000
},
{
"epoch": 2.37,
"eval_loss": 2.006558418273926,
"eval_runtime": 889.8092,
"eval_samples_per_second": 867.399,
"eval_steps_per_second": 54.213,
"step": 2176000
},
{
"epoch": 2.38,
"eval_loss": 2.0079538822174072,
"eval_runtime": 889.2248,
"eval_samples_per_second": 867.97,
"eval_steps_per_second": 54.248,
"step": 2184000
},
{
"epoch": 2.39,
"learning_rate": 3.5533333333333334e-08,
"loss": 2.1171,
"step": 2192000
},
{
"epoch": 2.39,
"eval_loss": 2.0031957626342773,
"eval_runtime": 891.062,
"eval_samples_per_second": 866.18,
"eval_steps_per_second": 54.137,
"step": 2192000
},
{
"epoch": 2.4,
"eval_loss": 2.0036396980285645,
"eval_runtime": 889.4858,
"eval_samples_per_second": 867.715,
"eval_steps_per_second": 54.232,
"step": 2200000
},
{
"epoch": 2.41,
"learning_rate": 3.28e-08,
"loss": 2.1119,
"step": 2208000
},
{
"epoch": 2.41,
"eval_loss": 2.004848003387451,
"eval_runtime": 890.2659,
"eval_samples_per_second": 866.954,
"eval_steps_per_second": 54.185,
"step": 2208000
},
{
"epoch": 2.42,
"eval_loss": 2.0058629512786865,
"eval_runtime": 890.6135,
"eval_samples_per_second": 866.616,
"eval_steps_per_second": 54.164,
"step": 2216000
},
{
"epoch": 2.43,
"learning_rate": 3.0066666666666665e-08,
"loss": 2.1097,
"step": 2224000
},
{
"epoch": 2.43,
"eval_loss": 2.005845546722412,
"eval_runtime": 889.9256,
"eval_samples_per_second": 867.286,
"eval_steps_per_second": 54.206,
"step": 2224000
},
{
"epoch": 2.44,
"eval_loss": 2.004934310913086,
"eval_runtime": 893.1468,
"eval_samples_per_second": 864.158,
"eval_steps_per_second": 54.01,
"step": 2232000
},
{
"epoch": 2.44,
"learning_rate": 2.7333333333333333e-08,
"loss": 2.1091,
"step": 2240000
},
{
"epoch": 2.44,
"eval_loss": 2.005760669708252,
"eval_runtime": 893.6832,
"eval_samples_per_second": 863.639,
"eval_steps_per_second": 53.978,
"step": 2240000
},
{
"epoch": 2.45,
"eval_loss": 2.0032405853271484,
"eval_runtime": 894.8171,
"eval_samples_per_second": 862.545,
"eval_steps_per_second": 53.909,
"step": 2248000
},
{
"epoch": 2.46,
"learning_rate": 2.46e-08,
"loss": 2.1107,
"step": 2256000
},
{
"epoch": 2.46,
"eval_loss": 2.00769305229187,
"eval_runtime": 893.4774,
"eval_samples_per_second": 863.838,
"eval_steps_per_second": 53.99,
"step": 2256000
},
{
"epoch": 2.47,
"eval_loss": 2.0032243728637695,
"eval_runtime": 893.6019,
"eval_samples_per_second": 863.718,
"eval_steps_per_second": 53.983,
"step": 2264000
},
{
"epoch": 2.48,
"learning_rate": 2.1866666666666667e-08,
"loss": 2.1126,
"step": 2272000
},
{
"epoch": 2.48,
"eval_loss": 2.0055274963378906,
"eval_runtime": 891.7304,
"eval_samples_per_second": 865.531,
"eval_steps_per_second": 54.096,
"step": 2272000
},
{
"epoch": 2.49,
"eval_loss": 2.002612590789795,
"eval_runtime": 892.1014,
"eval_samples_per_second": 865.171,
"eval_steps_per_second": 54.073,
"step": 2280000
},
{
"epoch": 2.5,
"learning_rate": 1.9133333333333333e-08,
"loss": 2.1173,
"step": 2288000
},
{
"epoch": 2.5,
"eval_loss": 2.0062429904937744,
"eval_runtime": 891.9249,
"eval_samples_per_second": 865.342,
"eval_steps_per_second": 54.084,
"step": 2288000
},
{
"epoch": 2.51,
"eval_loss": 2.003859043121338,
"eval_runtime": 892.8008,
"eval_samples_per_second": 864.493,
"eval_steps_per_second": 54.031,
"step": 2296000
},
{
"epoch": 2.51,
"learning_rate": 1.64e-08,
"loss": 2.114,
"step": 2304000
},
{
"epoch": 2.51,
"eval_loss": 2.006359100341797,
"eval_runtime": 891.1547,
"eval_samples_per_second": 866.09,
"eval_steps_per_second": 54.131,
"step": 2304000
},
{
"epoch": 2.52,
"eval_loss": 2.0113308429718018,
"eval_runtime": 890.136,
"eval_samples_per_second": 867.081,
"eval_steps_per_second": 54.193,
"step": 2312000
},
{
"epoch": 2.53,
"learning_rate": 1.3666666666666667e-08,
"loss": 2.1131,
"step": 2320000
},
{
"epoch": 2.53,
"eval_loss": 2.0065314769744873,
"eval_runtime": 890.6924,
"eval_samples_per_second": 866.539,
"eval_steps_per_second": 54.159,
"step": 2320000
},
{
"epoch": 2.54,
"eval_loss": 2.0098392963409424,
"eval_runtime": 892.2668,
"eval_samples_per_second": 865.01,
"eval_steps_per_second": 54.063,
"step": 2328000
},
{
"epoch": 2.55,
"learning_rate": 1.0933333333333334e-08,
"loss": 2.1045,
"step": 2336000
},
{
"epoch": 2.55,
"eval_loss": 2.0060501098632812,
"eval_runtime": 891.9301,
"eval_samples_per_second": 865.337,
"eval_steps_per_second": 54.084,
"step": 2336000
},
{
"epoch": 2.56,
"eval_loss": 2.006572961807251,
"eval_runtime": 894.7549,
"eval_samples_per_second": 862.605,
"eval_steps_per_second": 53.913,
"step": 2344000
},
{
"epoch": 2.57,
"learning_rate": 8.2e-09,
"loss": 2.1144,
"step": 2352000
},
{
"epoch": 2.57,
"eval_loss": 2.006028175354004,
"eval_runtime": 899.347,
"eval_samples_per_second": 858.2,
"eval_steps_per_second": 53.638,
"step": 2352000
},
{
"epoch": 2.57,
"eval_loss": 2.00589656829834,
"eval_runtime": 893.5452,
"eval_samples_per_second": 863.773,
"eval_steps_per_second": 53.986,
"step": 2360000
},
{
"epoch": 2.58,
"learning_rate": 5.466666666666667e-09,
"loss": 2.1086,
"step": 2368000
},
{
"epoch": 2.58,
"eval_loss": 2.0038540363311768,
"eval_runtime": 893.2561,
"eval_samples_per_second": 864.052,
"eval_steps_per_second": 54.004,
"step": 2368000
},
{
"epoch": 2.59,
"eval_loss": 2.0076115131378174,
"eval_runtime": 895.0756,
"eval_samples_per_second": 862.296,
"eval_steps_per_second": 53.894,
"step": 2376000
},
{
"epoch": 2.6,
"learning_rate": 2.7333333333333334e-09,
"loss": 2.1058,
"step": 2384000
},
{
"epoch": 2.6,
"eval_loss": 2.0035552978515625,
"eval_runtime": 895.3228,
"eval_samples_per_second": 862.058,
"eval_steps_per_second": 53.879,
"step": 2384000
},
{
"epoch": 2.61,
"eval_loss": 2.0077223777770996,
"eval_runtime": 896.1834,
"eval_samples_per_second": 861.23,
"eval_steps_per_second": 53.827,
"step": 2392000
},
{
"epoch": 2.62,
"learning_rate": 0.0,
"loss": 2.1112,
"step": 2400000
},
{
"epoch": 2.62,
"eval_loss": 2.000014066696167,
"eval_runtime": 893.9091,
"eval_samples_per_second": 863.421,
"eval_steps_per_second": 53.964,
"step": 2400000
},
{
"epoch": 2.62,
"step": 2400000,
"total_flos": 7.571300080769916e+17,
"train_loss": 2.133689431966146,
"train_runtime": 416842.919,
"train_samples_per_second": 92.121,
"train_steps_per_second": 5.758
}
],
"logging_steps": 16000,
"max_steps": 2400000,
"num_train_epochs": 3,
"save_steps": 32000,
"total_flos": 7.571300080769916e+17,
"trial_name": null,
"trial_params": null
}