2020-Q1-90p-filtered-random / trainer_state.json
DouglasPontes's picture
Training in progress, step 32000
46eb42d verified
raw
history blame
81.3 kB
{
"best_metric": 3.23770809173584,
"best_model_checkpoint": "./model_tweets_2020_Q1_90/checkpoint-128000",
"epoch": 49.171259398881354,
"eval_steps": 8000,
"global_step": 2400000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.16,
"eval_loss": 3.4494731426239014,
"eval_runtime": 46.3964,
"eval_samples_per_second": 885.888,
"eval_steps_per_second": 55.371,
"step": 8000
},
{
"epoch": 0.33,
"learning_rate": 9.939131159843243e-06,
"loss": 3.5684,
"step": 16000
},
{
"epoch": 0.33,
"eval_loss": 3.416630744934082,
"eval_runtime": 46.3565,
"eval_samples_per_second": 886.65,
"eval_steps_per_second": 55.418,
"step": 16000
},
{
"epoch": 0.49,
"eval_loss": 3.3847219944000244,
"eval_runtime": 47.2297,
"eval_samples_per_second": 870.258,
"eval_steps_per_second": 54.394,
"step": 24000
},
{
"epoch": 0.66,
"learning_rate": 9.872425581589261e-06,
"loss": 3.3755,
"step": 32000
},
{
"epoch": 0.66,
"eval_loss": 3.3664660453796387,
"eval_runtime": 47.0495,
"eval_samples_per_second": 873.591,
"eval_steps_per_second": 54.602,
"step": 32000
},
{
"epoch": 0.82,
"eval_loss": 3.3654134273529053,
"eval_runtime": 46.3932,
"eval_samples_per_second": 885.949,
"eval_steps_per_second": 55.374,
"step": 40000
},
{
"epoch": 0.98,
"learning_rate": 9.80572000333528e-06,
"loss": 3.3533,
"step": 48000
},
{
"epoch": 0.98,
"eval_loss": 3.3654167652130127,
"eval_runtime": 46.5322,
"eval_samples_per_second": 883.301,
"eval_steps_per_second": 55.209,
"step": 48000
},
{
"epoch": 1.15,
"eval_loss": 3.332759380340576,
"eval_runtime": 46.4492,
"eval_samples_per_second": 884.88,
"eval_steps_per_second": 55.308,
"step": 56000
},
{
"epoch": 1.31,
"learning_rate": 9.739014425081299e-06,
"loss": 3.3014,
"step": 64000
},
{
"epoch": 1.31,
"eval_loss": 3.3209590911865234,
"eval_runtime": 45.8973,
"eval_samples_per_second": 895.521,
"eval_steps_per_second": 55.973,
"step": 64000
},
{
"epoch": 1.48,
"eval_loss": 3.3491690158843994,
"eval_runtime": 46.3252,
"eval_samples_per_second": 887.249,
"eval_steps_per_second": 55.456,
"step": 72000
},
{
"epoch": 1.64,
"learning_rate": 9.672308846827316e-06,
"loss": 3.2888,
"step": 80000
},
{
"epoch": 1.64,
"eval_loss": 3.3213465213775635,
"eval_runtime": 45.915,
"eval_samples_per_second": 895.177,
"eval_steps_per_second": 55.951,
"step": 80000
},
{
"epoch": 1.8,
"eval_loss": 3.2708065509796143,
"eval_runtime": 45.9723,
"eval_samples_per_second": 894.061,
"eval_steps_per_second": 55.882,
"step": 88000
},
{
"epoch": 1.97,
"learning_rate": 9.605603268573334e-06,
"loss": 3.2609,
"step": 96000
},
{
"epoch": 1.97,
"eval_loss": 3.290764808654785,
"eval_runtime": 46.6916,
"eval_samples_per_second": 880.287,
"eval_steps_per_second": 55.021,
"step": 96000
},
{
"epoch": 2.13,
"eval_loss": 3.2766778469085693,
"eval_runtime": 45.6527,
"eval_samples_per_second": 900.318,
"eval_steps_per_second": 56.273,
"step": 104000
},
{
"epoch": 2.29,
"learning_rate": 9.538897690319354e-06,
"loss": 3.2159,
"step": 112000
},
{
"epoch": 2.29,
"eval_loss": 3.259241819381714,
"eval_runtime": 45.9077,
"eval_samples_per_second": 895.319,
"eval_steps_per_second": 55.96,
"step": 112000
},
{
"epoch": 2.46,
"eval_loss": 3.2411258220672607,
"eval_runtime": 46.8974,
"eval_samples_per_second": 876.424,
"eval_steps_per_second": 54.779,
"step": 120000
},
{
"epoch": 2.62,
"learning_rate": 9.472192112065373e-06,
"loss": 3.2167,
"step": 128000
},
{
"epoch": 2.62,
"eval_loss": 3.23770809173584,
"eval_runtime": 46.0285,
"eval_samples_per_second": 892.969,
"eval_steps_per_second": 55.813,
"step": 128000
},
{
"epoch": 2.79,
"eval_loss": 3.2485291957855225,
"eval_runtime": 46.313,
"eval_samples_per_second": 887.483,
"eval_steps_per_second": 55.47,
"step": 136000
},
{
"epoch": 2.95,
"learning_rate": 9.405486533811392e-06,
"loss": 3.199,
"step": 144000
},
{
"epoch": 2.95,
"eval_loss": 3.2608513832092285,
"eval_runtime": 46.3737,
"eval_samples_per_second": 886.322,
"eval_steps_per_second": 55.398,
"step": 144000
},
{
"epoch": 3.11,
"eval_loss": 3.2552711963653564,
"eval_runtime": 45.7073,
"eval_samples_per_second": 899.243,
"eval_steps_per_second": 56.205,
"step": 152000
},
{
"epoch": 3.28,
"learning_rate": 9.338780955557409e-06,
"loss": 3.1905,
"step": 160000
},
{
"epoch": 3.28,
"eval_loss": 3.2425193786621094,
"eval_runtime": 46.3189,
"eval_samples_per_second": 887.37,
"eval_steps_per_second": 55.463,
"step": 160000
},
{
"epoch": 3.44,
"eval_loss": 3.2421696186065674,
"eval_runtime": 46.3489,
"eval_samples_per_second": 886.796,
"eval_steps_per_second": 55.427,
"step": 168000
},
{
"epoch": 3.61,
"learning_rate": 9.272075377303427e-06,
"loss": 3.1822,
"step": 176000
},
{
"epoch": 3.61,
"eval_loss": 3.262392997741699,
"eval_runtime": 46.6763,
"eval_samples_per_second": 880.575,
"eval_steps_per_second": 55.039,
"step": 176000
},
{
"epoch": 3.77,
"eval_loss": 3.2507119178771973,
"eval_runtime": 46.8277,
"eval_samples_per_second": 877.728,
"eval_steps_per_second": 54.861,
"step": 184000
},
{
"epoch": 3.93,
"learning_rate": 9.205369799049446e-06,
"loss": 3.1852,
"step": 192000
},
{
"epoch": 3.93,
"eval_loss": 3.2483315467834473,
"eval_runtime": 45.7607,
"eval_samples_per_second": 898.195,
"eval_steps_per_second": 56.14,
"step": 192000
},
{
"epoch": 4.1,
"eval_loss": 3.251424789428711,
"eval_runtime": 46.3642,
"eval_samples_per_second": 886.503,
"eval_steps_per_second": 55.409,
"step": 200000
},
{
"epoch": 4.26,
"learning_rate": 9.138664220795464e-06,
"loss": 3.1767,
"step": 208000
},
{
"epoch": 4.26,
"eval_loss": 3.242562770843506,
"eval_runtime": 46.886,
"eval_samples_per_second": 876.637,
"eval_steps_per_second": 54.792,
"step": 208000
},
{
"epoch": 4.43,
"eval_loss": 3.234778642654419,
"eval_runtime": 46.4949,
"eval_samples_per_second": 884.01,
"eval_steps_per_second": 55.253,
"step": 216000
},
{
"epoch": 4.59,
"learning_rate": 9.071958642541483e-06,
"loss": 3.1767,
"step": 224000
},
{
"epoch": 4.59,
"eval_loss": 3.2734625339508057,
"eval_runtime": 46.0486,
"eval_samples_per_second": 892.58,
"eval_steps_per_second": 55.789,
"step": 224000
},
{
"epoch": 4.75,
"eval_loss": 3.2471694946289062,
"eval_runtime": 46.5054,
"eval_samples_per_second": 883.811,
"eval_steps_per_second": 55.241,
"step": 232000
},
{
"epoch": 4.92,
"learning_rate": 9.005253064287502e-06,
"loss": 3.1973,
"step": 240000
},
{
"epoch": 4.92,
"eval_loss": 3.259644031524658,
"eval_runtime": 45.8405,
"eval_samples_per_second": 896.631,
"eval_steps_per_second": 56.042,
"step": 240000
},
{
"epoch": 5.08,
"eval_loss": 3.2605602741241455,
"eval_runtime": 45.4485,
"eval_samples_per_second": 904.365,
"eval_steps_per_second": 56.526,
"step": 248000
},
{
"epoch": 5.24,
"learning_rate": 8.93854748603352e-06,
"loss": 3.1781,
"step": 256000
},
{
"epoch": 5.24,
"eval_loss": 3.281527519226074,
"eval_runtime": 46.3336,
"eval_samples_per_second": 887.089,
"eval_steps_per_second": 55.446,
"step": 256000
},
{
"epoch": 5.41,
"eval_loss": 3.273421049118042,
"eval_runtime": 45.4558,
"eval_samples_per_second": 904.218,
"eval_steps_per_second": 56.516,
"step": 264000
},
{
"epoch": 5.57,
"learning_rate": 8.871841907779539e-06,
"loss": 3.1803,
"step": 272000
},
{
"epoch": 5.57,
"eval_loss": 3.2739477157592773,
"eval_runtime": 45.6455,
"eval_samples_per_second": 900.462,
"eval_steps_per_second": 56.282,
"step": 272000
},
{
"epoch": 5.74,
"eval_loss": 3.2712481021881104,
"eval_runtime": 46.5973,
"eval_samples_per_second": 882.068,
"eval_steps_per_second": 55.132,
"step": 280000
},
{
"epoch": 5.9,
"learning_rate": 8.805136329525557e-06,
"loss": 3.1989,
"step": 288000
},
{
"epoch": 5.9,
"eval_loss": 3.273439884185791,
"eval_runtime": 46.318,
"eval_samples_per_second": 887.387,
"eval_steps_per_second": 55.464,
"step": 288000
},
{
"epoch": 6.06,
"eval_loss": 3.293893814086914,
"eval_runtime": 45.8003,
"eval_samples_per_second": 897.418,
"eval_steps_per_second": 56.091,
"step": 296000
},
{
"epoch": 6.23,
"learning_rate": 8.738430751271576e-06,
"loss": 3.1929,
"step": 304000
},
{
"epoch": 6.23,
"eval_loss": 3.288043737411499,
"eval_runtime": 46.6462,
"eval_samples_per_second": 881.144,
"eval_steps_per_second": 55.074,
"step": 304000
},
{
"epoch": 6.39,
"eval_loss": 3.289358139038086,
"eval_runtime": 45.8512,
"eval_samples_per_second": 896.422,
"eval_steps_per_second": 56.029,
"step": 312000
},
{
"epoch": 6.56,
"learning_rate": 8.671725173017595e-06,
"loss": 3.2083,
"step": 320000
},
{
"epoch": 6.56,
"eval_loss": 3.308645725250244,
"eval_runtime": 46.2317,
"eval_samples_per_second": 889.043,
"eval_steps_per_second": 55.568,
"step": 320000
},
{
"epoch": 6.72,
"eval_loss": 3.3066623210906982,
"eval_runtime": 46.8669,
"eval_samples_per_second": 876.995,
"eval_steps_per_second": 54.815,
"step": 328000
},
{
"epoch": 6.88,
"learning_rate": 8.605019594763613e-06,
"loss": 3.2013,
"step": 336000
},
{
"epoch": 6.88,
"eval_loss": 3.278655529022217,
"eval_runtime": 45.904,
"eval_samples_per_second": 895.391,
"eval_steps_per_second": 55.965,
"step": 336000
},
{
"epoch": 7.05,
"eval_loss": 3.3152964115142822,
"eval_runtime": 46.5312,
"eval_samples_per_second": 883.322,
"eval_steps_per_second": 55.21,
"step": 344000
},
{
"epoch": 7.21,
"learning_rate": 8.538314016509632e-06,
"loss": 3.2111,
"step": 352000
},
{
"epoch": 7.21,
"eval_loss": 3.3246278762817383,
"eval_runtime": 46.7247,
"eval_samples_per_second": 879.664,
"eval_steps_per_second": 54.982,
"step": 352000
},
{
"epoch": 7.38,
"eval_loss": 3.3322579860687256,
"eval_runtime": 45.9989,
"eval_samples_per_second": 893.543,
"eval_steps_per_second": 55.849,
"step": 360000
},
{
"epoch": 7.54,
"learning_rate": 8.471608438255649e-06,
"loss": 3.2186,
"step": 368000
},
{
"epoch": 7.54,
"eval_loss": 3.2938337326049805,
"eval_runtime": 46.144,
"eval_samples_per_second": 890.734,
"eval_steps_per_second": 55.674,
"step": 368000
},
{
"epoch": 7.7,
"eval_loss": 3.3499817848205566,
"eval_runtime": 45.582,
"eval_samples_per_second": 901.717,
"eval_steps_per_second": 56.36,
"step": 376000
},
{
"epoch": 7.87,
"learning_rate": 8.404902860001667e-06,
"loss": 3.2268,
"step": 384000
},
{
"epoch": 7.87,
"eval_loss": 3.3179759979248047,
"eval_runtime": 45.2091,
"eval_samples_per_second": 909.153,
"eval_steps_per_second": 56.825,
"step": 384000
},
{
"epoch": 8.03,
"eval_loss": 3.3171069622039795,
"eval_runtime": 46.0196,
"eval_samples_per_second": 893.141,
"eval_steps_per_second": 55.824,
"step": 392000
},
{
"epoch": 8.2,
"learning_rate": 8.338197281747686e-06,
"loss": 3.233,
"step": 400000
},
{
"epoch": 8.2,
"eval_loss": 3.3461642265319824,
"eval_runtime": 45.5487,
"eval_samples_per_second": 902.375,
"eval_steps_per_second": 56.401,
"step": 400000
},
{
"epoch": 8.36,
"eval_loss": 3.341256618499756,
"eval_runtime": 45.4264,
"eval_samples_per_second": 904.804,
"eval_steps_per_second": 56.553,
"step": 408000
},
{
"epoch": 8.52,
"learning_rate": 8.271491703493705e-06,
"loss": 3.2432,
"step": 416000
},
{
"epoch": 8.52,
"eval_loss": 3.328122615814209,
"eval_runtime": 45.9787,
"eval_samples_per_second": 893.936,
"eval_steps_per_second": 55.874,
"step": 416000
},
{
"epoch": 8.69,
"eval_loss": 3.342041492462158,
"eval_runtime": 45.4274,
"eval_samples_per_second": 904.784,
"eval_steps_per_second": 56.552,
"step": 424000
},
{
"epoch": 8.85,
"learning_rate": 8.204786125239725e-06,
"loss": 3.2586,
"step": 432000
},
{
"epoch": 8.85,
"eval_loss": 3.3609066009521484,
"eval_runtime": 45.3913,
"eval_samples_per_second": 905.504,
"eval_steps_per_second": 56.597,
"step": 432000
},
{
"epoch": 9.01,
"eval_loss": 3.352691173553467,
"eval_runtime": 46.0515,
"eval_samples_per_second": 892.522,
"eval_steps_per_second": 55.785,
"step": 440000
},
{
"epoch": 9.18,
"learning_rate": 8.138080546985743e-06,
"loss": 3.2567,
"step": 448000
},
{
"epoch": 9.18,
"eval_loss": 3.359393358230591,
"eval_runtime": 45.57,
"eval_samples_per_second": 901.953,
"eval_steps_per_second": 56.375,
"step": 448000
},
{
"epoch": 9.34,
"eval_loss": 3.3497443199157715,
"eval_runtime": 45.4208,
"eval_samples_per_second": 904.915,
"eval_steps_per_second": 56.56,
"step": 456000
},
{
"epoch": 9.51,
"learning_rate": 8.07137496873176e-06,
"loss": 3.2592,
"step": 464000
},
{
"epoch": 9.51,
"eval_loss": 3.3606550693511963,
"eval_runtime": 46.15,
"eval_samples_per_second": 890.617,
"eval_steps_per_second": 55.666,
"step": 464000
},
{
"epoch": 9.67,
"eval_loss": 3.3839540481567383,
"eval_runtime": 45.5702,
"eval_samples_per_second": 901.95,
"eval_steps_per_second": 56.375,
"step": 472000
},
{
"epoch": 9.83,
"learning_rate": 8.004669390477779e-06,
"loss": 3.2793,
"step": 480000
},
{
"epoch": 9.83,
"eval_loss": 3.366785764694214,
"eval_runtime": 45.749,
"eval_samples_per_second": 898.424,
"eval_steps_per_second": 56.154,
"step": 480000
},
{
"epoch": 10.0,
"eval_loss": 3.3609416484832764,
"eval_runtime": 47.1383,
"eval_samples_per_second": 871.945,
"eval_steps_per_second": 54.499,
"step": 488000
},
{
"epoch": 10.16,
"learning_rate": 7.937963812223798e-06,
"loss": 3.257,
"step": 496000
},
{
"epoch": 10.16,
"eval_loss": 3.368229389190674,
"eval_runtime": 45.5778,
"eval_samples_per_second": 901.798,
"eval_steps_per_second": 56.365,
"step": 496000
},
{
"epoch": 10.33,
"eval_loss": 3.4005918502807617,
"eval_runtime": 46.5843,
"eval_samples_per_second": 882.314,
"eval_steps_per_second": 55.147,
"step": 504000
},
{
"epoch": 10.49,
"learning_rate": 7.871258233969816e-06,
"loss": 3.2656,
"step": 512000
},
{
"epoch": 10.49,
"eval_loss": 3.358835220336914,
"eval_runtime": 46.2545,
"eval_samples_per_second": 888.605,
"eval_steps_per_second": 55.541,
"step": 512000
},
{
"epoch": 10.65,
"eval_loss": 3.379861831665039,
"eval_runtime": 45.613,
"eval_samples_per_second": 901.103,
"eval_steps_per_second": 56.322,
"step": 520000
},
{
"epoch": 10.82,
"learning_rate": 7.804552655715835e-06,
"loss": 3.2727,
"step": 528000
},
{
"epoch": 10.82,
"eval_loss": 3.383315086364746,
"eval_runtime": 46.0041,
"eval_samples_per_second": 893.442,
"eval_steps_per_second": 55.843,
"step": 528000
},
{
"epoch": 10.98,
"eval_loss": 3.356590747833252,
"eval_runtime": 45.9202,
"eval_samples_per_second": 895.074,
"eval_steps_per_second": 55.945,
"step": 536000
},
{
"epoch": 11.15,
"learning_rate": 7.737847077461853e-06,
"loss": 3.2705,
"step": 544000
},
{
"epoch": 11.15,
"eval_loss": 3.3793959617614746,
"eval_runtime": 45.6075,
"eval_samples_per_second": 901.211,
"eval_steps_per_second": 56.328,
"step": 544000
},
{
"epoch": 11.31,
"eval_loss": 3.3838233947753906,
"eval_runtime": 46.1859,
"eval_samples_per_second": 889.925,
"eval_steps_per_second": 55.623,
"step": 552000
},
{
"epoch": 11.47,
"learning_rate": 7.671141499207872e-06,
"loss": 3.2676,
"step": 560000
},
{
"epoch": 11.47,
"eval_loss": 3.3659656047821045,
"eval_runtime": 45.7183,
"eval_samples_per_second": 899.027,
"eval_steps_per_second": 56.192,
"step": 560000
},
{
"epoch": 11.64,
"eval_loss": 3.3937699794769287,
"eval_runtime": 45.9326,
"eval_samples_per_second": 894.832,
"eval_steps_per_second": 55.93,
"step": 568000
},
{
"epoch": 11.8,
"learning_rate": 7.604435920953891e-06,
"loss": 3.258,
"step": 576000
},
{
"epoch": 11.8,
"eval_loss": 3.3661420345306396,
"eval_runtime": 46.4625,
"eval_samples_per_second": 884.627,
"eval_steps_per_second": 55.292,
"step": 576000
},
{
"epoch": 11.97,
"eval_loss": 3.3490447998046875,
"eval_runtime": 45.8318,
"eval_samples_per_second": 896.801,
"eval_steps_per_second": 56.053,
"step": 584000
},
{
"epoch": 12.13,
"learning_rate": 7.537730342699909e-06,
"loss": 3.2646,
"step": 592000
},
{
"epoch": 12.13,
"eval_loss": 3.3716230392456055,
"eval_runtime": 45.6734,
"eval_samples_per_second": 899.91,
"eval_steps_per_second": 56.247,
"step": 592000
},
{
"epoch": 12.29,
"eval_loss": 3.3877346515655518,
"eval_runtime": 46.2161,
"eval_samples_per_second": 889.344,
"eval_steps_per_second": 55.587,
"step": 600000
},
{
"epoch": 12.46,
"learning_rate": 7.471024764445928e-06,
"loss": 3.2578,
"step": 608000
},
{
"epoch": 12.46,
"eval_loss": 3.3930206298828125,
"eval_runtime": 45.3985,
"eval_samples_per_second": 905.361,
"eval_steps_per_second": 56.588,
"step": 608000
},
{
"epoch": 12.62,
"eval_loss": 3.392077922821045,
"eval_runtime": 45.1724,
"eval_samples_per_second": 909.893,
"eval_steps_per_second": 56.871,
"step": 616000
},
{
"epoch": 12.78,
"learning_rate": 7.4043191861919465e-06,
"loss": 3.2719,
"step": 624000
},
{
"epoch": 12.78,
"eval_loss": 3.395730495452881,
"eval_runtime": 45.8195,
"eval_samples_per_second": 897.042,
"eval_steps_per_second": 56.068,
"step": 624000
},
{
"epoch": 12.95,
"eval_loss": 3.4196434020996094,
"eval_runtime": 45.2614,
"eval_samples_per_second": 908.103,
"eval_steps_per_second": 56.759,
"step": 632000
},
{
"epoch": 13.11,
"learning_rate": 7.337613607937964e-06,
"loss": 3.2828,
"step": 640000
},
{
"epoch": 13.11,
"eval_loss": 3.4077515602111816,
"eval_runtime": 45.5674,
"eval_samples_per_second": 902.004,
"eval_steps_per_second": 56.378,
"step": 640000
},
{
"epoch": 13.28,
"eval_loss": 3.4202864170074463,
"eval_runtime": 46.3249,
"eval_samples_per_second": 887.255,
"eval_steps_per_second": 55.456,
"step": 648000
},
{
"epoch": 13.44,
"learning_rate": 7.270908029683983e-06,
"loss": 3.2805,
"step": 656000
},
{
"epoch": 13.44,
"eval_loss": 3.3899548053741455,
"eval_runtime": 46.1588,
"eval_samples_per_second": 890.448,
"eval_steps_per_second": 55.656,
"step": 656000
},
{
"epoch": 13.6,
"eval_loss": 3.4037835597991943,
"eval_runtime": 46.9454,
"eval_samples_per_second": 875.527,
"eval_steps_per_second": 54.723,
"step": 664000
},
{
"epoch": 13.77,
"learning_rate": 7.2042024514300015e-06,
"loss": 3.2975,
"step": 672000
},
{
"epoch": 13.77,
"eval_loss": 3.405585765838623,
"eval_runtime": 46.2706,
"eval_samples_per_second": 888.297,
"eval_steps_per_second": 55.521,
"step": 672000
},
{
"epoch": 13.93,
"eval_loss": 3.428373336791992,
"eval_runtime": 45.9889,
"eval_samples_per_second": 893.738,
"eval_steps_per_second": 55.861,
"step": 680000
},
{
"epoch": 14.1,
"learning_rate": 7.13749687317602e-06,
"loss": 3.2965,
"step": 688000
},
{
"epoch": 14.1,
"eval_loss": 3.41803240776062,
"eval_runtime": 46.9126,
"eval_samples_per_second": 876.14,
"eval_steps_per_second": 54.761,
"step": 688000
},
{
"epoch": 14.26,
"eval_loss": 3.419599771499634,
"eval_runtime": 46.1796,
"eval_samples_per_second": 890.047,
"eval_steps_per_second": 55.631,
"step": 696000
},
{
"epoch": 14.42,
"learning_rate": 7.070791294922038e-06,
"loss": 3.3069,
"step": 704000
},
{
"epoch": 14.42,
"eval_loss": 3.425711154937744,
"eval_runtime": 46.2298,
"eval_samples_per_second": 889.08,
"eval_steps_per_second": 55.57,
"step": 704000
},
{
"epoch": 14.59,
"eval_loss": 3.4299447536468506,
"eval_runtime": 46.768,
"eval_samples_per_second": 878.85,
"eval_steps_per_second": 54.931,
"step": 712000
},
{
"epoch": 14.75,
"learning_rate": 7.0040857166680564e-06,
"loss": 3.3152,
"step": 720000
},
{
"epoch": 14.75,
"eval_loss": 3.4787514209747314,
"eval_runtime": 46.0913,
"eval_samples_per_second": 891.752,
"eval_steps_per_second": 55.737,
"step": 720000
},
{
"epoch": 14.92,
"eval_loss": 3.4424662590026855,
"eval_runtime": 46.3411,
"eval_samples_per_second": 886.945,
"eval_steps_per_second": 55.437,
"step": 728000
},
{
"epoch": 15.08,
"learning_rate": 6.937380138414076e-06,
"loss": 3.3125,
"step": 736000
},
{
"epoch": 15.08,
"eval_loss": 3.430126667022705,
"eval_runtime": 46.9882,
"eval_samples_per_second": 874.73,
"eval_steps_per_second": 54.673,
"step": 736000
},
{
"epoch": 15.24,
"eval_loss": 3.4440979957580566,
"eval_runtime": 46.1825,
"eval_samples_per_second": 889.99,
"eval_steps_per_second": 55.627,
"step": 744000
},
{
"epoch": 15.41,
"learning_rate": 6.8706745601600945e-06,
"loss": 3.3174,
"step": 752000
},
{
"epoch": 15.41,
"eval_loss": 3.4396116733551025,
"eval_runtime": 46.2686,
"eval_samples_per_second": 888.334,
"eval_steps_per_second": 55.524,
"step": 752000
},
{
"epoch": 15.57,
"eval_loss": 3.463931083679199,
"eval_runtime": 46.7798,
"eval_samples_per_second": 878.627,
"eval_steps_per_second": 54.917,
"step": 760000
},
{
"epoch": 15.73,
"learning_rate": 6.803968981906113e-06,
"loss": 3.3242,
"step": 768000
},
{
"epoch": 15.73,
"eval_loss": 3.4523837566375732,
"eval_runtime": 45.7867,
"eval_samples_per_second": 897.685,
"eval_steps_per_second": 56.108,
"step": 768000
},
{
"epoch": 15.9,
"eval_loss": 3.455958366394043,
"eval_runtime": 45.3124,
"eval_samples_per_second": 907.08,
"eval_steps_per_second": 56.695,
"step": 776000
},
{
"epoch": 16.06,
"learning_rate": 6.737263403652131e-06,
"loss": 3.3385,
"step": 784000
},
{
"epoch": 16.06,
"eval_loss": 3.4779999256134033,
"eval_runtime": 46.0072,
"eval_samples_per_second": 893.383,
"eval_steps_per_second": 55.839,
"step": 784000
},
{
"epoch": 16.23,
"eval_loss": 3.4773714542388916,
"eval_runtime": 45.131,
"eval_samples_per_second": 910.727,
"eval_steps_per_second": 56.923,
"step": 792000
},
{
"epoch": 16.39,
"learning_rate": 6.6705578253981495e-06,
"loss": 3.3371,
"step": 800000
},
{
"epoch": 16.39,
"eval_loss": 3.47719669342041,
"eval_runtime": 45.6308,
"eval_samples_per_second": 900.751,
"eval_steps_per_second": 56.3,
"step": 800000
},
{
"epoch": 16.55,
"eval_loss": 3.4955241680145264,
"eval_runtime": 46.0477,
"eval_samples_per_second": 892.597,
"eval_steps_per_second": 55.79,
"step": 808000
},
{
"epoch": 16.72,
"learning_rate": 6.603852247144168e-06,
"loss": 3.3633,
"step": 816000
},
{
"epoch": 16.72,
"eval_loss": 3.486057996749878,
"eval_runtime": 44.9231,
"eval_samples_per_second": 914.941,
"eval_steps_per_second": 57.187,
"step": 816000
},
{
"epoch": 16.88,
"eval_loss": 3.506316661834717,
"eval_runtime": 45.7078,
"eval_samples_per_second": 899.234,
"eval_steps_per_second": 56.205,
"step": 824000
},
{
"epoch": 17.05,
"learning_rate": 6.537146668890187e-06,
"loss": 3.3678,
"step": 832000
},
{
"epoch": 17.05,
"eval_loss": 3.50439190864563,
"eval_runtime": 45.0245,
"eval_samples_per_second": 912.882,
"eval_steps_per_second": 57.058,
"step": 832000
},
{
"epoch": 17.21,
"eval_loss": 3.520247220993042,
"eval_runtime": 45.2071,
"eval_samples_per_second": 909.193,
"eval_steps_per_second": 56.827,
"step": 840000
},
{
"epoch": 17.37,
"learning_rate": 6.4704410906362044e-06,
"loss": 3.3634,
"step": 848000
},
{
"epoch": 17.37,
"eval_loss": 3.4941418170928955,
"eval_runtime": 46.4208,
"eval_samples_per_second": 885.423,
"eval_steps_per_second": 55.342,
"step": 848000
},
{
"epoch": 17.54,
"eval_loss": 3.522303819656372,
"eval_runtime": 46.164,
"eval_samples_per_second": 890.347,
"eval_steps_per_second": 55.649,
"step": 856000
},
{
"epoch": 17.7,
"learning_rate": 6.403735512382223e-06,
"loss": 3.3797,
"step": 864000
},
{
"epoch": 17.7,
"eval_loss": 3.502774715423584,
"eval_runtime": 45.8285,
"eval_samples_per_second": 896.865,
"eval_steps_per_second": 56.057,
"step": 864000
},
{
"epoch": 17.87,
"eval_loss": 3.526393175125122,
"eval_runtime": 46.6422,
"eval_samples_per_second": 881.219,
"eval_steps_per_second": 55.079,
"step": 872000
},
{
"epoch": 18.03,
"learning_rate": 6.337029934128242e-06,
"loss": 3.3802,
"step": 880000
},
{
"epoch": 18.03,
"eval_loss": 3.531257152557373,
"eval_runtime": 46.217,
"eval_samples_per_second": 889.327,
"eval_steps_per_second": 55.586,
"step": 880000
},
{
"epoch": 18.19,
"eval_loss": 3.496319055557251,
"eval_runtime": 45.9803,
"eval_samples_per_second": 893.904,
"eval_steps_per_second": 55.872,
"step": 888000
},
{
"epoch": 18.36,
"learning_rate": 6.270324355874261e-06,
"loss": 3.357,
"step": 896000
},
{
"epoch": 18.36,
"eval_loss": 3.5171141624450684,
"eval_runtime": 47.1622,
"eval_samples_per_second": 871.504,
"eval_steps_per_second": 54.472,
"step": 896000
},
{
"epoch": 18.52,
"eval_loss": 3.530701160430908,
"eval_runtime": 46.113,
"eval_samples_per_second": 891.332,
"eval_steps_per_second": 55.711,
"step": 904000
},
{
"epoch": 18.69,
"learning_rate": 6.20361877762028e-06,
"loss": 3.3866,
"step": 912000
},
{
"epoch": 18.69,
"eval_loss": 3.5221967697143555,
"eval_runtime": 46.035,
"eval_samples_per_second": 892.843,
"eval_steps_per_second": 55.805,
"step": 912000
},
{
"epoch": 18.85,
"eval_loss": 3.5319056510925293,
"eval_runtime": 46.8446,
"eval_samples_per_second": 877.412,
"eval_steps_per_second": 54.841,
"step": 920000
},
{
"epoch": 19.01,
"learning_rate": 6.1369131993662975e-06,
"loss": 3.3818,
"step": 928000
},
{
"epoch": 19.01,
"eval_loss": 3.532552480697632,
"eval_runtime": 46.3901,
"eval_samples_per_second": 886.007,
"eval_steps_per_second": 55.378,
"step": 928000
},
{
"epoch": 19.18,
"eval_loss": 3.5116307735443115,
"eval_runtime": 45.2931,
"eval_samples_per_second": 907.466,
"eval_steps_per_second": 56.719,
"step": 936000
},
{
"epoch": 19.34,
"learning_rate": 6.070207621112316e-06,
"loss": 3.3754,
"step": 944000
},
{
"epoch": 19.34,
"eval_loss": 3.5228991508483887,
"eval_runtime": 47.0715,
"eval_samples_per_second": 873.183,
"eval_steps_per_second": 54.577,
"step": 944000
},
{
"epoch": 19.5,
"eval_loss": 3.538318634033203,
"eval_runtime": 45.9256,
"eval_samples_per_second": 894.97,
"eval_steps_per_second": 55.938,
"step": 952000
},
{
"epoch": 19.67,
"learning_rate": 6.003502042858335e-06,
"loss": 3.3893,
"step": 960000
},
{
"epoch": 19.67,
"eval_loss": 3.544513463973999,
"eval_runtime": 46.8245,
"eval_samples_per_second": 877.788,
"eval_steps_per_second": 54.864,
"step": 960000
},
{
"epoch": 19.83,
"eval_loss": 3.5230634212493896,
"eval_runtime": 47.3348,
"eval_samples_per_second": 868.325,
"eval_steps_per_second": 54.273,
"step": 968000
},
{
"epoch": 20.0,
"learning_rate": 5.936796464604353e-06,
"loss": 3.3899,
"step": 976000
},
{
"epoch": 20.0,
"eval_loss": 3.531026840209961,
"eval_runtime": 45.7886,
"eval_samples_per_second": 897.647,
"eval_steps_per_second": 56.106,
"step": 976000
},
{
"epoch": 20.16,
"eval_loss": 3.53287935256958,
"eval_runtime": 46.7771,
"eval_samples_per_second": 878.677,
"eval_steps_per_second": 54.92,
"step": 984000
},
{
"epoch": 20.32,
"learning_rate": 5.870090886350371e-06,
"loss": 3.3918,
"step": 992000
},
{
"epoch": 20.32,
"eval_loss": 3.5158653259277344,
"eval_runtime": 46.2173,
"eval_samples_per_second": 889.32,
"eval_steps_per_second": 55.585,
"step": 992000
},
{
"epoch": 20.49,
"eval_loss": 3.562788486480713,
"eval_runtime": 45.7474,
"eval_samples_per_second": 898.456,
"eval_steps_per_second": 56.156,
"step": 1000000
},
{
"epoch": 20.65,
"learning_rate": 5.80338530809639e-06,
"loss": 3.3786,
"step": 1008000
},
{
"epoch": 20.65,
"eval_loss": 3.5290534496307373,
"eval_runtime": 46.4581,
"eval_samples_per_second": 884.711,
"eval_steps_per_second": 55.297,
"step": 1008000
},
{
"epoch": 20.82,
"eval_loss": 3.5163111686706543,
"eval_runtime": 45.899,
"eval_samples_per_second": 895.487,
"eval_steps_per_second": 55.971,
"step": 1016000
},
{
"epoch": 20.98,
"learning_rate": 5.736679729842408e-06,
"loss": 3.3862,
"step": 1024000
},
{
"epoch": 20.98,
"eval_loss": 3.531219959259033,
"eval_runtime": 45.4959,
"eval_samples_per_second": 903.423,
"eval_steps_per_second": 56.467,
"step": 1024000
},
{
"epoch": 21.14,
"eval_loss": 3.514033317565918,
"eval_runtime": 46.6408,
"eval_samples_per_second": 881.245,
"eval_steps_per_second": 55.08,
"step": 1032000
},
{
"epoch": 21.31,
"learning_rate": 5.669974151588427e-06,
"loss": 3.3855,
"step": 1040000
},
{
"epoch": 21.31,
"eval_loss": 3.5617153644561768,
"eval_runtime": 45.7071,
"eval_samples_per_second": 899.248,
"eval_steps_per_second": 56.206,
"step": 1040000
},
{
"epoch": 21.47,
"eval_loss": 3.5374927520751953,
"eval_runtime": 45.668,
"eval_samples_per_second": 900.018,
"eval_steps_per_second": 56.254,
"step": 1048000
},
{
"epoch": 21.64,
"learning_rate": 5.603268573334446e-06,
"loss": 3.3872,
"step": 1056000
},
{
"epoch": 21.64,
"eval_loss": 3.532823085784912,
"eval_runtime": 46.5514,
"eval_samples_per_second": 882.938,
"eval_steps_per_second": 55.186,
"step": 1056000
},
{
"epoch": 21.8,
"eval_loss": 3.561626434326172,
"eval_runtime": 45.9586,
"eval_samples_per_second": 894.327,
"eval_steps_per_second": 55.898,
"step": 1064000
},
{
"epoch": 21.96,
"learning_rate": 5.536562995080464e-06,
"loss": 3.3931,
"step": 1072000
},
{
"epoch": 21.96,
"eval_loss": 3.5647873878479004,
"eval_runtime": 46.8936,
"eval_samples_per_second": 876.495,
"eval_steps_per_second": 54.784,
"step": 1072000
},
{
"epoch": 22.13,
"eval_loss": 3.544335126876831,
"eval_runtime": 46.3686,
"eval_samples_per_second": 886.419,
"eval_steps_per_second": 55.404,
"step": 1080000
},
{
"epoch": 22.29,
"learning_rate": 5.469857416826483e-06,
"loss": 3.3708,
"step": 1088000
},
{
"epoch": 22.29,
"eval_loss": 3.5400941371917725,
"eval_runtime": 45.8359,
"eval_samples_per_second": 896.72,
"eval_steps_per_second": 56.048,
"step": 1088000
},
{
"epoch": 22.45,
"eval_loss": 3.55292010307312,
"eval_runtime": 46.8082,
"eval_samples_per_second": 878.095,
"eval_steps_per_second": 54.884,
"step": 1096000
},
{
"epoch": 22.62,
"learning_rate": 5.403151838572501e-06,
"loss": 3.4099,
"step": 1104000
},
{
"epoch": 22.62,
"eval_loss": 3.533414602279663,
"eval_runtime": 46.1107,
"eval_samples_per_second": 891.377,
"eval_steps_per_second": 55.714,
"step": 1104000
},
{
"epoch": 22.78,
"eval_loss": 3.5325212478637695,
"eval_runtime": 46.1351,
"eval_samples_per_second": 890.905,
"eval_steps_per_second": 55.684,
"step": 1112000
},
{
"epoch": 22.95,
"learning_rate": 5.33644626031852e-06,
"loss": 3.4027,
"step": 1120000
},
{
"epoch": 22.95,
"eval_loss": 3.5818660259246826,
"eval_runtime": 46.7428,
"eval_samples_per_second": 879.323,
"eval_steps_per_second": 54.96,
"step": 1120000
},
{
"epoch": 23.11,
"eval_loss": 3.5470829010009766,
"eval_runtime": 46.1344,
"eval_samples_per_second": 890.92,
"eval_steps_per_second": 55.685,
"step": 1128000
},
{
"epoch": 23.27,
"learning_rate": 5.269740682064538e-06,
"loss": 3.4035,
"step": 1136000
},
{
"epoch": 23.27,
"eval_loss": 3.548552989959717,
"eval_runtime": 46.1071,
"eval_samples_per_second": 891.446,
"eval_steps_per_second": 55.718,
"step": 1136000
},
{
"epoch": 23.44,
"eval_loss": 3.5470151901245117,
"eval_runtime": 46.849,
"eval_samples_per_second": 877.33,
"eval_steps_per_second": 54.836,
"step": 1144000
},
{
"epoch": 23.6,
"learning_rate": 5.203035103810556e-06,
"loss": 3.3964,
"step": 1152000
},
{
"epoch": 23.6,
"eval_loss": 3.572176694869995,
"eval_runtime": 46.3661,
"eval_samples_per_second": 886.467,
"eval_steps_per_second": 55.407,
"step": 1152000
},
{
"epoch": 23.77,
"eval_loss": 3.55098295211792,
"eval_runtime": 46.1812,
"eval_samples_per_second": 890.015,
"eval_steps_per_second": 55.629,
"step": 1160000
},
{
"epoch": 23.93,
"learning_rate": 5.136329525556575e-06,
"loss": 3.4115,
"step": 1168000
},
{
"epoch": 23.93,
"eval_loss": 3.561007499694824,
"eval_runtime": 47.5429,
"eval_samples_per_second": 864.525,
"eval_steps_per_second": 54.035,
"step": 1168000
},
{
"epoch": 24.09,
"eval_loss": 3.5757482051849365,
"eval_runtime": 46.3962,
"eval_samples_per_second": 885.891,
"eval_steps_per_second": 55.371,
"step": 1176000
},
{
"epoch": 24.26,
"learning_rate": 5.0696239473025935e-06,
"loss": 3.4173,
"step": 1184000
},
{
"epoch": 24.26,
"eval_loss": 3.554094076156616,
"eval_runtime": 45.5708,
"eval_samples_per_second": 901.936,
"eval_steps_per_second": 56.374,
"step": 1184000
},
{
"epoch": 24.42,
"eval_loss": 3.577660083770752,
"eval_runtime": 47.0565,
"eval_samples_per_second": 873.461,
"eval_steps_per_second": 54.594,
"step": 1192000
},
{
"epoch": 24.59,
"learning_rate": 5.002918369048611e-06,
"loss": 3.4169,
"step": 1200000
},
{
"epoch": 24.59,
"eval_loss": 3.5637948513031006,
"eval_runtime": 47.0711,
"eval_samples_per_second": 873.19,
"eval_steps_per_second": 54.577,
"step": 1200000
},
{
"epoch": 24.75,
"eval_loss": 3.5462896823883057,
"eval_runtime": 46.8215,
"eval_samples_per_second": 877.845,
"eval_steps_per_second": 54.868,
"step": 1208000
},
{
"epoch": 24.91,
"learning_rate": 4.936212790794631e-06,
"loss": 3.4031,
"step": 1216000
},
{
"epoch": 24.91,
"eval_loss": 3.5299670696258545,
"eval_runtime": 46.9742,
"eval_samples_per_second": 874.99,
"eval_steps_per_second": 54.69,
"step": 1216000
},
{
"epoch": 25.08,
"eval_loss": 3.558427333831787,
"eval_runtime": 46.1322,
"eval_samples_per_second": 890.961,
"eval_steps_per_second": 55.688,
"step": 1224000
},
{
"epoch": 25.24,
"learning_rate": 4.869507212540649e-06,
"loss": 3.4094,
"step": 1232000
},
{
"epoch": 25.24,
"eval_loss": 3.568174123764038,
"eval_runtime": 46.3049,
"eval_samples_per_second": 887.638,
"eval_steps_per_second": 55.48,
"step": 1232000
},
{
"epoch": 25.41,
"eval_loss": 3.555844783782959,
"eval_runtime": 46.0676,
"eval_samples_per_second": 892.211,
"eval_steps_per_second": 55.766,
"step": 1240000
},
{
"epoch": 25.57,
"learning_rate": 4.802801634286667e-06,
"loss": 3.4116,
"step": 1248000
},
{
"epoch": 25.57,
"eval_loss": 3.5629091262817383,
"eval_runtime": 45.5765,
"eval_samples_per_second": 901.825,
"eval_steps_per_second": 56.367,
"step": 1248000
},
{
"epoch": 25.73,
"eval_loss": 3.5490224361419678,
"eval_runtime": 46.4409,
"eval_samples_per_second": 885.039,
"eval_steps_per_second": 55.318,
"step": 1256000
},
{
"epoch": 25.9,
"learning_rate": 4.7360960560326865e-06,
"loss": 3.4199,
"step": 1264000
},
{
"epoch": 25.9,
"eval_loss": 3.567878484725952,
"eval_runtime": 46.1595,
"eval_samples_per_second": 890.434,
"eval_steps_per_second": 55.655,
"step": 1264000
},
{
"epoch": 26.06,
"eval_loss": 3.5885465145111084,
"eval_runtime": 45.9316,
"eval_samples_per_second": 894.853,
"eval_steps_per_second": 55.931,
"step": 1272000
},
{
"epoch": 26.22,
"learning_rate": 4.669390477778704e-06,
"loss": 3.412,
"step": 1280000
},
{
"epoch": 26.22,
"eval_loss": 3.5578629970550537,
"eval_runtime": 46.4337,
"eval_samples_per_second": 885.176,
"eval_steps_per_second": 55.326,
"step": 1280000
},
{
"epoch": 26.39,
"eval_loss": 3.5465352535247803,
"eval_runtime": 45.7517,
"eval_samples_per_second": 898.371,
"eval_steps_per_second": 56.151,
"step": 1288000
},
{
"epoch": 26.55,
"learning_rate": 4.602684899524723e-06,
"loss": 3.4123,
"step": 1296000
},
{
"epoch": 26.55,
"eval_loss": 3.572610855102539,
"eval_runtime": 45.5426,
"eval_samples_per_second": 902.496,
"eval_steps_per_second": 56.409,
"step": 1296000
},
{
"epoch": 26.72,
"eval_loss": 3.577484130859375,
"eval_runtime": 46.4204,
"eval_samples_per_second": 885.431,
"eval_steps_per_second": 55.342,
"step": 1304000
},
{
"epoch": 26.88,
"learning_rate": 4.5359793212707415e-06,
"loss": 3.4132,
"step": 1312000
},
{
"epoch": 26.88,
"eval_loss": 3.5477850437164307,
"eval_runtime": 45.6512,
"eval_samples_per_second": 900.348,
"eval_steps_per_second": 56.275,
"step": 1312000
},
{
"epoch": 27.04,
"eval_loss": 3.5588574409484863,
"eval_runtime": 46.0446,
"eval_samples_per_second": 892.657,
"eval_steps_per_second": 55.794,
"step": 1320000
},
{
"epoch": 27.21,
"learning_rate": 4.46927374301676e-06,
"loss": 3.4161,
"step": 1328000
},
{
"epoch": 27.21,
"eval_loss": 3.56620717048645,
"eval_runtime": 46.4839,
"eval_samples_per_second": 884.22,
"eval_steps_per_second": 55.266,
"step": 1328000
},
{
"epoch": 27.37,
"eval_loss": 3.589487075805664,
"eval_runtime": 46.3966,
"eval_samples_per_second": 885.884,
"eval_steps_per_second": 55.37,
"step": 1336000
},
{
"epoch": 27.54,
"learning_rate": 4.402568164762779e-06,
"loss": 3.4097,
"step": 1344000
},
{
"epoch": 27.54,
"eval_loss": 3.5940632820129395,
"eval_runtime": 46.4364,
"eval_samples_per_second": 885.125,
"eval_steps_per_second": 55.323,
"step": 1344000
},
{
"epoch": 27.7,
"eval_loss": 3.5912110805511475,
"eval_runtime": 45.9687,
"eval_samples_per_second": 894.131,
"eval_steps_per_second": 55.886,
"step": 1352000
},
{
"epoch": 27.86,
"learning_rate": 4.335862586508797e-06,
"loss": 3.415,
"step": 1360000
},
{
"epoch": 27.86,
"eval_loss": 3.565756320953369,
"eval_runtime": 45.7621,
"eval_samples_per_second": 898.168,
"eval_steps_per_second": 56.138,
"step": 1360000
},
{
"epoch": 28.03,
"eval_loss": 3.5553781986236572,
"eval_runtime": 46.2903,
"eval_samples_per_second": 887.919,
"eval_steps_per_second": 55.498,
"step": 1368000
},
{
"epoch": 28.19,
"learning_rate": 4.269157008254816e-06,
"loss": 3.4193,
"step": 1376000
},
{
"epoch": 28.19,
"eval_loss": 3.589851140975952,
"eval_runtime": 45.8411,
"eval_samples_per_second": 896.618,
"eval_steps_per_second": 56.041,
"step": 1376000
},
{
"epoch": 28.36,
"eval_loss": 3.5652260780334473,
"eval_runtime": 45.5538,
"eval_samples_per_second": 902.275,
"eval_steps_per_second": 56.395,
"step": 1384000
},
{
"epoch": 28.52,
"learning_rate": 4.202451430000834e-06,
"loss": 3.4136,
"step": 1392000
},
{
"epoch": 28.52,
"eval_loss": 3.5832390785217285,
"eval_runtime": 46.575,
"eval_samples_per_second": 882.491,
"eval_steps_per_second": 55.158,
"step": 1392000
},
{
"epoch": 28.68,
"eval_loss": 3.5885210037231445,
"eval_runtime": 45.9659,
"eval_samples_per_second": 894.184,
"eval_steps_per_second": 55.889,
"step": 1400000
},
{
"epoch": 28.85,
"learning_rate": 4.135745851746852e-06,
"loss": 3.4294,
"step": 1408000
},
{
"epoch": 28.85,
"eval_loss": 3.583249807357788,
"eval_runtime": 45.7927,
"eval_samples_per_second": 897.568,
"eval_steps_per_second": 56.101,
"step": 1408000
},
{
"epoch": 29.01,
"eval_loss": 3.6025209426879883,
"eval_runtime": 46.362,
"eval_samples_per_second": 886.546,
"eval_steps_per_second": 55.412,
"step": 1416000
},
{
"epoch": 29.17,
"learning_rate": 4.069040273492872e-06,
"loss": 3.4243,
"step": 1424000
},
{
"epoch": 29.17,
"eval_loss": 3.6040360927581787,
"eval_runtime": 45.7855,
"eval_samples_per_second": 897.708,
"eval_steps_per_second": 56.11,
"step": 1424000
},
{
"epoch": 29.34,
"eval_loss": 3.5890395641326904,
"eval_runtime": 46.5109,
"eval_samples_per_second": 883.707,
"eval_steps_per_second": 55.234,
"step": 1432000
},
{
"epoch": 29.5,
"learning_rate": 4.0023346952388895e-06,
"loss": 3.4427,
"step": 1440000
},
{
"epoch": 29.5,
"eval_loss": 3.58347749710083,
"eval_runtime": 46.2896,
"eval_samples_per_second": 887.931,
"eval_steps_per_second": 55.498,
"step": 1440000
},
{
"epoch": 29.67,
"eval_loss": 3.6185286045074463,
"eval_runtime": 46.4189,
"eval_samples_per_second": 885.459,
"eval_steps_per_second": 55.344,
"step": 1448000
},
{
"epoch": 29.83,
"learning_rate": 3.935629116984908e-06,
"loss": 3.4293,
"step": 1456000
},
{
"epoch": 29.83,
"eval_loss": 3.6028919219970703,
"eval_runtime": 46.7251,
"eval_samples_per_second": 879.656,
"eval_steps_per_second": 54.981,
"step": 1456000
},
{
"epoch": 29.99,
"eval_loss": 3.616161823272705,
"eval_runtime": 45.7265,
"eval_samples_per_second": 898.865,
"eval_steps_per_second": 56.182,
"step": 1464000
},
{
"epoch": 30.16,
"learning_rate": 3.868923538730927e-06,
"loss": 3.4363,
"step": 1472000
},
{
"epoch": 30.16,
"eval_loss": 3.6257941722869873,
"eval_runtime": 45.6532,
"eval_samples_per_second": 900.308,
"eval_steps_per_second": 56.272,
"step": 1472000
},
{
"epoch": 30.32,
"eval_loss": 3.6038014888763428,
"eval_runtime": 46.717,
"eval_samples_per_second": 879.808,
"eval_steps_per_second": 54.991,
"step": 1480000
},
{
"epoch": 30.49,
"learning_rate": 3.8022179604769453e-06,
"loss": 3.4532,
"step": 1488000
},
{
"epoch": 30.49,
"eval_loss": 3.6039483547210693,
"eval_runtime": 45.742,
"eval_samples_per_second": 898.562,
"eval_steps_per_second": 56.163,
"step": 1488000
},
{
"epoch": 30.65,
"eval_loss": 3.605367422103882,
"eval_runtime": 45.7078,
"eval_samples_per_second": 899.234,
"eval_steps_per_second": 56.205,
"step": 1496000
},
{
"epoch": 30.81,
"learning_rate": 3.735512382222964e-06,
"loss": 3.4401,
"step": 1504000
},
{
"epoch": 30.81,
"eval_loss": 3.6269376277923584,
"eval_runtime": 46.6124,
"eval_samples_per_second": 881.783,
"eval_steps_per_second": 55.114,
"step": 1504000
},
{
"epoch": 30.98,
"eval_loss": 3.600417137145996,
"eval_runtime": 47.0146,
"eval_samples_per_second": 874.239,
"eval_steps_per_second": 54.643,
"step": 1512000
},
{
"epoch": 31.14,
"learning_rate": 3.668806803968982e-06,
"loss": 3.4491,
"step": 1520000
},
{
"epoch": 31.14,
"eval_loss": 3.6095597743988037,
"eval_runtime": 47.1653,
"eval_samples_per_second": 871.446,
"eval_steps_per_second": 54.468,
"step": 1520000
},
{
"epoch": 31.31,
"eval_loss": 3.6216766834259033,
"eval_runtime": 48.343,
"eval_samples_per_second": 850.216,
"eval_steps_per_second": 53.141,
"step": 1528000
},
{
"epoch": 31.47,
"learning_rate": 3.6021012257150007e-06,
"loss": 3.4438,
"step": 1536000
},
{
"epoch": 31.47,
"eval_loss": 3.6081080436706543,
"eval_runtime": 47.4804,
"eval_samples_per_second": 865.663,
"eval_steps_per_second": 54.107,
"step": 1536000
},
{
"epoch": 31.63,
"eval_loss": 3.6190168857574463,
"eval_runtime": 48.3587,
"eval_samples_per_second": 849.941,
"eval_steps_per_second": 53.124,
"step": 1544000
},
{
"epoch": 31.8,
"learning_rate": 3.535395647461019e-06,
"loss": 3.4337,
"step": 1552000
},
{
"epoch": 31.8,
"eval_loss": 3.611992835998535,
"eval_runtime": 47.5342,
"eval_samples_per_second": 864.683,
"eval_steps_per_second": 54.045,
"step": 1552000
},
{
"epoch": 31.96,
"eval_loss": 3.586127996444702,
"eval_runtime": 46.8726,
"eval_samples_per_second": 876.888,
"eval_steps_per_second": 54.808,
"step": 1560000
},
{
"epoch": 32.13,
"learning_rate": 3.468690069207038e-06,
"loss": 3.4475,
"step": 1568000
},
{
"epoch": 32.13,
"eval_loss": 3.620932102203369,
"eval_runtime": 48.2654,
"eval_samples_per_second": 851.582,
"eval_steps_per_second": 53.226,
"step": 1568000
},
{
"epoch": 32.29,
"eval_loss": 3.6301937103271484,
"eval_runtime": 47.2416,
"eval_samples_per_second": 870.039,
"eval_steps_per_second": 54.38,
"step": 1576000
},
{
"epoch": 32.45,
"learning_rate": 3.4019844909530565e-06,
"loss": 3.4406,
"step": 1584000
},
{
"epoch": 32.45,
"eval_loss": 3.6052932739257812,
"eval_runtime": 46.0861,
"eval_samples_per_second": 891.852,
"eval_steps_per_second": 55.743,
"step": 1584000
},
{
"epoch": 32.62,
"eval_loss": 3.593369960784912,
"eval_runtime": 49.6475,
"eval_samples_per_second": 827.876,
"eval_steps_per_second": 51.745,
"step": 1592000
},
{
"epoch": 32.78,
"learning_rate": 3.3352789126990747e-06,
"loss": 3.4392,
"step": 1600000
},
{
"epoch": 32.78,
"eval_loss": 3.594203472137451,
"eval_runtime": 47.8907,
"eval_samples_per_second": 858.246,
"eval_steps_per_second": 53.643,
"step": 1600000
},
{
"epoch": 32.94,
"eval_loss": 3.601329803466797,
"eval_runtime": 46.6549,
"eval_samples_per_second": 880.98,
"eval_steps_per_second": 55.064,
"step": 1608000
},
{
"epoch": 33.11,
"learning_rate": 3.2685733344450933e-06,
"loss": 3.4514,
"step": 1616000
},
{
"epoch": 33.11,
"eval_loss": 3.6505630016326904,
"eval_runtime": 47.3453,
"eval_samples_per_second": 868.132,
"eval_steps_per_second": 54.261,
"step": 1616000
},
{
"epoch": 33.27,
"eval_loss": 3.604905128479004,
"eval_runtime": 47.3478,
"eval_samples_per_second": 868.087,
"eval_steps_per_second": 54.258,
"step": 1624000
},
{
"epoch": 33.44,
"learning_rate": 3.2018677561911115e-06,
"loss": 3.4406,
"step": 1632000
},
{
"epoch": 33.44,
"eval_loss": 3.6285159587860107,
"eval_runtime": 45.2665,
"eval_samples_per_second": 908.001,
"eval_steps_per_second": 56.753,
"step": 1632000
},
{
"epoch": 33.6,
"eval_loss": 3.6107122898101807,
"eval_runtime": 47.0075,
"eval_samples_per_second": 874.372,
"eval_steps_per_second": 54.651,
"step": 1640000
},
{
"epoch": 33.76,
"learning_rate": 3.1351621779371306e-06,
"loss": 3.4522,
"step": 1648000
},
{
"epoch": 33.76,
"eval_loss": 3.6080775260925293,
"eval_runtime": 46.384,
"eval_samples_per_second": 886.124,
"eval_steps_per_second": 55.385,
"step": 1648000
},
{
"epoch": 33.93,
"eval_loss": 3.6121394634246826,
"eval_runtime": 47.5808,
"eval_samples_per_second": 863.836,
"eval_steps_per_second": 53.992,
"step": 1656000
},
{
"epoch": 34.09,
"learning_rate": 3.0684565996831487e-06,
"loss": 3.4592,
"step": 1664000
},
{
"epoch": 34.09,
"eval_loss": 3.639568567276001,
"eval_runtime": 47.4907,
"eval_samples_per_second": 865.474,
"eval_steps_per_second": 54.095,
"step": 1664000
},
{
"epoch": 34.26,
"eval_loss": 3.628408432006836,
"eval_runtime": 45.8805,
"eval_samples_per_second": 895.849,
"eval_steps_per_second": 55.993,
"step": 1672000
},
{
"epoch": 34.42,
"learning_rate": 3.0017510214291673e-06,
"loss": 3.4587,
"step": 1680000
},
{
"epoch": 34.42,
"eval_loss": 3.619464635848999,
"eval_runtime": 46.7813,
"eval_samples_per_second": 878.599,
"eval_steps_per_second": 54.915,
"step": 1680000
},
{
"epoch": 34.58,
"eval_loss": 3.6168148517608643,
"eval_runtime": 46.0408,
"eval_samples_per_second": 892.731,
"eval_steps_per_second": 55.798,
"step": 1688000
},
{
"epoch": 34.75,
"learning_rate": 2.9350454431751855e-06,
"loss": 3.4589,
"step": 1696000
},
{
"epoch": 34.75,
"eval_loss": 3.631527900695801,
"eval_runtime": 45.9831,
"eval_samples_per_second": 893.85,
"eval_steps_per_second": 55.868,
"step": 1696000
},
{
"epoch": 34.91,
"eval_loss": 3.6044745445251465,
"eval_runtime": 46.5293,
"eval_samples_per_second": 883.356,
"eval_steps_per_second": 55.212,
"step": 1704000
},
{
"epoch": 35.08,
"learning_rate": 2.868339864921204e-06,
"loss": 3.4703,
"step": 1712000
},
{
"epoch": 35.08,
"eval_loss": 3.6251227855682373,
"eval_runtime": 45.5912,
"eval_samples_per_second": 901.533,
"eval_steps_per_second": 56.349,
"step": 1712000
},
{
"epoch": 35.24,
"eval_loss": 3.6251931190490723,
"eval_runtime": 45.7404,
"eval_samples_per_second": 898.593,
"eval_steps_per_second": 56.165,
"step": 1720000
},
{
"epoch": 35.4,
"learning_rate": 2.801634286667223e-06,
"loss": 3.4565,
"step": 1728000
},
{
"epoch": 35.4,
"eval_loss": 3.62538743019104,
"eval_runtime": 46.4207,
"eval_samples_per_second": 885.423,
"eval_steps_per_second": 55.342,
"step": 1728000
},
{
"epoch": 35.57,
"eval_loss": 3.6544113159179688,
"eval_runtime": 45.7864,
"eval_samples_per_second": 897.691,
"eval_steps_per_second": 56.108,
"step": 1736000
},
{
"epoch": 35.73,
"learning_rate": 2.7349287084132413e-06,
"loss": 3.4634,
"step": 1744000
},
{
"epoch": 35.73,
"eval_loss": 3.629049062728882,
"eval_runtime": 46.556,
"eval_samples_per_second": 882.85,
"eval_steps_per_second": 55.181,
"step": 1744000
},
{
"epoch": 35.9,
"eval_loss": 3.612429618835449,
"eval_runtime": 46.5059,
"eval_samples_per_second": 883.802,
"eval_steps_per_second": 55.24,
"step": 1752000
},
{
"epoch": 36.06,
"learning_rate": 2.66822313015926e-06,
"loss": 3.4625,
"step": 1760000
},
{
"epoch": 36.06,
"eval_loss": 3.6262378692626953,
"eval_runtime": 45.8554,
"eval_samples_per_second": 896.34,
"eval_steps_per_second": 56.024,
"step": 1760000
},
{
"epoch": 36.22,
"eval_loss": 3.6317975521087646,
"eval_runtime": 46.7318,
"eval_samples_per_second": 879.529,
"eval_steps_per_second": 54.973,
"step": 1768000
},
{
"epoch": 36.39,
"learning_rate": 2.601517551905278e-06,
"loss": 3.457,
"step": 1776000
},
{
"epoch": 36.39,
"eval_loss": 3.640812397003174,
"eval_runtime": 45.9688,
"eval_samples_per_second": 894.129,
"eval_steps_per_second": 55.886,
"step": 1776000
},
{
"epoch": 36.55,
"eval_loss": 3.6433026790618896,
"eval_runtime": 45.8154,
"eval_samples_per_second": 897.122,
"eval_steps_per_second": 56.073,
"step": 1784000
},
{
"epoch": 36.71,
"learning_rate": 2.5348119736512967e-06,
"loss": 3.4618,
"step": 1792000
},
{
"epoch": 36.71,
"eval_loss": 3.627612352371216,
"eval_runtime": 46.6149,
"eval_samples_per_second": 881.735,
"eval_steps_per_second": 55.111,
"step": 1792000
},
{
"epoch": 36.88,
"eval_loss": 3.631366014480591,
"eval_runtime": 46.0925,
"eval_samples_per_second": 891.729,
"eval_steps_per_second": 55.736,
"step": 1800000
},
{
"epoch": 37.04,
"learning_rate": 2.4681063953973154e-06,
"loss": 3.4611,
"step": 1808000
},
{
"epoch": 37.04,
"eval_loss": 3.6415860652923584,
"eval_runtime": 46.287,
"eval_samples_per_second": 887.982,
"eval_steps_per_second": 55.502,
"step": 1808000
},
{
"epoch": 37.21,
"eval_loss": 3.665800094604492,
"eval_runtime": 46.839,
"eval_samples_per_second": 877.517,
"eval_steps_per_second": 54.847,
"step": 1816000
},
{
"epoch": 37.37,
"learning_rate": 2.4014008171433335e-06,
"loss": 3.4651,
"step": 1824000
},
{
"epoch": 37.37,
"eval_loss": 3.638195037841797,
"eval_runtime": 46.0815,
"eval_samples_per_second": 891.942,
"eval_steps_per_second": 55.749,
"step": 1824000
},
{
"epoch": 37.53,
"eval_loss": 3.656243085861206,
"eval_runtime": 45.3257,
"eval_samples_per_second": 906.815,
"eval_steps_per_second": 56.679,
"step": 1832000
},
{
"epoch": 37.7,
"learning_rate": 2.334695238889352e-06,
"loss": 3.4625,
"step": 1840000
},
{
"epoch": 37.7,
"eval_loss": 3.6376214027404785,
"eval_runtime": 47.1734,
"eval_samples_per_second": 871.296,
"eval_steps_per_second": 54.459,
"step": 1840000
},
{
"epoch": 37.86,
"eval_loss": 3.651963710784912,
"eval_runtime": 46.059,
"eval_samples_per_second": 892.377,
"eval_steps_per_second": 55.776,
"step": 1848000
},
{
"epoch": 38.03,
"learning_rate": 2.2679896606353707e-06,
"loss": 3.4561,
"step": 1856000
},
{
"epoch": 38.03,
"eval_loss": 3.6300716400146484,
"eval_runtime": 46.8158,
"eval_samples_per_second": 877.951,
"eval_steps_per_second": 54.875,
"step": 1856000
},
{
"epoch": 38.19,
"eval_loss": 3.619462728500366,
"eval_runtime": 45.8596,
"eval_samples_per_second": 896.258,
"eval_steps_per_second": 56.019,
"step": 1864000
},
{
"epoch": 38.35,
"learning_rate": 2.2012840823813894e-06,
"loss": 3.4655,
"step": 1872000
},
{
"epoch": 38.35,
"eval_loss": 3.6279447078704834,
"eval_runtime": 46.2215,
"eval_samples_per_second": 889.241,
"eval_steps_per_second": 55.58,
"step": 1872000
},
{
"epoch": 38.52,
"eval_loss": 3.636460542678833,
"eval_runtime": 46.7533,
"eval_samples_per_second": 879.125,
"eval_steps_per_second": 54.948,
"step": 1880000
},
{
"epoch": 38.68,
"learning_rate": 2.134578504127408e-06,
"loss": 3.4637,
"step": 1888000
},
{
"epoch": 38.68,
"eval_loss": 3.638620138168335,
"eval_runtime": 46.2177,
"eval_samples_per_second": 889.313,
"eval_steps_per_second": 55.585,
"step": 1888000
},
{
"epoch": 38.85,
"eval_loss": 3.643373489379883,
"eval_runtime": 45.9947,
"eval_samples_per_second": 893.624,
"eval_steps_per_second": 55.854,
"step": 1896000
},
{
"epoch": 39.01,
"learning_rate": 2.067872925873426e-06,
"loss": 3.458,
"step": 1904000
},
{
"epoch": 39.01,
"eval_loss": 3.65189266204834,
"eval_runtime": 46.7003,
"eval_samples_per_second": 880.122,
"eval_steps_per_second": 55.01,
"step": 1904000
},
{
"epoch": 39.17,
"eval_loss": 3.6438076496124268,
"eval_runtime": 46.3785,
"eval_samples_per_second": 886.229,
"eval_steps_per_second": 55.392,
"step": 1912000
},
{
"epoch": 39.34,
"learning_rate": 2.0011673476194448e-06,
"loss": 3.4523,
"step": 1920000
},
{
"epoch": 39.34,
"eval_loss": 3.640777349472046,
"eval_runtime": 46.701,
"eval_samples_per_second": 880.109,
"eval_steps_per_second": 55.01,
"step": 1920000
},
{
"epoch": 39.5,
"eval_loss": 3.6513171195983887,
"eval_runtime": 46.884,
"eval_samples_per_second": 876.675,
"eval_steps_per_second": 54.795,
"step": 1928000
},
{
"epoch": 39.66,
"learning_rate": 1.9344617693654634e-06,
"loss": 3.4743,
"step": 1936000
},
{
"epoch": 39.66,
"eval_loss": 3.6177797317504883,
"eval_runtime": 46.0686,
"eval_samples_per_second": 892.192,
"eval_steps_per_second": 55.765,
"step": 1936000
},
{
"epoch": 39.83,
"eval_loss": 3.6398518085479736,
"eval_runtime": 46.8575,
"eval_samples_per_second": 877.171,
"eval_steps_per_second": 54.826,
"step": 1944000
},
{
"epoch": 39.99,
"learning_rate": 1.867756191111482e-06,
"loss": 3.4626,
"step": 1952000
},
{
"epoch": 39.99,
"eval_loss": 3.624283790588379,
"eval_runtime": 46.1682,
"eval_samples_per_second": 890.266,
"eval_steps_per_second": 55.644,
"step": 1952000
},
{
"epoch": 40.16,
"eval_loss": 3.6325714588165283,
"eval_runtime": 45.9837,
"eval_samples_per_second": 893.838,
"eval_steps_per_second": 55.868,
"step": 1960000
},
{
"epoch": 40.32,
"learning_rate": 1.8010506128575004e-06,
"loss": 3.4692,
"step": 1968000
},
{
"epoch": 40.32,
"eval_loss": 3.6723103523254395,
"eval_runtime": 46.8787,
"eval_samples_per_second": 876.773,
"eval_steps_per_second": 54.801,
"step": 1968000
},
{
"epoch": 40.48,
"eval_loss": 3.6456410884857178,
"eval_runtime": 46.0442,
"eval_samples_per_second": 892.664,
"eval_steps_per_second": 55.794,
"step": 1976000
},
{
"epoch": 40.65,
"learning_rate": 1.734345034603519e-06,
"loss": 3.4765,
"step": 1984000
},
{
"epoch": 40.65,
"eval_loss": 3.6437156200408936,
"eval_runtime": 45.2826,
"eval_samples_per_second": 907.678,
"eval_steps_per_second": 56.733,
"step": 1984000
},
{
"epoch": 40.81,
"eval_loss": 3.647704839706421,
"eval_runtime": 46.8981,
"eval_samples_per_second": 876.41,
"eval_steps_per_second": 54.778,
"step": 1992000
},
{
"epoch": 40.98,
"learning_rate": 1.6676394563495374e-06,
"loss": 3.4747,
"step": 2000000
},
{
"epoch": 40.98,
"eval_loss": 3.638388156890869,
"eval_runtime": 46.0328,
"eval_samples_per_second": 892.886,
"eval_steps_per_second": 55.808,
"step": 2000000
},
{
"epoch": 41.14,
"eval_loss": 3.6370368003845215,
"eval_runtime": 46.7372,
"eval_samples_per_second": 879.427,
"eval_steps_per_second": 54.967,
"step": 2008000
},
{
"epoch": 41.3,
"learning_rate": 1.6009338780955558e-06,
"loss": 3.4683,
"step": 2016000
},
{
"epoch": 41.3,
"eval_loss": 3.662468433380127,
"eval_runtime": 46.61,
"eval_samples_per_second": 881.828,
"eval_steps_per_second": 55.117,
"step": 2016000
},
{
"epoch": 41.47,
"eval_loss": 3.6453213691711426,
"eval_runtime": 45.8611,
"eval_samples_per_second": 896.229,
"eval_steps_per_second": 56.017,
"step": 2024000
},
{
"epoch": 41.63,
"learning_rate": 1.5342282998415744e-06,
"loss": 3.4599,
"step": 2032000
},
{
"epoch": 41.63,
"eval_loss": 3.64886212348938,
"eval_runtime": 46.762,
"eval_samples_per_second": 878.962,
"eval_steps_per_second": 54.938,
"step": 2032000
},
{
"epoch": 41.8,
"eval_loss": 3.6310884952545166,
"eval_runtime": 46.4576,
"eval_samples_per_second": 884.72,
"eval_steps_per_second": 55.298,
"step": 2040000
},
{
"epoch": 41.96,
"learning_rate": 1.4675227215875928e-06,
"loss": 3.4713,
"step": 2048000
},
{
"epoch": 41.96,
"eval_loss": 3.619154691696167,
"eval_runtime": 45.9184,
"eval_samples_per_second": 895.109,
"eval_steps_per_second": 55.947,
"step": 2048000
},
{
"epoch": 42.12,
"eval_loss": 3.651060104370117,
"eval_runtime": 47.0032,
"eval_samples_per_second": 874.451,
"eval_steps_per_second": 54.656,
"step": 2056000
},
{
"epoch": 42.29,
"learning_rate": 1.4008171433336116e-06,
"loss": 3.4677,
"step": 2064000
},
{
"epoch": 42.29,
"eval_loss": 3.6425869464874268,
"eval_runtime": 46.3503,
"eval_samples_per_second": 886.769,
"eval_steps_per_second": 55.426,
"step": 2064000
},
{
"epoch": 42.45,
"eval_loss": 3.6362836360931396,
"eval_runtime": 46.2845,
"eval_samples_per_second": 888.029,
"eval_steps_per_second": 55.505,
"step": 2072000
},
{
"epoch": 42.62,
"learning_rate": 1.33411156507963e-06,
"loss": 3.4689,
"step": 2080000
},
{
"epoch": 42.62,
"eval_loss": 3.6378438472747803,
"eval_runtime": 47.0132,
"eval_samples_per_second": 874.265,
"eval_steps_per_second": 54.644,
"step": 2080000
},
{
"epoch": 42.78,
"eval_loss": 3.6450445652008057,
"eval_runtime": 46.1055,
"eval_samples_per_second": 891.478,
"eval_steps_per_second": 55.72,
"step": 2088000
},
{
"epoch": 42.94,
"learning_rate": 1.2674059868256484e-06,
"loss": 3.4598,
"step": 2096000
},
{
"epoch": 42.94,
"eval_loss": 3.64805006980896,
"eval_runtime": 46.8684,
"eval_samples_per_second": 876.967,
"eval_steps_per_second": 54.813,
"step": 2096000
},
{
"epoch": 43.11,
"eval_loss": 3.6675028800964355,
"eval_runtime": 46.4765,
"eval_samples_per_second": 884.36,
"eval_steps_per_second": 55.275,
"step": 2104000
},
{
"epoch": 43.27,
"learning_rate": 1.2007004085716668e-06,
"loss": 3.4487,
"step": 2112000
},
{
"epoch": 43.27,
"eval_loss": 3.6557657718658447,
"eval_runtime": 46.0356,
"eval_samples_per_second": 892.83,
"eval_steps_per_second": 55.805,
"step": 2112000
},
{
"epoch": 43.43,
"eval_loss": 3.6451427936553955,
"eval_runtime": 47.3121,
"eval_samples_per_second": 868.741,
"eval_steps_per_second": 54.299,
"step": 2120000
},
{
"epoch": 43.6,
"learning_rate": 1.1339948303176854e-06,
"loss": 3.4555,
"step": 2128000
},
{
"epoch": 43.6,
"eval_loss": 3.643132448196411,
"eval_runtime": 46.2499,
"eval_samples_per_second": 888.694,
"eval_steps_per_second": 55.546,
"step": 2128000
},
{
"epoch": 43.76,
"eval_loss": 3.6470389366149902,
"eval_runtime": 45.8331,
"eval_samples_per_second": 896.776,
"eval_steps_per_second": 56.051,
"step": 2136000
},
{
"epoch": 43.93,
"learning_rate": 1.067289252063704e-06,
"loss": 3.4727,
"step": 2144000
},
{
"epoch": 43.93,
"eval_loss": 3.6265406608581543,
"eval_runtime": 47.1162,
"eval_samples_per_second": 872.353,
"eval_steps_per_second": 54.525,
"step": 2144000
},
{
"epoch": 44.09,
"eval_loss": 3.6335132122039795,
"eval_runtime": 45.9499,
"eval_samples_per_second": 894.497,
"eval_steps_per_second": 55.909,
"step": 2152000
},
{
"epoch": 44.25,
"learning_rate": 1.0005836738097224e-06,
"loss": 3.4626,
"step": 2160000
},
{
"epoch": 44.25,
"eval_loss": 3.639557123184204,
"eval_runtime": 46.75,
"eval_samples_per_second": 879.187,
"eval_steps_per_second": 54.952,
"step": 2160000
},
{
"epoch": 44.42,
"eval_loss": 3.653687000274658,
"eval_runtime": 47.165,
"eval_samples_per_second": 871.452,
"eval_steps_per_second": 54.468,
"step": 2168000
},
{
"epoch": 44.58,
"learning_rate": 9.33878095555741e-07,
"loss": 3.4724,
"step": 2176000
},
{
"epoch": 44.58,
"eval_loss": 3.61678409576416,
"eval_runtime": 46.2585,
"eval_samples_per_second": 888.528,
"eval_steps_per_second": 55.536,
"step": 2176000
},
{
"epoch": 44.75,
"eval_loss": 3.644352674484253,
"eval_runtime": 47.0469,
"eval_samples_per_second": 873.64,
"eval_steps_per_second": 54.605,
"step": 2184000
},
{
"epoch": 44.91,
"learning_rate": 8.671725173017595e-07,
"loss": 3.4545,
"step": 2192000
},
{
"epoch": 44.91,
"eval_loss": 3.6440114974975586,
"eval_runtime": 46.2426,
"eval_samples_per_second": 888.835,
"eval_steps_per_second": 55.555,
"step": 2192000
},
{
"epoch": 45.07,
"eval_loss": 3.6327061653137207,
"eval_runtime": 46.09,
"eval_samples_per_second": 891.776,
"eval_steps_per_second": 55.739,
"step": 2200000
},
{
"epoch": 45.24,
"learning_rate": 8.004669390477779e-07,
"loss": 3.461,
"step": 2208000
},
{
"epoch": 45.24,
"eval_loss": 3.6362533569335938,
"eval_runtime": 47.1445,
"eval_samples_per_second": 871.831,
"eval_steps_per_second": 54.492,
"step": 2208000
},
{
"epoch": 45.4,
"eval_loss": 3.653747081756592,
"eval_runtime": 46.2235,
"eval_samples_per_second": 889.202,
"eval_steps_per_second": 55.578,
"step": 2216000
},
{
"epoch": 45.57,
"learning_rate": 7.337613607937964e-07,
"loss": 3.4702,
"step": 2224000
},
{
"epoch": 45.57,
"eval_loss": 3.6123247146606445,
"eval_runtime": 46.168,
"eval_samples_per_second": 890.27,
"eval_steps_per_second": 55.645,
"step": 2224000
},
{
"epoch": 45.73,
"eval_loss": 3.6554455757141113,
"eval_runtime": 47.1193,
"eval_samples_per_second": 872.296,
"eval_steps_per_second": 54.521,
"step": 2232000
},
{
"epoch": 45.89,
"learning_rate": 6.67055782539815e-07,
"loss": 3.4565,
"step": 2240000
},
{
"epoch": 45.89,
"eval_loss": 3.6522979736328125,
"eval_runtime": 46.1449,
"eval_samples_per_second": 890.716,
"eval_steps_per_second": 55.672,
"step": 2240000
},
{
"epoch": 46.06,
"eval_loss": 3.6339659690856934,
"eval_runtime": 47.2579,
"eval_samples_per_second": 869.739,
"eval_steps_per_second": 54.361,
"step": 2248000
},
{
"epoch": 46.22,
"learning_rate": 6.003502042858334e-07,
"loss": 3.4517,
"step": 2256000
},
{
"epoch": 46.22,
"eval_loss": 3.6459498405456543,
"eval_runtime": 46.9038,
"eval_samples_per_second": 876.305,
"eval_steps_per_second": 54.772,
"step": 2256000
},
{
"epoch": 46.38,
"eval_loss": 3.656141996383667,
"eval_runtime": 46.3654,
"eval_samples_per_second": 886.48,
"eval_steps_per_second": 55.408,
"step": 2264000
},
{
"epoch": 46.55,
"learning_rate": 5.33644626031852e-07,
"loss": 3.4631,
"step": 2272000
},
{
"epoch": 46.55,
"eval_loss": 3.6547927856445312,
"eval_runtime": 47.1154,
"eval_samples_per_second": 872.368,
"eval_steps_per_second": 54.526,
"step": 2272000
},
{
"epoch": 46.71,
"eval_loss": 3.6228716373443604,
"eval_runtime": 46.2908,
"eval_samples_per_second": 887.908,
"eval_steps_per_second": 55.497,
"step": 2280000
},
{
"epoch": 46.88,
"learning_rate": 4.669390477778705e-07,
"loss": 3.4518,
"step": 2288000
},
{
"epoch": 46.88,
"eval_loss": 3.6350128650665283,
"eval_runtime": 46.3584,
"eval_samples_per_second": 886.613,
"eval_steps_per_second": 55.416,
"step": 2288000
},
{
"epoch": 47.04,
"eval_loss": 3.6483192443847656,
"eval_runtime": 47.24,
"eval_samples_per_second": 870.067,
"eval_steps_per_second": 54.382,
"step": 2296000
},
{
"epoch": 47.2,
"learning_rate": 4.0023346952388894e-07,
"loss": 3.4592,
"step": 2304000
},
{
"epoch": 47.2,
"eval_loss": 3.6263089179992676,
"eval_runtime": 47.0185,
"eval_samples_per_second": 874.166,
"eval_steps_per_second": 54.638,
"step": 2304000
},
{
"epoch": 47.37,
"eval_loss": 3.6339097023010254,
"eval_runtime": 46.0199,
"eval_samples_per_second": 893.135,
"eval_steps_per_second": 55.824,
"step": 2312000
},
{
"epoch": 47.53,
"learning_rate": 3.335278912699075e-07,
"loss": 3.4569,
"step": 2320000
},
{
"epoch": 47.53,
"eval_loss": 3.659444808959961,
"eval_runtime": 47.1636,
"eval_samples_per_second": 871.477,
"eval_steps_per_second": 54.47,
"step": 2320000
},
{
"epoch": 47.7,
"eval_loss": 3.638535737991333,
"eval_runtime": 46.1693,
"eval_samples_per_second": 890.246,
"eval_steps_per_second": 55.643,
"step": 2328000
},
{
"epoch": 47.86,
"learning_rate": 2.66822313015926e-07,
"loss": 3.4524,
"step": 2336000
},
{
"epoch": 47.86,
"eval_loss": 3.6434078216552734,
"eval_runtime": 47.0318,
"eval_samples_per_second": 873.919,
"eval_steps_per_second": 54.623,
"step": 2336000
},
{
"epoch": 48.02,
"eval_loss": 3.650230646133423,
"eval_runtime": 46.5514,
"eval_samples_per_second": 882.938,
"eval_steps_per_second": 55.186,
"step": 2344000
},
{
"epoch": 48.19,
"learning_rate": 2.0011673476194447e-07,
"loss": 3.4644,
"step": 2352000
},
{
"epoch": 48.19,
"eval_loss": 3.617619276046753,
"eval_runtime": 46.2116,
"eval_samples_per_second": 889.43,
"eval_steps_per_second": 55.592,
"step": 2352000
},
{
"epoch": 48.35,
"eval_loss": 3.6293184757232666,
"eval_runtime": 47.399,
"eval_samples_per_second": 867.15,
"eval_steps_per_second": 54.199,
"step": 2360000
},
{
"epoch": 48.52,
"learning_rate": 1.33411156507963e-07,
"loss": 3.4586,
"step": 2368000
},
{
"epoch": 48.52,
"eval_loss": 3.630380392074585,
"eval_runtime": 46.3912,
"eval_samples_per_second": 885.987,
"eval_steps_per_second": 55.377,
"step": 2368000
},
{
"epoch": 48.68,
"eval_loss": 3.6343326568603516,
"eval_runtime": 46.2144,
"eval_samples_per_second": 889.376,
"eval_steps_per_second": 55.589,
"step": 2376000
},
{
"epoch": 48.84,
"learning_rate": 6.67055782539815e-08,
"loss": 3.4439,
"step": 2384000
},
{
"epoch": 48.84,
"eval_loss": 3.6090333461761475,
"eval_runtime": 47.3482,
"eval_samples_per_second": 868.08,
"eval_steps_per_second": 54.258,
"step": 2384000
},
{
"epoch": 49.01,
"eval_loss": 3.6414153575897217,
"eval_runtime": 46.5994,
"eval_samples_per_second": 882.029,
"eval_steps_per_second": 55.13,
"step": 2392000
},
{
"epoch": 49.17,
"learning_rate": 0.0,
"loss": 3.4474,
"step": 2400000
},
{
"epoch": 49.17,
"eval_loss": 3.620838165283203,
"eval_runtime": 46.9825,
"eval_samples_per_second": 874.835,
"eval_steps_per_second": 54.68,
"step": 2400000
},
{
"epoch": 49.17,
"step": 2400000,
"total_flos": 6.906141294629226e+17,
"train_loss": 3.376089767252604,
"train_runtime": 158003.2062,
"train_samples_per_second": 243.033,
"train_steps_per_second": 15.19
}
],
"logging_steps": 16000,
"max_steps": 2400000,
"num_train_epochs": 50,
"save_steps": 32000,
"total_flos": 6.906141294629226e+17,
"trial_name": null,
"trial_params": null
}