2020-Q4-50p-filtered-random / trainer_state.json
DouglasPontes's picture
Training in progress, step 32000
6eda8d9 verified
raw
history blame
80.8 kB
{
"best_metric": 2.596395254135132,
"best_model_checkpoint": "./model_tweets_2020_Q4_50/checkpoint-2112000",
"epoch": 10.105348255564257,
"eval_steps": 8000,
"global_step": 2400000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03,
"eval_loss": 2.9659674167633057,
"eval_runtime": 427.2887,
"eval_samples_per_second": 468.063,
"eval_steps_per_second": 29.254,
"step": 8000
},
{
"epoch": 0.07,
"learning_rate": 4.0726666666666665e-07,
"loss": 3.1627,
"step": 16000
},
{
"epoch": 0.07,
"eval_loss": 2.875443458557129,
"eval_runtime": 428.9511,
"eval_samples_per_second": 466.249,
"eval_steps_per_second": 29.141,
"step": 16000
},
{
"epoch": 0.1,
"eval_loss": 2.8262600898742676,
"eval_runtime": 428.7189,
"eval_samples_per_second": 466.501,
"eval_steps_per_second": 29.157,
"step": 24000
},
{
"epoch": 0.13,
"learning_rate": 4.0453333333333336e-07,
"loss": 2.9611,
"step": 32000
},
{
"epoch": 0.13,
"eval_loss": 2.7973239421844482,
"eval_runtime": 427.7192,
"eval_samples_per_second": 467.592,
"eval_steps_per_second": 29.225,
"step": 32000
},
{
"epoch": 0.17,
"eval_loss": 2.774101734161377,
"eval_runtime": 430.9135,
"eval_samples_per_second": 464.126,
"eval_steps_per_second": 29.008,
"step": 40000
},
{
"epoch": 0.2,
"learning_rate": 4.018e-07,
"loss": 2.8986,
"step": 48000
},
{
"epoch": 0.2,
"eval_loss": 2.757391929626465,
"eval_runtime": 434.8184,
"eval_samples_per_second": 459.958,
"eval_steps_per_second": 28.748,
"step": 48000
},
{
"epoch": 0.24,
"eval_loss": 2.741281509399414,
"eval_runtime": 431.3008,
"eval_samples_per_second": 463.709,
"eval_steps_per_second": 28.982,
"step": 56000
},
{
"epoch": 0.27,
"learning_rate": 3.9906666666666667e-07,
"loss": 2.8726,
"step": 64000
},
{
"epoch": 0.27,
"eval_loss": 2.724008798599243,
"eval_runtime": 431.9972,
"eval_samples_per_second": 462.961,
"eval_steps_per_second": 28.935,
"step": 64000
},
{
"epoch": 0.3,
"eval_loss": 2.7238657474517822,
"eval_runtime": 428.6784,
"eval_samples_per_second": 466.546,
"eval_steps_per_second": 29.159,
"step": 72000
},
{
"epoch": 0.34,
"learning_rate": 3.963333333333333e-07,
"loss": 2.8558,
"step": 80000
},
{
"epoch": 0.34,
"eval_loss": 2.7132034301757812,
"eval_runtime": 428.5232,
"eval_samples_per_second": 466.714,
"eval_steps_per_second": 29.17,
"step": 80000
},
{
"epoch": 0.37,
"eval_loss": 2.702976942062378,
"eval_runtime": 434.0957,
"eval_samples_per_second": 460.723,
"eval_steps_per_second": 28.795,
"step": 88000
},
{
"epoch": 0.4,
"learning_rate": 3.936e-07,
"loss": 2.8459,
"step": 96000
},
{
"epoch": 0.4,
"eval_loss": 2.7112441062927246,
"eval_runtime": 430.6879,
"eval_samples_per_second": 464.369,
"eval_steps_per_second": 29.023,
"step": 96000
},
{
"epoch": 0.44,
"eval_loss": 2.6918396949768066,
"eval_runtime": 432.9686,
"eval_samples_per_second": 461.923,
"eval_steps_per_second": 28.87,
"step": 104000
},
{
"epoch": 0.47,
"learning_rate": 3.908666666666667e-07,
"loss": 2.8379,
"step": 112000
},
{
"epoch": 0.47,
"eval_loss": 2.701714515686035,
"eval_runtime": 431.7861,
"eval_samples_per_second": 463.188,
"eval_steps_per_second": 28.95,
"step": 112000
},
{
"epoch": 0.51,
"eval_loss": 2.6920413970947266,
"eval_runtime": 432.0079,
"eval_samples_per_second": 462.95,
"eval_steps_per_second": 28.935,
"step": 120000
},
{
"epoch": 0.54,
"learning_rate": 3.8813333333333334e-07,
"loss": 2.8265,
"step": 128000
},
{
"epoch": 0.54,
"eval_loss": 2.6970582008361816,
"eval_runtime": 428.3996,
"eval_samples_per_second": 466.849,
"eval_steps_per_second": 29.178,
"step": 128000
},
{
"epoch": 0.57,
"eval_loss": 2.692416191101074,
"eval_runtime": 431.1394,
"eval_samples_per_second": 463.882,
"eval_steps_per_second": 28.993,
"step": 136000
},
{
"epoch": 0.61,
"learning_rate": 3.854e-07,
"loss": 2.8227,
"step": 144000
},
{
"epoch": 0.61,
"eval_loss": 2.69515323638916,
"eval_runtime": 428.2249,
"eval_samples_per_second": 467.04,
"eval_steps_per_second": 29.19,
"step": 144000
},
{
"epoch": 0.64,
"eval_loss": 2.681142807006836,
"eval_runtime": 427.7295,
"eval_samples_per_second": 467.581,
"eval_steps_per_second": 29.224,
"step": 152000
},
{
"epoch": 0.67,
"learning_rate": 3.8266666666666665e-07,
"loss": 2.8209,
"step": 160000
},
{
"epoch": 0.67,
"eval_loss": 2.6828596591949463,
"eval_runtime": 429.1179,
"eval_samples_per_second": 466.068,
"eval_steps_per_second": 29.13,
"step": 160000
},
{
"epoch": 0.71,
"eval_loss": 2.6882851123809814,
"eval_runtime": 430.1324,
"eval_samples_per_second": 464.968,
"eval_steps_per_second": 29.061,
"step": 168000
},
{
"epoch": 0.74,
"learning_rate": 3.799333333333333e-07,
"loss": 2.8147,
"step": 176000
},
{
"epoch": 0.74,
"eval_loss": 2.6675167083740234,
"eval_runtime": 428.1706,
"eval_samples_per_second": 467.099,
"eval_steps_per_second": 29.194,
"step": 176000
},
{
"epoch": 0.77,
"eval_loss": 2.66744327545166,
"eval_runtime": 427.87,
"eval_samples_per_second": 467.427,
"eval_steps_per_second": 29.214,
"step": 184000
},
{
"epoch": 0.81,
"learning_rate": 3.772e-07,
"loss": 2.8077,
"step": 192000
},
{
"epoch": 0.81,
"eval_loss": 2.6661171913146973,
"eval_runtime": 426.1667,
"eval_samples_per_second": 469.295,
"eval_steps_per_second": 29.331,
"step": 192000
},
{
"epoch": 0.84,
"eval_loss": 2.6773271560668945,
"eval_runtime": 430.5929,
"eval_samples_per_second": 464.471,
"eval_steps_per_second": 29.03,
"step": 200000
},
{
"epoch": 0.88,
"learning_rate": 3.7446666666666667e-07,
"loss": 2.8058,
"step": 208000
},
{
"epoch": 0.88,
"eval_loss": 2.6734445095062256,
"eval_runtime": 431.7928,
"eval_samples_per_second": 463.18,
"eval_steps_per_second": 28.949,
"step": 208000
},
{
"epoch": 0.91,
"eval_loss": 2.6741647720336914,
"eval_runtime": 433.2608,
"eval_samples_per_second": 461.611,
"eval_steps_per_second": 28.851,
"step": 216000
},
{
"epoch": 0.94,
"learning_rate": 3.7173333333333333e-07,
"loss": 2.812,
"step": 224000
},
{
"epoch": 0.94,
"eval_loss": 2.6665947437286377,
"eval_runtime": 429.9196,
"eval_samples_per_second": 465.199,
"eval_steps_per_second": 29.075,
"step": 224000
},
{
"epoch": 0.98,
"eval_loss": 2.6641786098480225,
"eval_runtime": 432.0068,
"eval_samples_per_second": 462.951,
"eval_steps_per_second": 28.935,
"step": 232000
},
{
"epoch": 1.01,
"learning_rate": 3.69e-07,
"loss": 2.8025,
"step": 240000
},
{
"epoch": 1.01,
"eval_loss": 2.668074607849121,
"eval_runtime": 429.4276,
"eval_samples_per_second": 465.732,
"eval_steps_per_second": 29.109,
"step": 240000
},
{
"epoch": 1.04,
"eval_loss": 2.6663010120391846,
"eval_runtime": 430.5757,
"eval_samples_per_second": 464.49,
"eval_steps_per_second": 29.031,
"step": 248000
},
{
"epoch": 1.08,
"learning_rate": 3.6626666666666664e-07,
"loss": 2.809,
"step": 256000
},
{
"epoch": 1.08,
"eval_loss": 2.6645491123199463,
"eval_runtime": 429.4305,
"eval_samples_per_second": 465.728,
"eval_steps_per_second": 29.108,
"step": 256000
},
{
"epoch": 1.11,
"eval_loss": 2.6528775691986084,
"eval_runtime": 430.2815,
"eval_samples_per_second": 464.807,
"eval_steps_per_second": 29.051,
"step": 264000
},
{
"epoch": 1.15,
"learning_rate": 3.6353333333333335e-07,
"loss": 2.8073,
"step": 272000
},
{
"epoch": 1.15,
"eval_loss": 2.662346839904785,
"eval_runtime": 427.2132,
"eval_samples_per_second": 468.146,
"eval_steps_per_second": 29.259,
"step": 272000
},
{
"epoch": 1.18,
"eval_loss": 2.655132293701172,
"eval_runtime": 427.6981,
"eval_samples_per_second": 467.615,
"eval_steps_per_second": 29.226,
"step": 280000
},
{
"epoch": 1.21,
"learning_rate": 3.608e-07,
"loss": 2.8005,
"step": 288000
},
{
"epoch": 1.21,
"eval_loss": 2.664275646209717,
"eval_runtime": 428.4044,
"eval_samples_per_second": 466.844,
"eval_steps_per_second": 29.178,
"step": 288000
},
{
"epoch": 1.25,
"eval_loss": 2.6627519130706787,
"eval_runtime": 432.9315,
"eval_samples_per_second": 461.962,
"eval_steps_per_second": 28.873,
"step": 296000
},
{
"epoch": 1.28,
"learning_rate": 3.5806666666666666e-07,
"loss": 2.7988,
"step": 304000
},
{
"epoch": 1.28,
"eval_loss": 2.658301591873169,
"eval_runtime": 430.042,
"eval_samples_per_second": 465.066,
"eval_steps_per_second": 29.067,
"step": 304000
},
{
"epoch": 1.31,
"eval_loss": 2.659410238265991,
"eval_runtime": 430.2191,
"eval_samples_per_second": 464.875,
"eval_steps_per_second": 29.055,
"step": 312000
},
{
"epoch": 1.35,
"learning_rate": 3.553333333333333e-07,
"loss": 2.7887,
"step": 320000
},
{
"epoch": 1.35,
"eval_loss": 2.654367685317993,
"eval_runtime": 428.3466,
"eval_samples_per_second": 466.907,
"eval_steps_per_second": 29.182,
"step": 320000
},
{
"epoch": 1.38,
"eval_loss": 2.6515774726867676,
"eval_runtime": 428.1496,
"eval_samples_per_second": 467.122,
"eval_steps_per_second": 29.195,
"step": 328000
},
{
"epoch": 1.41,
"learning_rate": 3.5259999999999997e-07,
"loss": 2.7964,
"step": 336000
},
{
"epoch": 1.41,
"eval_loss": 2.655459403991699,
"eval_runtime": 428.9032,
"eval_samples_per_second": 466.301,
"eval_steps_per_second": 29.144,
"step": 336000
},
{
"epoch": 1.45,
"eval_loss": 2.65506911277771,
"eval_runtime": 431.185,
"eval_samples_per_second": 463.833,
"eval_steps_per_second": 28.99,
"step": 344000
},
{
"epoch": 1.48,
"learning_rate": 3.498666666666667e-07,
"loss": 2.7919,
"step": 352000
},
{
"epoch": 1.48,
"eval_loss": 2.6507833003997803,
"eval_runtime": 429.9054,
"eval_samples_per_second": 465.214,
"eval_steps_per_second": 29.076,
"step": 352000
},
{
"epoch": 1.52,
"eval_loss": 2.648573637008667,
"eval_runtime": 426.8576,
"eval_samples_per_second": 468.536,
"eval_steps_per_second": 29.284,
"step": 360000
},
{
"epoch": 1.55,
"learning_rate": 3.4713333333333333e-07,
"loss": 2.8058,
"step": 368000
},
{
"epoch": 1.55,
"eval_loss": 2.648383140563965,
"eval_runtime": 426.9677,
"eval_samples_per_second": 468.415,
"eval_steps_per_second": 29.276,
"step": 368000
},
{
"epoch": 1.58,
"eval_loss": 2.653191566467285,
"eval_runtime": 429.3713,
"eval_samples_per_second": 465.793,
"eval_steps_per_second": 29.112,
"step": 376000
},
{
"epoch": 1.62,
"learning_rate": 3.444e-07,
"loss": 2.796,
"step": 384000
},
{
"epoch": 1.62,
"eval_loss": 2.6472740173339844,
"eval_runtime": 426.8024,
"eval_samples_per_second": 468.596,
"eval_steps_per_second": 29.288,
"step": 384000
},
{
"epoch": 1.65,
"eval_loss": 2.6489272117614746,
"eval_runtime": 428.8708,
"eval_samples_per_second": 466.336,
"eval_steps_per_second": 29.146,
"step": 392000
},
{
"epoch": 1.68,
"learning_rate": 3.416666666666667e-07,
"loss": 2.799,
"step": 400000
},
{
"epoch": 1.68,
"eval_loss": 2.6475629806518555,
"eval_runtime": 427.9765,
"eval_samples_per_second": 467.311,
"eval_steps_per_second": 29.207,
"step": 400000
},
{
"epoch": 1.72,
"eval_loss": 2.6416804790496826,
"eval_runtime": 426.9442,
"eval_samples_per_second": 468.441,
"eval_steps_per_second": 29.278,
"step": 408000
},
{
"epoch": 1.75,
"learning_rate": 3.3893333333333335e-07,
"loss": 2.7991,
"step": 416000
},
{
"epoch": 1.75,
"eval_loss": 2.6544759273529053,
"eval_runtime": 427.1377,
"eval_samples_per_second": 468.228,
"eval_steps_per_second": 29.265,
"step": 416000
},
{
"epoch": 1.79,
"eval_loss": 2.6465859413146973,
"eval_runtime": 428.9124,
"eval_samples_per_second": 466.291,
"eval_steps_per_second": 29.143,
"step": 424000
},
{
"epoch": 1.82,
"learning_rate": 3.3619999999999995e-07,
"loss": 2.792,
"step": 432000
},
{
"epoch": 1.82,
"eval_loss": 2.6396875381469727,
"eval_runtime": 428.572,
"eval_samples_per_second": 466.661,
"eval_steps_per_second": 29.167,
"step": 432000
},
{
"epoch": 1.85,
"eval_loss": 2.642756938934326,
"eval_runtime": 428.9912,
"eval_samples_per_second": 466.205,
"eval_steps_per_second": 29.138,
"step": 440000
},
{
"epoch": 1.89,
"learning_rate": 3.3346666666666666e-07,
"loss": 2.7972,
"step": 448000
},
{
"epoch": 1.89,
"eval_loss": 2.644583225250244,
"eval_runtime": 427.6258,
"eval_samples_per_second": 467.694,
"eval_steps_per_second": 29.231,
"step": 448000
},
{
"epoch": 1.92,
"eval_loss": 2.6433892250061035,
"eval_runtime": 429.7813,
"eval_samples_per_second": 465.348,
"eval_steps_per_second": 29.085,
"step": 456000
},
{
"epoch": 1.95,
"learning_rate": 3.307333333333333e-07,
"loss": 2.798,
"step": 464000
},
{
"epoch": 1.95,
"eval_loss": 2.648988962173462,
"eval_runtime": 429.7597,
"eval_samples_per_second": 465.372,
"eval_steps_per_second": 29.086,
"step": 464000
},
{
"epoch": 1.99,
"eval_loss": 2.6502037048339844,
"eval_runtime": 430.0385,
"eval_samples_per_second": 465.07,
"eval_steps_per_second": 29.067,
"step": 472000
},
{
"epoch": 2.02,
"learning_rate": 3.28e-07,
"loss": 2.7914,
"step": 480000
},
{
"epoch": 2.02,
"eval_loss": 2.6407032012939453,
"eval_runtime": 430.7536,
"eval_samples_per_second": 464.298,
"eval_steps_per_second": 29.019,
"step": 480000
},
{
"epoch": 2.05,
"eval_loss": 2.6283910274505615,
"eval_runtime": 433.2696,
"eval_samples_per_second": 461.602,
"eval_steps_per_second": 28.85,
"step": 488000
},
{
"epoch": 2.09,
"learning_rate": 3.252666666666667e-07,
"loss": 2.7932,
"step": 496000
},
{
"epoch": 2.09,
"eval_loss": 2.642556667327881,
"eval_runtime": 432.5252,
"eval_samples_per_second": 462.396,
"eval_steps_per_second": 28.9,
"step": 496000
},
{
"epoch": 2.12,
"eval_loss": 2.6423070430755615,
"eval_runtime": 432.6328,
"eval_samples_per_second": 462.281,
"eval_steps_per_second": 28.893,
"step": 504000
},
{
"epoch": 2.16,
"learning_rate": 3.2253333333333334e-07,
"loss": 2.787,
"step": 512000
},
{
"epoch": 2.16,
"eval_loss": 2.638451337814331,
"eval_runtime": 436.1841,
"eval_samples_per_second": 458.517,
"eval_steps_per_second": 28.658,
"step": 512000
},
{
"epoch": 2.19,
"eval_loss": 2.6388065814971924,
"eval_runtime": 430.5381,
"eval_samples_per_second": 464.53,
"eval_steps_per_second": 29.033,
"step": 520000
},
{
"epoch": 2.22,
"learning_rate": 3.198e-07,
"loss": 2.7893,
"step": 528000
},
{
"epoch": 2.22,
"eval_loss": 2.642207145690918,
"eval_runtime": 428.0052,
"eval_samples_per_second": 467.279,
"eval_steps_per_second": 29.205,
"step": 528000
},
{
"epoch": 2.26,
"eval_loss": 2.6409590244293213,
"eval_runtime": 429.1736,
"eval_samples_per_second": 466.007,
"eval_steps_per_second": 29.126,
"step": 536000
},
{
"epoch": 2.29,
"learning_rate": 3.1706666666666665e-07,
"loss": 2.7889,
"step": 544000
},
{
"epoch": 2.29,
"eval_loss": 2.633716583251953,
"eval_runtime": 430.2951,
"eval_samples_per_second": 464.793,
"eval_steps_per_second": 29.05,
"step": 544000
},
{
"epoch": 2.32,
"eval_loss": 2.627978801727295,
"eval_runtime": 437.5919,
"eval_samples_per_second": 457.042,
"eval_steps_per_second": 28.565,
"step": 552000
},
{
"epoch": 2.36,
"learning_rate": 3.1433333333333336e-07,
"loss": 2.791,
"step": 560000
},
{
"epoch": 2.36,
"eval_loss": 2.6364176273345947,
"eval_runtime": 435.9143,
"eval_samples_per_second": 458.801,
"eval_steps_per_second": 28.675,
"step": 560000
},
{
"epoch": 2.39,
"eval_loss": 2.6341359615325928,
"eval_runtime": 436.6584,
"eval_samples_per_second": 458.019,
"eval_steps_per_second": 28.626,
"step": 568000
},
{
"epoch": 2.43,
"learning_rate": 3.116e-07,
"loss": 2.7883,
"step": 576000
},
{
"epoch": 2.43,
"eval_loss": 2.6317176818847656,
"eval_runtime": 433.1883,
"eval_samples_per_second": 461.688,
"eval_steps_per_second": 28.856,
"step": 576000
},
{
"epoch": 2.46,
"eval_loss": 2.6277999877929688,
"eval_runtime": 430.7277,
"eval_samples_per_second": 464.326,
"eval_steps_per_second": 29.021,
"step": 584000
},
{
"epoch": 2.49,
"learning_rate": 3.0886666666666667e-07,
"loss": 2.7889,
"step": 592000
},
{
"epoch": 2.49,
"eval_loss": 2.635715961456299,
"eval_runtime": 431.7357,
"eval_samples_per_second": 463.242,
"eval_steps_per_second": 28.953,
"step": 592000
},
{
"epoch": 2.53,
"eval_loss": 2.6340713500976562,
"eval_runtime": 431.0234,
"eval_samples_per_second": 464.007,
"eval_steps_per_second": 29.001,
"step": 600000
},
{
"epoch": 2.56,
"learning_rate": 3.061333333333333e-07,
"loss": 2.7838,
"step": 608000
},
{
"epoch": 2.56,
"eval_loss": 2.633284091949463,
"eval_runtime": 430.0207,
"eval_samples_per_second": 465.089,
"eval_steps_per_second": 29.068,
"step": 608000
},
{
"epoch": 2.59,
"eval_loss": 2.638205051422119,
"eval_runtime": 430.4236,
"eval_samples_per_second": 464.654,
"eval_steps_per_second": 29.041,
"step": 616000
},
{
"epoch": 2.63,
"learning_rate": 3.034e-07,
"loss": 2.7873,
"step": 624000
},
{
"epoch": 2.63,
"eval_loss": 2.6274592876434326,
"eval_runtime": 429.469,
"eval_samples_per_second": 465.687,
"eval_steps_per_second": 29.106,
"step": 624000
},
{
"epoch": 2.66,
"eval_loss": 2.6260039806365967,
"eval_runtime": 430.4416,
"eval_samples_per_second": 464.634,
"eval_steps_per_second": 29.04,
"step": 632000
},
{
"epoch": 2.69,
"learning_rate": 3.0066666666666663e-07,
"loss": 2.7813,
"step": 640000
},
{
"epoch": 2.69,
"eval_loss": 2.63725209236145,
"eval_runtime": 430.3994,
"eval_samples_per_second": 464.68,
"eval_steps_per_second": 29.043,
"step": 640000
},
{
"epoch": 2.73,
"eval_loss": 2.6348888874053955,
"eval_runtime": 430.1402,
"eval_samples_per_second": 464.96,
"eval_steps_per_second": 29.06,
"step": 648000
},
{
"epoch": 2.76,
"learning_rate": 2.9793333333333334e-07,
"loss": 2.7858,
"step": 656000
},
{
"epoch": 2.76,
"eval_loss": 2.622281789779663,
"eval_runtime": 429.8357,
"eval_samples_per_second": 465.289,
"eval_steps_per_second": 29.081,
"step": 656000
},
{
"epoch": 2.8,
"eval_loss": 2.6275925636291504,
"eval_runtime": 429.5654,
"eval_samples_per_second": 465.582,
"eval_steps_per_second": 29.099,
"step": 664000
},
{
"epoch": 2.83,
"learning_rate": 2.952e-07,
"loss": 2.7895,
"step": 672000
},
{
"epoch": 2.83,
"eval_loss": 2.6354682445526123,
"eval_runtime": 432.8071,
"eval_samples_per_second": 462.095,
"eval_steps_per_second": 28.881,
"step": 672000
},
{
"epoch": 2.86,
"eval_loss": 2.6269936561584473,
"eval_runtime": 431.2646,
"eval_samples_per_second": 463.748,
"eval_steps_per_second": 28.985,
"step": 680000
},
{
"epoch": 2.9,
"learning_rate": 2.9246666666666665e-07,
"loss": 2.7873,
"step": 688000
},
{
"epoch": 2.9,
"eval_loss": 2.624408006668091,
"eval_runtime": 431.1254,
"eval_samples_per_second": 463.898,
"eval_steps_per_second": 28.994,
"step": 688000
},
{
"epoch": 2.93,
"eval_loss": 2.6397435665130615,
"eval_runtime": 428.8853,
"eval_samples_per_second": 466.32,
"eval_steps_per_second": 29.145,
"step": 696000
},
{
"epoch": 2.96,
"learning_rate": 2.897333333333333e-07,
"loss": 2.7866,
"step": 704000
},
{
"epoch": 2.96,
"eval_loss": 2.6303048133850098,
"eval_runtime": 428.8276,
"eval_samples_per_second": 466.383,
"eval_steps_per_second": 29.149,
"step": 704000
},
{
"epoch": 3.0,
"eval_loss": 2.6166625022888184,
"eval_runtime": 430.118,
"eval_samples_per_second": 464.984,
"eval_steps_per_second": 29.062,
"step": 712000
},
{
"epoch": 3.03,
"learning_rate": 2.8699999999999996e-07,
"loss": 2.7865,
"step": 720000
},
{
"epoch": 3.03,
"eval_loss": 2.6264894008636475,
"eval_runtime": 430.8627,
"eval_samples_per_second": 464.18,
"eval_steps_per_second": 29.012,
"step": 720000
},
{
"epoch": 3.07,
"eval_loss": 2.640347957611084,
"eval_runtime": 433.7872,
"eval_samples_per_second": 461.051,
"eval_steps_per_second": 28.816,
"step": 728000
},
{
"epoch": 3.1,
"learning_rate": 2.8426666666666667e-07,
"loss": 2.7716,
"step": 736000
},
{
"epoch": 3.1,
"eval_loss": 2.6247291564941406,
"eval_runtime": 428.0225,
"eval_samples_per_second": 467.26,
"eval_steps_per_second": 29.204,
"step": 736000
},
{
"epoch": 3.13,
"eval_loss": 2.625520706176758,
"eval_runtime": 430.348,
"eval_samples_per_second": 464.736,
"eval_steps_per_second": 29.046,
"step": 744000
},
{
"epoch": 3.17,
"learning_rate": 2.815333333333333e-07,
"loss": 2.779,
"step": 752000
},
{
"epoch": 3.17,
"eval_loss": 2.6315715312957764,
"eval_runtime": 429.049,
"eval_samples_per_second": 466.143,
"eval_steps_per_second": 29.134,
"step": 752000
},
{
"epoch": 3.2,
"eval_loss": 2.6269681453704834,
"eval_runtime": 429.2822,
"eval_samples_per_second": 465.889,
"eval_steps_per_second": 29.118,
"step": 760000
},
{
"epoch": 3.23,
"learning_rate": 2.7880000000000003e-07,
"loss": 2.7811,
"step": 768000
},
{
"epoch": 3.23,
"eval_loss": 2.626842498779297,
"eval_runtime": 430.4013,
"eval_samples_per_second": 464.678,
"eval_steps_per_second": 29.043,
"step": 768000
},
{
"epoch": 3.27,
"eval_loss": 2.6146929264068604,
"eval_runtime": 427.5353,
"eval_samples_per_second": 467.793,
"eval_steps_per_second": 29.237,
"step": 776000
},
{
"epoch": 3.3,
"learning_rate": 2.7606666666666664e-07,
"loss": 2.7797,
"step": 784000
},
{
"epoch": 3.3,
"eval_loss": 2.6270837783813477,
"eval_runtime": 429.6573,
"eval_samples_per_second": 465.483,
"eval_steps_per_second": 29.093,
"step": 784000
},
{
"epoch": 3.33,
"eval_loss": 2.6242685317993164,
"eval_runtime": 427.6055,
"eval_samples_per_second": 467.716,
"eval_steps_per_second": 29.233,
"step": 792000
},
{
"epoch": 3.37,
"learning_rate": 2.733333333333333e-07,
"loss": 2.7798,
"step": 800000
},
{
"epoch": 3.37,
"eval_loss": 2.623974323272705,
"eval_runtime": 428.5522,
"eval_samples_per_second": 466.683,
"eval_steps_per_second": 29.168,
"step": 800000
},
{
"epoch": 3.4,
"eval_loss": 2.622472047805786,
"eval_runtime": 434.6013,
"eval_samples_per_second": 460.187,
"eval_steps_per_second": 28.762,
"step": 808000
},
{
"epoch": 3.44,
"learning_rate": 2.706e-07,
"loss": 2.7774,
"step": 816000
},
{
"epoch": 3.44,
"eval_loss": 2.6231884956359863,
"eval_runtime": 429.2208,
"eval_samples_per_second": 465.956,
"eval_steps_per_second": 29.123,
"step": 816000
},
{
"epoch": 3.47,
"eval_loss": 2.6246674060821533,
"eval_runtime": 427.3461,
"eval_samples_per_second": 468.0,
"eval_steps_per_second": 29.25,
"step": 824000
},
{
"epoch": 3.5,
"learning_rate": 2.6786666666666666e-07,
"loss": 2.7744,
"step": 832000
},
{
"epoch": 3.5,
"eval_loss": 2.6269562244415283,
"eval_runtime": 427.4671,
"eval_samples_per_second": 467.868,
"eval_steps_per_second": 29.242,
"step": 832000
},
{
"epoch": 3.54,
"eval_loss": 2.6175224781036377,
"eval_runtime": 431.5837,
"eval_samples_per_second": 463.405,
"eval_steps_per_second": 28.963,
"step": 840000
},
{
"epoch": 3.57,
"learning_rate": 2.651333333333333e-07,
"loss": 2.7786,
"step": 848000
},
{
"epoch": 3.57,
"eval_loss": 2.6263880729675293,
"eval_runtime": 429.7254,
"eval_samples_per_second": 465.409,
"eval_steps_per_second": 29.088,
"step": 848000
},
{
"epoch": 3.6,
"eval_loss": 2.6192069053649902,
"eval_runtime": 429.8114,
"eval_samples_per_second": 465.316,
"eval_steps_per_second": 29.083,
"step": 856000
},
{
"epoch": 3.64,
"learning_rate": 2.624e-07,
"loss": 2.7829,
"step": 864000
},
{
"epoch": 3.64,
"eval_loss": 2.6278185844421387,
"eval_runtime": 429.9283,
"eval_samples_per_second": 465.189,
"eval_steps_per_second": 29.075,
"step": 864000
},
{
"epoch": 3.67,
"eval_loss": 2.623713731765747,
"eval_runtime": 439.0054,
"eval_samples_per_second": 455.571,
"eval_steps_per_second": 28.473,
"step": 872000
},
{
"epoch": 3.71,
"learning_rate": 2.596666666666667e-07,
"loss": 2.776,
"step": 880000
},
{
"epoch": 3.71,
"eval_loss": 2.6201553344726562,
"eval_runtime": 437.9731,
"eval_samples_per_second": 456.644,
"eval_steps_per_second": 28.541,
"step": 880000
},
{
"epoch": 3.74,
"eval_loss": 2.621598482131958,
"eval_runtime": 438.0301,
"eval_samples_per_second": 456.585,
"eval_steps_per_second": 28.537,
"step": 888000
},
{
"epoch": 3.77,
"learning_rate": 2.5693333333333333e-07,
"loss": 2.7797,
"step": 896000
},
{
"epoch": 3.77,
"eval_loss": 2.617363929748535,
"eval_runtime": 441.0259,
"eval_samples_per_second": 453.484,
"eval_steps_per_second": 28.343,
"step": 896000
},
{
"epoch": 3.81,
"eval_loss": 2.6238625049591064,
"eval_runtime": 429.8157,
"eval_samples_per_second": 465.311,
"eval_steps_per_second": 29.082,
"step": 904000
},
{
"epoch": 3.84,
"learning_rate": 2.542e-07,
"loss": 2.7744,
"step": 912000
},
{
"epoch": 3.84,
"eval_loss": 2.616323709487915,
"eval_runtime": 431.7594,
"eval_samples_per_second": 463.216,
"eval_steps_per_second": 28.951,
"step": 912000
},
{
"epoch": 3.87,
"eval_loss": 2.6197702884674072,
"eval_runtime": 428.7684,
"eval_samples_per_second": 466.448,
"eval_steps_per_second": 29.153,
"step": 920000
},
{
"epoch": 3.91,
"learning_rate": 2.5146666666666664e-07,
"loss": 2.7713,
"step": 928000
},
{
"epoch": 3.91,
"eval_loss": 2.623600959777832,
"eval_runtime": 427.7444,
"eval_samples_per_second": 467.564,
"eval_steps_per_second": 29.223,
"step": 928000
},
{
"epoch": 3.94,
"eval_loss": 2.622565507888794,
"eval_runtime": 440.6765,
"eval_samples_per_second": 453.843,
"eval_steps_per_second": 28.365,
"step": 936000
},
{
"epoch": 3.97,
"learning_rate": 2.4873333333333335e-07,
"loss": 2.7853,
"step": 944000
},
{
"epoch": 3.97,
"eval_loss": 2.6175239086151123,
"eval_runtime": 438.6983,
"eval_samples_per_second": 455.89,
"eval_steps_per_second": 28.493,
"step": 944000
},
{
"epoch": 4.01,
"eval_loss": 2.618924140930176,
"eval_runtime": 435.2773,
"eval_samples_per_second": 459.473,
"eval_steps_per_second": 28.717,
"step": 952000
},
{
"epoch": 4.04,
"learning_rate": 2.46e-07,
"loss": 2.7766,
"step": 960000
},
{
"epoch": 4.04,
"eval_loss": 2.619227647781372,
"eval_runtime": 436.1097,
"eval_samples_per_second": 458.596,
"eval_steps_per_second": 28.663,
"step": 960000
},
{
"epoch": 4.08,
"eval_loss": 2.631781578063965,
"eval_runtime": 434.7218,
"eval_samples_per_second": 460.06,
"eval_steps_per_second": 28.754,
"step": 968000
},
{
"epoch": 4.11,
"learning_rate": 2.4326666666666666e-07,
"loss": 2.7851,
"step": 976000
},
{
"epoch": 4.11,
"eval_loss": 2.6210079193115234,
"eval_runtime": 434.5133,
"eval_samples_per_second": 460.28,
"eval_steps_per_second": 28.768,
"step": 976000
},
{
"epoch": 4.14,
"eval_loss": 2.6172115802764893,
"eval_runtime": 432.096,
"eval_samples_per_second": 462.855,
"eval_steps_per_second": 28.929,
"step": 984000
},
{
"epoch": 4.18,
"learning_rate": 2.405333333333333e-07,
"loss": 2.7804,
"step": 992000
},
{
"epoch": 4.18,
"eval_loss": 2.6199557781219482,
"eval_runtime": 431.3352,
"eval_samples_per_second": 463.672,
"eval_steps_per_second": 28.98,
"step": 992000
},
{
"epoch": 4.21,
"eval_loss": 2.6156609058380127,
"eval_runtime": 427.025,
"eval_samples_per_second": 468.352,
"eval_steps_per_second": 29.272,
"step": 1000000
},
{
"epoch": 4.24,
"learning_rate": 2.3779999999999997e-07,
"loss": 2.773,
"step": 1008000
},
{
"epoch": 4.24,
"eval_loss": 2.60978364944458,
"eval_runtime": 428.465,
"eval_samples_per_second": 466.778,
"eval_steps_per_second": 29.174,
"step": 1008000
},
{
"epoch": 4.28,
"eval_loss": 2.6156139373779297,
"eval_runtime": 427.2949,
"eval_samples_per_second": 468.056,
"eval_steps_per_second": 29.254,
"step": 1016000
},
{
"epoch": 4.31,
"learning_rate": 2.3506666666666668e-07,
"loss": 2.7818,
"step": 1024000
},
{
"epoch": 4.31,
"eval_loss": 2.6148924827575684,
"eval_runtime": 426.9113,
"eval_samples_per_second": 468.477,
"eval_steps_per_second": 29.28,
"step": 1024000
},
{
"epoch": 4.35,
"eval_loss": 2.612070083618164,
"eval_runtime": 441.0262,
"eval_samples_per_second": 453.483,
"eval_steps_per_second": 28.343,
"step": 1032000
},
{
"epoch": 4.38,
"learning_rate": 2.3233333333333334e-07,
"loss": 2.7736,
"step": 1040000
},
{
"epoch": 4.38,
"eval_loss": 2.615013599395752,
"eval_runtime": 435.5985,
"eval_samples_per_second": 459.134,
"eval_steps_per_second": 28.696,
"step": 1040000
},
{
"epoch": 4.41,
"eval_loss": 2.6156115531921387,
"eval_runtime": 436.5917,
"eval_samples_per_second": 458.089,
"eval_steps_per_second": 28.631,
"step": 1048000
},
{
"epoch": 4.45,
"learning_rate": 2.2960000000000002e-07,
"loss": 2.7761,
"step": 1056000
},
{
"epoch": 4.45,
"eval_loss": 2.6170592308044434,
"eval_runtime": 437.6326,
"eval_samples_per_second": 457.0,
"eval_steps_per_second": 28.563,
"step": 1056000
},
{
"epoch": 4.48,
"eval_loss": 2.6124441623687744,
"eval_runtime": 430.5486,
"eval_samples_per_second": 464.519,
"eval_steps_per_second": 29.033,
"step": 1064000
},
{
"epoch": 4.51,
"learning_rate": 2.2686666666666667e-07,
"loss": 2.7789,
"step": 1072000
},
{
"epoch": 4.51,
"eval_loss": 2.6276962757110596,
"eval_runtime": 428.8138,
"eval_samples_per_second": 466.398,
"eval_steps_per_second": 29.15,
"step": 1072000
},
{
"epoch": 4.55,
"eval_loss": 2.6138463020324707,
"eval_runtime": 431.2439,
"eval_samples_per_second": 463.77,
"eval_steps_per_second": 28.986,
"step": 1080000
},
{
"epoch": 4.58,
"learning_rate": 2.2413333333333333e-07,
"loss": 2.7744,
"step": 1088000
},
{
"epoch": 4.58,
"eval_loss": 2.6080663204193115,
"eval_runtime": 429.0451,
"eval_samples_per_second": 466.147,
"eval_steps_per_second": 29.134,
"step": 1088000
},
{
"epoch": 4.61,
"eval_loss": 2.620119571685791,
"eval_runtime": 435.3098,
"eval_samples_per_second": 459.438,
"eval_steps_per_second": 28.715,
"step": 1096000
},
{
"epoch": 4.65,
"learning_rate": 2.214e-07,
"loss": 2.77,
"step": 1104000
},
{
"epoch": 4.65,
"eval_loss": 2.6170663833618164,
"eval_runtime": 430.7548,
"eval_samples_per_second": 464.297,
"eval_steps_per_second": 29.019,
"step": 1104000
},
{
"epoch": 4.68,
"eval_loss": 2.609856128692627,
"eval_runtime": 430.7439,
"eval_samples_per_second": 464.308,
"eval_steps_per_second": 29.02,
"step": 1112000
},
{
"epoch": 4.72,
"learning_rate": 2.1866666666666667e-07,
"loss": 2.772,
"step": 1120000
},
{
"epoch": 4.72,
"eval_loss": 2.614119052886963,
"eval_runtime": 430.1555,
"eval_samples_per_second": 464.944,
"eval_steps_per_second": 29.059,
"step": 1120000
},
{
"epoch": 4.75,
"eval_loss": 2.6174395084381104,
"eval_runtime": 430.2468,
"eval_samples_per_second": 464.845,
"eval_steps_per_second": 29.053,
"step": 1128000
},
{
"epoch": 4.78,
"learning_rate": 2.1593333333333332e-07,
"loss": 2.7709,
"step": 1136000
},
{
"epoch": 4.78,
"eval_loss": 2.6200435161590576,
"eval_runtime": 429.4394,
"eval_samples_per_second": 465.719,
"eval_steps_per_second": 29.108,
"step": 1136000
},
{
"epoch": 4.82,
"eval_loss": 2.6149799823760986,
"eval_runtime": 429.9204,
"eval_samples_per_second": 465.198,
"eval_steps_per_second": 29.075,
"step": 1144000
},
{
"epoch": 4.85,
"learning_rate": 2.132e-07,
"loss": 2.7724,
"step": 1152000
},
{
"epoch": 4.85,
"eval_loss": 2.6041531562805176,
"eval_runtime": 433.3404,
"eval_samples_per_second": 461.526,
"eval_steps_per_second": 28.846,
"step": 1152000
},
{
"epoch": 4.88,
"eval_loss": 2.615821361541748,
"eval_runtime": 427.3883,
"eval_samples_per_second": 467.954,
"eval_steps_per_second": 29.247,
"step": 1160000
},
{
"epoch": 4.92,
"learning_rate": 2.1046666666666666e-07,
"loss": 2.7763,
"step": 1168000
},
{
"epoch": 4.92,
"eval_loss": 2.616694211959839,
"eval_runtime": 424.8183,
"eval_samples_per_second": 470.785,
"eval_steps_per_second": 29.424,
"step": 1168000
},
{
"epoch": 4.95,
"eval_loss": 2.617375612258911,
"eval_runtime": 425.4075,
"eval_samples_per_second": 470.133,
"eval_steps_per_second": 29.384,
"step": 1176000
},
{
"epoch": 4.99,
"learning_rate": 2.0773333333333334e-07,
"loss": 2.7736,
"step": 1184000
},
{
"epoch": 4.99,
"eval_loss": 2.609928846359253,
"eval_runtime": 429.8799,
"eval_samples_per_second": 465.242,
"eval_steps_per_second": 29.078,
"step": 1184000
},
{
"epoch": 5.02,
"eval_loss": 2.607574462890625,
"eval_runtime": 427.5829,
"eval_samples_per_second": 467.741,
"eval_steps_per_second": 29.234,
"step": 1192000
},
{
"epoch": 5.05,
"learning_rate": 2.05e-07,
"loss": 2.7692,
"step": 1200000
},
{
"epoch": 5.05,
"eval_loss": 2.6088333129882812,
"eval_runtime": 427.3086,
"eval_samples_per_second": 468.041,
"eval_steps_per_second": 29.253,
"step": 1200000
},
{
"epoch": 5.09,
"eval_loss": 2.6174449920654297,
"eval_runtime": 426.9323,
"eval_samples_per_second": 468.454,
"eval_steps_per_second": 29.279,
"step": 1208000
},
{
"epoch": 5.12,
"learning_rate": 2.0226666666666668e-07,
"loss": 2.7794,
"step": 1216000
},
{
"epoch": 5.12,
"eval_loss": 2.604072332382202,
"eval_runtime": 425.7765,
"eval_samples_per_second": 469.725,
"eval_steps_per_second": 29.358,
"step": 1216000
},
{
"epoch": 5.15,
"eval_loss": 2.60508394241333,
"eval_runtime": 429.4647,
"eval_samples_per_second": 465.691,
"eval_steps_per_second": 29.106,
"step": 1224000
},
{
"epoch": 5.19,
"learning_rate": 1.9953333333333333e-07,
"loss": 2.7709,
"step": 1232000
},
{
"epoch": 5.19,
"eval_loss": 2.6092872619628906,
"eval_runtime": 431.1236,
"eval_samples_per_second": 463.899,
"eval_steps_per_second": 28.994,
"step": 1232000
},
{
"epoch": 5.22,
"eval_loss": 2.6061949729919434,
"eval_runtime": 430.933,
"eval_samples_per_second": 464.105,
"eval_steps_per_second": 29.007,
"step": 1240000
},
{
"epoch": 5.25,
"learning_rate": 1.968e-07,
"loss": 2.7727,
"step": 1248000
},
{
"epoch": 5.25,
"eval_loss": 2.6051762104034424,
"eval_runtime": 428.4194,
"eval_samples_per_second": 466.828,
"eval_steps_per_second": 29.177,
"step": 1248000
},
{
"epoch": 5.29,
"eval_loss": 2.612610340118408,
"eval_runtime": 426.271,
"eval_samples_per_second": 469.18,
"eval_steps_per_second": 29.324,
"step": 1256000
},
{
"epoch": 5.32,
"learning_rate": 1.9406666666666667e-07,
"loss": 2.7686,
"step": 1264000
},
{
"epoch": 5.32,
"eval_loss": 2.609870433807373,
"eval_runtime": 424.9717,
"eval_samples_per_second": 470.615,
"eval_steps_per_second": 29.414,
"step": 1264000
},
{
"epoch": 5.36,
"eval_loss": 2.619239568710327,
"eval_runtime": 428.8052,
"eval_samples_per_second": 466.408,
"eval_steps_per_second": 29.151,
"step": 1272000
},
{
"epoch": 5.39,
"learning_rate": 1.9133333333333333e-07,
"loss": 2.7668,
"step": 1280000
},
{
"epoch": 5.39,
"eval_loss": 2.616584539413452,
"eval_runtime": 426.4046,
"eval_samples_per_second": 469.033,
"eval_steps_per_second": 29.315,
"step": 1280000
},
{
"epoch": 5.42,
"eval_loss": 2.6041982173919678,
"eval_runtime": 425.6206,
"eval_samples_per_second": 469.897,
"eval_steps_per_second": 29.369,
"step": 1288000
},
{
"epoch": 5.46,
"learning_rate": 1.886e-07,
"loss": 2.7777,
"step": 1296000
},
{
"epoch": 5.46,
"eval_loss": 2.603804349899292,
"eval_runtime": 425.5755,
"eval_samples_per_second": 469.947,
"eval_steps_per_second": 29.372,
"step": 1296000
},
{
"epoch": 5.49,
"eval_loss": 2.6119117736816406,
"eval_runtime": 426.6248,
"eval_samples_per_second": 468.791,
"eval_steps_per_second": 29.3,
"step": 1304000
},
{
"epoch": 5.52,
"learning_rate": 1.8586666666666666e-07,
"loss": 2.7737,
"step": 1312000
},
{
"epoch": 5.52,
"eval_loss": 2.615492343902588,
"eval_runtime": 426.0601,
"eval_samples_per_second": 469.413,
"eval_steps_per_second": 29.339,
"step": 1312000
},
{
"epoch": 5.56,
"eval_loss": 2.6235928535461426,
"eval_runtime": 427.3368,
"eval_samples_per_second": 468.01,
"eval_steps_per_second": 29.251,
"step": 1320000
},
{
"epoch": 5.59,
"learning_rate": 1.8313333333333332e-07,
"loss": 2.7757,
"step": 1328000
},
{
"epoch": 5.59,
"eval_loss": 2.6123950481414795,
"eval_runtime": 427.0533,
"eval_samples_per_second": 468.321,
"eval_steps_per_second": 29.27,
"step": 1328000
},
{
"epoch": 5.63,
"eval_loss": 2.5992510318756104,
"eval_runtime": 425.5905,
"eval_samples_per_second": 469.931,
"eval_steps_per_second": 29.371,
"step": 1336000
},
{
"epoch": 5.66,
"learning_rate": 1.804e-07,
"loss": 2.7757,
"step": 1344000
},
{
"epoch": 5.66,
"eval_loss": 2.613180160522461,
"eval_runtime": 424.8849,
"eval_samples_per_second": 470.711,
"eval_steps_per_second": 29.42,
"step": 1344000
},
{
"epoch": 5.69,
"eval_loss": 2.6062958240509033,
"eval_runtime": 424.2489,
"eval_samples_per_second": 471.417,
"eval_steps_per_second": 29.464,
"step": 1352000
},
{
"epoch": 5.73,
"learning_rate": 1.7766666666666666e-07,
"loss": 2.7748,
"step": 1360000
},
{
"epoch": 5.73,
"eval_loss": 2.612989902496338,
"eval_runtime": 424.5958,
"eval_samples_per_second": 471.032,
"eval_steps_per_second": 29.44,
"step": 1360000
},
{
"epoch": 5.76,
"eval_loss": 2.610032796859741,
"eval_runtime": 427.5632,
"eval_samples_per_second": 467.762,
"eval_steps_per_second": 29.235,
"step": 1368000
},
{
"epoch": 5.79,
"learning_rate": 1.7493333333333334e-07,
"loss": 2.769,
"step": 1376000
},
{
"epoch": 5.79,
"eval_loss": 2.602424383163452,
"eval_runtime": 424.7302,
"eval_samples_per_second": 470.882,
"eval_steps_per_second": 29.43,
"step": 1376000
},
{
"epoch": 5.83,
"eval_loss": 2.6061973571777344,
"eval_runtime": 428.6827,
"eval_samples_per_second": 466.541,
"eval_steps_per_second": 29.159,
"step": 1384000
},
{
"epoch": 5.86,
"learning_rate": 1.722e-07,
"loss": 2.7713,
"step": 1392000
},
{
"epoch": 5.86,
"eval_loss": 2.6137661933898926,
"eval_runtime": 430.6966,
"eval_samples_per_second": 464.359,
"eval_steps_per_second": 29.023,
"step": 1392000
},
{
"epoch": 5.89,
"eval_loss": 2.6025471687316895,
"eval_runtime": 430.8099,
"eval_samples_per_second": 464.237,
"eval_steps_per_second": 29.015,
"step": 1400000
},
{
"epoch": 5.93,
"learning_rate": 1.6946666666666668e-07,
"loss": 2.7766,
"step": 1408000
},
{
"epoch": 5.93,
"eval_loss": 2.6087875366210938,
"eval_runtime": 427.6043,
"eval_samples_per_second": 467.717,
"eval_steps_per_second": 29.233,
"step": 1408000
},
{
"epoch": 5.96,
"eval_loss": 2.6138193607330322,
"eval_runtime": 428.3638,
"eval_samples_per_second": 466.888,
"eval_steps_per_second": 29.181,
"step": 1416000
},
{
"epoch": 6.0,
"learning_rate": 1.6673333333333333e-07,
"loss": 2.7727,
"step": 1424000
},
{
"epoch": 6.0,
"eval_loss": 2.604793071746826,
"eval_runtime": 426.9803,
"eval_samples_per_second": 468.401,
"eval_steps_per_second": 29.275,
"step": 1424000
},
{
"epoch": 6.03,
"eval_loss": 2.606837511062622,
"eval_runtime": 429.9514,
"eval_samples_per_second": 465.164,
"eval_steps_per_second": 29.073,
"step": 1432000
},
{
"epoch": 6.06,
"learning_rate": 1.64e-07,
"loss": 2.7737,
"step": 1440000
},
{
"epoch": 6.06,
"eval_loss": 2.614352226257324,
"eval_runtime": 427.0139,
"eval_samples_per_second": 468.364,
"eval_steps_per_second": 29.273,
"step": 1440000
},
{
"epoch": 6.1,
"eval_loss": 2.6051464080810547,
"eval_runtime": 430.2406,
"eval_samples_per_second": 464.852,
"eval_steps_per_second": 29.054,
"step": 1448000
},
{
"epoch": 6.13,
"learning_rate": 1.6126666666666667e-07,
"loss": 2.778,
"step": 1456000
},
{
"epoch": 6.13,
"eval_loss": 2.6157853603363037,
"eval_runtime": 431.1381,
"eval_samples_per_second": 463.884,
"eval_steps_per_second": 28.993,
"step": 1456000
},
{
"epoch": 6.16,
"eval_loss": 2.615216016769409,
"eval_runtime": 425.9498,
"eval_samples_per_second": 469.534,
"eval_steps_per_second": 29.346,
"step": 1464000
},
{
"epoch": 6.2,
"learning_rate": 1.5853333333333332e-07,
"loss": 2.7767,
"step": 1472000
},
{
"epoch": 6.2,
"eval_loss": 2.6018834114074707,
"eval_runtime": 426.8729,
"eval_samples_per_second": 468.519,
"eval_steps_per_second": 29.283,
"step": 1472000
},
{
"epoch": 6.23,
"eval_loss": 2.611650228500366,
"eval_runtime": 425.3068,
"eval_samples_per_second": 470.244,
"eval_steps_per_second": 29.391,
"step": 1480000
},
{
"epoch": 6.27,
"learning_rate": 1.558e-07,
"loss": 2.7706,
"step": 1488000
},
{
"epoch": 6.27,
"eval_loss": 2.6065428256988525,
"eval_runtime": 427.2148,
"eval_samples_per_second": 468.144,
"eval_steps_per_second": 29.259,
"step": 1488000
},
{
"epoch": 6.3,
"eval_loss": 2.612178087234497,
"eval_runtime": 424.5636,
"eval_samples_per_second": 471.067,
"eval_steps_per_second": 29.442,
"step": 1496000
},
{
"epoch": 6.33,
"learning_rate": 1.5306666666666666e-07,
"loss": 2.7775,
"step": 1504000
},
{
"epoch": 6.33,
"eval_loss": 2.610006809234619,
"eval_runtime": 424.709,
"eval_samples_per_second": 470.906,
"eval_steps_per_second": 29.432,
"step": 1504000
},
{
"epoch": 6.37,
"eval_loss": 2.6100497245788574,
"eval_runtime": 424.7106,
"eval_samples_per_second": 470.904,
"eval_steps_per_second": 29.432,
"step": 1512000
},
{
"epoch": 6.4,
"learning_rate": 1.5033333333333332e-07,
"loss": 2.7753,
"step": 1520000
},
{
"epoch": 6.4,
"eval_loss": 2.605093479156494,
"eval_runtime": 427.9079,
"eval_samples_per_second": 467.386,
"eval_steps_per_second": 29.212,
"step": 1520000
},
{
"epoch": 6.43,
"eval_loss": 2.6036603450775146,
"eval_runtime": 425.7208,
"eval_samples_per_second": 469.787,
"eval_steps_per_second": 29.362,
"step": 1528000
},
{
"epoch": 6.47,
"learning_rate": 1.476e-07,
"loss": 2.7691,
"step": 1536000
},
{
"epoch": 6.47,
"eval_loss": 2.6037118434906006,
"eval_runtime": 425.0126,
"eval_samples_per_second": 470.57,
"eval_steps_per_second": 29.411,
"step": 1536000
},
{
"epoch": 6.5,
"eval_loss": 2.599247932434082,
"eval_runtime": 425.3525,
"eval_samples_per_second": 470.194,
"eval_steps_per_second": 29.387,
"step": 1544000
},
{
"epoch": 6.53,
"learning_rate": 1.4486666666666665e-07,
"loss": 2.758,
"step": 1552000
},
{
"epoch": 6.53,
"eval_loss": 2.6080071926116943,
"eval_runtime": 424.5489,
"eval_samples_per_second": 471.084,
"eval_steps_per_second": 29.443,
"step": 1552000
},
{
"epoch": 6.57,
"eval_loss": 2.6138691902160645,
"eval_runtime": 427.0402,
"eval_samples_per_second": 468.335,
"eval_steps_per_second": 29.271,
"step": 1560000
},
{
"epoch": 6.6,
"learning_rate": 1.4213333333333334e-07,
"loss": 2.7722,
"step": 1568000
},
{
"epoch": 6.6,
"eval_loss": 2.6000304222106934,
"eval_runtime": 426.0372,
"eval_samples_per_second": 469.438,
"eval_steps_per_second": 29.34,
"step": 1568000
},
{
"epoch": 6.64,
"eval_loss": 2.6107337474823,
"eval_runtime": 428.351,
"eval_samples_per_second": 466.902,
"eval_steps_per_second": 29.182,
"step": 1576000
},
{
"epoch": 6.67,
"learning_rate": 1.3940000000000002e-07,
"loss": 2.7737,
"step": 1584000
},
{
"epoch": 6.67,
"eval_loss": 2.6056902408599854,
"eval_runtime": 427.5761,
"eval_samples_per_second": 467.748,
"eval_steps_per_second": 29.235,
"step": 1584000
},
{
"epoch": 6.7,
"eval_loss": 2.6063265800476074,
"eval_runtime": 427.3328,
"eval_samples_per_second": 468.015,
"eval_steps_per_second": 29.251,
"step": 1592000
},
{
"epoch": 6.74,
"learning_rate": 1.3666666666666665e-07,
"loss": 2.7722,
"step": 1600000
},
{
"epoch": 6.74,
"eval_loss": 2.602761745452881,
"eval_runtime": 427.2826,
"eval_samples_per_second": 468.07,
"eval_steps_per_second": 29.255,
"step": 1600000
},
{
"epoch": 6.77,
"eval_loss": 2.5995423793792725,
"eval_runtime": 431.7911,
"eval_samples_per_second": 463.182,
"eval_steps_per_second": 28.949,
"step": 1608000
},
{
"epoch": 6.8,
"learning_rate": 1.3393333333333333e-07,
"loss": 2.7659,
"step": 1616000
},
{
"epoch": 6.8,
"eval_loss": 2.604205846786499,
"eval_runtime": 430.2172,
"eval_samples_per_second": 464.877,
"eval_steps_per_second": 29.055,
"step": 1616000
},
{
"epoch": 6.84,
"eval_loss": 2.601318597793579,
"eval_runtime": 426.9357,
"eval_samples_per_second": 468.45,
"eval_steps_per_second": 29.278,
"step": 1624000
},
{
"epoch": 6.87,
"learning_rate": 1.312e-07,
"loss": 2.7769,
"step": 1632000
},
{
"epoch": 6.87,
"eval_loss": 2.60282826423645,
"eval_runtime": 430.2987,
"eval_samples_per_second": 464.789,
"eval_steps_per_second": 29.05,
"step": 1632000
},
{
"epoch": 6.91,
"eval_loss": 2.608042001724243,
"eval_runtime": 428.8831,
"eval_samples_per_second": 466.323,
"eval_steps_per_second": 29.145,
"step": 1640000
},
{
"epoch": 6.94,
"learning_rate": 1.2846666666666667e-07,
"loss": 2.7732,
"step": 1648000
},
{
"epoch": 6.94,
"eval_loss": 2.5994017124176025,
"eval_runtime": 427.4593,
"eval_samples_per_second": 467.876,
"eval_steps_per_second": 29.243,
"step": 1648000
},
{
"epoch": 6.97,
"eval_loss": 2.6063311100006104,
"eval_runtime": 427.0875,
"eval_samples_per_second": 468.283,
"eval_steps_per_second": 29.268,
"step": 1656000
},
{
"epoch": 7.01,
"learning_rate": 1.2573333333333332e-07,
"loss": 2.7708,
"step": 1664000
},
{
"epoch": 7.01,
"eval_loss": 2.6120190620422363,
"eval_runtime": 426.6328,
"eval_samples_per_second": 468.783,
"eval_steps_per_second": 29.299,
"step": 1664000
},
{
"epoch": 7.04,
"eval_loss": 2.602278470993042,
"eval_runtime": 426.2122,
"eval_samples_per_second": 469.245,
"eval_steps_per_second": 29.328,
"step": 1672000
},
{
"epoch": 7.07,
"learning_rate": 1.23e-07,
"loss": 2.7614,
"step": 1680000
},
{
"epoch": 7.07,
"eval_loss": 2.6091384887695312,
"eval_runtime": 428.3432,
"eval_samples_per_second": 466.911,
"eval_steps_per_second": 29.182,
"step": 1680000
},
{
"epoch": 7.11,
"eval_loss": 2.600266218185425,
"eval_runtime": 426.5859,
"eval_samples_per_second": 468.834,
"eval_steps_per_second": 29.302,
"step": 1688000
},
{
"epoch": 7.14,
"learning_rate": 1.2026666666666666e-07,
"loss": 2.7655,
"step": 1696000
},
{
"epoch": 7.14,
"eval_loss": 2.6015985012054443,
"eval_runtime": 427.3154,
"eval_samples_per_second": 468.034,
"eval_steps_per_second": 29.252,
"step": 1696000
},
{
"epoch": 7.17,
"eval_loss": 2.605762481689453,
"eval_runtime": 431.3039,
"eval_samples_per_second": 463.706,
"eval_steps_per_second": 28.982,
"step": 1704000
},
{
"epoch": 7.21,
"learning_rate": 1.1753333333333334e-07,
"loss": 2.7747,
"step": 1712000
},
{
"epoch": 7.21,
"eval_loss": 2.6045358180999756,
"eval_runtime": 430.9287,
"eval_samples_per_second": 464.109,
"eval_steps_per_second": 29.007,
"step": 1712000
},
{
"epoch": 7.24,
"eval_loss": 2.609701633453369,
"eval_runtime": 430.8027,
"eval_samples_per_second": 464.245,
"eval_steps_per_second": 29.016,
"step": 1720000
},
{
"epoch": 7.28,
"learning_rate": 1.1480000000000001e-07,
"loss": 2.7685,
"step": 1728000
},
{
"epoch": 7.28,
"eval_loss": 2.606764316558838,
"eval_runtime": 430.5036,
"eval_samples_per_second": 464.568,
"eval_steps_per_second": 29.036,
"step": 1728000
},
{
"epoch": 7.31,
"eval_loss": 2.6037065982818604,
"eval_runtime": 428.2126,
"eval_samples_per_second": 467.053,
"eval_steps_per_second": 29.191,
"step": 1736000
},
{
"epoch": 7.34,
"learning_rate": 1.1206666666666666e-07,
"loss": 2.7736,
"step": 1744000
},
{
"epoch": 7.34,
"eval_loss": 2.612487554550171,
"eval_runtime": 426.9199,
"eval_samples_per_second": 468.467,
"eval_steps_per_second": 29.279,
"step": 1744000
},
{
"epoch": 7.38,
"eval_loss": 2.6112568378448486,
"eval_runtime": 424.0556,
"eval_samples_per_second": 471.632,
"eval_steps_per_second": 29.477,
"step": 1752000
},
{
"epoch": 7.41,
"learning_rate": 1.0933333333333333e-07,
"loss": 2.7666,
"step": 1760000
},
{
"epoch": 7.41,
"eval_loss": 2.597158193588257,
"eval_runtime": 424.796,
"eval_samples_per_second": 470.81,
"eval_steps_per_second": 29.426,
"step": 1760000
},
{
"epoch": 7.44,
"eval_loss": 2.6080923080444336,
"eval_runtime": 426.0498,
"eval_samples_per_second": 469.424,
"eval_steps_per_second": 29.339,
"step": 1768000
},
{
"epoch": 7.48,
"learning_rate": 1.066e-07,
"loss": 2.7658,
"step": 1776000
},
{
"epoch": 7.48,
"eval_loss": 2.6089766025543213,
"eval_runtime": 426.2325,
"eval_samples_per_second": 469.223,
"eval_steps_per_second": 29.327,
"step": 1776000
},
{
"epoch": 7.51,
"eval_loss": 2.6126182079315186,
"eval_runtime": 425.9411,
"eval_samples_per_second": 469.544,
"eval_steps_per_second": 29.347,
"step": 1784000
},
{
"epoch": 7.55,
"learning_rate": 1.0386666666666667e-07,
"loss": 2.7802,
"step": 1792000
},
{
"epoch": 7.55,
"eval_loss": 2.6020755767822266,
"eval_runtime": 425.4923,
"eval_samples_per_second": 470.039,
"eval_steps_per_second": 29.378,
"step": 1792000
},
{
"epoch": 7.58,
"eval_loss": 2.608738660812378,
"eval_runtime": 423.4207,
"eval_samples_per_second": 472.339,
"eval_steps_per_second": 29.521,
"step": 1800000
},
{
"epoch": 7.61,
"learning_rate": 1.0113333333333334e-07,
"loss": 2.7749,
"step": 1808000
},
{
"epoch": 7.61,
"eval_loss": 2.5985732078552246,
"eval_runtime": 427.6655,
"eval_samples_per_second": 467.651,
"eval_steps_per_second": 29.228,
"step": 1808000
},
{
"epoch": 7.65,
"eval_loss": 2.600249767303467,
"eval_runtime": 425.4151,
"eval_samples_per_second": 470.124,
"eval_steps_per_second": 29.383,
"step": 1816000
},
{
"epoch": 7.68,
"learning_rate": 9.84e-08,
"loss": 2.7689,
"step": 1824000
},
{
"epoch": 7.68,
"eval_loss": 2.6023120880126953,
"eval_runtime": 426.7002,
"eval_samples_per_second": 468.708,
"eval_steps_per_second": 29.295,
"step": 1824000
},
{
"epoch": 7.71,
"eval_loss": 2.5969133377075195,
"eval_runtime": 429.4646,
"eval_samples_per_second": 465.691,
"eval_steps_per_second": 29.106,
"step": 1832000
},
{
"epoch": 7.75,
"learning_rate": 9.566666666666666e-08,
"loss": 2.7699,
"step": 1840000
},
{
"epoch": 7.75,
"eval_loss": 2.5974552631378174,
"eval_runtime": 425.4534,
"eval_samples_per_second": 470.082,
"eval_steps_per_second": 29.38,
"step": 1840000
},
{
"epoch": 7.78,
"eval_loss": 2.606982946395874,
"eval_runtime": 427.6242,
"eval_samples_per_second": 467.696,
"eval_steps_per_second": 29.231,
"step": 1848000
},
{
"epoch": 7.81,
"learning_rate": 9.293333333333333e-08,
"loss": 2.7715,
"step": 1856000
},
{
"epoch": 7.81,
"eval_loss": 2.603482484817505,
"eval_runtime": 429.5475,
"eval_samples_per_second": 465.602,
"eval_steps_per_second": 29.1,
"step": 1856000
},
{
"epoch": 7.85,
"eval_loss": 2.6048595905303955,
"eval_runtime": 430.2797,
"eval_samples_per_second": 464.809,
"eval_steps_per_second": 29.051,
"step": 1864000
},
{
"epoch": 7.88,
"learning_rate": 9.02e-08,
"loss": 2.7653,
"step": 1872000
},
{
"epoch": 7.88,
"eval_loss": 2.6129438877105713,
"eval_runtime": 430.4774,
"eval_samples_per_second": 464.596,
"eval_steps_per_second": 29.038,
"step": 1872000
},
{
"epoch": 7.92,
"eval_loss": 2.602692127227783,
"eval_runtime": 426.0433,
"eval_samples_per_second": 469.431,
"eval_steps_per_second": 29.34,
"step": 1880000
},
{
"epoch": 7.95,
"learning_rate": 8.746666666666667e-08,
"loss": 2.7729,
"step": 1888000
},
{
"epoch": 7.95,
"eval_loss": 2.5999996662139893,
"eval_runtime": 427.4645,
"eval_samples_per_second": 467.87,
"eval_steps_per_second": 29.242,
"step": 1888000
},
{
"epoch": 7.98,
"eval_loss": 2.6137943267822266,
"eval_runtime": 427.597,
"eval_samples_per_second": 467.726,
"eval_steps_per_second": 29.233,
"step": 1896000
},
{
"epoch": 8.02,
"learning_rate": 8.473333333333334e-08,
"loss": 2.7693,
"step": 1904000
},
{
"epoch": 8.02,
"eval_loss": 2.6051719188690186,
"eval_runtime": 428.0224,
"eval_samples_per_second": 467.261,
"eval_steps_per_second": 29.204,
"step": 1904000
},
{
"epoch": 8.05,
"eval_loss": 2.6060233116149902,
"eval_runtime": 430.6968,
"eval_samples_per_second": 464.359,
"eval_steps_per_second": 29.023,
"step": 1912000
},
{
"epoch": 8.08,
"learning_rate": 8.2e-08,
"loss": 2.7585,
"step": 1920000
},
{
"epoch": 8.08,
"eval_loss": 2.6064672470092773,
"eval_runtime": 427.6059,
"eval_samples_per_second": 467.716,
"eval_steps_per_second": 29.233,
"step": 1920000
},
{
"epoch": 8.12,
"eval_loss": 2.6105079650878906,
"eval_runtime": 438.1262,
"eval_samples_per_second": 456.485,
"eval_steps_per_second": 28.531,
"step": 1928000
},
{
"epoch": 8.15,
"learning_rate": 7.926666666666666e-08,
"loss": 2.7652,
"step": 1936000
},
{
"epoch": 8.15,
"eval_loss": 2.607515335083008,
"eval_runtime": 438.8385,
"eval_samples_per_second": 455.744,
"eval_steps_per_second": 28.484,
"step": 1936000
},
{
"epoch": 8.19,
"eval_loss": 2.607562780380249,
"eval_runtime": 434.8048,
"eval_samples_per_second": 459.972,
"eval_steps_per_second": 28.749,
"step": 1944000
},
{
"epoch": 8.22,
"learning_rate": 7.653333333333333e-08,
"loss": 2.7508,
"step": 1952000
},
{
"epoch": 8.22,
"eval_loss": 2.6083250045776367,
"eval_runtime": 436.0024,
"eval_samples_per_second": 458.709,
"eval_steps_per_second": 28.67,
"step": 1952000
},
{
"epoch": 8.25,
"eval_loss": 2.6111507415771484,
"eval_runtime": 436.2282,
"eval_samples_per_second": 458.471,
"eval_steps_per_second": 28.655,
"step": 1960000
},
{
"epoch": 8.29,
"learning_rate": 7.38e-08,
"loss": 2.7678,
"step": 1968000
},
{
"epoch": 8.29,
"eval_loss": 2.6018521785736084,
"eval_runtime": 437.0443,
"eval_samples_per_second": 457.615,
"eval_steps_per_second": 28.601,
"step": 1968000
},
{
"epoch": 8.32,
"eval_loss": 2.602910280227661,
"eval_runtime": 435.5894,
"eval_samples_per_second": 459.143,
"eval_steps_per_second": 28.697,
"step": 1976000
},
{
"epoch": 8.35,
"learning_rate": 7.106666666666667e-08,
"loss": 2.7653,
"step": 1984000
},
{
"epoch": 8.35,
"eval_loss": 2.6087162494659424,
"eval_runtime": 434.8171,
"eval_samples_per_second": 459.959,
"eval_steps_per_second": 28.748,
"step": 1984000
},
{
"epoch": 8.39,
"eval_loss": 2.606381416320801,
"eval_runtime": 429.8668,
"eval_samples_per_second": 465.256,
"eval_steps_per_second": 29.079,
"step": 1992000
},
{
"epoch": 8.42,
"learning_rate": 6.833333333333332e-08,
"loss": 2.7661,
"step": 2000000
},
{
"epoch": 8.42,
"eval_loss": 2.603147506713867,
"eval_runtime": 424.748,
"eval_samples_per_second": 470.863,
"eval_steps_per_second": 29.429,
"step": 2000000
},
{
"epoch": 8.45,
"eval_loss": 2.6050961017608643,
"eval_runtime": 424.2051,
"eval_samples_per_second": 471.465,
"eval_steps_per_second": 29.467,
"step": 2008000
},
{
"epoch": 8.49,
"learning_rate": 6.56e-08,
"loss": 2.7742,
"step": 2016000
},
{
"epoch": 8.49,
"eval_loss": 2.6091232299804688,
"eval_runtime": 425.2363,
"eval_samples_per_second": 470.322,
"eval_steps_per_second": 29.395,
"step": 2016000
},
{
"epoch": 8.52,
"eval_loss": 2.5978386402130127,
"eval_runtime": 421.3464,
"eval_samples_per_second": 474.664,
"eval_steps_per_second": 29.667,
"step": 2024000
},
{
"epoch": 8.56,
"learning_rate": 6.286666666666666e-08,
"loss": 2.7748,
"step": 2032000
},
{
"epoch": 8.56,
"eval_loss": 2.6131348609924316,
"eval_runtime": 421.1531,
"eval_samples_per_second": 474.882,
"eval_steps_per_second": 29.68,
"step": 2032000
},
{
"epoch": 8.59,
"eval_loss": 2.6030309200286865,
"eval_runtime": 423.5077,
"eval_samples_per_second": 472.242,
"eval_steps_per_second": 29.515,
"step": 2040000
},
{
"epoch": 8.62,
"learning_rate": 6.013333333333333e-08,
"loss": 2.7706,
"step": 2048000
},
{
"epoch": 8.62,
"eval_loss": 2.6036195755004883,
"eval_runtime": 424.684,
"eval_samples_per_second": 470.934,
"eval_steps_per_second": 29.434,
"step": 2048000
},
{
"epoch": 8.66,
"eval_loss": 2.599808692932129,
"eval_runtime": 421.9265,
"eval_samples_per_second": 474.011,
"eval_steps_per_second": 29.626,
"step": 2056000
},
{
"epoch": 8.69,
"learning_rate": 5.7400000000000004e-08,
"loss": 2.769,
"step": 2064000
},
{
"epoch": 8.69,
"eval_loss": 2.6013376712799072,
"eval_runtime": 424.7682,
"eval_samples_per_second": 470.84,
"eval_steps_per_second": 29.428,
"step": 2064000
},
{
"epoch": 8.72,
"eval_loss": 2.6000382900238037,
"eval_runtime": 422.3044,
"eval_samples_per_second": 473.587,
"eval_steps_per_second": 29.6,
"step": 2072000
},
{
"epoch": 8.76,
"learning_rate": 5.4666666666666666e-08,
"loss": 2.7733,
"step": 2080000
},
{
"epoch": 8.76,
"eval_loss": 2.606200695037842,
"eval_runtime": 421.4091,
"eval_samples_per_second": 474.593,
"eval_steps_per_second": 29.662,
"step": 2080000
},
{
"epoch": 8.79,
"eval_loss": 2.605668783187866,
"eval_runtime": 425.3566,
"eval_samples_per_second": 470.189,
"eval_steps_per_second": 29.387,
"step": 2088000
},
{
"epoch": 8.83,
"learning_rate": 5.1933333333333335e-08,
"loss": 2.7714,
"step": 2096000
},
{
"epoch": 8.83,
"eval_loss": 2.602085828781128,
"eval_runtime": 423.0675,
"eval_samples_per_second": 472.733,
"eval_steps_per_second": 29.546,
"step": 2096000
},
{
"epoch": 8.86,
"eval_loss": 2.602846145629883,
"eval_runtime": 424.1211,
"eval_samples_per_second": 471.559,
"eval_steps_per_second": 29.473,
"step": 2104000
},
{
"epoch": 8.89,
"learning_rate": 4.92e-08,
"loss": 2.7754,
"step": 2112000
},
{
"epoch": 8.89,
"eval_loss": 2.596395254135132,
"eval_runtime": 421.9383,
"eval_samples_per_second": 473.998,
"eval_steps_per_second": 29.625,
"step": 2112000
},
{
"epoch": 8.93,
"eval_loss": 2.601470470428467,
"eval_runtime": 430.2725,
"eval_samples_per_second": 464.817,
"eval_steps_per_second": 29.051,
"step": 2120000
},
{
"epoch": 8.96,
"learning_rate": 4.6466666666666666e-08,
"loss": 2.7683,
"step": 2128000
},
{
"epoch": 8.96,
"eval_loss": 2.6060473918914795,
"eval_runtime": 428.5391,
"eval_samples_per_second": 466.697,
"eval_steps_per_second": 29.169,
"step": 2128000
},
{
"epoch": 8.99,
"eval_loss": 2.6081697940826416,
"eval_runtime": 430.2474,
"eval_samples_per_second": 464.844,
"eval_steps_per_second": 29.053,
"step": 2136000
},
{
"epoch": 9.03,
"learning_rate": 4.3733333333333335e-08,
"loss": 2.7758,
"step": 2144000
},
{
"epoch": 9.03,
"eval_loss": 2.6130032539367676,
"eval_runtime": 427.9829,
"eval_samples_per_second": 467.304,
"eval_steps_per_second": 29.207,
"step": 2144000
},
{
"epoch": 9.06,
"eval_loss": 2.607052803039551,
"eval_runtime": 426.9186,
"eval_samples_per_second": 468.469,
"eval_steps_per_second": 29.28,
"step": 2152000
},
{
"epoch": 9.09,
"learning_rate": 4.1e-08,
"loss": 2.768,
"step": 2160000
},
{
"epoch": 9.09,
"eval_loss": 2.6140778064727783,
"eval_runtime": 426.6857,
"eval_samples_per_second": 468.724,
"eval_steps_per_second": 29.296,
"step": 2160000
},
{
"epoch": 9.13,
"eval_loss": 2.600281000137329,
"eval_runtime": 427.0893,
"eval_samples_per_second": 468.281,
"eval_steps_per_second": 29.268,
"step": 2168000
},
{
"epoch": 9.16,
"learning_rate": 3.8266666666666665e-08,
"loss": 2.7653,
"step": 2176000
},
{
"epoch": 9.16,
"eval_loss": 2.5986554622650146,
"eval_runtime": 425.4388,
"eval_samples_per_second": 470.098,
"eval_steps_per_second": 29.381,
"step": 2176000
},
{
"epoch": 9.2,
"eval_loss": 2.6066486835479736,
"eval_runtime": 425.1112,
"eval_samples_per_second": 470.46,
"eval_steps_per_second": 29.404,
"step": 2184000
},
{
"epoch": 9.23,
"learning_rate": 3.5533333333333334e-08,
"loss": 2.7621,
"step": 2192000
},
{
"epoch": 9.23,
"eval_loss": 2.6040539741516113,
"eval_runtime": 424.0828,
"eval_samples_per_second": 471.601,
"eval_steps_per_second": 29.475,
"step": 2192000
},
{
"epoch": 9.26,
"eval_loss": 2.605970859527588,
"eval_runtime": 424.8928,
"eval_samples_per_second": 470.702,
"eval_steps_per_second": 29.419,
"step": 2200000
},
{
"epoch": 9.3,
"learning_rate": 3.28e-08,
"loss": 2.7712,
"step": 2208000
},
{
"epoch": 9.3,
"eval_loss": 2.6143710613250732,
"eval_runtime": 422.534,
"eval_samples_per_second": 473.33,
"eval_steps_per_second": 29.583,
"step": 2208000
},
{
"epoch": 9.33,
"eval_loss": 2.5990421772003174,
"eval_runtime": 423.7702,
"eval_samples_per_second": 471.949,
"eval_steps_per_second": 29.497,
"step": 2216000
},
{
"epoch": 9.36,
"learning_rate": 3.0066666666666665e-08,
"loss": 2.7718,
"step": 2224000
},
{
"epoch": 9.36,
"eval_loss": 2.6039345264434814,
"eval_runtime": 422.8971,
"eval_samples_per_second": 472.924,
"eval_steps_per_second": 29.558,
"step": 2224000
},
{
"epoch": 9.4,
"eval_loss": 2.593075752258301,
"eval_runtime": 423.1257,
"eval_samples_per_second": 472.668,
"eval_steps_per_second": 29.542,
"step": 2232000
},
{
"epoch": 9.43,
"learning_rate": 2.7333333333333333e-08,
"loss": 2.774,
"step": 2240000
},
{
"epoch": 9.43,
"eval_loss": 2.6128671169281006,
"eval_runtime": 423.6724,
"eval_samples_per_second": 472.058,
"eval_steps_per_second": 29.504,
"step": 2240000
},
{
"epoch": 9.47,
"eval_loss": 2.6095166206359863,
"eval_runtime": 427.8902,
"eval_samples_per_second": 467.405,
"eval_steps_per_second": 29.213,
"step": 2248000
},
{
"epoch": 9.5,
"learning_rate": 2.46e-08,
"loss": 2.765,
"step": 2256000
},
{
"epoch": 9.5,
"eval_loss": 2.5932390689849854,
"eval_runtime": 426.8467,
"eval_samples_per_second": 468.548,
"eval_steps_per_second": 29.285,
"step": 2256000
},
{
"epoch": 9.53,
"eval_loss": 2.6009600162506104,
"eval_runtime": 424.1412,
"eval_samples_per_second": 471.536,
"eval_steps_per_second": 29.471,
"step": 2264000
},
{
"epoch": 9.57,
"learning_rate": 2.1866666666666667e-08,
"loss": 2.7754,
"step": 2272000
},
{
"epoch": 9.57,
"eval_loss": 2.60778546333313,
"eval_runtime": 425.6089,
"eval_samples_per_second": 469.91,
"eval_steps_per_second": 29.37,
"step": 2272000
},
{
"epoch": 9.6,
"eval_loss": 2.5981459617614746,
"eval_runtime": 425.6519,
"eval_samples_per_second": 469.863,
"eval_steps_per_second": 29.367,
"step": 2280000
},
{
"epoch": 9.63,
"learning_rate": 1.9133333333333333e-08,
"loss": 2.771,
"step": 2288000
},
{
"epoch": 9.63,
"eval_loss": 2.6052143573760986,
"eval_runtime": 424.5141,
"eval_samples_per_second": 471.122,
"eval_steps_per_second": 29.445,
"step": 2288000
},
{
"epoch": 9.67,
"eval_loss": 2.5944042205810547,
"eval_runtime": 425.2866,
"eval_samples_per_second": 470.266,
"eval_steps_per_second": 29.392,
"step": 2296000
},
{
"epoch": 9.7,
"learning_rate": 1.64e-08,
"loss": 2.7757,
"step": 2304000
},
{
"epoch": 9.7,
"eval_loss": 2.6045000553131104,
"eval_runtime": 428.7498,
"eval_samples_per_second": 466.468,
"eval_steps_per_second": 29.155,
"step": 2304000
},
{
"epoch": 9.73,
"eval_loss": 2.5971217155456543,
"eval_runtime": 432.7726,
"eval_samples_per_second": 462.132,
"eval_steps_per_second": 28.884,
"step": 2312000
},
{
"epoch": 9.77,
"learning_rate": 1.3666666666666667e-08,
"loss": 2.7685,
"step": 2320000
},
{
"epoch": 9.77,
"eval_loss": 2.610078811645508,
"eval_runtime": 432.5525,
"eval_samples_per_second": 462.367,
"eval_steps_per_second": 28.898,
"step": 2320000
},
{
"epoch": 9.8,
"eval_loss": 2.596436023712158,
"eval_runtime": 432.023,
"eval_samples_per_second": 462.934,
"eval_steps_per_second": 28.934,
"step": 2328000
},
{
"epoch": 9.84,
"learning_rate": 1.0933333333333334e-08,
"loss": 2.7708,
"step": 2336000
},
{
"epoch": 9.84,
"eval_loss": 2.5973622798919678,
"eval_runtime": 432.5814,
"eval_samples_per_second": 462.336,
"eval_steps_per_second": 28.896,
"step": 2336000
},
{
"epoch": 9.87,
"eval_loss": 2.5952794551849365,
"eval_runtime": 427.2422,
"eval_samples_per_second": 468.114,
"eval_steps_per_second": 29.257,
"step": 2344000
},
{
"epoch": 9.9,
"learning_rate": 8.2e-09,
"loss": 2.7695,
"step": 2352000
},
{
"epoch": 9.9,
"eval_loss": 2.598102569580078,
"eval_runtime": 427.2188,
"eval_samples_per_second": 468.139,
"eval_steps_per_second": 29.259,
"step": 2352000
},
{
"epoch": 9.94,
"eval_loss": 2.6094541549682617,
"eval_runtime": 427.6756,
"eval_samples_per_second": 467.639,
"eval_steps_per_second": 29.228,
"step": 2360000
},
{
"epoch": 9.97,
"learning_rate": 5.466666666666667e-09,
"loss": 2.7702,
"step": 2368000
},
{
"epoch": 9.97,
"eval_loss": 2.6042184829711914,
"eval_runtime": 426.894,
"eval_samples_per_second": 468.496,
"eval_steps_per_second": 29.281,
"step": 2368000
},
{
"epoch": 10.0,
"eval_loss": 2.6094839572906494,
"eval_runtime": 429.3589,
"eval_samples_per_second": 465.806,
"eval_steps_per_second": 29.113,
"step": 2376000
},
{
"epoch": 10.04,
"learning_rate": 2.7333333333333334e-09,
"loss": 2.7614,
"step": 2384000
},
{
"epoch": 10.04,
"eval_loss": 2.6007468700408936,
"eval_runtime": 426.5863,
"eval_samples_per_second": 468.834,
"eval_steps_per_second": 29.302,
"step": 2384000
},
{
"epoch": 10.07,
"eval_loss": 2.601724863052368,
"eval_runtime": 428.327,
"eval_samples_per_second": 466.928,
"eval_steps_per_second": 29.183,
"step": 2392000
},
{
"epoch": 10.11,
"learning_rate": 0.0,
"loss": 2.7708,
"step": 2400000
},
{
"epoch": 10.11,
"eval_loss": 2.611358880996704,
"eval_runtime": 427.933,
"eval_samples_per_second": 467.358,
"eval_steps_per_second": 29.21,
"step": 2400000
},
{
"epoch": 10.11,
"step": 2400000,
"total_flos": 7.626726685368748e+17,
"train_loss": 2.7854965201822917,
"train_runtime": 398406.2107,
"train_samples_per_second": 96.384,
"train_steps_per_second": 6.024
}
],
"logging_steps": 16000,
"max_steps": 2400000,
"num_train_epochs": 11,
"save_steps": 32000,
"total_flos": 7.626726685368748e+17,
"trial_name": null,
"trial_params": null
}