2020-Q2-50p-filtered-prog_from_Q1 / trainer_state.json
DouglasPontes's picture
Training in progress, step 32000
03f4cbc verified
{
"best_metric": 2.555936574935913,
"best_model_checkpoint": "./model_tweets_2020_Q2_50/checkpoint-160000",
"epoch": 10.105263157894736,
"eval_steps": 8000,
"global_step": 2400000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03,
"eval_loss": 2.664475917816162,
"eval_runtime": 220.2299,
"eval_samples_per_second": 908.142,
"eval_steps_per_second": 56.759,
"step": 8000
},
{
"epoch": 0.07,
"learning_rate": 9.939131159843243e-06,
"loss": 2.8656,
"step": 16000
},
{
"epoch": 0.07,
"eval_loss": 2.6464931964874268,
"eval_runtime": 221.9882,
"eval_samples_per_second": 900.949,
"eval_steps_per_second": 56.309,
"step": 16000
},
{
"epoch": 0.1,
"eval_loss": 2.6185896396636963,
"eval_runtime": 220.2227,
"eval_samples_per_second": 908.172,
"eval_steps_per_second": 56.761,
"step": 24000
},
{
"epoch": 0.13,
"learning_rate": 9.872425581589261e-06,
"loss": 2.7946,
"step": 32000
},
{
"epoch": 0.13,
"eval_loss": 2.6234936714172363,
"eval_runtime": 220.6677,
"eval_samples_per_second": 906.34,
"eval_steps_per_second": 56.646,
"step": 32000
},
{
"epoch": 0.17,
"eval_loss": 2.615138292312622,
"eval_runtime": 221.7491,
"eval_samples_per_second": 901.92,
"eval_steps_per_second": 56.37,
"step": 40000
},
{
"epoch": 0.2,
"learning_rate": 9.80572000333528e-06,
"loss": 2.7911,
"step": 48000
},
{
"epoch": 0.2,
"eval_loss": 2.6128268241882324,
"eval_runtime": 221.3475,
"eval_samples_per_second": 903.557,
"eval_steps_per_second": 56.472,
"step": 48000
},
{
"epoch": 0.24,
"eval_loss": 2.6009654998779297,
"eval_runtime": 221.2039,
"eval_samples_per_second": 904.143,
"eval_steps_per_second": 56.509,
"step": 56000
},
{
"epoch": 0.27,
"learning_rate": 9.739014425081299e-06,
"loss": 2.7898,
"step": 64000
},
{
"epoch": 0.27,
"eval_loss": 2.614436149597168,
"eval_runtime": 221.9246,
"eval_samples_per_second": 901.207,
"eval_steps_per_second": 56.325,
"step": 64000
},
{
"epoch": 0.3,
"eval_loss": 2.597571611404419,
"eval_runtime": 222.8659,
"eval_samples_per_second": 897.401,
"eval_steps_per_second": 56.088,
"step": 72000
},
{
"epoch": 0.34,
"learning_rate": 9.672308846827316e-06,
"loss": 2.7791,
"step": 80000
},
{
"epoch": 0.34,
"eval_loss": 2.6006274223327637,
"eval_runtime": 221.9145,
"eval_samples_per_second": 901.248,
"eval_steps_per_second": 56.328,
"step": 80000
},
{
"epoch": 0.37,
"eval_loss": 2.5888915061950684,
"eval_runtime": 223.9796,
"eval_samples_per_second": 892.938,
"eval_steps_per_second": 55.809,
"step": 88000
},
{
"epoch": 0.4,
"learning_rate": 9.605603268573334e-06,
"loss": 2.7776,
"step": 96000
},
{
"epoch": 0.4,
"eval_loss": 2.5888023376464844,
"eval_runtime": 221.625,
"eval_samples_per_second": 902.425,
"eval_steps_per_second": 56.402,
"step": 96000
},
{
"epoch": 0.44,
"eval_loss": 2.584191083908081,
"eval_runtime": 222.5849,
"eval_samples_per_second": 898.533,
"eval_steps_per_second": 56.158,
"step": 104000
},
{
"epoch": 0.47,
"learning_rate": 9.538897690319354e-06,
"loss": 2.7702,
"step": 112000
},
{
"epoch": 0.47,
"eval_loss": 2.5760483741760254,
"eval_runtime": 222.9149,
"eval_samples_per_second": 897.203,
"eval_steps_per_second": 56.075,
"step": 112000
},
{
"epoch": 0.51,
"eval_loss": 2.5719943046569824,
"eval_runtime": 220.2346,
"eval_samples_per_second": 908.123,
"eval_steps_per_second": 56.758,
"step": 120000
},
{
"epoch": 0.54,
"learning_rate": 9.472192112065373e-06,
"loss": 2.7661,
"step": 128000
},
{
"epoch": 0.54,
"eval_loss": 2.5709779262542725,
"eval_runtime": 221.9288,
"eval_samples_per_second": 901.19,
"eval_steps_per_second": 56.324,
"step": 128000
},
{
"epoch": 0.57,
"eval_loss": 2.56732177734375,
"eval_runtime": 222.0037,
"eval_samples_per_second": 900.886,
"eval_steps_per_second": 56.305,
"step": 136000
},
{
"epoch": 0.61,
"learning_rate": 9.405486533811392e-06,
"loss": 2.7609,
"step": 144000
},
{
"epoch": 0.61,
"eval_loss": 2.5692689418792725,
"eval_runtime": 221.4862,
"eval_samples_per_second": 902.991,
"eval_steps_per_second": 56.437,
"step": 144000
},
{
"epoch": 0.64,
"eval_loss": 2.5623199939727783,
"eval_runtime": 222.634,
"eval_samples_per_second": 898.335,
"eval_steps_per_second": 56.146,
"step": 152000
},
{
"epoch": 0.67,
"learning_rate": 9.338780955557409e-06,
"loss": 2.7557,
"step": 160000
},
{
"epoch": 0.67,
"eval_loss": 2.555936574935913,
"eval_runtime": 222.9827,
"eval_samples_per_second": 896.93,
"eval_steps_per_second": 56.058,
"step": 160000
},
{
"epoch": 0.71,
"eval_loss": 2.564979314804077,
"eval_runtime": 223.4268,
"eval_samples_per_second": 895.148,
"eval_steps_per_second": 55.947,
"step": 168000
},
{
"epoch": 0.74,
"learning_rate": 9.272075377303427e-06,
"loss": 2.7584,
"step": 176000
},
{
"epoch": 0.74,
"eval_loss": 2.558361053466797,
"eval_runtime": 223.2855,
"eval_samples_per_second": 895.714,
"eval_steps_per_second": 55.982,
"step": 176000
},
{
"epoch": 0.77,
"eval_loss": 2.5590734481811523,
"eval_runtime": 224.766,
"eval_samples_per_second": 889.814,
"eval_steps_per_second": 55.613,
"step": 184000
},
{
"epoch": 0.81,
"learning_rate": 9.205369799049446e-06,
"loss": 2.7619,
"step": 192000
},
{
"epoch": 0.81,
"eval_loss": 2.5597450733184814,
"eval_runtime": 223.3408,
"eval_samples_per_second": 895.492,
"eval_steps_per_second": 55.968,
"step": 192000
},
{
"epoch": 0.84,
"eval_loss": 2.564985990524292,
"eval_runtime": 222.3245,
"eval_samples_per_second": 899.586,
"eval_steps_per_second": 56.224,
"step": 200000
},
{
"epoch": 0.88,
"learning_rate": 9.138664220795464e-06,
"loss": 2.7678,
"step": 208000
},
{
"epoch": 0.88,
"eval_loss": 2.572838306427002,
"eval_runtime": 222.9177,
"eval_samples_per_second": 897.192,
"eval_steps_per_second": 56.074,
"step": 208000
},
{
"epoch": 0.91,
"eval_loss": 2.571180820465088,
"eval_runtime": 222.4106,
"eval_samples_per_second": 899.238,
"eval_steps_per_second": 56.202,
"step": 216000
},
{
"epoch": 0.94,
"learning_rate": 9.071958642541483e-06,
"loss": 2.7735,
"step": 224000
},
{
"epoch": 0.94,
"eval_loss": 2.5728507041931152,
"eval_runtime": 221.0881,
"eval_samples_per_second": 904.617,
"eval_steps_per_second": 56.539,
"step": 224000
},
{
"epoch": 0.98,
"eval_loss": 2.5754916667938232,
"eval_runtime": 224.4187,
"eval_samples_per_second": 891.191,
"eval_steps_per_second": 55.699,
"step": 232000
},
{
"epoch": 1.01,
"learning_rate": 9.005253064287502e-06,
"loss": 2.777,
"step": 240000
},
{
"epoch": 1.01,
"eval_loss": 2.571467876434326,
"eval_runtime": 223.3627,
"eval_samples_per_second": 895.405,
"eval_steps_per_second": 55.963,
"step": 240000
},
{
"epoch": 1.04,
"eval_loss": 2.5747482776641846,
"eval_runtime": 223.2929,
"eval_samples_per_second": 895.685,
"eval_steps_per_second": 55.98,
"step": 248000
},
{
"epoch": 1.08,
"learning_rate": 8.93854748603352e-06,
"loss": 2.7692,
"step": 256000
},
{
"epoch": 1.08,
"eval_loss": 2.5781774520874023,
"eval_runtime": 225.2908,
"eval_samples_per_second": 887.742,
"eval_steps_per_second": 55.484,
"step": 256000
},
{
"epoch": 1.11,
"eval_loss": 2.58413028717041,
"eval_runtime": 223.7883,
"eval_samples_per_second": 893.702,
"eval_steps_per_second": 55.856,
"step": 264000
},
{
"epoch": 1.15,
"learning_rate": 8.871841907779539e-06,
"loss": 2.7826,
"step": 272000
},
{
"epoch": 1.15,
"eval_loss": 2.573080539703369,
"eval_runtime": 222.4765,
"eval_samples_per_second": 898.971,
"eval_steps_per_second": 56.186,
"step": 272000
},
{
"epoch": 1.18,
"eval_loss": 2.5836124420166016,
"eval_runtime": 222.4727,
"eval_samples_per_second": 898.987,
"eval_steps_per_second": 56.187,
"step": 280000
},
{
"epoch": 1.21,
"learning_rate": 8.805136329525557e-06,
"loss": 2.7845,
"step": 288000
},
{
"epoch": 1.21,
"eval_loss": 2.5840952396392822,
"eval_runtime": 223.8774,
"eval_samples_per_second": 893.346,
"eval_steps_per_second": 55.834,
"step": 288000
},
{
"epoch": 1.25,
"eval_loss": 2.5810587406158447,
"eval_runtime": 224.0492,
"eval_samples_per_second": 892.661,
"eval_steps_per_second": 55.791,
"step": 296000
},
{
"epoch": 1.28,
"learning_rate": 8.738430751271576e-06,
"loss": 2.7909,
"step": 304000
},
{
"epoch": 1.28,
"eval_loss": 2.592771530151367,
"eval_runtime": 224.5453,
"eval_samples_per_second": 890.689,
"eval_steps_per_second": 55.668,
"step": 304000
},
{
"epoch": 1.31,
"eval_loss": 2.597700834274292,
"eval_runtime": 222.8877,
"eval_samples_per_second": 897.313,
"eval_steps_per_second": 56.082,
"step": 312000
},
{
"epoch": 1.35,
"learning_rate": 8.671725173017595e-06,
"loss": 2.7993,
"step": 320000
},
{
"epoch": 1.35,
"eval_loss": 2.6025121212005615,
"eval_runtime": 223.4062,
"eval_samples_per_second": 895.23,
"eval_steps_per_second": 55.952,
"step": 320000
},
{
"epoch": 1.38,
"eval_loss": 2.6072068214416504,
"eval_runtime": 222.1596,
"eval_samples_per_second": 900.254,
"eval_steps_per_second": 56.266,
"step": 328000
},
{
"epoch": 1.41,
"learning_rate": 8.605019594763613e-06,
"loss": 2.8107,
"step": 336000
},
{
"epoch": 1.41,
"eval_loss": 2.6110291481018066,
"eval_runtime": 221.489,
"eval_samples_per_second": 902.979,
"eval_steps_per_second": 56.436,
"step": 336000
},
{
"epoch": 1.45,
"eval_loss": 2.6020007133483887,
"eval_runtime": 221.5356,
"eval_samples_per_second": 902.79,
"eval_steps_per_second": 56.424,
"step": 344000
},
{
"epoch": 1.48,
"learning_rate": 8.538314016509632e-06,
"loss": 2.8102,
"step": 352000
},
{
"epoch": 1.48,
"eval_loss": 2.606468677520752,
"eval_runtime": 221.9142,
"eval_samples_per_second": 901.249,
"eval_steps_per_second": 56.328,
"step": 352000
},
{
"epoch": 1.52,
"eval_loss": 2.620694637298584,
"eval_runtime": 223.9159,
"eval_samples_per_second": 893.193,
"eval_steps_per_second": 55.825,
"step": 360000
},
{
"epoch": 1.55,
"learning_rate": 8.471608438255649e-06,
"loss": 2.8247,
"step": 368000
},
{
"epoch": 1.55,
"eval_loss": 2.6191916465759277,
"eval_runtime": 224.0761,
"eval_samples_per_second": 892.554,
"eval_steps_per_second": 55.785,
"step": 368000
},
{
"epoch": 1.58,
"eval_loss": 2.6223857402801514,
"eval_runtime": 224.2988,
"eval_samples_per_second": 891.668,
"eval_steps_per_second": 55.729,
"step": 376000
},
{
"epoch": 1.62,
"learning_rate": 8.404902860001667e-06,
"loss": 2.8271,
"step": 384000
},
{
"epoch": 1.62,
"eval_loss": 2.6205480098724365,
"eval_runtime": 224.6631,
"eval_samples_per_second": 890.222,
"eval_steps_per_second": 55.639,
"step": 384000
},
{
"epoch": 1.65,
"eval_loss": 2.62916898727417,
"eval_runtime": 221.5526,
"eval_samples_per_second": 902.72,
"eval_steps_per_second": 56.42,
"step": 392000
},
{
"epoch": 1.68,
"learning_rate": 8.338197281747686e-06,
"loss": 2.8415,
"step": 400000
},
{
"epoch": 1.68,
"eval_loss": 2.6347849369049072,
"eval_runtime": 222.2459,
"eval_samples_per_second": 899.904,
"eval_steps_per_second": 56.244,
"step": 400000
},
{
"epoch": 1.72,
"eval_loss": 2.6518216133117676,
"eval_runtime": 222.7257,
"eval_samples_per_second": 897.966,
"eval_steps_per_second": 56.123,
"step": 408000
},
{
"epoch": 1.75,
"learning_rate": 8.271491703493705e-06,
"loss": 2.842,
"step": 416000
},
{
"epoch": 1.75,
"eval_loss": 2.6465137004852295,
"eval_runtime": 221.6934,
"eval_samples_per_second": 902.147,
"eval_steps_per_second": 56.384,
"step": 416000
},
{
"epoch": 1.79,
"eval_loss": 2.6434342861175537,
"eval_runtime": 222.3028,
"eval_samples_per_second": 899.674,
"eval_steps_per_second": 56.23,
"step": 424000
},
{
"epoch": 1.82,
"learning_rate": 8.204786125239725e-06,
"loss": 2.8431,
"step": 432000
},
{
"epoch": 1.82,
"eval_loss": 2.641423225402832,
"eval_runtime": 222.8849,
"eval_samples_per_second": 897.324,
"eval_steps_per_second": 56.083,
"step": 432000
},
{
"epoch": 1.85,
"eval_loss": 2.6531593799591064,
"eval_runtime": 223.101,
"eval_samples_per_second": 896.455,
"eval_steps_per_second": 56.028,
"step": 440000
},
{
"epoch": 1.89,
"learning_rate": 8.138080546985743e-06,
"loss": 2.8599,
"step": 448000
},
{
"epoch": 1.89,
"eval_loss": 2.6645281314849854,
"eval_runtime": 222.7835,
"eval_samples_per_second": 897.732,
"eval_steps_per_second": 56.108,
"step": 448000
},
{
"epoch": 1.92,
"eval_loss": 2.6651265621185303,
"eval_runtime": 222.493,
"eval_samples_per_second": 898.905,
"eval_steps_per_second": 56.182,
"step": 456000
},
{
"epoch": 1.95,
"learning_rate": 8.07137496873176e-06,
"loss": 2.8567,
"step": 464000
},
{
"epoch": 1.95,
"eval_loss": 2.6693992614746094,
"eval_runtime": 221.9941,
"eval_samples_per_second": 900.925,
"eval_steps_per_second": 56.308,
"step": 464000
},
{
"epoch": 1.99,
"eval_loss": 2.66097354888916,
"eval_runtime": 222.2278,
"eval_samples_per_second": 899.977,
"eval_steps_per_second": 56.249,
"step": 472000
},
{
"epoch": 2.02,
"learning_rate": 8.004669390477779e-06,
"loss": 2.8682,
"step": 480000
},
{
"epoch": 2.02,
"eval_loss": 2.687664747238159,
"eval_runtime": 222.351,
"eval_samples_per_second": 899.479,
"eval_steps_per_second": 56.217,
"step": 480000
},
{
"epoch": 2.05,
"eval_loss": 2.6723899841308594,
"eval_runtime": 224.037,
"eval_samples_per_second": 892.71,
"eval_steps_per_second": 55.794,
"step": 488000
},
{
"epoch": 2.09,
"learning_rate": 7.937963812223798e-06,
"loss": 2.8693,
"step": 496000
},
{
"epoch": 2.09,
"eval_loss": 2.683910608291626,
"eval_runtime": 223.5884,
"eval_samples_per_second": 894.501,
"eval_steps_per_second": 55.906,
"step": 496000
},
{
"epoch": 2.12,
"eval_loss": 2.692282199859619,
"eval_runtime": 222.8054,
"eval_samples_per_second": 897.644,
"eval_steps_per_second": 56.103,
"step": 504000
},
{
"epoch": 2.16,
"learning_rate": 7.871258233969816e-06,
"loss": 2.8881,
"step": 512000
},
{
"epoch": 2.16,
"eval_loss": 2.696408987045288,
"eval_runtime": 223.0143,
"eval_samples_per_second": 896.803,
"eval_steps_per_second": 56.05,
"step": 512000
},
{
"epoch": 2.19,
"eval_loss": 2.698155403137207,
"eval_runtime": 223.8418,
"eval_samples_per_second": 893.488,
"eval_steps_per_second": 55.843,
"step": 520000
},
{
"epoch": 2.22,
"learning_rate": 7.804552655715835e-06,
"loss": 2.8874,
"step": 528000
},
{
"epoch": 2.22,
"eval_loss": 2.6960911750793457,
"eval_runtime": 224.8442,
"eval_samples_per_second": 889.505,
"eval_steps_per_second": 55.594,
"step": 528000
},
{
"epoch": 2.26,
"eval_loss": 2.6883530616760254,
"eval_runtime": 223.4198,
"eval_samples_per_second": 895.176,
"eval_steps_per_second": 55.948,
"step": 536000
},
{
"epoch": 2.29,
"learning_rate": 7.737847077461853e-06,
"loss": 2.8899,
"step": 544000
},
{
"epoch": 2.29,
"eval_loss": 2.7055277824401855,
"eval_runtime": 222.7527,
"eval_samples_per_second": 897.857,
"eval_steps_per_second": 56.116,
"step": 544000
},
{
"epoch": 2.32,
"eval_loss": 2.6987791061401367,
"eval_runtime": 226.517,
"eval_samples_per_second": 882.936,
"eval_steps_per_second": 55.183,
"step": 552000
},
{
"epoch": 2.36,
"learning_rate": 7.671141499207872e-06,
"loss": 2.8966,
"step": 560000
},
{
"epoch": 2.36,
"eval_loss": 2.7103066444396973,
"eval_runtime": 226.9023,
"eval_samples_per_second": 881.437,
"eval_steps_per_second": 55.09,
"step": 560000
},
{
"epoch": 2.39,
"eval_loss": 2.709984302520752,
"eval_runtime": 226.5608,
"eval_samples_per_second": 882.765,
"eval_steps_per_second": 55.173,
"step": 568000
},
{
"epoch": 2.43,
"learning_rate": 7.604435920953891e-06,
"loss": 2.9,
"step": 576000
},
{
"epoch": 2.43,
"eval_loss": 2.716878652572632,
"eval_runtime": 227.2343,
"eval_samples_per_second": 880.149,
"eval_steps_per_second": 55.009,
"step": 576000
},
{
"epoch": 2.46,
"eval_loss": 2.718041181564331,
"eval_runtime": 224.0002,
"eval_samples_per_second": 892.856,
"eval_steps_per_second": 55.804,
"step": 584000
},
{
"epoch": 2.49,
"learning_rate": 7.537730342699909e-06,
"loss": 2.9237,
"step": 592000
},
{
"epoch": 2.49,
"eval_loss": 2.7270028591156006,
"eval_runtime": 223.2886,
"eval_samples_per_second": 895.702,
"eval_steps_per_second": 55.981,
"step": 592000
},
{
"epoch": 2.53,
"eval_loss": 2.726536989212036,
"eval_runtime": 222.779,
"eval_samples_per_second": 897.751,
"eval_steps_per_second": 56.109,
"step": 600000
},
{
"epoch": 2.56,
"learning_rate": 7.471024764445928e-06,
"loss": 2.9236,
"step": 608000
},
{
"epoch": 2.56,
"eval_loss": 2.732328176498413,
"eval_runtime": 223.8713,
"eval_samples_per_second": 893.37,
"eval_steps_per_second": 55.836,
"step": 608000
},
{
"epoch": 2.59,
"eval_loss": 2.73500394821167,
"eval_runtime": 225.493,
"eval_samples_per_second": 886.945,
"eval_steps_per_second": 55.434,
"step": 616000
},
{
"epoch": 2.63,
"learning_rate": 7.4043191861919465e-06,
"loss": 2.9276,
"step": 624000
},
{
"epoch": 2.63,
"eval_loss": 2.7333498001098633,
"eval_runtime": 224.8806,
"eval_samples_per_second": 889.361,
"eval_steps_per_second": 55.585,
"step": 624000
},
{
"epoch": 2.66,
"eval_loss": 2.734511613845825,
"eval_runtime": 225.2251,
"eval_samples_per_second": 888.001,
"eval_steps_per_second": 55.5,
"step": 632000
},
{
"epoch": 2.69,
"learning_rate": 7.337613607937964e-06,
"loss": 2.9252,
"step": 640000
},
{
"epoch": 2.69,
"eval_loss": 2.749704360961914,
"eval_runtime": 225.1054,
"eval_samples_per_second": 888.473,
"eval_steps_per_second": 55.53,
"step": 640000
},
{
"epoch": 2.73,
"eval_loss": 2.74284029006958,
"eval_runtime": 224.0229,
"eval_samples_per_second": 892.766,
"eval_steps_per_second": 55.798,
"step": 648000
},
{
"epoch": 2.76,
"learning_rate": 7.270908029683983e-06,
"loss": 2.9364,
"step": 656000
},
{
"epoch": 2.76,
"eval_loss": 2.7391881942749023,
"eval_runtime": 224.6028,
"eval_samples_per_second": 890.461,
"eval_steps_per_second": 55.654,
"step": 656000
},
{
"epoch": 2.8,
"eval_loss": 2.750549077987671,
"eval_runtime": 223.6418,
"eval_samples_per_second": 894.287,
"eval_steps_per_second": 55.893,
"step": 664000
},
{
"epoch": 2.83,
"learning_rate": 7.2042024514300015e-06,
"loss": 2.9366,
"step": 672000
},
{
"epoch": 2.83,
"eval_loss": 2.7392961978912354,
"eval_runtime": 223.5241,
"eval_samples_per_second": 894.758,
"eval_steps_per_second": 55.922,
"step": 672000
},
{
"epoch": 2.86,
"eval_loss": 2.7371537685394287,
"eval_runtime": 223.9923,
"eval_samples_per_second": 892.888,
"eval_steps_per_second": 55.805,
"step": 680000
},
{
"epoch": 2.9,
"learning_rate": 7.13749687317602e-06,
"loss": 2.9437,
"step": 688000
},
{
"epoch": 2.9,
"eval_loss": 2.7450687885284424,
"eval_runtime": 223.0769,
"eval_samples_per_second": 896.552,
"eval_steps_per_second": 56.034,
"step": 688000
},
{
"epoch": 2.93,
"eval_loss": 2.748831033706665,
"eval_runtime": 222.9228,
"eval_samples_per_second": 897.172,
"eval_steps_per_second": 56.073,
"step": 696000
},
{
"epoch": 2.96,
"learning_rate": 7.070791294922038e-06,
"loss": 2.9483,
"step": 704000
},
{
"epoch": 2.96,
"eval_loss": 2.7586183547973633,
"eval_runtime": 223.3142,
"eval_samples_per_second": 895.599,
"eval_steps_per_second": 55.975,
"step": 704000
},
{
"epoch": 3.0,
"eval_loss": 2.7612552642822266,
"eval_runtime": 222.226,
"eval_samples_per_second": 899.985,
"eval_steps_per_second": 56.249,
"step": 712000
},
{
"epoch": 3.03,
"learning_rate": 7.0040857166680564e-06,
"loss": 2.9588,
"step": 720000
},
{
"epoch": 3.03,
"eval_loss": 2.76190447807312,
"eval_runtime": 222.4583,
"eval_samples_per_second": 899.045,
"eval_steps_per_second": 56.19,
"step": 720000
},
{
"epoch": 3.07,
"eval_loss": 2.7680461406707764,
"eval_runtime": 221.9857,
"eval_samples_per_second": 900.959,
"eval_steps_per_second": 56.31,
"step": 728000
},
{
"epoch": 3.1,
"learning_rate": 6.937380138414076e-06,
"loss": 2.9422,
"step": 736000
},
{
"epoch": 3.1,
"eval_loss": 2.754580020904541,
"eval_runtime": 221.9355,
"eval_samples_per_second": 901.163,
"eval_steps_per_second": 56.323,
"step": 736000
},
{
"epoch": 3.13,
"eval_loss": 2.762883424758911,
"eval_runtime": 221.6295,
"eval_samples_per_second": 902.407,
"eval_steps_per_second": 56.4,
"step": 744000
},
{
"epoch": 3.17,
"learning_rate": 6.8706745601600945e-06,
"loss": 2.965,
"step": 752000
},
{
"epoch": 3.17,
"eval_loss": 2.759537696838379,
"eval_runtime": 221.2415,
"eval_samples_per_second": 903.99,
"eval_steps_per_second": 56.499,
"step": 752000
},
{
"epoch": 3.2,
"eval_loss": 2.776278018951416,
"eval_runtime": 221.0108,
"eval_samples_per_second": 904.933,
"eval_steps_per_second": 56.558,
"step": 760000
},
{
"epoch": 3.23,
"learning_rate": 6.803968981906113e-06,
"loss": 2.959,
"step": 768000
},
{
"epoch": 3.23,
"eval_loss": 2.7738993167877197,
"eval_runtime": 221.3449,
"eval_samples_per_second": 903.567,
"eval_steps_per_second": 56.473,
"step": 768000
},
{
"epoch": 3.27,
"eval_loss": 2.7838892936706543,
"eval_runtime": 223.8916,
"eval_samples_per_second": 893.29,
"eval_steps_per_second": 55.831,
"step": 776000
},
{
"epoch": 3.3,
"learning_rate": 6.737263403652131e-06,
"loss": 2.9604,
"step": 784000
},
{
"epoch": 3.3,
"eval_loss": 2.7680771350860596,
"eval_runtime": 223.8457,
"eval_samples_per_second": 893.473,
"eval_steps_per_second": 55.842,
"step": 784000
},
{
"epoch": 3.33,
"eval_loss": 2.7816002368927,
"eval_runtime": 224.025,
"eval_samples_per_second": 892.757,
"eval_steps_per_second": 55.797,
"step": 792000
},
{
"epoch": 3.37,
"learning_rate": 6.6705578253981495e-06,
"loss": 2.9638,
"step": 800000
},
{
"epoch": 3.37,
"eval_loss": 2.7812399864196777,
"eval_runtime": 224.4231,
"eval_samples_per_second": 891.174,
"eval_steps_per_second": 55.698,
"step": 800000
},
{
"epoch": 3.4,
"eval_loss": 2.7845778465270996,
"eval_runtime": 223.1998,
"eval_samples_per_second": 896.058,
"eval_steps_per_second": 56.004,
"step": 808000
},
{
"epoch": 3.44,
"learning_rate": 6.603852247144168e-06,
"loss": 2.9704,
"step": 816000
},
{
"epoch": 3.44,
"eval_loss": 2.7766318321228027,
"eval_runtime": 222.3046,
"eval_samples_per_second": 899.667,
"eval_steps_per_second": 56.229,
"step": 816000
},
{
"epoch": 3.47,
"eval_loss": 2.786909580230713,
"eval_runtime": 221.8638,
"eval_samples_per_second": 901.454,
"eval_steps_per_second": 56.341,
"step": 824000
},
{
"epoch": 3.5,
"learning_rate": 6.537146668890187e-06,
"loss": 2.9684,
"step": 832000
},
{
"epoch": 3.5,
"eval_loss": 2.7741353511810303,
"eval_runtime": 222.0395,
"eval_samples_per_second": 900.741,
"eval_steps_per_second": 56.296,
"step": 832000
},
{
"epoch": 3.54,
"eval_loss": 2.773477077484131,
"eval_runtime": 225.8502,
"eval_samples_per_second": 885.543,
"eval_steps_per_second": 55.346,
"step": 840000
},
{
"epoch": 3.57,
"learning_rate": 6.4704410906362044e-06,
"loss": 2.9723,
"step": 848000
},
{
"epoch": 3.57,
"eval_loss": 2.7700908184051514,
"eval_runtime": 226.2294,
"eval_samples_per_second": 884.058,
"eval_steps_per_second": 55.254,
"step": 848000
},
{
"epoch": 3.6,
"eval_loss": 2.7779886722564697,
"eval_runtime": 224.2673,
"eval_samples_per_second": 891.793,
"eval_steps_per_second": 55.737,
"step": 856000
},
{
"epoch": 3.64,
"learning_rate": 6.403735512382223e-06,
"loss": 2.9734,
"step": 864000
},
{
"epoch": 3.64,
"eval_loss": 2.7833447456359863,
"eval_runtime": 223.9605,
"eval_samples_per_second": 893.014,
"eval_steps_per_second": 55.813,
"step": 864000
},
{
"epoch": 3.67,
"eval_loss": 2.790961503982544,
"eval_runtime": 223.0622,
"eval_samples_per_second": 896.611,
"eval_steps_per_second": 56.038,
"step": 872000
},
{
"epoch": 3.71,
"learning_rate": 6.337029934128242e-06,
"loss": 2.9806,
"step": 880000
},
{
"epoch": 3.71,
"eval_loss": 2.794116258621216,
"eval_runtime": 222.8246,
"eval_samples_per_second": 897.567,
"eval_steps_per_second": 56.098,
"step": 880000
},
{
"epoch": 3.74,
"eval_loss": 2.7997074127197266,
"eval_runtime": 223.2842,
"eval_samples_per_second": 895.719,
"eval_steps_per_second": 55.982,
"step": 888000
},
{
"epoch": 3.77,
"learning_rate": 6.270324355874261e-06,
"loss": 2.9808,
"step": 896000
},
{
"epoch": 3.77,
"eval_loss": 2.802687406539917,
"eval_runtime": 223.8034,
"eval_samples_per_second": 893.641,
"eval_steps_per_second": 55.853,
"step": 896000
},
{
"epoch": 3.81,
"eval_loss": 2.797201156616211,
"eval_runtime": 221.8286,
"eval_samples_per_second": 901.597,
"eval_steps_per_second": 56.35,
"step": 904000
},
{
"epoch": 3.84,
"learning_rate": 6.20361877762028e-06,
"loss": 3.0008,
"step": 912000
},
{
"epoch": 3.84,
"eval_loss": 2.8025898933410645,
"eval_runtime": 222.2117,
"eval_samples_per_second": 900.042,
"eval_steps_per_second": 56.253,
"step": 912000
},
{
"epoch": 3.87,
"eval_loss": 2.7974584102630615,
"eval_runtime": 222.1337,
"eval_samples_per_second": 900.358,
"eval_steps_per_second": 56.272,
"step": 920000
},
{
"epoch": 3.91,
"learning_rate": 6.1369131993662975e-06,
"loss": 2.9934,
"step": 928000
},
{
"epoch": 3.91,
"eval_loss": 2.797086000442505,
"eval_runtime": 221.5435,
"eval_samples_per_second": 902.757,
"eval_steps_per_second": 56.422,
"step": 928000
},
{
"epoch": 3.94,
"eval_loss": 2.8030388355255127,
"eval_runtime": 226.5332,
"eval_samples_per_second": 882.873,
"eval_steps_per_second": 55.18,
"step": 936000
},
{
"epoch": 3.97,
"learning_rate": 6.070207621112316e-06,
"loss": 2.9927,
"step": 944000
},
{
"epoch": 3.97,
"eval_loss": 2.8082187175750732,
"eval_runtime": 224.5948,
"eval_samples_per_second": 890.492,
"eval_steps_per_second": 55.656,
"step": 944000
},
{
"epoch": 4.01,
"eval_loss": 2.820798397064209,
"eval_runtime": 224.7429,
"eval_samples_per_second": 889.906,
"eval_steps_per_second": 55.619,
"step": 952000
},
{
"epoch": 4.04,
"learning_rate": 6.003502042858335e-06,
"loss": 3.0013,
"step": 960000
},
{
"epoch": 4.04,
"eval_loss": 2.8129076957702637,
"eval_runtime": 224.0828,
"eval_samples_per_second": 892.527,
"eval_steps_per_second": 55.783,
"step": 960000
},
{
"epoch": 4.08,
"eval_loss": 2.823551893234253,
"eval_runtime": 222.6379,
"eval_samples_per_second": 898.32,
"eval_steps_per_second": 56.145,
"step": 968000
},
{
"epoch": 4.11,
"learning_rate": 5.936796464604353e-06,
"loss": 2.9996,
"step": 976000
},
{
"epoch": 4.11,
"eval_loss": 2.8225581645965576,
"eval_runtime": 223.2923,
"eval_samples_per_second": 895.687,
"eval_steps_per_second": 55.98,
"step": 976000
},
{
"epoch": 4.14,
"eval_loss": 2.827303409576416,
"eval_runtime": 223.5156,
"eval_samples_per_second": 894.792,
"eval_steps_per_second": 55.925,
"step": 984000
},
{
"epoch": 4.18,
"learning_rate": 5.870090886350371e-06,
"loss": 3.0125,
"step": 992000
},
{
"epoch": 4.18,
"eval_loss": 2.8161449432373047,
"eval_runtime": 222.4898,
"eval_samples_per_second": 898.917,
"eval_steps_per_second": 56.182,
"step": 992000
},
{
"epoch": 4.21,
"eval_loss": 2.8249175548553467,
"eval_runtime": 224.1746,
"eval_samples_per_second": 892.162,
"eval_steps_per_second": 55.76,
"step": 1000000
},
{
"epoch": 4.24,
"learning_rate": 5.80338530809639e-06,
"loss": 3.0086,
"step": 1008000
},
{
"epoch": 4.24,
"eval_loss": 2.832012414932251,
"eval_runtime": 224.9255,
"eval_samples_per_second": 889.184,
"eval_steps_per_second": 55.574,
"step": 1008000
},
{
"epoch": 4.28,
"eval_loss": 2.831321954727173,
"eval_runtime": 225.1137,
"eval_samples_per_second": 888.44,
"eval_steps_per_second": 55.528,
"step": 1016000
},
{
"epoch": 4.31,
"learning_rate": 5.736679729842408e-06,
"loss": 3.0077,
"step": 1024000
},
{
"epoch": 4.31,
"eval_loss": 2.83213210105896,
"eval_runtime": 224.7924,
"eval_samples_per_second": 889.71,
"eval_steps_per_second": 55.607,
"step": 1024000
},
{
"epoch": 4.35,
"eval_loss": 2.833178758621216,
"eval_runtime": 225.4632,
"eval_samples_per_second": 887.063,
"eval_steps_per_second": 55.441,
"step": 1032000
},
{
"epoch": 4.38,
"learning_rate": 5.669974151588427e-06,
"loss": 3.0186,
"step": 1040000
},
{
"epoch": 4.38,
"eval_loss": 2.8288471698760986,
"eval_runtime": 225.9333,
"eval_samples_per_second": 885.217,
"eval_steps_per_second": 55.326,
"step": 1040000
},
{
"epoch": 4.41,
"eval_loss": 2.839233160018921,
"eval_runtime": 225.2383,
"eval_samples_per_second": 887.949,
"eval_steps_per_second": 55.497,
"step": 1048000
},
{
"epoch": 4.45,
"learning_rate": 5.603268573334446e-06,
"loss": 3.0311,
"step": 1056000
},
{
"epoch": 4.45,
"eval_loss": 2.824310302734375,
"eval_runtime": 223.8873,
"eval_samples_per_second": 893.307,
"eval_steps_per_second": 55.832,
"step": 1056000
},
{
"epoch": 4.48,
"eval_loss": 2.852445602416992,
"eval_runtime": 226.2506,
"eval_samples_per_second": 883.976,
"eval_steps_per_second": 55.248,
"step": 1064000
},
{
"epoch": 4.51,
"learning_rate": 5.536562995080464e-06,
"loss": 3.0199,
"step": 1072000
},
{
"epoch": 4.51,
"eval_loss": 2.834698438644409,
"eval_runtime": 224.6576,
"eval_samples_per_second": 890.244,
"eval_steps_per_second": 55.64,
"step": 1072000
},
{
"epoch": 4.55,
"eval_loss": 2.8437862396240234,
"eval_runtime": 224.6897,
"eval_samples_per_second": 890.116,
"eval_steps_per_second": 55.632,
"step": 1080000
},
{
"epoch": 4.58,
"learning_rate": 5.469857416826483e-06,
"loss": 3.0198,
"step": 1088000
},
{
"epoch": 4.58,
"eval_loss": 2.8415181636810303,
"eval_runtime": 223.938,
"eval_samples_per_second": 893.104,
"eval_steps_per_second": 55.819,
"step": 1088000
},
{
"epoch": 4.61,
"eval_loss": 2.84600567817688,
"eval_runtime": 222.512,
"eval_samples_per_second": 898.828,
"eval_steps_per_second": 56.177,
"step": 1096000
},
{
"epoch": 4.65,
"learning_rate": 5.403151838572501e-06,
"loss": 3.0279,
"step": 1104000
},
{
"epoch": 4.65,
"eval_loss": 2.855103015899658,
"eval_runtime": 224.3844,
"eval_samples_per_second": 891.328,
"eval_steps_per_second": 55.708,
"step": 1104000
},
{
"epoch": 4.68,
"eval_loss": 2.8528149127960205,
"eval_runtime": 222.2925,
"eval_samples_per_second": 899.715,
"eval_steps_per_second": 56.232,
"step": 1112000
},
{
"epoch": 4.72,
"learning_rate": 5.33644626031852e-06,
"loss": 3.0319,
"step": 1120000
},
{
"epoch": 4.72,
"eval_loss": 2.8601133823394775,
"eval_runtime": 225.9192,
"eval_samples_per_second": 885.272,
"eval_steps_per_second": 55.33,
"step": 1120000
},
{
"epoch": 4.75,
"eval_loss": 2.8543853759765625,
"eval_runtime": 228.4752,
"eval_samples_per_second": 875.369,
"eval_steps_per_second": 54.711,
"step": 1128000
},
{
"epoch": 4.78,
"learning_rate": 5.269740682064538e-06,
"loss": 3.0371,
"step": 1136000
},
{
"epoch": 4.78,
"eval_loss": 2.855318069458008,
"eval_runtime": 229.1947,
"eval_samples_per_second": 872.621,
"eval_steps_per_second": 54.539,
"step": 1136000
},
{
"epoch": 4.82,
"eval_loss": 2.8596949577331543,
"eval_runtime": 228.9063,
"eval_samples_per_second": 873.72,
"eval_steps_per_second": 54.607,
"step": 1144000
},
{
"epoch": 4.85,
"learning_rate": 5.203035103810556e-06,
"loss": 3.038,
"step": 1152000
},
{
"epoch": 4.85,
"eval_loss": 2.865326166152954,
"eval_runtime": 228.6229,
"eval_samples_per_second": 874.803,
"eval_steps_per_second": 54.675,
"step": 1152000
},
{
"epoch": 4.88,
"eval_loss": 2.856044292449951,
"eval_runtime": 224.6889,
"eval_samples_per_second": 890.12,
"eval_steps_per_second": 55.632,
"step": 1160000
},
{
"epoch": 4.92,
"learning_rate": 5.136329525556575e-06,
"loss": 3.0318,
"step": 1168000
},
{
"epoch": 4.92,
"eval_loss": 2.860161542892456,
"eval_runtime": 223.8596,
"eval_samples_per_second": 893.417,
"eval_steps_per_second": 55.839,
"step": 1168000
},
{
"epoch": 4.95,
"eval_loss": 2.8483996391296387,
"eval_runtime": 223.3074,
"eval_samples_per_second": 895.627,
"eval_steps_per_second": 55.977,
"step": 1176000
},
{
"epoch": 4.99,
"learning_rate": 5.0696239473025935e-06,
"loss": 3.0449,
"step": 1184000
},
{
"epoch": 4.99,
"eval_loss": 2.861185073852539,
"eval_runtime": 223.8763,
"eval_samples_per_second": 893.35,
"eval_steps_per_second": 55.834,
"step": 1184000
},
{
"epoch": 5.02,
"eval_loss": 2.8597800731658936,
"eval_runtime": 223.5703,
"eval_samples_per_second": 894.573,
"eval_steps_per_second": 55.911,
"step": 1192000
},
{
"epoch": 5.05,
"learning_rate": 5.002918369048611e-06,
"loss": 3.0384,
"step": 1200000
},
{
"epoch": 5.05,
"eval_loss": 2.8580985069274902,
"eval_runtime": 223.5118,
"eval_samples_per_second": 894.807,
"eval_steps_per_second": 55.925,
"step": 1200000
},
{
"epoch": 5.09,
"eval_loss": 2.8481242656707764,
"eval_runtime": 222.9723,
"eval_samples_per_second": 896.972,
"eval_steps_per_second": 56.061,
"step": 1208000
},
{
"epoch": 5.12,
"learning_rate": 4.936212790794631e-06,
"loss": 3.0243,
"step": 1216000
},
{
"epoch": 5.12,
"eval_loss": 2.845810651779175,
"eval_runtime": 223.4767,
"eval_samples_per_second": 894.948,
"eval_steps_per_second": 55.934,
"step": 1216000
},
{
"epoch": 5.15,
"eval_loss": 2.849405527114868,
"eval_runtime": 224.1558,
"eval_samples_per_second": 892.237,
"eval_steps_per_second": 55.765,
"step": 1224000
},
{
"epoch": 5.19,
"learning_rate": 4.869507212540649e-06,
"loss": 3.0345,
"step": 1232000
},
{
"epoch": 5.19,
"eval_loss": 2.854433536529541,
"eval_runtime": 223.6399,
"eval_samples_per_second": 894.295,
"eval_steps_per_second": 55.893,
"step": 1232000
},
{
"epoch": 5.22,
"eval_loss": 2.8487536907196045,
"eval_runtime": 223.5008,
"eval_samples_per_second": 894.851,
"eval_steps_per_second": 55.928,
"step": 1240000
},
{
"epoch": 5.25,
"learning_rate": 4.802801634286667e-06,
"loss": 3.0251,
"step": 1248000
},
{
"epoch": 5.25,
"eval_loss": 2.845292568206787,
"eval_runtime": 224.0031,
"eval_samples_per_second": 892.845,
"eval_steps_per_second": 55.803,
"step": 1248000
},
{
"epoch": 5.29,
"eval_loss": 2.8464181423187256,
"eval_runtime": 225.2034,
"eval_samples_per_second": 888.086,
"eval_steps_per_second": 55.505,
"step": 1256000
},
{
"epoch": 5.32,
"learning_rate": 4.7360960560326865e-06,
"loss": 3.0234,
"step": 1264000
},
{
"epoch": 5.32,
"eval_loss": 2.848585367202759,
"eval_runtime": 223.6745,
"eval_samples_per_second": 894.156,
"eval_steps_per_second": 55.885,
"step": 1264000
},
{
"epoch": 5.36,
"eval_loss": 2.8435869216918945,
"eval_runtime": 223.3913,
"eval_samples_per_second": 895.29,
"eval_steps_per_second": 55.956,
"step": 1272000
},
{
"epoch": 5.39,
"learning_rate": 4.669390477778704e-06,
"loss": 3.0205,
"step": 1280000
},
{
"epoch": 5.39,
"eval_loss": 2.8476340770721436,
"eval_runtime": 223.5929,
"eval_samples_per_second": 894.483,
"eval_steps_per_second": 55.905,
"step": 1280000
},
{
"epoch": 5.42,
"eval_loss": 2.8326635360717773,
"eval_runtime": 224.8548,
"eval_samples_per_second": 889.463,
"eval_steps_per_second": 55.591,
"step": 1288000
},
{
"epoch": 5.46,
"learning_rate": 4.602684899524723e-06,
"loss": 3.0228,
"step": 1296000
},
{
"epoch": 5.46,
"eval_loss": 2.8452436923980713,
"eval_runtime": 223.4053,
"eval_samples_per_second": 895.234,
"eval_steps_per_second": 55.952,
"step": 1296000
},
{
"epoch": 5.49,
"eval_loss": 2.837240695953369,
"eval_runtime": 225.2471,
"eval_samples_per_second": 887.914,
"eval_steps_per_second": 55.495,
"step": 1304000
},
{
"epoch": 5.52,
"learning_rate": 4.5359793212707415e-06,
"loss": 3.0063,
"step": 1312000
},
{
"epoch": 5.52,
"eval_loss": 2.830629348754883,
"eval_runtime": 224.2293,
"eval_samples_per_second": 891.944,
"eval_steps_per_second": 55.746,
"step": 1312000
},
{
"epoch": 5.56,
"eval_loss": 2.841139078140259,
"eval_runtime": 226.2486,
"eval_samples_per_second": 883.983,
"eval_steps_per_second": 55.249,
"step": 1320000
},
{
"epoch": 5.59,
"learning_rate": 4.46927374301676e-06,
"loss": 3.0068,
"step": 1328000
},
{
"epoch": 5.59,
"eval_loss": 2.827270030975342,
"eval_runtime": 226.3948,
"eval_samples_per_second": 883.413,
"eval_steps_per_second": 55.213,
"step": 1328000
},
{
"epoch": 5.63,
"eval_loss": 2.834273338317871,
"eval_runtime": 226.3338,
"eval_samples_per_second": 883.651,
"eval_steps_per_second": 55.228,
"step": 1336000
},
{
"epoch": 5.66,
"learning_rate": 4.402568164762779e-06,
"loss": 3.0109,
"step": 1344000
},
{
"epoch": 5.66,
"eval_loss": 2.8328187465667725,
"eval_runtime": 225.4917,
"eval_samples_per_second": 886.95,
"eval_steps_per_second": 55.434,
"step": 1344000
},
{
"epoch": 5.69,
"eval_loss": 2.843144655227661,
"eval_runtime": 224.1529,
"eval_samples_per_second": 892.248,
"eval_steps_per_second": 55.766,
"step": 1352000
},
{
"epoch": 5.73,
"learning_rate": 4.335862586508797e-06,
"loss": 3.0068,
"step": 1360000
},
{
"epoch": 5.73,
"eval_loss": 2.8331680297851562,
"eval_runtime": 224.2839,
"eval_samples_per_second": 891.727,
"eval_steps_per_second": 55.733,
"step": 1360000
},
{
"epoch": 5.76,
"eval_loss": 2.827512502670288,
"eval_runtime": 223.7519,
"eval_samples_per_second": 893.847,
"eval_steps_per_second": 55.865,
"step": 1368000
},
{
"epoch": 5.79,
"learning_rate": 4.269157008254816e-06,
"loss": 3.002,
"step": 1376000
},
{
"epoch": 5.79,
"eval_loss": 2.8313817977905273,
"eval_runtime": 224.094,
"eval_samples_per_second": 892.483,
"eval_steps_per_second": 55.78,
"step": 1376000
},
{
"epoch": 5.83,
"eval_loss": 2.8324134349823,
"eval_runtime": 226.0373,
"eval_samples_per_second": 884.81,
"eval_steps_per_second": 55.301,
"step": 1384000
},
{
"epoch": 5.86,
"learning_rate": 4.202451430000834e-06,
"loss": 3.0037,
"step": 1392000
},
{
"epoch": 5.86,
"eval_loss": 2.839409351348877,
"eval_runtime": 223.5509,
"eval_samples_per_second": 894.651,
"eval_steps_per_second": 55.916,
"step": 1392000
},
{
"epoch": 5.89,
"eval_loss": 2.8337831497192383,
"eval_runtime": 223.4898,
"eval_samples_per_second": 894.895,
"eval_steps_per_second": 55.931,
"step": 1400000
},
{
"epoch": 5.93,
"learning_rate": 4.135745851746852e-06,
"loss": 3.0086,
"step": 1408000
},
{
"epoch": 5.93,
"eval_loss": 2.8447976112365723,
"eval_runtime": 223.3032,
"eval_samples_per_second": 895.643,
"eval_steps_per_second": 55.978,
"step": 1408000
},
{
"epoch": 5.96,
"eval_loss": 2.8326330184936523,
"eval_runtime": 225.1553,
"eval_samples_per_second": 888.276,
"eval_steps_per_second": 55.517,
"step": 1416000
},
{
"epoch": 6.0,
"learning_rate": 4.069040273492872e-06,
"loss": 2.9977,
"step": 1424000
},
{
"epoch": 6.0,
"eval_loss": 2.8310978412628174,
"eval_runtime": 224.0959,
"eval_samples_per_second": 892.475,
"eval_steps_per_second": 55.78,
"step": 1424000
},
{
"epoch": 6.03,
"eval_loss": 2.8410427570343018,
"eval_runtime": 223.4994,
"eval_samples_per_second": 894.857,
"eval_steps_per_second": 55.929,
"step": 1432000
},
{
"epoch": 6.06,
"learning_rate": 4.0023346952388895e-06,
"loss": 2.9984,
"step": 1440000
},
{
"epoch": 6.06,
"eval_loss": 2.8358559608459473,
"eval_runtime": 223.5492,
"eval_samples_per_second": 894.658,
"eval_steps_per_second": 55.916,
"step": 1440000
},
{
"epoch": 6.1,
"eval_loss": 2.839256763458252,
"eval_runtime": 228.1513,
"eval_samples_per_second": 876.611,
"eval_steps_per_second": 54.788,
"step": 1448000
},
{
"epoch": 6.13,
"learning_rate": 3.935629116984908e-06,
"loss": 3.0095,
"step": 1456000
},
{
"epoch": 6.13,
"eval_loss": 2.838825225830078,
"eval_runtime": 226.4727,
"eval_samples_per_second": 883.109,
"eval_steps_per_second": 55.194,
"step": 1456000
},
{
"epoch": 6.16,
"eval_loss": 2.844802141189575,
"eval_runtime": 226.1587,
"eval_samples_per_second": 884.335,
"eval_steps_per_second": 55.271,
"step": 1464000
},
{
"epoch": 6.2,
"learning_rate": 3.868923538730927e-06,
"loss": 3.0051,
"step": 1472000
},
{
"epoch": 6.2,
"eval_loss": 2.8472418785095215,
"eval_runtime": 228.0091,
"eval_samples_per_second": 877.158,
"eval_steps_per_second": 54.822,
"step": 1472000
},
{
"epoch": 6.23,
"eval_loss": 2.842092514038086,
"eval_runtime": 224.1053,
"eval_samples_per_second": 892.438,
"eval_steps_per_second": 55.777,
"step": 1480000
},
{
"epoch": 6.27,
"learning_rate": 3.8022179604769453e-06,
"loss": 3.0142,
"step": 1488000
},
{
"epoch": 6.27,
"eval_loss": 2.842365264892578,
"eval_runtime": 223.8921,
"eval_samples_per_second": 893.287,
"eval_steps_per_second": 55.83,
"step": 1488000
},
{
"epoch": 6.3,
"eval_loss": 2.847733974456787,
"eval_runtime": 223.983,
"eval_samples_per_second": 892.925,
"eval_steps_per_second": 55.808,
"step": 1496000
},
{
"epoch": 6.33,
"learning_rate": 3.735512382222964e-06,
"loss": 3.0149,
"step": 1504000
},
{
"epoch": 6.33,
"eval_loss": 2.842820405960083,
"eval_runtime": 224.541,
"eval_samples_per_second": 890.706,
"eval_steps_per_second": 55.669,
"step": 1504000
},
{
"epoch": 6.37,
"eval_loss": 2.8529434204101562,
"eval_runtime": 229.172,
"eval_samples_per_second": 872.707,
"eval_steps_per_second": 54.544,
"step": 1512000
},
{
"epoch": 6.4,
"learning_rate": 3.668806803968982e-06,
"loss": 3.0147,
"step": 1520000
},
{
"epoch": 6.4,
"eval_loss": 2.854137420654297,
"eval_runtime": 228.0077,
"eval_samples_per_second": 877.163,
"eval_steps_per_second": 54.823,
"step": 1520000
},
{
"epoch": 6.43,
"eval_loss": 2.8518521785736084,
"eval_runtime": 227.8943,
"eval_samples_per_second": 877.6,
"eval_steps_per_second": 54.85,
"step": 1528000
},
{
"epoch": 6.47,
"learning_rate": 3.6021012257150007e-06,
"loss": 3.0205,
"step": 1536000
},
{
"epoch": 6.47,
"eval_loss": 2.852667808532715,
"eval_runtime": 227.5511,
"eval_samples_per_second": 878.924,
"eval_steps_per_second": 54.933,
"step": 1536000
},
{
"epoch": 6.5,
"eval_loss": 2.8470675945281982,
"eval_runtime": 223.3626,
"eval_samples_per_second": 895.405,
"eval_steps_per_second": 55.963,
"step": 1544000
},
{
"epoch": 6.53,
"learning_rate": 3.535395647461019e-06,
"loss": 3.029,
"step": 1552000
},
{
"epoch": 6.53,
"eval_loss": 2.8583133220672607,
"eval_runtime": 224.8105,
"eval_samples_per_second": 889.638,
"eval_steps_per_second": 55.602,
"step": 1552000
},
{
"epoch": 6.57,
"eval_loss": 2.84967303276062,
"eval_runtime": 223.9635,
"eval_samples_per_second": 893.002,
"eval_steps_per_second": 55.813,
"step": 1560000
},
{
"epoch": 6.6,
"learning_rate": 3.468690069207038e-06,
"loss": 3.024,
"step": 1568000
},
{
"epoch": 6.6,
"eval_loss": 2.865325689315796,
"eval_runtime": 223.4985,
"eval_samples_per_second": 894.86,
"eval_steps_per_second": 55.929,
"step": 1568000
},
{
"epoch": 6.64,
"eval_loss": 2.855334997177124,
"eval_runtime": 225.852,
"eval_samples_per_second": 885.536,
"eval_steps_per_second": 55.346,
"step": 1576000
},
{
"epoch": 6.67,
"learning_rate": 3.4019844909530565e-06,
"loss": 3.0371,
"step": 1584000
},
{
"epoch": 6.67,
"eval_loss": 2.865299940109253,
"eval_runtime": 224.5284,
"eval_samples_per_second": 890.756,
"eval_steps_per_second": 55.672,
"step": 1584000
},
{
"epoch": 6.7,
"eval_loss": 2.860386848449707,
"eval_runtime": 223.6209,
"eval_samples_per_second": 894.371,
"eval_steps_per_second": 55.898,
"step": 1592000
},
{
"epoch": 6.74,
"learning_rate": 3.3352789126990747e-06,
"loss": 3.0319,
"step": 1600000
},
{
"epoch": 6.74,
"eval_loss": 2.862384080886841,
"eval_runtime": 223.9592,
"eval_samples_per_second": 893.02,
"eval_steps_per_second": 55.814,
"step": 1600000
},
{
"epoch": 6.77,
"eval_loss": 2.8657121658325195,
"eval_runtime": 226.5681,
"eval_samples_per_second": 882.737,
"eval_steps_per_second": 55.171,
"step": 1608000
},
{
"epoch": 6.8,
"learning_rate": 3.2685733344450933e-06,
"loss": 3.0369,
"step": 1616000
},
{
"epoch": 6.8,
"eval_loss": 2.861598491668701,
"eval_runtime": 224.0599,
"eval_samples_per_second": 892.618,
"eval_steps_per_second": 55.789,
"step": 1616000
},
{
"epoch": 6.84,
"eval_loss": 2.8666746616363525,
"eval_runtime": 224.0122,
"eval_samples_per_second": 892.808,
"eval_steps_per_second": 55.801,
"step": 1624000
},
{
"epoch": 6.87,
"learning_rate": 3.2018677561911115e-06,
"loss": 3.0357,
"step": 1632000
},
{
"epoch": 6.87,
"eval_loss": 2.86602783203125,
"eval_runtime": 223.9311,
"eval_samples_per_second": 893.132,
"eval_steps_per_second": 55.821,
"step": 1632000
},
{
"epoch": 6.91,
"eval_loss": 2.868190050125122,
"eval_runtime": 224.6108,
"eval_samples_per_second": 890.429,
"eval_steps_per_second": 55.652,
"step": 1640000
},
{
"epoch": 6.94,
"learning_rate": 3.1351621779371306e-06,
"loss": 3.0342,
"step": 1648000
},
{
"epoch": 6.94,
"eval_loss": 2.867553472518921,
"eval_runtime": 224.9283,
"eval_samples_per_second": 889.172,
"eval_steps_per_second": 55.573,
"step": 1648000
},
{
"epoch": 6.97,
"eval_loss": 2.881544589996338,
"eval_runtime": 225.5949,
"eval_samples_per_second": 886.545,
"eval_steps_per_second": 55.409,
"step": 1656000
},
{
"epoch": 7.01,
"learning_rate": 3.0684565996831487e-06,
"loss": 3.0375,
"step": 1664000
},
{
"epoch": 7.01,
"eval_loss": 2.8667211532592773,
"eval_runtime": 224.8671,
"eval_samples_per_second": 889.414,
"eval_steps_per_second": 55.588,
"step": 1664000
},
{
"epoch": 7.04,
"eval_loss": 2.8734593391418457,
"eval_runtime": 226.9159,
"eval_samples_per_second": 881.384,
"eval_steps_per_second": 55.086,
"step": 1672000
},
{
"epoch": 7.07,
"learning_rate": 3.0017510214291673e-06,
"loss": 3.0419,
"step": 1680000
},
{
"epoch": 7.07,
"eval_loss": 2.8788318634033203,
"eval_runtime": 226.3899,
"eval_samples_per_second": 883.432,
"eval_steps_per_second": 55.214,
"step": 1680000
},
{
"epoch": 7.11,
"eval_loss": 2.8766632080078125,
"eval_runtime": 225.7385,
"eval_samples_per_second": 885.981,
"eval_steps_per_second": 55.374,
"step": 1688000
},
{
"epoch": 7.14,
"learning_rate": 2.9350454431751855e-06,
"loss": 3.0403,
"step": 1696000
},
{
"epoch": 7.14,
"eval_loss": 2.8811895847320557,
"eval_runtime": 225.9242,
"eval_samples_per_second": 885.253,
"eval_steps_per_second": 55.328,
"step": 1696000
},
{
"epoch": 7.17,
"eval_loss": 2.879542827606201,
"eval_runtime": 225.7881,
"eval_samples_per_second": 885.786,
"eval_steps_per_second": 55.362,
"step": 1704000
},
{
"epoch": 7.21,
"learning_rate": 2.868339864921204e-06,
"loss": 3.0482,
"step": 1712000
},
{
"epoch": 7.21,
"eval_loss": 2.88046932220459,
"eval_runtime": 225.2755,
"eval_samples_per_second": 887.802,
"eval_steps_per_second": 55.488,
"step": 1712000
},
{
"epoch": 7.24,
"eval_loss": 2.8794021606445312,
"eval_runtime": 226.8559,
"eval_samples_per_second": 881.617,
"eval_steps_per_second": 55.101,
"step": 1720000
},
{
"epoch": 7.28,
"learning_rate": 2.801634286667223e-06,
"loss": 3.0533,
"step": 1728000
},
{
"epoch": 7.28,
"eval_loss": 2.8787782192230225,
"eval_runtime": 225.0025,
"eval_samples_per_second": 888.879,
"eval_steps_per_second": 55.555,
"step": 1728000
},
{
"epoch": 7.31,
"eval_loss": 2.884382724761963,
"eval_runtime": 225.9472,
"eval_samples_per_second": 885.163,
"eval_steps_per_second": 55.323,
"step": 1736000
},
{
"epoch": 7.34,
"learning_rate": 2.7349287084132413e-06,
"loss": 3.0453,
"step": 1744000
},
{
"epoch": 7.34,
"eval_loss": 2.870943069458008,
"eval_runtime": 225.043,
"eval_samples_per_second": 888.719,
"eval_steps_per_second": 55.545,
"step": 1744000
},
{
"epoch": 7.38,
"eval_loss": 2.8835349082946777,
"eval_runtime": 225.0959,
"eval_samples_per_second": 888.51,
"eval_steps_per_second": 55.532,
"step": 1752000
},
{
"epoch": 7.41,
"learning_rate": 2.66822313015926e-06,
"loss": 3.0562,
"step": 1760000
},
{
"epoch": 7.41,
"eval_loss": 2.8891103267669678,
"eval_runtime": 224.5495,
"eval_samples_per_second": 890.672,
"eval_steps_per_second": 55.667,
"step": 1760000
},
{
"epoch": 7.44,
"eval_loss": 2.8902649879455566,
"eval_runtime": 225.1215,
"eval_samples_per_second": 888.409,
"eval_steps_per_second": 55.526,
"step": 1768000
},
{
"epoch": 7.48,
"learning_rate": 2.601517551905278e-06,
"loss": 3.0617,
"step": 1776000
},
{
"epoch": 7.48,
"eval_loss": 2.884901762008667,
"eval_runtime": 225.0729,
"eval_samples_per_second": 888.601,
"eval_steps_per_second": 55.538,
"step": 1776000
},
{
"epoch": 7.51,
"eval_loss": 2.8766396045684814,
"eval_runtime": 224.6011,
"eval_samples_per_second": 890.468,
"eval_steps_per_second": 55.654,
"step": 1784000
},
{
"epoch": 7.55,
"learning_rate": 2.5348119736512967e-06,
"loss": 3.0539,
"step": 1792000
},
{
"epoch": 7.55,
"eval_loss": 2.8871917724609375,
"eval_runtime": 224.4825,
"eval_samples_per_second": 890.938,
"eval_steps_per_second": 55.684,
"step": 1792000
},
{
"epoch": 7.58,
"eval_loss": 2.898136615753174,
"eval_runtime": 225.2799,
"eval_samples_per_second": 887.784,
"eval_steps_per_second": 55.487,
"step": 1800000
},
{
"epoch": 7.61,
"learning_rate": 2.4681063953973154e-06,
"loss": 3.0561,
"step": 1808000
},
{
"epoch": 7.61,
"eval_loss": 2.886209487915039,
"eval_runtime": 226.1006,
"eval_samples_per_second": 884.562,
"eval_steps_per_second": 55.285,
"step": 1808000
},
{
"epoch": 7.65,
"eval_loss": 2.8940441608428955,
"eval_runtime": 227.2765,
"eval_samples_per_second": 879.986,
"eval_steps_per_second": 54.999,
"step": 1816000
},
{
"epoch": 7.68,
"learning_rate": 2.4014008171433335e-06,
"loss": 3.0529,
"step": 1824000
},
{
"epoch": 7.68,
"eval_loss": 2.887427568435669,
"eval_runtime": 225.5383,
"eval_samples_per_second": 886.767,
"eval_steps_per_second": 55.423,
"step": 1824000
},
{
"epoch": 7.71,
"eval_loss": 2.883918046951294,
"eval_runtime": 226.0624,
"eval_samples_per_second": 884.711,
"eval_steps_per_second": 55.294,
"step": 1832000
},
{
"epoch": 7.75,
"learning_rate": 2.334695238889352e-06,
"loss": 3.0484,
"step": 1840000
},
{
"epoch": 7.75,
"eval_loss": 2.883819103240967,
"eval_runtime": 225.1615,
"eval_samples_per_second": 888.251,
"eval_steps_per_second": 55.516,
"step": 1840000
},
{
"epoch": 7.78,
"eval_loss": 2.8856074810028076,
"eval_runtime": 226.1802,
"eval_samples_per_second": 884.251,
"eval_steps_per_second": 55.266,
"step": 1848000
},
{
"epoch": 7.81,
"learning_rate": 2.2679896606353707e-06,
"loss": 3.0562,
"step": 1856000
},
{
"epoch": 7.81,
"eval_loss": 2.8983583450317383,
"eval_runtime": 227.809,
"eval_samples_per_second": 877.929,
"eval_steps_per_second": 54.871,
"step": 1856000
},
{
"epoch": 7.85,
"eval_loss": 2.884408473968506,
"eval_runtime": 228.3309,
"eval_samples_per_second": 875.922,
"eval_steps_per_second": 54.745,
"step": 1864000
},
{
"epoch": 7.88,
"learning_rate": 2.2012840823813894e-06,
"loss": 3.0578,
"step": 1872000
},
{
"epoch": 7.88,
"eval_loss": 2.8873543739318848,
"eval_runtime": 226.4275,
"eval_samples_per_second": 883.285,
"eval_steps_per_second": 55.205,
"step": 1872000
},
{
"epoch": 7.92,
"eval_loss": 2.8886616230010986,
"eval_runtime": 226.5836,
"eval_samples_per_second": 882.676,
"eval_steps_per_second": 55.167,
"step": 1880000
},
{
"epoch": 7.95,
"learning_rate": 2.134578504127408e-06,
"loss": 3.0553,
"step": 1888000
},
{
"epoch": 7.95,
"eval_loss": 2.879803419113159,
"eval_runtime": 228.4861,
"eval_samples_per_second": 875.327,
"eval_steps_per_second": 54.708,
"step": 1888000
},
{
"epoch": 7.98,
"eval_loss": 2.8788740634918213,
"eval_runtime": 228.0679,
"eval_samples_per_second": 876.932,
"eval_steps_per_second": 54.808,
"step": 1896000
},
{
"epoch": 8.02,
"learning_rate": 2.067872925873426e-06,
"loss": 3.0623,
"step": 1904000
},
{
"epoch": 8.02,
"eval_loss": 2.8968303203582764,
"eval_runtime": 229.4287,
"eval_samples_per_second": 871.731,
"eval_steps_per_second": 54.483,
"step": 1904000
},
{
"epoch": 8.05,
"eval_loss": 2.8834211826324463,
"eval_runtime": 227.0353,
"eval_samples_per_second": 880.92,
"eval_steps_per_second": 55.058,
"step": 1912000
},
{
"epoch": 8.08,
"learning_rate": 2.0011673476194448e-06,
"loss": 3.0652,
"step": 1920000
},
{
"epoch": 8.08,
"eval_loss": 2.8902077674865723,
"eval_runtime": 227.3091,
"eval_samples_per_second": 879.859,
"eval_steps_per_second": 54.991,
"step": 1920000
},
{
"epoch": 8.12,
"eval_loss": 2.8821847438812256,
"eval_runtime": 226.7104,
"eval_samples_per_second": 882.183,
"eval_steps_per_second": 55.136,
"step": 1928000
},
{
"epoch": 8.15,
"learning_rate": 1.9344617693654634e-06,
"loss": 3.0487,
"step": 1936000
},
{
"epoch": 8.15,
"eval_loss": 2.8844268321990967,
"eval_runtime": 227.086,
"eval_samples_per_second": 880.724,
"eval_steps_per_second": 55.045,
"step": 1936000
},
{
"epoch": 8.19,
"eval_loss": 2.890925407409668,
"eval_runtime": 227.5282,
"eval_samples_per_second": 879.012,
"eval_steps_per_second": 54.938,
"step": 1944000
},
{
"epoch": 8.22,
"learning_rate": 1.867756191111482e-06,
"loss": 3.0546,
"step": 1952000
},
{
"epoch": 8.22,
"eval_loss": 2.8915293216705322,
"eval_runtime": 226.2178,
"eval_samples_per_second": 884.104,
"eval_steps_per_second": 55.256,
"step": 1952000
},
{
"epoch": 8.25,
"eval_loss": 2.8869712352752686,
"eval_runtime": 234.1736,
"eval_samples_per_second": 854.067,
"eval_steps_per_second": 53.379,
"step": 1960000
},
{
"epoch": 8.29,
"learning_rate": 1.8010506128575004e-06,
"loss": 3.0524,
"step": 1968000
},
{
"epoch": 8.29,
"eval_loss": 2.882768154144287,
"eval_runtime": 232.3633,
"eval_samples_per_second": 860.721,
"eval_steps_per_second": 53.795,
"step": 1968000
},
{
"epoch": 8.32,
"eval_loss": 2.878105401992798,
"eval_runtime": 232.7641,
"eval_samples_per_second": 859.239,
"eval_steps_per_second": 53.702,
"step": 1976000
},
{
"epoch": 8.35,
"learning_rate": 1.734345034603519e-06,
"loss": 3.0491,
"step": 1984000
},
{
"epoch": 8.35,
"eval_loss": 2.894814968109131,
"eval_runtime": 235.0584,
"eval_samples_per_second": 850.852,
"eval_steps_per_second": 53.178,
"step": 1984000
},
{
"epoch": 8.39,
"eval_loss": 2.8903963565826416,
"eval_runtime": 227.9139,
"eval_samples_per_second": 877.524,
"eval_steps_per_second": 54.845,
"step": 1992000
},
{
"epoch": 8.42,
"learning_rate": 1.6676394563495374e-06,
"loss": 3.0534,
"step": 2000000
},
{
"epoch": 8.42,
"eval_loss": 2.8839056491851807,
"eval_runtime": 229.0151,
"eval_samples_per_second": 873.305,
"eval_steps_per_second": 54.582,
"step": 2000000
},
{
"epoch": 8.45,
"eval_loss": 2.891777276992798,
"eval_runtime": 227.2746,
"eval_samples_per_second": 879.993,
"eval_steps_per_second": 55.0,
"step": 2008000
},
{
"epoch": 8.49,
"learning_rate": 1.6009338780955558e-06,
"loss": 3.0547,
"step": 2016000
},
{
"epoch": 8.49,
"eval_loss": 2.8738794326782227,
"eval_runtime": 227.7519,
"eval_samples_per_second": 878.149,
"eval_steps_per_second": 54.884,
"step": 2016000
},
{
"epoch": 8.52,
"eval_loss": 2.868389129638672,
"eval_runtime": 228.3511,
"eval_samples_per_second": 875.844,
"eval_steps_per_second": 54.74,
"step": 2024000
},
{
"epoch": 8.56,
"learning_rate": 1.5342282998415744e-06,
"loss": 3.0544,
"step": 2032000
},
{
"epoch": 8.56,
"eval_loss": 2.8739755153656006,
"eval_runtime": 229.9365,
"eval_samples_per_second": 869.806,
"eval_steps_per_second": 54.363,
"step": 2032000
},
{
"epoch": 8.59,
"eval_loss": 2.8784215450286865,
"eval_runtime": 228.6391,
"eval_samples_per_second": 874.741,
"eval_steps_per_second": 54.671,
"step": 2040000
},
{
"epoch": 8.62,
"learning_rate": 1.4675227215875928e-06,
"loss": 3.0448,
"step": 2048000
},
{
"epoch": 8.62,
"eval_loss": 2.8758127689361572,
"eval_runtime": 229.161,
"eval_samples_per_second": 872.749,
"eval_steps_per_second": 54.547,
"step": 2048000
},
{
"epoch": 8.66,
"eval_loss": 2.880105972290039,
"eval_runtime": 230.876,
"eval_samples_per_second": 866.266,
"eval_steps_per_second": 54.142,
"step": 2056000
},
{
"epoch": 8.69,
"learning_rate": 1.4008171433336116e-06,
"loss": 3.0499,
"step": 2064000
},
{
"epoch": 8.69,
"eval_loss": 2.8793435096740723,
"eval_runtime": 229.0938,
"eval_samples_per_second": 873.005,
"eval_steps_per_second": 54.563,
"step": 2064000
},
{
"epoch": 8.72,
"eval_loss": 2.8707237243652344,
"eval_runtime": 228.2778,
"eval_samples_per_second": 876.126,
"eval_steps_per_second": 54.758,
"step": 2072000
},
{
"epoch": 8.76,
"learning_rate": 1.33411156507963e-06,
"loss": 3.0368,
"step": 2080000
},
{
"epoch": 8.76,
"eval_loss": 2.872204065322876,
"eval_runtime": 229.9264,
"eval_samples_per_second": 869.844,
"eval_steps_per_second": 54.365,
"step": 2080000
},
{
"epoch": 8.79,
"eval_loss": 2.875173807144165,
"eval_runtime": 229.2278,
"eval_samples_per_second": 872.495,
"eval_steps_per_second": 54.531,
"step": 2088000
},
{
"epoch": 8.83,
"learning_rate": 1.2674059868256484e-06,
"loss": 3.0548,
"step": 2096000
},
{
"epoch": 8.83,
"eval_loss": 2.8879776000976562,
"eval_runtime": 228.5322,
"eval_samples_per_second": 875.15,
"eval_steps_per_second": 54.697,
"step": 2096000
},
{
"epoch": 8.86,
"eval_loss": 2.87813663482666,
"eval_runtime": 228.0244,
"eval_samples_per_second": 877.099,
"eval_steps_per_second": 54.819,
"step": 2104000
},
{
"epoch": 8.89,
"learning_rate": 1.2007004085716668e-06,
"loss": 3.0457,
"step": 2112000
},
{
"epoch": 8.89,
"eval_loss": 2.882504463195801,
"eval_runtime": 228.7296,
"eval_samples_per_second": 874.395,
"eval_steps_per_second": 54.65,
"step": 2112000
},
{
"epoch": 8.93,
"eval_loss": 2.8827481269836426,
"eval_runtime": 231.2951,
"eval_samples_per_second": 864.696,
"eval_steps_per_second": 54.044,
"step": 2120000
},
{
"epoch": 8.96,
"learning_rate": 1.1339948303176854e-06,
"loss": 3.0377,
"step": 2128000
},
{
"epoch": 8.96,
"eval_loss": 2.880984306335449,
"eval_runtime": 231.2589,
"eval_samples_per_second": 864.832,
"eval_steps_per_second": 54.052,
"step": 2128000
},
{
"epoch": 8.99,
"eval_loss": 2.872668981552124,
"eval_runtime": 231.1403,
"eval_samples_per_second": 865.275,
"eval_steps_per_second": 54.08,
"step": 2136000
},
{
"epoch": 9.03,
"learning_rate": 1.067289252063704e-06,
"loss": 3.0341,
"step": 2144000
},
{
"epoch": 9.03,
"eval_loss": 2.8749947547912598,
"eval_runtime": 232.6534,
"eval_samples_per_second": 859.648,
"eval_steps_per_second": 53.728,
"step": 2144000
},
{
"epoch": 9.06,
"eval_loss": 2.8637659549713135,
"eval_runtime": 232.1804,
"eval_samples_per_second": 861.399,
"eval_steps_per_second": 53.837,
"step": 2152000
},
{
"epoch": 9.09,
"learning_rate": 1.0005836738097224e-06,
"loss": 3.0275,
"step": 2160000
},
{
"epoch": 9.09,
"eval_loss": 2.8689756393432617,
"eval_runtime": 233.8191,
"eval_samples_per_second": 855.362,
"eval_steps_per_second": 53.46,
"step": 2160000
},
{
"epoch": 9.13,
"eval_loss": 2.866030693054199,
"eval_runtime": 231.4154,
"eval_samples_per_second": 864.247,
"eval_steps_per_second": 54.015,
"step": 2168000
},
{
"epoch": 9.16,
"learning_rate": 9.33878095555741e-07,
"loss": 3.0413,
"step": 2176000
},
{
"epoch": 9.16,
"eval_loss": 2.8578262329101562,
"eval_runtime": 233.839,
"eval_samples_per_second": 855.289,
"eval_steps_per_second": 53.456,
"step": 2176000
},
{
"epoch": 9.2,
"eval_loss": 2.8692455291748047,
"eval_runtime": 236.8158,
"eval_samples_per_second": 844.538,
"eval_steps_per_second": 52.784,
"step": 2184000
},
{
"epoch": 9.23,
"learning_rate": 8.671725173017595e-07,
"loss": 3.0272,
"step": 2192000
},
{
"epoch": 9.23,
"eval_loss": 2.8701837062835693,
"eval_runtime": 235.4116,
"eval_samples_per_second": 849.576,
"eval_steps_per_second": 53.098,
"step": 2192000
},
{
"epoch": 9.26,
"eval_loss": 2.870734453201294,
"eval_runtime": 236.6161,
"eval_samples_per_second": 845.251,
"eval_steps_per_second": 52.828,
"step": 2200000
},
{
"epoch": 9.3,
"learning_rate": 8.004669390477779e-07,
"loss": 3.034,
"step": 2208000
},
{
"epoch": 9.3,
"eval_loss": 2.866581916809082,
"eval_runtime": 233.8393,
"eval_samples_per_second": 855.288,
"eval_steps_per_second": 53.456,
"step": 2208000
},
{
"epoch": 9.33,
"eval_loss": 2.873441219329834,
"eval_runtime": 229.9559,
"eval_samples_per_second": 869.732,
"eval_steps_per_second": 54.358,
"step": 2216000
},
{
"epoch": 9.36,
"learning_rate": 7.337613607937964e-07,
"loss": 3.0346,
"step": 2224000
},
{
"epoch": 9.36,
"eval_loss": 2.8685038089752197,
"eval_runtime": 229.1295,
"eval_samples_per_second": 872.869,
"eval_steps_per_second": 54.554,
"step": 2224000
},
{
"epoch": 9.4,
"eval_loss": 2.867513656616211,
"eval_runtime": 228.6384,
"eval_samples_per_second": 874.744,
"eval_steps_per_second": 54.671,
"step": 2232000
},
{
"epoch": 9.43,
"learning_rate": 6.67055782539815e-07,
"loss": 3.0234,
"step": 2240000
},
{
"epoch": 9.43,
"eval_loss": 2.866205930709839,
"eval_runtime": 228.8031,
"eval_samples_per_second": 874.114,
"eval_steps_per_second": 54.632,
"step": 2240000
},
{
"epoch": 9.47,
"eval_loss": 2.8670058250427246,
"eval_runtime": 230.0362,
"eval_samples_per_second": 869.428,
"eval_steps_per_second": 54.339,
"step": 2248000
},
{
"epoch": 9.5,
"learning_rate": 6.003502042858334e-07,
"loss": 3.0256,
"step": 2256000
},
{
"epoch": 9.5,
"eval_loss": 2.8764402866363525,
"eval_runtime": 228.8894,
"eval_samples_per_second": 873.784,
"eval_steps_per_second": 54.612,
"step": 2256000
},
{
"epoch": 9.53,
"eval_loss": 2.8664441108703613,
"eval_runtime": 228.7947,
"eval_samples_per_second": 874.146,
"eval_steps_per_second": 54.634,
"step": 2264000
},
{
"epoch": 9.57,
"learning_rate": 5.33644626031852e-07,
"loss": 3.0232,
"step": 2272000
},
{
"epoch": 9.57,
"eval_loss": 2.8624887466430664,
"eval_runtime": 229.0315,
"eval_samples_per_second": 873.242,
"eval_steps_per_second": 54.578,
"step": 2272000
},
{
"epoch": 9.6,
"eval_loss": 2.8646833896636963,
"eval_runtime": 229.8068,
"eval_samples_per_second": 870.296,
"eval_steps_per_second": 54.394,
"step": 2280000
},
{
"epoch": 9.63,
"learning_rate": 4.669390477778705e-07,
"loss": 3.0309,
"step": 2288000
},
{
"epoch": 9.63,
"eval_loss": 2.8561413288116455,
"eval_runtime": 229.8225,
"eval_samples_per_second": 870.237,
"eval_steps_per_second": 54.39,
"step": 2288000
},
{
"epoch": 9.67,
"eval_loss": 2.8657453060150146,
"eval_runtime": 230.8107,
"eval_samples_per_second": 866.511,
"eval_steps_per_second": 54.157,
"step": 2296000
},
{
"epoch": 9.7,
"learning_rate": 4.0023346952388894e-07,
"loss": 3.0254,
"step": 2304000
},
{
"epoch": 9.7,
"eval_loss": 2.8666698932647705,
"eval_runtime": 230.9054,
"eval_samples_per_second": 866.156,
"eval_steps_per_second": 54.135,
"step": 2304000
},
{
"epoch": 9.73,
"eval_loss": 2.861841917037964,
"eval_runtime": 233.616,
"eval_samples_per_second": 856.106,
"eval_steps_per_second": 53.507,
"step": 2312000
},
{
"epoch": 9.77,
"learning_rate": 3.335278912699075e-07,
"loss": 3.0198,
"step": 2320000
},
{
"epoch": 9.77,
"eval_loss": 2.8649652004241943,
"eval_runtime": 232.9095,
"eval_samples_per_second": 858.702,
"eval_steps_per_second": 53.669,
"step": 2320000
},
{
"epoch": 9.8,
"eval_loss": 2.8629865646362305,
"eval_runtime": 233.276,
"eval_samples_per_second": 857.353,
"eval_steps_per_second": 53.585,
"step": 2328000
},
{
"epoch": 9.84,
"learning_rate": 2.66822313015926e-07,
"loss": 3.0109,
"step": 2336000
},
{
"epoch": 9.84,
"eval_loss": 2.8533174991607666,
"eval_runtime": 232.7296,
"eval_samples_per_second": 859.366,
"eval_steps_per_second": 53.71,
"step": 2336000
},
{
"epoch": 9.87,
"eval_loss": 2.8656232357025146,
"eval_runtime": 230.1435,
"eval_samples_per_second": 869.023,
"eval_steps_per_second": 54.314,
"step": 2344000
},
{
"epoch": 9.9,
"learning_rate": 2.0011673476194447e-07,
"loss": 3.0316,
"step": 2352000
},
{
"epoch": 9.9,
"eval_loss": 2.8606715202331543,
"eval_runtime": 229.4357,
"eval_samples_per_second": 871.704,
"eval_steps_per_second": 54.482,
"step": 2352000
},
{
"epoch": 9.94,
"eval_loss": 2.8572158813476562,
"eval_runtime": 229.9275,
"eval_samples_per_second": 869.839,
"eval_steps_per_second": 54.365,
"step": 2360000
},
{
"epoch": 9.97,
"learning_rate": 1.33411156507963e-07,
"loss": 3.0225,
"step": 2368000
},
{
"epoch": 9.97,
"eval_loss": 2.8617327213287354,
"eval_runtime": 229.6061,
"eval_samples_per_second": 871.057,
"eval_steps_per_second": 54.441,
"step": 2368000
},
{
"epoch": 10.0,
"eval_loss": 2.8604278564453125,
"eval_runtime": 229.8413,
"eval_samples_per_second": 870.166,
"eval_steps_per_second": 54.385,
"step": 2376000
},
{
"epoch": 10.04,
"learning_rate": 6.67055782539815e-08,
"loss": 3.0132,
"step": 2384000
},
{
"epoch": 10.04,
"eval_loss": 2.857710838317871,
"eval_runtime": 229.9337,
"eval_samples_per_second": 869.816,
"eval_steps_per_second": 54.364,
"step": 2384000
},
{
"epoch": 10.07,
"eval_loss": 2.8534834384918213,
"eval_runtime": 230.8863,
"eval_samples_per_second": 866.227,
"eval_steps_per_second": 54.139,
"step": 2392000
},
{
"epoch": 10.11,
"learning_rate": 0.0,
"loss": 3.0202,
"step": 2400000
},
{
"epoch": 10.11,
"eval_loss": 2.8565549850463867,
"eval_runtime": 230.1736,
"eval_samples_per_second": 868.909,
"eval_steps_per_second": 54.307,
"step": 2400000
},
{
"epoch": 10.11,
"step": 2400000,
"total_flos": 7.688849395607474e+17,
"train_loss": 2.9681437548828127,
"train_runtime": 221059.9809,
"train_samples_per_second": 173.709,
"train_steps_per_second": 10.857
}
],
"logging_steps": 16000,
"max_steps": 2400000,
"num_train_epochs": 11,
"save_steps": 32000,
"total_flos": 7.688849395607474e+17,
"trial_name": null,
"trial_params": null
}