beamaia's picture
Upload folder using huggingface_hub
b696258 verified
raw
history blame
79.3 kB
{
"best_metric": 0.14816446602344513,
"best_model_checkpoint": "./mistral7b/13-02-24-Weni-ZeroShot-3.3.0-Mistral-7b-Multilanguage-3.1.0_zeroshot-2_max_steps-4968_batch_128_2024-02-13_03/checkpoint-4840",
"epoch": 148.92307692307693,
"eval_steps": 20,
"global_step": 4840,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.62,
"learning_rate": 2.82258064516129e-06,
"loss": 1.6698,
"step": 20
},
{
"epoch": 0.62,
"eval_loss": 1.6584945917129517,
"eval_runtime": 26.1472,
"eval_samples_per_second": 17.669,
"eval_steps_per_second": 0.574,
"step": 20
},
{
"epoch": 1.23,
"learning_rate": 1.0483870967741936e-05,
"loss": 1.6069,
"step": 40
},
{
"epoch": 1.23,
"eval_loss": 1.5037014484405518,
"eval_runtime": 26.055,
"eval_samples_per_second": 17.732,
"eval_steps_per_second": 0.576,
"step": 40
},
{
"epoch": 1.85,
"learning_rate": 1.8548387096774193e-05,
"loss": 1.3817,
"step": 60
},
{
"epoch": 1.85,
"eval_loss": 1.246311902999878,
"eval_runtime": 26.0429,
"eval_samples_per_second": 17.74,
"eval_steps_per_second": 0.576,
"step": 60
},
{
"epoch": 2.46,
"learning_rate": 2.661290322580645e-05,
"loss": 1.1419,
"step": 80
},
{
"epoch": 2.46,
"eval_loss": 1.0472239255905151,
"eval_runtime": 26.0923,
"eval_samples_per_second": 17.706,
"eval_steps_per_second": 0.575,
"step": 80
},
{
"epoch": 3.08,
"learning_rate": 3.467741935483872e-05,
"loss": 0.9906,
"step": 100
},
{
"epoch": 3.08,
"eval_loss": 0.9301682710647583,
"eval_runtime": 26.1937,
"eval_samples_per_second": 17.638,
"eval_steps_per_second": 0.573,
"step": 100
},
{
"epoch": 3.69,
"learning_rate": 4.2741935483870973e-05,
"loss": 0.8634,
"step": 120
},
{
"epoch": 3.69,
"eval_loss": 0.7793559432029724,
"eval_runtime": 26.0879,
"eval_samples_per_second": 17.709,
"eval_steps_per_second": 0.575,
"step": 120
},
{
"epoch": 4.31,
"learning_rate": 5.080645161290323e-05,
"loss": 0.7382,
"step": 140
},
{
"epoch": 4.31,
"eval_loss": 0.7010539174079895,
"eval_runtime": 26.032,
"eval_samples_per_second": 17.747,
"eval_steps_per_second": 0.576,
"step": 140
},
{
"epoch": 4.92,
"learning_rate": 5.887096774193549e-05,
"loss": 0.6869,
"step": 160
},
{
"epoch": 4.92,
"eval_loss": 0.6714752316474915,
"eval_runtime": 44.7747,
"eval_samples_per_second": 10.318,
"eval_steps_per_second": 0.335,
"step": 160
},
{
"epoch": 5.54,
"learning_rate": 6.693548387096774e-05,
"loss": 0.6623,
"step": 180
},
{
"epoch": 5.54,
"eval_loss": 0.6569082140922546,
"eval_runtime": 44.9201,
"eval_samples_per_second": 10.285,
"eval_steps_per_second": 0.334,
"step": 180
},
{
"epoch": 6.15,
"learning_rate": 7.500000000000001e-05,
"loss": 0.6508,
"step": 200
},
{
"epoch": 6.15,
"eval_loss": 0.6456889510154724,
"eval_runtime": 44.7964,
"eval_samples_per_second": 10.313,
"eval_steps_per_second": 0.335,
"step": 200
},
{
"epoch": 6.77,
"learning_rate": 8.306451612903227e-05,
"loss": 0.6394,
"step": 220
},
{
"epoch": 6.77,
"eval_loss": 0.6361492276191711,
"eval_runtime": 26.7207,
"eval_samples_per_second": 17.29,
"eval_steps_per_second": 0.561,
"step": 220
},
{
"epoch": 7.38,
"learning_rate": 9.112903225806452e-05,
"loss": 0.6289,
"step": 240
},
{
"epoch": 7.38,
"eval_loss": 0.6280443072319031,
"eval_runtime": 26.6969,
"eval_samples_per_second": 17.305,
"eval_steps_per_second": 0.562,
"step": 240
},
{
"epoch": 8.0,
"learning_rate": 9.919354838709678e-05,
"loss": 0.6239,
"step": 260
},
{
"epoch": 8.0,
"eval_loss": 0.6212936043739319,
"eval_runtime": 26.7173,
"eval_samples_per_second": 17.292,
"eval_steps_per_second": 0.561,
"step": 260
},
{
"epoch": 8.62,
"learning_rate": 0.00010725806451612903,
"loss": 0.6171,
"step": 280
},
{
"epoch": 8.62,
"eval_loss": 0.614993155002594,
"eval_runtime": 26.7242,
"eval_samples_per_second": 17.288,
"eval_steps_per_second": 0.561,
"step": 280
},
{
"epoch": 9.23,
"learning_rate": 0.00011532258064516131,
"loss": 0.6096,
"step": 300
},
{
"epoch": 9.23,
"eval_loss": 0.608863353729248,
"eval_runtime": 26.7229,
"eval_samples_per_second": 17.289,
"eval_steps_per_second": 0.561,
"step": 300
},
{
"epoch": 9.85,
"learning_rate": 0.00012338709677419356,
"loss": 0.6048,
"step": 320
},
{
"epoch": 9.85,
"eval_loss": 0.6036637425422668,
"eval_runtime": 26.725,
"eval_samples_per_second": 17.287,
"eval_steps_per_second": 0.561,
"step": 320
},
{
"epoch": 10.46,
"learning_rate": 0.0001314516129032258,
"loss": 0.5986,
"step": 340
},
{
"epoch": 10.46,
"eval_loss": 0.5977433323860168,
"eval_runtime": 26.7485,
"eval_samples_per_second": 17.272,
"eval_steps_per_second": 0.561,
"step": 340
},
{
"epoch": 11.08,
"learning_rate": 0.0001395161290322581,
"loss": 0.5914,
"step": 360
},
{
"epoch": 11.08,
"eval_loss": 0.591983437538147,
"eval_runtime": 44.868,
"eval_samples_per_second": 10.297,
"eval_steps_per_second": 0.334,
"step": 360
},
{
"epoch": 11.69,
"learning_rate": 0.00014758064516129032,
"loss": 0.5871,
"step": 380
},
{
"epoch": 11.69,
"eval_loss": 0.5865333676338196,
"eval_runtime": 44.9887,
"eval_samples_per_second": 10.269,
"eval_steps_per_second": 0.333,
"step": 380
},
{
"epoch": 12.31,
"learning_rate": 0.0001556451612903226,
"loss": 0.5808,
"step": 400
},
{
"epoch": 12.31,
"eval_loss": 0.5812229514122009,
"eval_runtime": 26.7165,
"eval_samples_per_second": 17.293,
"eval_steps_per_second": 0.561,
"step": 400
},
{
"epoch": 12.92,
"learning_rate": 0.00016370967741935485,
"loss": 0.5746,
"step": 420
},
{
"epoch": 12.92,
"eval_loss": 0.5760770440101624,
"eval_runtime": 26.7316,
"eval_samples_per_second": 17.283,
"eval_steps_per_second": 0.561,
"step": 420
},
{
"epoch": 13.54,
"learning_rate": 0.00017177419354838711,
"loss": 0.5684,
"step": 440
},
{
"epoch": 13.54,
"eval_loss": 0.5711672306060791,
"eval_runtime": 26.708,
"eval_samples_per_second": 17.298,
"eval_steps_per_second": 0.562,
"step": 440
},
{
"epoch": 14.15,
"learning_rate": 0.00017983870967741935,
"loss": 0.5641,
"step": 460
},
{
"epoch": 14.15,
"eval_loss": 0.5648314356803894,
"eval_runtime": 26.7361,
"eval_samples_per_second": 17.28,
"eval_steps_per_second": 0.561,
"step": 460
},
{
"epoch": 14.77,
"learning_rate": 0.00018790322580645164,
"loss": 0.5573,
"step": 480
},
{
"epoch": 14.77,
"eval_loss": 0.5593515634536743,
"eval_runtime": 26.7412,
"eval_samples_per_second": 17.277,
"eval_steps_per_second": 0.561,
"step": 480
},
{
"epoch": 15.38,
"learning_rate": 0.00019596774193548388,
"loss": 0.5517,
"step": 500
},
{
"epoch": 15.38,
"eval_loss": 0.5539582967758179,
"eval_runtime": 26.732,
"eval_samples_per_second": 17.283,
"eval_steps_per_second": 0.561,
"step": 500
},
{
"epoch": 16.0,
"learning_rate": 0.00019999753245902063,
"loss": 0.5447,
"step": 520
},
{
"epoch": 16.0,
"eval_loss": 0.54853355884552,
"eval_runtime": 26.7332,
"eval_samples_per_second": 17.282,
"eval_steps_per_second": 0.561,
"step": 520
},
{
"epoch": 16.62,
"learning_rate": 0.00019997779286183058,
"loss": 0.5372,
"step": 540
},
{
"epoch": 16.62,
"eval_loss": 0.5420479774475098,
"eval_runtime": 26.733,
"eval_samples_per_second": 17.282,
"eval_steps_per_second": 0.561,
"step": 540
},
{
"epoch": 17.23,
"learning_rate": 0.00019993831756406357,
"loss": 0.5314,
"step": 560
},
{
"epoch": 17.23,
"eval_loss": 0.5360643267631531,
"eval_runtime": 26.736,
"eval_samples_per_second": 17.28,
"eval_steps_per_second": 0.561,
"step": 560
},
{
"epoch": 17.85,
"learning_rate": 0.0001998791143581767,
"loss": 0.5248,
"step": 580
},
{
"epoch": 17.85,
"eval_loss": 0.5307183265686035,
"eval_runtime": 26.7116,
"eval_samples_per_second": 17.296,
"eval_steps_per_second": 0.562,
"step": 580
},
{
"epoch": 18.46,
"learning_rate": 0.00019980019493093267,
"loss": 0.5195,
"step": 600
},
{
"epoch": 18.46,
"eval_loss": 0.5241357684135437,
"eval_runtime": 26.7372,
"eval_samples_per_second": 17.279,
"eval_steps_per_second": 0.561,
"step": 600
},
{
"epoch": 19.08,
"learning_rate": 0.00019970157486109296,
"loss": 0.5136,
"step": 620
},
{
"epoch": 19.08,
"eval_loss": 0.5182597637176514,
"eval_runtime": 26.7038,
"eval_samples_per_second": 17.301,
"eval_steps_per_second": 0.562,
"step": 620
},
{
"epoch": 19.69,
"learning_rate": 0.00019958327361634248,
"loss": 0.5036,
"step": 640
},
{
"epoch": 19.69,
"eval_loss": 0.5129547715187073,
"eval_runtime": 26.7294,
"eval_samples_per_second": 17.284,
"eval_steps_per_second": 0.561,
"step": 640
},
{
"epoch": 20.31,
"learning_rate": 0.00019944531454944663,
"loss": 0.4996,
"step": 660
},
{
"epoch": 20.31,
"eval_loss": 0.5069959163665771,
"eval_runtime": 26.7503,
"eval_samples_per_second": 17.271,
"eval_steps_per_second": 0.561,
"step": 660
},
{
"epoch": 20.92,
"learning_rate": 0.0001992877248936415,
"loss": 0.4941,
"step": 680
},
{
"epoch": 20.92,
"eval_loss": 0.5006260871887207,
"eval_runtime": 26.7128,
"eval_samples_per_second": 17.295,
"eval_steps_per_second": 0.562,
"step": 680
},
{
"epoch": 21.54,
"learning_rate": 0.000199110535757258,
"loss": 0.4838,
"step": 700
},
{
"epoch": 21.54,
"eval_loss": 0.4946294128894806,
"eval_runtime": 26.7236,
"eval_samples_per_second": 17.288,
"eval_steps_per_second": 0.561,
"step": 700
},
{
"epoch": 22.15,
"learning_rate": 0.00019891378211758096,
"loss": 0.4795,
"step": 720
},
{
"epoch": 22.15,
"eval_loss": 0.4879631996154785,
"eval_runtime": 26.733,
"eval_samples_per_second": 17.282,
"eval_steps_per_second": 0.561,
"step": 720
},
{
"epoch": 22.77,
"learning_rate": 0.0001986975028139447,
"loss": 0.4722,
"step": 740
},
{
"epoch": 22.77,
"eval_loss": 0.48206356167793274,
"eval_runtime": 26.7335,
"eval_samples_per_second": 17.282,
"eval_steps_per_second": 0.561,
"step": 740
},
{
"epoch": 23.38,
"learning_rate": 0.00019846174054006607,
"loss": 0.464,
"step": 760
},
{
"epoch": 23.38,
"eval_loss": 0.4757327735424042,
"eval_runtime": 26.722,
"eval_samples_per_second": 17.289,
"eval_steps_per_second": 0.561,
"step": 760
},
{
"epoch": 24.0,
"learning_rate": 0.00019820654183561658,
"loss": 0.4605,
"step": 780
},
{
"epoch": 24.0,
"eval_loss": 0.47157037258148193,
"eval_runtime": 26.6987,
"eval_samples_per_second": 17.304,
"eval_steps_per_second": 0.562,
"step": 780
},
{
"epoch": 24.62,
"learning_rate": 0.00019793195707703567,
"loss": 0.4524,
"step": 800
},
{
"epoch": 24.62,
"eval_loss": 0.46418875455856323,
"eval_runtime": 26.7165,
"eval_samples_per_second": 17.293,
"eval_steps_per_second": 0.561,
"step": 800
},
{
"epoch": 25.23,
"learning_rate": 0.00019763804046758602,
"loss": 0.4461,
"step": 820
},
{
"epoch": 25.23,
"eval_loss": 0.45834338665008545,
"eval_runtime": 26.7468,
"eval_samples_per_second": 17.273,
"eval_steps_per_second": 0.561,
"step": 820
},
{
"epoch": 25.85,
"learning_rate": 0.00019732485002665415,
"loss": 0.4393,
"step": 840
},
{
"epoch": 25.85,
"eval_loss": 0.4538223147392273,
"eval_runtime": 26.746,
"eval_samples_per_second": 17.274,
"eval_steps_per_second": 0.561,
"step": 840
},
{
"epoch": 26.46,
"learning_rate": 0.00019699244757829702,
"loss": 0.4337,
"step": 860
},
{
"epoch": 26.46,
"eval_loss": 0.4459408223628998,
"eval_runtime": 26.705,
"eval_samples_per_second": 17.3,
"eval_steps_per_second": 0.562,
"step": 860
},
{
"epoch": 27.08,
"learning_rate": 0.0001966408987390381,
"loss": 0.4269,
"step": 880
},
{
"epoch": 27.08,
"eval_loss": 0.43985316157341003,
"eval_runtime": 26.7756,
"eval_samples_per_second": 17.255,
"eval_steps_per_second": 0.56,
"step": 880
},
{
"epoch": 27.69,
"learning_rate": 0.00019627027290491458,
"loss": 0.4191,
"step": 900
},
{
"epoch": 27.69,
"eval_loss": 0.43426012992858887,
"eval_runtime": 26.7408,
"eval_samples_per_second": 17.277,
"eval_steps_per_second": 0.561,
"step": 900
},
{
"epoch": 28.31,
"learning_rate": 0.00019588064323777853,
"loss": 0.4138,
"step": 920
},
{
"epoch": 28.31,
"eval_loss": 0.4298732876777649,
"eval_runtime": 26.7628,
"eval_samples_per_second": 17.263,
"eval_steps_per_second": 0.56,
"step": 920
},
{
"epoch": 28.92,
"learning_rate": 0.00019549296276462325,
"loss": 0.408,
"step": 940
},
{
"epoch": 28.92,
"eval_loss": 0.42369845509529114,
"eval_runtime": 26.7274,
"eval_samples_per_second": 17.286,
"eval_steps_per_second": 0.561,
"step": 940
},
{
"epoch": 29.54,
"learning_rate": 0.00019506650024792317,
"loss": 0.4001,
"step": 960
},
{
"epoch": 29.54,
"eval_loss": 0.4199902415275574,
"eval_runtime": 26.7231,
"eval_samples_per_second": 17.288,
"eval_steps_per_second": 0.561,
"step": 960
},
{
"epoch": 30.15,
"learning_rate": 0.0001946212715239476,
"loss": 0.3978,
"step": 980
},
{
"epoch": 30.15,
"eval_loss": 0.41251733899116516,
"eval_runtime": 26.7147,
"eval_samples_per_second": 17.294,
"eval_steps_per_second": 0.561,
"step": 980
},
{
"epoch": 30.77,
"learning_rate": 0.00019415736448122193,
"loss": 0.3891,
"step": 1000
},
{
"epoch": 30.77,
"eval_loss": 0.40809690952301025,
"eval_runtime": 26.7253,
"eval_samples_per_second": 17.287,
"eval_steps_per_second": 0.561,
"step": 1000
},
{
"epoch": 31.38,
"learning_rate": 0.0001936748706953874,
"loss": 0.3861,
"step": 1020
},
{
"epoch": 31.38,
"eval_loss": 0.40387141704559326,
"eval_runtime": 26.7175,
"eval_samples_per_second": 17.292,
"eval_steps_per_second": 0.561,
"step": 1020
},
{
"epoch": 32.0,
"learning_rate": 0.00019317388541112396,
"loss": 0.3806,
"step": 1040
},
{
"epoch": 32.0,
"eval_loss": 0.3994995355606079,
"eval_runtime": 26.6886,
"eval_samples_per_second": 17.311,
"eval_steps_per_second": 0.562,
"step": 1040
},
{
"epoch": 32.62,
"learning_rate": 0.000192654507523349,
"loss": 0.3744,
"step": 1060
},
{
"epoch": 32.62,
"eval_loss": 0.3944377303123474,
"eval_runtime": 43.1554,
"eval_samples_per_second": 10.705,
"eval_steps_per_second": 0.348,
"step": 1060
},
{
"epoch": 33.23,
"learning_rate": 0.00019211683955769538,
"loss": 0.3704,
"step": 1080
},
{
"epoch": 33.23,
"eval_loss": 0.3890739977359772,
"eval_runtime": 44.8442,
"eval_samples_per_second": 10.302,
"eval_steps_per_second": 0.334,
"step": 1080
},
{
"epoch": 33.85,
"learning_rate": 0.00019156098765027262,
"loss": 0.3642,
"step": 1100
},
{
"epoch": 33.85,
"eval_loss": 0.3840695321559906,
"eval_runtime": 44.6934,
"eval_samples_per_second": 10.337,
"eval_steps_per_second": 0.336,
"step": 1100
},
{
"epoch": 34.46,
"learning_rate": 0.00019098706152671576,
"loss": 0.3578,
"step": 1120
},
{
"epoch": 34.46,
"eval_loss": 0.37998247146606445,
"eval_runtime": 44.349,
"eval_samples_per_second": 10.417,
"eval_steps_per_second": 0.338,
"step": 1120
},
{
"epoch": 35.08,
"learning_rate": 0.00019039517448052535,
"loss": 0.3547,
"step": 1140
},
{
"epoch": 35.08,
"eval_loss": 0.3763927221298218,
"eval_runtime": 44.8048,
"eval_samples_per_second": 10.311,
"eval_steps_per_second": 0.335,
"step": 1140
},
{
"epoch": 35.69,
"learning_rate": 0.00018978544335070314,
"loss": 0.3494,
"step": 1160
},
{
"epoch": 35.69,
"eval_loss": 0.37159162759780884,
"eval_runtime": 44.97,
"eval_samples_per_second": 10.274,
"eval_steps_per_second": 0.334,
"step": 1160
},
{
"epoch": 36.31,
"learning_rate": 0.0001891579884986881,
"loss": 0.3449,
"step": 1180
},
{
"epoch": 36.31,
"eval_loss": 0.36737060546875,
"eval_runtime": 44.3891,
"eval_samples_per_second": 10.408,
"eval_steps_per_second": 0.338,
"step": 1180
},
{
"epoch": 36.92,
"learning_rate": 0.00018851293378459685,
"loss": 0.3409,
"step": 1200
},
{
"epoch": 36.92,
"eval_loss": 0.3632607161998749,
"eval_runtime": 44.9198,
"eval_samples_per_second": 10.285,
"eval_steps_per_second": 0.334,
"step": 1200
},
{
"epoch": 37.54,
"learning_rate": 0.0001878504065427736,
"loss": 0.3339,
"step": 1220
},
{
"epoch": 37.54,
"eval_loss": 0.3598220944404602,
"eval_runtime": 44.8761,
"eval_samples_per_second": 10.295,
"eval_steps_per_second": 0.334,
"step": 1220
},
{
"epoch": 38.15,
"learning_rate": 0.00018717053755665437,
"loss": 0.3301,
"step": 1240
},
{
"epoch": 38.15,
"eval_loss": 0.35608917474746704,
"eval_runtime": 44.9887,
"eval_samples_per_second": 10.269,
"eval_steps_per_second": 0.333,
"step": 1240
},
{
"epoch": 38.77,
"learning_rate": 0.00018647346103295003,
"loss": 0.3267,
"step": 1260
},
{
"epoch": 38.77,
"eval_loss": 0.3520090579986572,
"eval_runtime": 40.8773,
"eval_samples_per_second": 11.302,
"eval_steps_per_second": 0.367,
"step": 1260
},
{
"epoch": 39.38,
"learning_rate": 0.00018575931457515382,
"loss": 0.3247,
"step": 1280
},
{
"epoch": 39.38,
"eval_loss": 0.34774109721183777,
"eval_runtime": 45.7461,
"eval_samples_per_second": 10.099,
"eval_steps_per_second": 0.328,
"step": 1280
},
{
"epoch": 40.0,
"learning_rate": 0.00018502823915637846,
"loss": 0.3196,
"step": 1300
},
{
"epoch": 40.0,
"eval_loss": 0.34475430846214294,
"eval_runtime": 44.903,
"eval_samples_per_second": 10.289,
"eval_steps_per_second": 0.334,
"step": 1300
},
{
"epoch": 40.62,
"learning_rate": 0.00018428037909152785,
"loss": 0.3155,
"step": 1320
},
{
"epoch": 40.62,
"eval_loss": 0.3413088619709015,
"eval_runtime": 44.6571,
"eval_samples_per_second": 10.345,
"eval_steps_per_second": 0.336,
"step": 1320
},
{
"epoch": 41.23,
"learning_rate": 0.00018351588200880907,
"loss": 0.311,
"step": 1340
},
{
"epoch": 41.23,
"eval_loss": 0.3366176187992096,
"eval_runtime": 45.0459,
"eval_samples_per_second": 10.256,
"eval_steps_per_second": 0.333,
"step": 1340
},
{
"epoch": 41.85,
"learning_rate": 0.00018273489882059062,
"loss": 0.3059,
"step": 1360
},
{
"epoch": 41.85,
"eval_loss": 0.3341914713382721,
"eval_runtime": 45.038,
"eval_samples_per_second": 10.258,
"eval_steps_per_second": 0.333,
"step": 1360
},
{
"epoch": 42.46,
"learning_rate": 0.0001819375836936121,
"loss": 0.3047,
"step": 1380
},
{
"epoch": 42.46,
"eval_loss": 0.33061912655830383,
"eval_runtime": 26.7201,
"eval_samples_per_second": 17.29,
"eval_steps_per_second": 0.561,
"step": 1380
},
{
"epoch": 43.08,
"learning_rate": 0.00018112409401855158,
"loss": 0.3006,
"step": 1400
},
{
"epoch": 43.08,
"eval_loss": 0.32672399282455444,
"eval_runtime": 45.0029,
"eval_samples_per_second": 10.266,
"eval_steps_per_second": 0.333,
"step": 1400
},
{
"epoch": 43.69,
"learning_rate": 0.00018029459037895658,
"loss": 0.2967,
"step": 1420
},
{
"epoch": 43.69,
"eval_loss": 0.32409900426864624,
"eval_runtime": 45.8902,
"eval_samples_per_second": 10.067,
"eval_steps_per_second": 0.327,
"step": 1420
},
{
"epoch": 44.31,
"learning_rate": 0.00017944923651954474,
"loss": 0.2924,
"step": 1440
},
{
"epoch": 44.31,
"eval_loss": 0.3199877142906189,
"eval_runtime": 44.6863,
"eval_samples_per_second": 10.339,
"eval_steps_per_second": 0.336,
"step": 1440
},
{
"epoch": 44.92,
"learning_rate": 0.00017858819931388032,
"loss": 0.2876,
"step": 1460
},
{
"epoch": 44.92,
"eval_loss": 0.3171309530735016,
"eval_runtime": 45.7535,
"eval_samples_per_second": 10.098,
"eval_steps_per_second": 0.328,
"step": 1460
},
{
"epoch": 45.54,
"learning_rate": 0.0001777116487314335,
"loss": 0.2848,
"step": 1480
},
{
"epoch": 45.54,
"eval_loss": 0.3142802119255066,
"eval_runtime": 45.7609,
"eval_samples_per_second": 10.096,
"eval_steps_per_second": 0.328,
"step": 1480
},
{
"epoch": 46.15,
"learning_rate": 0.00017681975780402807,
"loss": 0.2836,
"step": 1500
},
{
"epoch": 46.15,
"eval_loss": 0.31119367480278015,
"eval_runtime": 45.7107,
"eval_samples_per_second": 10.107,
"eval_steps_per_second": 0.328,
"step": 1500
},
{
"epoch": 46.77,
"learning_rate": 0.00017591270259168477,
"loss": 0.2786,
"step": 1520
},
{
"epoch": 46.77,
"eval_loss": 0.3083397448062897,
"eval_runtime": 45.8269,
"eval_samples_per_second": 10.081,
"eval_steps_per_second": 0.327,
"step": 1520
},
{
"epoch": 47.38,
"learning_rate": 0.00017499066214786708,
"loss": 0.2766,
"step": 1540
},
{
"epoch": 47.38,
"eval_loss": 0.307574063539505,
"eval_runtime": 45.6083,
"eval_samples_per_second": 10.13,
"eval_steps_per_second": 0.329,
"step": 1540
},
{
"epoch": 48.0,
"learning_rate": 0.00017405381848413571,
"loss": 0.273,
"step": 1560
},
{
"epoch": 48.0,
"eval_loss": 0.3024856150150299,
"eval_runtime": 45.6794,
"eval_samples_per_second": 10.114,
"eval_steps_per_second": 0.328,
"step": 1560
},
{
"epoch": 48.62,
"learning_rate": 0.0001731023565342195,
"loss": 0.2691,
"step": 1580
},
{
"epoch": 48.62,
"eval_loss": 0.3002566397190094,
"eval_runtime": 45.8955,
"eval_samples_per_second": 10.066,
"eval_steps_per_second": 0.327,
"step": 1580
},
{
"epoch": 49.23,
"learning_rate": 0.00017213646411750935,
"loss": 0.2657,
"step": 1600
},
{
"epoch": 49.23,
"eval_loss": 0.29747503995895386,
"eval_runtime": 45.7917,
"eval_samples_per_second": 10.089,
"eval_steps_per_second": 0.328,
"step": 1600
},
{
"epoch": 49.85,
"learning_rate": 0.00017115633190198238,
"loss": 0.2615,
"step": 1620
},
{
"epoch": 49.85,
"eval_loss": 0.2955474853515625,
"eval_runtime": 45.8407,
"eval_samples_per_second": 10.078,
"eval_steps_per_second": 0.327,
"step": 1620
},
{
"epoch": 50.46,
"learning_rate": 0.000170162153366564,
"loss": 0.2614,
"step": 1640
},
{
"epoch": 50.46,
"eval_loss": 0.2920401394367218,
"eval_runtime": 45.835,
"eval_samples_per_second": 10.08,
"eval_steps_per_second": 0.327,
"step": 1640
},
{
"epoch": 51.08,
"learning_rate": 0.00016915412476293512,
"loss": 0.2587,
"step": 1660
},
{
"epoch": 51.08,
"eval_loss": 0.28886348009109497,
"eval_runtime": 45.8928,
"eval_samples_per_second": 10.067,
"eval_steps_per_second": 0.327,
"step": 1660
},
{
"epoch": 51.69,
"learning_rate": 0.00016813244507679165,
"loss": 0.2543,
"step": 1680
},
{
"epoch": 51.69,
"eval_loss": 0.28654780983924866,
"eval_runtime": 45.8541,
"eval_samples_per_second": 10.075,
"eval_steps_per_second": 0.327,
"step": 1680
},
{
"epoch": 52.31,
"learning_rate": 0.0001670973159885648,
"loss": 0.2507,
"step": 1700
},
{
"epoch": 52.31,
"eval_loss": 0.28392159938812256,
"eval_runtime": 46.0394,
"eval_samples_per_second": 10.035,
"eval_steps_per_second": 0.326,
"step": 1700
},
{
"epoch": 52.92,
"learning_rate": 0.000166048941833609,
"loss": 0.2512,
"step": 1720
},
{
"epoch": 52.92,
"eval_loss": 0.28163596987724304,
"eval_runtime": 45.9037,
"eval_samples_per_second": 10.065,
"eval_steps_per_second": 0.327,
"step": 1720
},
{
"epoch": 53.54,
"learning_rate": 0.00016498752956186605,
"loss": 0.2446,
"step": 1740
},
{
"epoch": 53.54,
"eval_loss": 0.27993378043174744,
"eval_runtime": 45.8612,
"eval_samples_per_second": 10.074,
"eval_steps_per_second": 0.327,
"step": 1740
},
{
"epoch": 54.15,
"learning_rate": 0.00016391328869701306,
"loss": 0.2428,
"step": 1760
},
{
"epoch": 54.15,
"eval_loss": 0.27705731987953186,
"eval_runtime": 45.8648,
"eval_samples_per_second": 10.073,
"eval_steps_per_second": 0.327,
"step": 1760
},
{
"epoch": 54.77,
"learning_rate": 0.00016282643129510212,
"loss": 0.2421,
"step": 1780
},
{
"epoch": 54.77,
"eval_loss": 0.27512410283088684,
"eval_runtime": 45.742,
"eval_samples_per_second": 10.1,
"eval_steps_per_second": 0.328,
"step": 1780
},
{
"epoch": 55.38,
"learning_rate": 0.00016172717190270045,
"loss": 0.24,
"step": 1800
},
{
"epoch": 55.38,
"eval_loss": 0.2725893259048462,
"eval_runtime": 26.75,
"eval_samples_per_second": 17.271,
"eval_steps_per_second": 0.561,
"step": 1800
},
{
"epoch": 56.0,
"learning_rate": 0.00016061572751453862,
"loss": 0.2379,
"step": 1820
},
{
"epoch": 56.0,
"eval_loss": 0.2703319787979126,
"eval_runtime": 26.7376,
"eval_samples_per_second": 17.279,
"eval_steps_per_second": 0.561,
"step": 1820
},
{
"epoch": 56.62,
"learning_rate": 0.0001594923175306756,
"loss": 0.2348,
"step": 1840
},
{
"epoch": 56.62,
"eval_loss": 0.2680712342262268,
"eval_runtime": 26.7571,
"eval_samples_per_second": 17.266,
"eval_steps_per_second": 0.561,
"step": 1840
},
{
"epoch": 57.23,
"learning_rate": 0.00015835716371318908,
"loss": 0.2318,
"step": 1860
},
{
"epoch": 57.23,
"eval_loss": 0.2659159302711487,
"eval_runtime": 26.7348,
"eval_samples_per_second": 17.281,
"eval_steps_per_second": 0.561,
"step": 1860
},
{
"epoch": 57.85,
"learning_rate": 0.00015721049014239943,
"loss": 0.2288,
"step": 1880
},
{
"epoch": 57.85,
"eval_loss": 0.263480007648468,
"eval_runtime": 26.7538,
"eval_samples_per_second": 17.269,
"eval_steps_per_second": 0.561,
"step": 1880
},
{
"epoch": 58.46,
"learning_rate": 0.0001560525231726359,
"loss": 0.2288,
"step": 1900
},
{
"epoch": 58.46,
"eval_loss": 0.2611861228942871,
"eval_runtime": 26.7314,
"eval_samples_per_second": 17.283,
"eval_steps_per_second": 0.561,
"step": 1900
},
{
"epoch": 59.08,
"learning_rate": 0.00015488349138755448,
"loss": 0.2239,
"step": 1920
},
{
"epoch": 59.08,
"eval_loss": 0.25924989581108093,
"eval_runtime": 26.7587,
"eval_samples_per_second": 17.265,
"eval_steps_per_second": 0.561,
"step": 1920
},
{
"epoch": 59.69,
"learning_rate": 0.0001537036255550147,
"loss": 0.2233,
"step": 1940
},
{
"epoch": 59.69,
"eval_loss": 0.2566944360733032,
"eval_runtime": 26.7636,
"eval_samples_per_second": 17.262,
"eval_steps_per_second": 0.56,
"step": 1940
},
{
"epoch": 60.31,
"learning_rate": 0.0001525131585815264,
"loss": 0.2199,
"step": 1960
},
{
"epoch": 60.31,
"eval_loss": 0.25529178977012634,
"eval_runtime": 26.7593,
"eval_samples_per_second": 17.265,
"eval_steps_per_second": 0.561,
"step": 1960
},
{
"epoch": 60.92,
"learning_rate": 0.00015131232546627355,
"loss": 0.219,
"step": 1980
},
{
"epoch": 60.92,
"eval_loss": 0.2528415322303772,
"eval_runtime": 26.7591,
"eval_samples_per_second": 17.265,
"eval_steps_per_second": 0.561,
"step": 1980
},
{
"epoch": 61.54,
"learning_rate": 0.0001501013632547252,
"loss": 0.217,
"step": 2000
},
{
"epoch": 61.54,
"eval_loss": 0.25113263726234436,
"eval_runtime": 26.7433,
"eval_samples_per_second": 17.275,
"eval_steps_per_second": 0.561,
"step": 2000
},
{
"epoch": 62.15,
"learning_rate": 0.00014888051099184256,
"loss": 0.2154,
"step": 2020
},
{
"epoch": 62.15,
"eval_loss": 0.24899105727672577,
"eval_runtime": 26.7472,
"eval_samples_per_second": 17.273,
"eval_steps_per_second": 0.561,
"step": 2020
},
{
"epoch": 62.77,
"learning_rate": 0.0001476500096748913,
"loss": 0.2126,
"step": 2040
},
{
"epoch": 62.77,
"eval_loss": 0.24699197709560394,
"eval_runtime": 26.7455,
"eval_samples_per_second": 17.274,
"eval_steps_per_second": 0.561,
"step": 2040
},
{
"epoch": 63.38,
"learning_rate": 0.00014641010220586858,
"loss": 0.2085,
"step": 2060
},
{
"epoch": 63.38,
"eval_loss": 0.24530422687530518,
"eval_runtime": 26.7332,
"eval_samples_per_second": 17.282,
"eval_steps_per_second": 0.561,
"step": 2060
},
{
"epoch": 64.0,
"learning_rate": 0.0001451610333435538,
"loss": 0.2088,
"step": 2080
},
{
"epoch": 64.0,
"eval_loss": 0.24252080917358398,
"eval_runtime": 26.7344,
"eval_samples_per_second": 17.281,
"eval_steps_per_second": 0.561,
"step": 2080
},
{
"epoch": 64.62,
"learning_rate": 0.00014390304965519312,
"loss": 0.207,
"step": 2100
},
{
"epoch": 64.62,
"eval_loss": 0.24115830659866333,
"eval_runtime": 26.7241,
"eval_samples_per_second": 17.288,
"eval_steps_per_second": 0.561,
"step": 2100
},
{
"epoch": 65.23,
"learning_rate": 0.00014263639946782695,
"loss": 0.2066,
"step": 2120
},
{
"epoch": 65.23,
"eval_loss": 0.23876874148845673,
"eval_runtime": 26.7538,
"eval_samples_per_second": 17.269,
"eval_steps_per_second": 0.561,
"step": 2120
},
{
"epoch": 65.85,
"learning_rate": 0.00014136133281926987,
"loss": 0.2021,
"step": 2140
},
{
"epoch": 65.85,
"eval_loss": 0.2371101826429367,
"eval_runtime": 26.7047,
"eval_samples_per_second": 17.3,
"eval_steps_per_second": 0.562,
"step": 2140
},
{
"epoch": 66.46,
"learning_rate": 0.00014007810140875295,
"loss": 0.2016,
"step": 2160
},
{
"epoch": 66.46,
"eval_loss": 0.2353435754776001,
"eval_runtime": 26.7324,
"eval_samples_per_second": 17.282,
"eval_steps_per_second": 0.561,
"step": 2160
},
{
"epoch": 67.08,
"learning_rate": 0.00013878695854723826,
"loss": 0.1986,
"step": 2180
},
{
"epoch": 67.08,
"eval_loss": 0.23351863026618958,
"eval_runtime": 45.6122,
"eval_samples_per_second": 10.129,
"eval_steps_per_second": 0.329,
"step": 2180
},
{
"epoch": 67.69,
"learning_rate": 0.0001374881591074148,
"loss": 0.1965,
"step": 2200
},
{
"epoch": 67.69,
"eval_loss": 0.23166298866271973,
"eval_runtime": 45.6494,
"eval_samples_per_second": 10.121,
"eval_steps_per_second": 0.329,
"step": 2200
},
{
"epoch": 68.31,
"learning_rate": 0.0001361819594733868,
"loss": 0.1969,
"step": 2220
},
{
"epoch": 68.31,
"eval_loss": 0.23032891750335693,
"eval_runtime": 26.7108,
"eval_samples_per_second": 17.296,
"eval_steps_per_second": 0.562,
"step": 2220
},
{
"epoch": 68.92,
"learning_rate": 0.00013486861749006286,
"loss": 0.1957,
"step": 2240
},
{
"epoch": 68.92,
"eval_loss": 0.22889761626720428,
"eval_runtime": 26.7424,
"eval_samples_per_second": 17.276,
"eval_steps_per_second": 0.561,
"step": 2240
},
{
"epoch": 69.54,
"learning_rate": 0.0001335483924122575,
"loss": 0.1918,
"step": 2260
},
{
"epoch": 69.54,
"eval_loss": 0.2264855057001114,
"eval_runtime": 26.7477,
"eval_samples_per_second": 17.273,
"eval_steps_per_second": 0.561,
"step": 2260
},
{
"epoch": 70.15,
"learning_rate": 0.00013222154485351375,
"loss": 0.1913,
"step": 2280
},
{
"epoch": 70.15,
"eval_loss": 0.22507672011852264,
"eval_runtime": 45.5727,
"eval_samples_per_second": 10.138,
"eval_steps_per_second": 0.329,
"step": 2280
},
{
"epoch": 70.77,
"learning_rate": 0.0001308883367346581,
"loss": 0.1892,
"step": 2300
},
{
"epoch": 70.77,
"eval_loss": 0.22298868000507355,
"eval_runtime": 26.731,
"eval_samples_per_second": 17.283,
"eval_steps_per_second": 0.561,
"step": 2300
},
{
"epoch": 71.38,
"learning_rate": 0.00012954903123209687,
"loss": 0.1885,
"step": 2320
},
{
"epoch": 71.38,
"eval_loss": 0.2219810038805008,
"eval_runtime": 26.7121,
"eval_samples_per_second": 17.296,
"eval_steps_per_second": 0.562,
"step": 2320
},
{
"epoch": 72.0,
"learning_rate": 0.0001282038927258651,
"loss": 0.1876,
"step": 2340
},
{
"epoch": 72.0,
"eval_loss": 0.2204855978488922,
"eval_runtime": 45.041,
"eval_samples_per_second": 10.257,
"eval_steps_per_second": 0.333,
"step": 2340
},
{
"epoch": 72.62,
"learning_rate": 0.0001268531867474377,
"loss": 0.1855,
"step": 2360
},
{
"epoch": 72.62,
"eval_loss": 0.21908599138259888,
"eval_runtime": 45.1286,
"eval_samples_per_second": 10.237,
"eval_steps_per_second": 0.332,
"step": 2360
},
{
"epoch": 73.23,
"learning_rate": 0.00012549717992731317,
"loss": 0.1841,
"step": 2380
},
{
"epoch": 73.23,
"eval_loss": 0.21735349297523499,
"eval_runtime": 26.5835,
"eval_samples_per_second": 17.379,
"eval_steps_per_second": 0.564,
"step": 2380
},
{
"epoch": 73.85,
"learning_rate": 0.0001241361399423808,
"loss": 0.1819,
"step": 2400
},
{
"epoch": 73.85,
"eval_loss": 0.21623647212982178,
"eval_runtime": 26.7905,
"eval_samples_per_second": 17.245,
"eval_steps_per_second": 0.56,
"step": 2400
},
{
"epoch": 74.46,
"learning_rate": 0.0001227703354630807,
"loss": 0.1812,
"step": 2420
},
{
"epoch": 74.46,
"eval_loss": 0.21434533596038818,
"eval_runtime": 26.7405,
"eval_samples_per_second": 17.277,
"eval_steps_per_second": 0.561,
"step": 2420
},
{
"epoch": 75.08,
"learning_rate": 0.0001214000361003683,
"loss": 0.1801,
"step": 2440
},
{
"epoch": 75.08,
"eval_loss": 0.21285748481750488,
"eval_runtime": 26.7466,
"eval_samples_per_second": 17.273,
"eval_steps_per_second": 0.561,
"step": 2440
},
{
"epoch": 75.69,
"learning_rate": 0.00012002551235249268,
"loss": 0.1773,
"step": 2460
},
{
"epoch": 75.69,
"eval_loss": 0.21103879809379578,
"eval_runtime": 26.7635,
"eval_samples_per_second": 17.262,
"eval_steps_per_second": 0.56,
"step": 2460
},
{
"epoch": 76.31,
"learning_rate": 0.00011864703555160028,
"loss": 0.1771,
"step": 2480
},
{
"epoch": 76.31,
"eval_loss": 0.21016329526901245,
"eval_runtime": 26.7904,
"eval_samples_per_second": 17.245,
"eval_steps_per_second": 0.56,
"step": 2480
},
{
"epoch": 76.92,
"learning_rate": 0.00011726487781017337,
"loss": 0.1752,
"step": 2500
},
{
"epoch": 76.92,
"eval_loss": 0.2086782455444336,
"eval_runtime": 44.9559,
"eval_samples_per_second": 10.277,
"eval_steps_per_second": 0.334,
"step": 2500
},
{
"epoch": 77.54,
"learning_rate": 0.00011587931196731505,
"loss": 0.1742,
"step": 2520
},
{
"epoch": 77.54,
"eval_loss": 0.2075587958097458,
"eval_runtime": 44.9658,
"eval_samples_per_second": 10.274,
"eval_steps_per_second": 0.334,
"step": 2520
},
{
"epoch": 78.15,
"learning_rate": 0.00011449061153489055,
"loss": 0.1735,
"step": 2540
},
{
"epoch": 78.15,
"eval_loss": 0.20602142810821533,
"eval_runtime": 26.6678,
"eval_samples_per_second": 17.324,
"eval_steps_per_second": 0.562,
"step": 2540
},
{
"epoch": 78.77,
"learning_rate": 0.00011309905064353575,
"loss": 0.1715,
"step": 2560
},
{
"epoch": 78.77,
"eval_loss": 0.20492884516716003,
"eval_runtime": 26.7865,
"eval_samples_per_second": 17.248,
"eval_steps_per_second": 0.56,
"step": 2560
},
{
"epoch": 79.38,
"learning_rate": 0.00011170490398854336,
"loss": 0.1707,
"step": 2580
},
{
"epoch": 79.38,
"eval_loss": 0.2027878314256668,
"eval_runtime": 42.4616,
"eval_samples_per_second": 10.88,
"eval_steps_per_second": 0.353,
"step": 2580
},
{
"epoch": 80.0,
"learning_rate": 0.0001103084467756382,
"loss": 0.1701,
"step": 2600
},
{
"epoch": 80.0,
"eval_loss": 0.201686292886734,
"eval_runtime": 45.0476,
"eval_samples_per_second": 10.256,
"eval_steps_per_second": 0.333,
"step": 2600
},
{
"epoch": 80.62,
"learning_rate": 0.00010890995466665108,
"loss": 0.1675,
"step": 2620
},
{
"epoch": 80.62,
"eval_loss": 0.2003440409898758,
"eval_runtime": 45.0995,
"eval_samples_per_second": 10.244,
"eval_steps_per_second": 0.333,
"step": 2620
},
{
"epoch": 81.23,
"learning_rate": 0.00010750970372510307,
"loss": 0.1663,
"step": 2640
},
{
"epoch": 81.23,
"eval_loss": 0.19951596856117249,
"eval_runtime": 45.0074,
"eval_samples_per_second": 10.265,
"eval_steps_per_second": 0.333,
"step": 2640
},
{
"epoch": 81.85,
"learning_rate": 0.00010610797036171014,
"loss": 0.1653,
"step": 2660
},
{
"epoch": 81.85,
"eval_loss": 0.19835925102233887,
"eval_runtime": 45.0031,
"eval_samples_per_second": 10.266,
"eval_steps_per_second": 0.333,
"step": 2660
},
{
"epoch": 82.46,
"learning_rate": 0.00010470503127981977,
"loss": 0.165,
"step": 2680
},
{
"epoch": 82.46,
"eval_loss": 0.19678974151611328,
"eval_runtime": 45.1089,
"eval_samples_per_second": 10.242,
"eval_steps_per_second": 0.333,
"step": 2680
},
{
"epoch": 83.08,
"learning_rate": 0.0001033011634207891,
"loss": 0.1644,
"step": 2700
},
{
"epoch": 83.08,
"eval_loss": 0.19566014409065247,
"eval_runtime": 26.7641,
"eval_samples_per_second": 17.262,
"eval_steps_per_second": 0.56,
"step": 2700
},
{
"epoch": 83.69,
"learning_rate": 0.00010189664390931682,
"loss": 0.1631,
"step": 2720
},
{
"epoch": 83.69,
"eval_loss": 0.19463180005550385,
"eval_runtime": 26.7406,
"eval_samples_per_second": 17.277,
"eval_steps_per_second": 0.561,
"step": 2720
},
{
"epoch": 84.31,
"learning_rate": 0.00010049174999873823,
"loss": 0.162,
"step": 2740
},
{
"epoch": 84.31,
"eval_loss": 0.1935625970363617,
"eval_runtime": 26.7779,
"eval_samples_per_second": 17.253,
"eval_steps_per_second": 0.56,
"step": 2740
},
{
"epoch": 84.92,
"learning_rate": 9.908675901629543e-05,
"loss": 0.1604,
"step": 2760
},
{
"epoch": 84.92,
"eval_loss": 0.19222331047058105,
"eval_runtime": 26.7706,
"eval_samples_per_second": 17.258,
"eval_steps_per_second": 0.56,
"step": 2760
},
{
"epoch": 85.54,
"learning_rate": 9.768194830839252e-05,
"loss": 0.1598,
"step": 2780
},
{
"epoch": 85.54,
"eval_loss": 0.19124871492385864,
"eval_runtime": 26.778,
"eval_samples_per_second": 17.253,
"eval_steps_per_second": 0.56,
"step": 2780
},
{
"epoch": 86.15,
"learning_rate": 9.627759518584733e-05,
"loss": 0.1583,
"step": 2800
},
{
"epoch": 86.15,
"eval_loss": 0.19053253531455994,
"eval_runtime": 26.7797,
"eval_samples_per_second": 17.252,
"eval_steps_per_second": 0.56,
"step": 2800
},
{
"epoch": 86.77,
"learning_rate": 9.487397686914985e-05,
"loss": 0.1581,
"step": 2820
},
{
"epoch": 86.77,
"eval_loss": 0.18871891498565674,
"eval_runtime": 26.767,
"eval_samples_per_second": 17.26,
"eval_steps_per_second": 0.56,
"step": 2820
},
{
"epoch": 87.38,
"learning_rate": 9.347137043373885e-05,
"loss": 0.1569,
"step": 2840
},
{
"epoch": 87.38,
"eval_loss": 0.18785762786865234,
"eval_runtime": 26.7805,
"eval_samples_per_second": 17.251,
"eval_steps_per_second": 0.56,
"step": 2840
},
{
"epoch": 88.0,
"learning_rate": 9.20700527553069e-05,
"loss": 0.1553,
"step": 2860
},
{
"epoch": 88.0,
"eval_loss": 0.18669484555721283,
"eval_runtime": 26.7742,
"eval_samples_per_second": 17.255,
"eval_steps_per_second": 0.56,
"step": 2860
},
{
"epoch": 88.62,
"learning_rate": 9.067030045514476e-05,
"loss": 0.154,
"step": 2880
},
{
"epoch": 88.62,
"eval_loss": 0.1860661506652832,
"eval_runtime": 26.7794,
"eval_samples_per_second": 17.252,
"eval_steps_per_second": 0.56,
"step": 2880
},
{
"epoch": 89.23,
"learning_rate": 8.927238984553626e-05,
"loss": 0.1549,
"step": 2900
},
{
"epoch": 89.23,
"eval_loss": 0.18506208062171936,
"eval_runtime": 26.7725,
"eval_samples_per_second": 17.257,
"eval_steps_per_second": 0.56,
"step": 2900
},
{
"epoch": 89.85,
"learning_rate": 8.787659687521403e-05,
"loss": 0.1528,
"step": 2920
},
{
"epoch": 89.85,
"eval_loss": 0.18385158479213715,
"eval_runtime": 26.763,
"eval_samples_per_second": 17.263,
"eval_steps_per_second": 0.56,
"step": 2920
},
{
"epoch": 90.46,
"learning_rate": 8.648319707488682e-05,
"loss": 0.1523,
"step": 2940
},
{
"epoch": 90.46,
"eval_loss": 0.18269173800945282,
"eval_runtime": 26.7762,
"eval_samples_per_second": 17.254,
"eval_steps_per_second": 0.56,
"step": 2940
},
{
"epoch": 91.08,
"learning_rate": 8.509246550284961e-05,
"loss": 0.1513,
"step": 2960
},
{
"epoch": 91.08,
"eval_loss": 0.18222320079803467,
"eval_runtime": 26.788,
"eval_samples_per_second": 17.247,
"eval_steps_per_second": 0.56,
"step": 2960
},
{
"epoch": 91.69,
"learning_rate": 8.37046766906869e-05,
"loss": 0.1503,
"step": 2980
},
{
"epoch": 91.69,
"eval_loss": 0.18125151097774506,
"eval_runtime": 26.7603,
"eval_samples_per_second": 17.264,
"eval_steps_per_second": 0.561,
"step": 2980
},
{
"epoch": 92.31,
"learning_rate": 8.232010458907992e-05,
"loss": 0.1502,
"step": 3000
},
{
"epoch": 92.31,
"eval_loss": 0.1806618869304657,
"eval_runtime": 26.7921,
"eval_samples_per_second": 17.244,
"eval_steps_per_second": 0.56,
"step": 3000
},
{
"epoch": 92.92,
"learning_rate": 8.093902251372853e-05,
"loss": 0.1481,
"step": 3020
},
{
"epoch": 92.92,
"eval_loss": 0.17959125339984894,
"eval_runtime": 26.7772,
"eval_samples_per_second": 17.254,
"eval_steps_per_second": 0.56,
"step": 3020
},
{
"epoch": 93.54,
"learning_rate": 7.956170309139842e-05,
"loss": 0.1475,
"step": 3040
},
{
"epoch": 93.54,
"eval_loss": 0.1786828190088272,
"eval_runtime": 26.773,
"eval_samples_per_second": 17.256,
"eval_steps_per_second": 0.56,
"step": 3040
},
{
"epoch": 94.15,
"learning_rate": 7.825698244184431e-05,
"loss": 0.1469,
"step": 3060
},
{
"epoch": 94.15,
"eval_loss": 0.17820928990840912,
"eval_runtime": 26.7735,
"eval_samples_per_second": 17.256,
"eval_steps_per_second": 0.56,
"step": 3060
},
{
"epoch": 94.77,
"learning_rate": 7.68877814745228e-05,
"loss": 0.1472,
"step": 3080
},
{
"epoch": 94.77,
"eval_loss": 0.17709802091121674,
"eval_runtime": 26.7597,
"eval_samples_per_second": 17.265,
"eval_steps_per_second": 0.561,
"step": 3080
},
{
"epoch": 95.38,
"learning_rate": 7.552314287861831e-05,
"loss": 0.1461,
"step": 3100
},
{
"epoch": 95.38,
"eval_loss": 0.1761600822210312,
"eval_runtime": 26.7646,
"eval_samples_per_second": 17.262,
"eval_steps_per_second": 0.56,
"step": 3100
},
{
"epoch": 96.0,
"learning_rate": 7.416333603493977e-05,
"loss": 0.145,
"step": 3120
},
{
"epoch": 96.0,
"eval_loss": 0.17534850537776947,
"eval_runtime": 26.7471,
"eval_samples_per_second": 17.273,
"eval_steps_per_second": 0.561,
"step": 3120
},
{
"epoch": 96.62,
"learning_rate": 7.280862937050435e-05,
"loss": 0.143,
"step": 3140
},
{
"epoch": 96.62,
"eval_loss": 0.1751878708600998,
"eval_runtime": 26.7568,
"eval_samples_per_second": 17.267,
"eval_steps_per_second": 0.561,
"step": 3140
},
{
"epoch": 97.23,
"learning_rate": 7.152662566194701e-05,
"loss": 0.1436,
"step": 3160
},
{
"epoch": 97.23,
"eval_loss": 0.17516781389713287,
"eval_runtime": 26.8034,
"eval_samples_per_second": 17.237,
"eval_steps_per_second": 0.56,
"step": 3160
},
{
"epoch": 97.85,
"learning_rate": 7.018263255002402e-05,
"loss": 0.1426,
"step": 3180
},
{
"epoch": 97.85,
"eval_loss": 0.17369630932807922,
"eval_runtime": 26.781,
"eval_samples_per_second": 17.251,
"eval_steps_per_second": 0.56,
"step": 3180
},
{
"epoch": 98.46,
"learning_rate": 6.884452541156719e-05,
"loss": 0.1427,
"step": 3200
},
{
"epoch": 98.46,
"eval_loss": 0.17294321954250336,
"eval_runtime": 26.7722,
"eval_samples_per_second": 17.257,
"eval_steps_per_second": 0.56,
"step": 3200
},
{
"epoch": 99.08,
"learning_rate": 6.751256839005342e-05,
"loss": 0.142,
"step": 3220
},
{
"epoch": 99.08,
"eval_loss": 0.17208707332611084,
"eval_runtime": 26.7495,
"eval_samples_per_second": 17.271,
"eval_steps_per_second": 0.561,
"step": 3220
},
{
"epoch": 99.69,
"learning_rate": 6.625314525914243e-05,
"loss": 0.1411,
"step": 3240
},
{
"epoch": 99.69,
"eval_loss": 0.17149858176708221,
"eval_runtime": 26.756,
"eval_samples_per_second": 17.267,
"eval_steps_per_second": 0.561,
"step": 3240
},
{
"epoch": 100.31,
"learning_rate": 6.493393606401967e-05,
"loss": 0.1406,
"step": 3260
},
{
"epoch": 100.31,
"eval_loss": 0.1708817183971405,
"eval_runtime": 26.7681,
"eval_samples_per_second": 17.259,
"eval_steps_per_second": 0.56,
"step": 3260
},
{
"epoch": 100.92,
"learning_rate": 6.36216489394732e-05,
"loss": 0.1403,
"step": 3280
},
{
"epoch": 100.92,
"eval_loss": 0.16994836926460266,
"eval_runtime": 26.7644,
"eval_samples_per_second": 17.262,
"eval_steps_per_second": 0.56,
"step": 3280
},
{
"epoch": 101.54,
"learning_rate": 6.231654293208744e-05,
"loss": 0.1401,
"step": 3300
},
{
"epoch": 101.54,
"eval_loss": 0.16944177448749542,
"eval_runtime": 26.7612,
"eval_samples_per_second": 17.264,
"eval_steps_per_second": 0.561,
"step": 3300
},
{
"epoch": 102.15,
"learning_rate": 6.101887567088831e-05,
"loss": 0.1377,
"step": 3320
},
{
"epoch": 102.15,
"eval_loss": 0.16865964233875275,
"eval_runtime": 26.7843,
"eval_samples_per_second": 17.249,
"eval_steps_per_second": 0.56,
"step": 3320
},
{
"epoch": 102.77,
"learning_rate": 5.972890331648686e-05,
"loss": 0.1383,
"step": 3340
},
{
"epoch": 102.77,
"eval_loss": 0.16790008544921875,
"eval_runtime": 26.7761,
"eval_samples_per_second": 17.254,
"eval_steps_per_second": 0.56,
"step": 3340
},
{
"epoch": 103.38,
"learning_rate": 5.8446880510513144e-05,
"loss": 0.1378,
"step": 3360
},
{
"epoch": 103.38,
"eval_loss": 0.1674834042787552,
"eval_runtime": 26.7598,
"eval_samples_per_second": 17.265,
"eval_steps_per_second": 0.561,
"step": 3360
},
{
"epoch": 104.0,
"learning_rate": 5.717306032534962e-05,
"loss": 0.1372,
"step": 3380
},
{
"epoch": 104.0,
"eval_loss": 0.16623561084270477,
"eval_runtime": 26.77,
"eval_samples_per_second": 17.258,
"eval_steps_per_second": 0.56,
"step": 3380
},
{
"epoch": 104.62,
"learning_rate": 5.5907694214174344e-05,
"loss": 0.1362,
"step": 3400
},
{
"epoch": 104.62,
"eval_loss": 0.16605305671691895,
"eval_runtime": 26.7686,
"eval_samples_per_second": 17.259,
"eval_steps_per_second": 0.56,
"step": 3400
},
{
"epoch": 105.23,
"learning_rate": 5.4651031961324364e-05,
"loss": 0.1343,
"step": 3420
},
{
"epoch": 105.23,
"eval_loss": 0.16553008556365967,
"eval_runtime": 26.7736,
"eval_samples_per_second": 17.256,
"eval_steps_per_second": 0.56,
"step": 3420
},
{
"epoch": 105.85,
"learning_rate": 5.3403321632987425e-05,
"loss": 0.1357,
"step": 3440
},
{
"epoch": 105.85,
"eval_loss": 0.16530947387218475,
"eval_runtime": 26.7838,
"eval_samples_per_second": 17.249,
"eval_steps_per_second": 0.56,
"step": 3440
},
{
"epoch": 106.46,
"learning_rate": 5.2164809528234015e-05,
"loss": 0.1344,
"step": 3460
},
{
"epoch": 106.46,
"eval_loss": 0.1647026091814041,
"eval_runtime": 26.7756,
"eval_samples_per_second": 17.254,
"eval_steps_per_second": 0.56,
"step": 3460
},
{
"epoch": 107.08,
"learning_rate": 5.0935740130397494e-05,
"loss": 0.1339,
"step": 3480
},
{
"epoch": 107.08,
"eval_loss": 0.16388827562332153,
"eval_runtime": 26.7918,
"eval_samples_per_second": 17.244,
"eval_steps_per_second": 0.56,
"step": 3480
},
{
"epoch": 107.69,
"learning_rate": 4.971635605881291e-05,
"loss": 0.1336,
"step": 3500
},
{
"epoch": 107.69,
"eval_loss": 0.16345228254795074,
"eval_runtime": 26.7727,
"eval_samples_per_second": 17.256,
"eval_steps_per_second": 0.56,
"step": 3500
},
{
"epoch": 108.31,
"learning_rate": 4.850689802092378e-05,
"loss": 0.1333,
"step": 3520
},
{
"epoch": 108.31,
"eval_loss": 0.16290676593780518,
"eval_runtime": 26.7927,
"eval_samples_per_second": 17.243,
"eval_steps_per_second": 0.56,
"step": 3520
},
{
"epoch": 108.92,
"learning_rate": 4.730760476476611e-05,
"loss": 0.1332,
"step": 3540
},
{
"epoch": 108.92,
"eval_loss": 0.1624392867088318,
"eval_runtime": 26.7947,
"eval_samples_per_second": 17.242,
"eval_steps_per_second": 0.56,
"step": 3540
},
{
"epoch": 109.54,
"learning_rate": 4.611871303183952e-05,
"loss": 0.1322,
"step": 3560
},
{
"epoch": 109.54,
"eval_loss": 0.1618904024362564,
"eval_runtime": 26.7757,
"eval_samples_per_second": 17.254,
"eval_steps_per_second": 0.56,
"step": 3560
},
{
"epoch": 110.15,
"learning_rate": 4.4940457510374136e-05,
"loss": 0.1327,
"step": 3580
},
{
"epoch": 110.15,
"eval_loss": 0.16122287511825562,
"eval_runtime": 26.8152,
"eval_samples_per_second": 17.229,
"eval_steps_per_second": 0.559,
"step": 3580
},
{
"epoch": 110.77,
"learning_rate": 4.3773070789003026e-05,
"loss": 0.1311,
"step": 3600
},
{
"epoch": 110.77,
"eval_loss": 0.16065527498722076,
"eval_runtime": 26.7937,
"eval_samples_per_second": 17.243,
"eval_steps_per_second": 0.56,
"step": 3600
},
{
"epoch": 111.38,
"learning_rate": 4.261678331084884e-05,
"loss": 0.132,
"step": 3620
},
{
"epoch": 111.38,
"eval_loss": 0.16051311790943146,
"eval_runtime": 26.7716,
"eval_samples_per_second": 17.257,
"eval_steps_per_second": 0.56,
"step": 3620
},
{
"epoch": 112.0,
"learning_rate": 4.147182332803439e-05,
"loss": 0.131,
"step": 3640
},
{
"epoch": 112.0,
"eval_loss": 0.16002397239208221,
"eval_runtime": 26.809,
"eval_samples_per_second": 17.233,
"eval_steps_per_second": 0.56,
"step": 3640
},
{
"epoch": 112.62,
"learning_rate": 4.0338416856625294e-05,
"loss": 0.1298,
"step": 3660
},
{
"epoch": 112.62,
"eval_loss": 0.15938027203083038,
"eval_runtime": 26.7668,
"eval_samples_per_second": 17.26,
"eval_steps_per_second": 0.56,
"step": 3660
},
{
"epoch": 113.23,
"learning_rate": 3.921678763201434e-05,
"loss": 0.13,
"step": 3680
},
{
"epoch": 113.23,
"eval_loss": 0.1591978669166565,
"eval_runtime": 26.7683,
"eval_samples_per_second": 17.259,
"eval_steps_per_second": 0.56,
"step": 3680
},
{
"epoch": 113.85,
"learning_rate": 3.810715706475575e-05,
"loss": 0.1302,
"step": 3700
},
{
"epoch": 113.85,
"eval_loss": 0.15868616104125977,
"eval_runtime": 26.7596,
"eval_samples_per_second": 17.265,
"eval_steps_per_second": 0.561,
"step": 3700
},
{
"epoch": 114.46,
"learning_rate": 3.70097441968588e-05,
"loss": 0.1292,
"step": 3720
},
{
"epoch": 114.46,
"eval_loss": 0.1582469791173935,
"eval_runtime": 26.8259,
"eval_samples_per_second": 17.222,
"eval_steps_per_second": 0.559,
"step": 3720
},
{
"epoch": 115.08,
"learning_rate": 3.592476565854854e-05,
"loss": 0.1284,
"step": 3740
},
{
"epoch": 115.08,
"eval_loss": 0.15772594511508942,
"eval_runtime": 26.7663,
"eval_samples_per_second": 17.261,
"eval_steps_per_second": 0.56,
"step": 3740
},
{
"epoch": 115.69,
"learning_rate": 3.485243562550297e-05,
"loss": 0.1278,
"step": 3760
},
{
"epoch": 115.69,
"eval_loss": 0.1572510004043579,
"eval_runtime": 26.8195,
"eval_samples_per_second": 17.226,
"eval_steps_per_second": 0.559,
"step": 3760
},
{
"epoch": 116.31,
"learning_rate": 3.379296577657434e-05,
"loss": 0.1281,
"step": 3780
},
{
"epoch": 116.31,
"eval_loss": 0.156888946890831,
"eval_runtime": 26.7809,
"eval_samples_per_second": 17.251,
"eval_steps_per_second": 0.56,
"step": 3780
},
{
"epoch": 116.92,
"learning_rate": 3.2746565252003815e-05,
"loss": 0.1277,
"step": 3800
},
{
"epoch": 116.92,
"eval_loss": 0.15669023990631104,
"eval_runtime": 26.7251,
"eval_samples_per_second": 17.287,
"eval_steps_per_second": 0.561,
"step": 3800
},
{
"epoch": 117.54,
"learning_rate": 3.1713440612136924e-05,
"loss": 0.1266,
"step": 3820
},
{
"epoch": 117.54,
"eval_loss": 0.1565510779619217,
"eval_runtime": 26.7847,
"eval_samples_per_second": 17.249,
"eval_steps_per_second": 0.56,
"step": 3820
},
{
"epoch": 118.15,
"learning_rate": 3.069379579664835e-05,
"loss": 0.1279,
"step": 3840
},
{
"epoch": 118.15,
"eval_loss": 0.15575794875621796,
"eval_runtime": 26.785,
"eval_samples_per_second": 17.248,
"eval_steps_per_second": 0.56,
"step": 3840
},
{
"epoch": 118.77,
"learning_rate": 2.9737802267115754e-05,
"loss": 0.1261,
"step": 3860
},
{
"epoch": 118.77,
"eval_loss": 0.15544316172599792,
"eval_runtime": 26.805,
"eval_samples_per_second": 17.236,
"eval_steps_per_second": 0.56,
"step": 3860
},
{
"epoch": 119.38,
"learning_rate": 2.8745019577809483e-05,
"loss": 0.1271,
"step": 3880
},
{
"epoch": 119.38,
"eval_loss": 0.15522195398807526,
"eval_runtime": 26.7839,
"eval_samples_per_second": 17.249,
"eval_steps_per_second": 0.56,
"step": 3880
},
{
"epoch": 120.0,
"learning_rate": 2.7766302681695688e-05,
"loss": 0.1263,
"step": 3900
},
{
"epoch": 120.0,
"eval_loss": 0.1547752469778061,
"eval_runtime": 26.8002,
"eval_samples_per_second": 17.239,
"eval_steps_per_second": 0.56,
"step": 3900
},
{
"epoch": 120.62,
"learning_rate": 2.6801844778314467e-05,
"loss": 0.1254,
"step": 3920
},
{
"epoch": 120.62,
"eval_loss": 0.15464647114276886,
"eval_runtime": 26.8029,
"eval_samples_per_second": 17.237,
"eval_steps_per_second": 0.56,
"step": 3920
},
{
"epoch": 121.23,
"learning_rate": 2.5851836252468897e-05,
"loss": 0.1255,
"step": 3940
},
{
"epoch": 121.23,
"eval_loss": 0.15429826080799103,
"eval_runtime": 26.8105,
"eval_samples_per_second": 17.232,
"eval_steps_per_second": 0.559,
"step": 3940
},
{
"epoch": 121.85,
"learning_rate": 2.491646463664261e-05,
"loss": 0.1261,
"step": 3960
},
{
"epoch": 121.85,
"eval_loss": 0.1540435552597046,
"eval_runtime": 26.8082,
"eval_samples_per_second": 17.234,
"eval_steps_per_second": 0.56,
"step": 3960
},
{
"epoch": 122.46,
"learning_rate": 2.399591457398106e-05,
"loss": 0.1257,
"step": 3980
},
{
"epoch": 122.46,
"eval_loss": 0.15359282493591309,
"eval_runtime": 26.7751,
"eval_samples_per_second": 17.255,
"eval_steps_per_second": 0.56,
"step": 3980
},
{
"epoch": 123.08,
"learning_rate": 2.3090367781842413e-05,
"loss": 0.1246,
"step": 4000
},
{
"epoch": 123.08,
"eval_loss": 0.15332242846488953,
"eval_runtime": 26.7777,
"eval_samples_per_second": 17.253,
"eval_steps_per_second": 0.56,
"step": 4000
},
{
"epoch": 123.69,
"learning_rate": 2.2200003015926705e-05,
"loss": 0.1247,
"step": 4020
},
{
"epoch": 123.69,
"eval_loss": 0.15318149328231812,
"eval_runtime": 26.7776,
"eval_samples_per_second": 17.253,
"eval_steps_per_second": 0.56,
"step": 4020
},
{
"epoch": 124.31,
"learning_rate": 2.1324996034989165e-05,
"loss": 0.1252,
"step": 4040
},
{
"epoch": 124.31,
"eval_loss": 0.15291614830493927,
"eval_runtime": 26.7677,
"eval_samples_per_second": 17.26,
"eval_steps_per_second": 0.56,
"step": 4040
},
{
"epoch": 124.92,
"learning_rate": 2.046551956614534e-05,
"loss": 0.1249,
"step": 4060
},
{
"epoch": 124.92,
"eval_loss": 0.15260463953018188,
"eval_runtime": 26.7848,
"eval_samples_per_second": 17.249,
"eval_steps_per_second": 0.56,
"step": 4060
},
{
"epoch": 125.54,
"learning_rate": 1.9621743270774597e-05,
"loss": 0.1242,
"step": 4080
},
{
"epoch": 125.54,
"eval_loss": 0.15242013335227966,
"eval_runtime": 26.7819,
"eval_samples_per_second": 17.25,
"eval_steps_per_second": 0.56,
"step": 4080
},
{
"epoch": 126.15,
"learning_rate": 1.8793833711028773e-05,
"loss": 0.1239,
"step": 4100
},
{
"epoch": 126.15,
"eval_loss": 0.1519923359155655,
"eval_runtime": 26.789,
"eval_samples_per_second": 17.246,
"eval_steps_per_second": 0.56,
"step": 4100
},
{
"epoch": 126.77,
"learning_rate": 1.7981954316952786e-05,
"loss": 0.1231,
"step": 4120
},
{
"epoch": 126.77,
"eval_loss": 0.15172038972377777,
"eval_runtime": 26.788,
"eval_samples_per_second": 17.246,
"eval_steps_per_second": 0.56,
"step": 4120
},
{
"epoch": 127.38,
"learning_rate": 1.718626535422332e-05,
"loss": 0.1235,
"step": 4140
},
{
"epoch": 127.38,
"eval_loss": 0.15152348577976227,
"eval_runtime": 26.7862,
"eval_samples_per_second": 17.248,
"eval_steps_per_second": 0.56,
"step": 4140
},
{
"epoch": 128.0,
"learning_rate": 1.6406923892512284e-05,
"loss": 0.123,
"step": 4160
},
{
"epoch": 128.0,
"eval_loss": 0.151360422372818,
"eval_runtime": 26.8008,
"eval_samples_per_second": 17.238,
"eval_steps_per_second": 0.56,
"step": 4160
},
{
"epoch": 128.62,
"learning_rate": 1.5644083774481043e-05,
"loss": 0.123,
"step": 4180
},
{
"epoch": 128.62,
"eval_loss": 0.1512284278869629,
"eval_runtime": 26.7984,
"eval_samples_per_second": 17.24,
"eval_steps_per_second": 0.56,
"step": 4180
},
{
"epoch": 129.23,
"learning_rate": 1.489789558541187e-05,
"loss": 0.1235,
"step": 4200
},
{
"epoch": 129.23,
"eval_loss": 0.15105997025966644,
"eval_runtime": 26.7894,
"eval_samples_per_second": 17.246,
"eval_steps_per_second": 0.56,
"step": 4200
},
{
"epoch": 129.85,
"learning_rate": 1.4168506623482202e-05,
"loss": 0.1222,
"step": 4220
},
{
"epoch": 129.85,
"eval_loss": 0.15094506740570068,
"eval_runtime": 26.7743,
"eval_samples_per_second": 17.255,
"eval_steps_per_second": 0.56,
"step": 4220
},
{
"epoch": 130.46,
"learning_rate": 1.3456060870687937e-05,
"loss": 0.1221,
"step": 4240
},
{
"epoch": 130.46,
"eval_loss": 0.1505899429321289,
"eval_runtime": 26.7534,
"eval_samples_per_second": 17.269,
"eval_steps_per_second": 0.561,
"step": 4240
},
{
"epoch": 131.08,
"learning_rate": 1.2760698964421091e-05,
"loss": 0.1212,
"step": 4260
},
{
"epoch": 131.08,
"eval_loss": 0.15049409866333008,
"eval_runtime": 26.7596,
"eval_samples_per_second": 17.265,
"eval_steps_per_second": 0.561,
"step": 4260
},
{
"epoch": 131.69,
"learning_rate": 1.2082558169708081e-05,
"loss": 0.122,
"step": 4280
},
{
"epoch": 131.69,
"eval_loss": 0.15041232109069824,
"eval_runtime": 26.7564,
"eval_samples_per_second": 17.267,
"eval_steps_per_second": 0.561,
"step": 4280
},
{
"epoch": 132.31,
"learning_rate": 1.1421772352113336e-05,
"loss": 0.1225,
"step": 4300
},
{
"epoch": 132.31,
"eval_loss": 0.1501646488904953,
"eval_runtime": 26.7249,
"eval_samples_per_second": 17.287,
"eval_steps_per_second": 0.561,
"step": 4300
},
{
"epoch": 132.92,
"learning_rate": 1.0778471951314229e-05,
"loss": 0.1213,
"step": 4320
},
{
"epoch": 132.92,
"eval_loss": 0.15006287395954132,
"eval_runtime": 26.7285,
"eval_samples_per_second": 17.285,
"eval_steps_per_second": 0.561,
"step": 4320
},
{
"epoch": 133.54,
"learning_rate": 1.015278395535203e-05,
"loss": 0.1225,
"step": 4340
},
{
"epoch": 133.54,
"eval_loss": 0.14982885122299194,
"eval_runtime": 26.7639,
"eval_samples_per_second": 17.262,
"eval_steps_per_second": 0.56,
"step": 4340
},
{
"epoch": 134.15,
"learning_rate": 9.54483187556453e-06,
"loss": 0.1219,
"step": 4360
},
{
"epoch": 134.15,
"eval_loss": 0.14970383048057556,
"eval_runtime": 26.7778,
"eval_samples_per_second": 17.253,
"eval_steps_per_second": 0.56,
"step": 4360
},
{
"epoch": 134.77,
"learning_rate": 8.954735722204689e-06,
"loss": 0.1213,
"step": 4380
},
{
"epoch": 134.77,
"eval_loss": 0.14960302412509918,
"eval_runtime": 26.7677,
"eval_samples_per_second": 17.26,
"eval_steps_per_second": 0.56,
"step": 4380
},
{
"epoch": 135.38,
"learning_rate": 8.382611980750532e-06,
"loss": 0.1216,
"step": 4400
},
{
"epoch": 135.38,
"eval_loss": 0.14945241808891296,
"eval_runtime": 26.7712,
"eval_samples_per_second": 17.257,
"eval_steps_per_second": 0.56,
"step": 4400
},
{
"epoch": 136.0,
"learning_rate": 7.828573588910859e-06,
"loss": 0.1211,
"step": 4420
},
{
"epoch": 136.0,
"eval_loss": 0.1492658108472824,
"eval_runtime": 26.7652,
"eval_samples_per_second": 17.261,
"eval_steps_per_second": 0.56,
"step": 4420
},
{
"epoch": 136.62,
"learning_rate": 7.292729914331142e-06,
"loss": 0.1216,
"step": 4440
},
{
"epoch": 136.62,
"eval_loss": 0.14914917945861816,
"eval_runtime": 26.7705,
"eval_samples_per_second": 17.258,
"eval_steps_per_second": 0.56,
"step": 4440
},
{
"epoch": 137.23,
"learning_rate": 6.775186733004424e-06,
"loss": 0.1197,
"step": 4460
},
{
"epoch": 137.23,
"eval_loss": 0.14917601644992828,
"eval_runtime": 26.7525,
"eval_samples_per_second": 17.269,
"eval_steps_per_second": 0.561,
"step": 4460
},
{
"epoch": 137.85,
"learning_rate": 6.276046208390873e-06,
"loss": 0.1203,
"step": 4480
},
{
"epoch": 137.85,
"eval_loss": 0.14903923869132996,
"eval_runtime": 26.762,
"eval_samples_per_second": 17.263,
"eval_steps_per_second": 0.56,
"step": 4480
},
{
"epoch": 138.46,
"learning_rate": 5.795406871250797e-06,
"loss": 0.1209,
"step": 4500
},
{
"epoch": 138.46,
"eval_loss": 0.14884509146213531,
"eval_runtime": 26.758,
"eval_samples_per_second": 17.266,
"eval_steps_per_second": 0.561,
"step": 4500
},
{
"epoch": 139.08,
"learning_rate": 5.333363600194396e-06,
"loss": 0.1197,
"step": 4520
},
{
"epoch": 139.08,
"eval_loss": 0.14882220327854156,
"eval_runtime": 26.765,
"eval_samples_per_second": 17.261,
"eval_steps_per_second": 0.56,
"step": 4520
},
{
"epoch": 139.69,
"learning_rate": 4.890007602952828e-06,
"loss": 0.1202,
"step": 4540
},
{
"epoch": 139.69,
"eval_loss": 0.1487365961074829,
"eval_runtime": 26.7652,
"eval_samples_per_second": 17.261,
"eval_steps_per_second": 0.56,
"step": 4540
},
{
"epoch": 140.31,
"learning_rate": 4.46542639837364e-06,
"loss": 0.121,
"step": 4560
},
{
"epoch": 140.31,
"eval_loss": 0.1486121267080307,
"eval_runtime": 26.748,
"eval_samples_per_second": 17.272,
"eval_steps_per_second": 0.561,
"step": 4560
},
{
"epoch": 140.92,
"learning_rate": 4.059703799144476e-06,
"loss": 0.1202,
"step": 4580
},
{
"epoch": 140.92,
"eval_loss": 0.1485925018787384,
"eval_runtime": 26.769,
"eval_samples_per_second": 17.259,
"eval_steps_per_second": 0.56,
"step": 4580
},
{
"epoch": 141.54,
"learning_rate": 3.6729198952483724e-06,
"loss": 0.1194,
"step": 4600
},
{
"epoch": 141.54,
"eval_loss": 0.14861957728862762,
"eval_runtime": 26.7652,
"eval_samples_per_second": 17.261,
"eval_steps_per_second": 0.56,
"step": 4600
},
{
"epoch": 142.15,
"learning_rate": 3.305151038153964e-06,
"loss": 0.1199,
"step": 4620
},
{
"epoch": 142.15,
"eval_loss": 0.1484871208667755,
"eval_runtime": 26.7519,
"eval_samples_per_second": 17.27,
"eval_steps_per_second": 0.561,
"step": 4620
},
{
"epoch": 142.77,
"learning_rate": 2.956469825743613e-06,
"loss": 0.1201,
"step": 4640
},
{
"epoch": 142.77,
"eval_loss": 0.14845435321331024,
"eval_runtime": 26.7438,
"eval_samples_per_second": 17.275,
"eval_steps_per_second": 0.561,
"step": 4640
},
{
"epoch": 143.38,
"learning_rate": 2.6269450879825243e-06,
"loss": 0.1198,
"step": 4660
},
{
"epoch": 143.38,
"eval_loss": 0.1484329104423523,
"eval_runtime": 26.7558,
"eval_samples_per_second": 17.267,
"eval_steps_per_second": 0.561,
"step": 4660
},
{
"epoch": 144.0,
"learning_rate": 2.316641873331704e-06,
"loss": 0.12,
"step": 4680
},
{
"epoch": 144.0,
"eval_loss": 0.14837703108787537,
"eval_runtime": 26.7449,
"eval_samples_per_second": 17.274,
"eval_steps_per_second": 0.561,
"step": 4680
},
{
"epoch": 144.62,
"learning_rate": 2.025621435907221e-06,
"loss": 0.1197,
"step": 4700
},
{
"epoch": 144.62,
"eval_loss": 0.14830969274044037,
"eval_runtime": 26.7686,
"eval_samples_per_second": 17.259,
"eval_steps_per_second": 0.56,
"step": 4700
},
{
"epoch": 145.23,
"learning_rate": 1.753941223388733e-06,
"loss": 0.1195,
"step": 4720
},
{
"epoch": 145.23,
"eval_loss": 0.14829565584659576,
"eval_runtime": 26.7574,
"eval_samples_per_second": 17.266,
"eval_steps_per_second": 0.561,
"step": 4720
},
{
"epoch": 145.85,
"learning_rate": 1.5016548656791697e-06,
"loss": 0.1206,
"step": 4740
},
{
"epoch": 145.85,
"eval_loss": 0.1482698619365692,
"eval_runtime": 26.744,
"eval_samples_per_second": 17.275,
"eval_steps_per_second": 0.561,
"step": 4740
},
{
"epoch": 146.46,
"learning_rate": 1.2688121643181893e-06,
"loss": 0.1211,
"step": 4760
},
{
"epoch": 146.46,
"eval_loss": 0.1482834368944168,
"eval_runtime": 26.7646,
"eval_samples_per_second": 17.262,
"eval_steps_per_second": 0.56,
"step": 4760
},
{
"epoch": 147.08,
"learning_rate": 1.0554590826512778e-06,
"loss": 0.1196,
"step": 4780
},
{
"epoch": 147.08,
"eval_loss": 0.14823544025421143,
"eval_runtime": 26.7343,
"eval_samples_per_second": 17.281,
"eval_steps_per_second": 0.561,
"step": 4780
},
{
"epoch": 147.69,
"learning_rate": 8.61637736756582e-07,
"loss": 0.1197,
"step": 4800
},
{
"epoch": 147.69,
"eval_loss": 0.1482395976781845,
"eval_runtime": 26.7411,
"eval_samples_per_second": 17.277,
"eval_steps_per_second": 0.561,
"step": 4800
},
{
"epoch": 148.31,
"learning_rate": 6.873863871311614e-07,
"loss": 0.1212,
"step": 4820
},
{
"epoch": 148.31,
"eval_loss": 0.14818722009658813,
"eval_runtime": 26.7764,
"eval_samples_per_second": 17.254,
"eval_steps_per_second": 0.56,
"step": 4820
},
{
"epoch": 148.92,
"learning_rate": 5.32739431138285e-07,
"loss": 0.1205,
"step": 4840
},
{
"epoch": 148.92,
"eval_loss": 0.14816446602344513,
"eval_runtime": 26.7592,
"eval_samples_per_second": 17.265,
"eval_steps_per_second": 0.561,
"step": 4840
}
],
"logging_steps": 20,
"max_steps": 4968,
"num_input_tokens_seen": 0,
"num_train_epochs": 156,
"save_steps": 20,
"total_flos": 9998909350871040.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}