pythia-1b-deduped-sft / trainer_state.json
theblackcat102's picture
Upload 7 files
97c3f4f
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.3287175905000616,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 1.4084967333570947e-06,
"loss": 2.5231,
"step": 10
},
{
"epoch": 0.0,
"learning_rate": 2.0507482022971233e-06,
"loss": 2.3436,
"step": 20
},
{
"epoch": 0.0,
"learning_rate": 2.385606273598312e-06,
"loss": 2.22,
"step": 30
},
{
"epoch": 0.01,
"learning_rate": 2.6136695401116585e-06,
"loss": 2.1055,
"step": 40
},
{
"epoch": 0.01,
"learning_rate": 2.7868297632261957e-06,
"loss": 2.1275,
"step": 50
},
{
"epoch": 0.01,
"learning_rate": 2.926458092787486e-06,
"loss": 2.0425,
"step": 60
},
{
"epoch": 0.01,
"learning_rate": 3.0434580045013773e-06,
"loss": 2.0407,
"step": 70
},
{
"epoch": 0.01,
"learning_rate": 3.1441512086208035e-06,
"loss": 2.0558,
"step": 80
},
{
"epoch": 0.01,
"learning_rate": 3.232532087697698e-06,
"loss": 1.9887,
"step": 90
},
{
"epoch": 0.02,
"learning_rate": 3.3112862237770753e-06,
"loss": 1.9845,
"step": 100
},
{
"epoch": 0.02,
"learning_rate": 3.3823062961420163e-06,
"loss": 1.9856,
"step": 110
},
{
"epoch": 0.02,
"learning_rate": 3.446976436243603e-06,
"loss": 1.9968,
"step": 120
},
{
"epoch": 0.02,
"learning_rate": 3.506339534926595e-06,
"loss": 1.9247,
"step": 130
},
{
"epoch": 0.02,
"learning_rate": 3.5612009452606784e-06,
"loss": 1.9817,
"step": 140
},
{
"epoch": 0.02,
"learning_rate": 3.612195557913627e-06,
"loss": 1.9644,
"step": 150
},
{
"epoch": 0.03,
"learning_rate": 3.65983275401539e-06,
"loss": 1.9639,
"step": 160
},
{
"epoch": 0.03,
"learning_rate": 3.7045274519126395e-06,
"loss": 1.9587,
"step": 170
},
{
"epoch": 0.03,
"learning_rate": 3.7466221106030114e-06,
"loss": 1.9849,
"step": 180
},
{
"epoch": 0.03,
"learning_rate": 3.786402677560832e-06,
"loss": 1.9745,
"step": 190
},
{
"epoch": 0.03,
"learning_rate": 3.824110376935989e-06,
"loss": 1.9429,
"step": 200
},
{
"epoch": 0.03,
"learning_rate": 3.8599505757615295e-06,
"loss": 1.9484,
"step": 210
},
{
"epoch": 0.04,
"learning_rate": 3.894099556414216e-06,
"loss": 1.9214,
"step": 220
},
{
"epoch": 0.04,
"learning_rate": 3.9267097619885385e-06,
"loss": 1.9274,
"step": 230
},
{
"epoch": 0.04,
"learning_rate": 3.95791391001684e-06,
"loss": 1.9185,
"step": 240
},
{
"epoch": 0.04,
"learning_rate": 3.987828255432777e-06,
"loss": 1.9578,
"step": 250
},
{
"epoch": 0.04,
"learning_rate": 4.016555205552159e-06,
"loss": 1.907,
"step": 260
},
{
"epoch": 0.04,
"learning_rate": 4.044185435607626e-06,
"loss": 1.9448,
"step": 270
},
{
"epoch": 0.05,
"learning_rate": 4.070799615107415e-06,
"loss": 1.8884,
"step": 280
},
{
"epoch": 0.05,
"learning_rate": 4.096469827889988e-06,
"loss": 1.9402,
"step": 290
},
{
"epoch": 0.05,
"learning_rate": 4.121260748862021e-06,
"loss": 1.9153,
"step": 300
},
{
"epoch": 0.05,
"learning_rate": 4.145230625795312e-06,
"loss": 1.9106,
"step": 310
},
{
"epoch": 0.05,
"learning_rate": 4.1684321036962525e-06,
"loss": 1.958,
"step": 320
},
{
"epoch": 0.05,
"learning_rate": 4.190912921100477e-06,
"loss": 1.9117,
"step": 330
},
{
"epoch": 0.06,
"learning_rate": 4.212716501452232e-06,
"loss": 1.9097,
"step": 340
},
{
"epoch": 0.06,
"learning_rate": 4.233882457984791e-06,
"loss": 1.9279,
"step": 350
},
{
"epoch": 0.06,
"learning_rate": 4.2544470268536555e-06,
"loss": 1.9164,
"step": 360
},
{
"epoch": 0.06,
"learning_rate": 4.27444344042015e-06,
"loss": 1.9323,
"step": 370
},
{
"epoch": 0.06,
"learning_rate": 4.293902250342989e-06,
"loss": 1.9134,
"step": 380
},
{
"epoch": 0.06,
"learning_rate": 4.312851608364853e-06,
"loss": 1.8835,
"step": 390
},
{
"epoch": 0.07,
"learning_rate": 4.3313175112718595e-06,
"loss": 1.8969,
"step": 400
},
{
"epoch": 0.07,
"learning_rate": 4.3493240153753665e-06,
"loss": 1.9238,
"step": 410
},
{
"epoch": 0.07,
"learning_rate": 4.366893424956263e-06,
"loss": 1.8946,
"step": 420
},
{
"epoch": 0.07,
"learning_rate": 4.38404645837504e-06,
"loss": 1.8781,
"step": 430
},
{
"epoch": 0.07,
"learning_rate": 4.400802394950703e-06,
"loss": 1.8955,
"step": 440
},
{
"epoch": 0.07,
"learning_rate": 4.4171792052198945e-06,
"loss": 1.8515,
"step": 450
},
{
"epoch": 0.08,
"learning_rate": 4.433193666783084e-06,
"loss": 1.8978,
"step": 460
},
{
"epoch": 0.08,
"learning_rate": 4.448861467610187e-06,
"loss": 1.889,
"step": 470
},
{
"epoch": 0.08,
"learning_rate": 4.4641972984001906e-06,
"loss": 1.8667,
"step": 480
},
{
"epoch": 0.08,
"learning_rate": 4.479214935357724e-06,
"loss": 1.9304,
"step": 490
},
{
"epoch": 0.08,
"learning_rate": 4.493927314555554e-06,
"loss": 1.9042,
"step": 500
},
{
"epoch": 0.08,
"eval_gsm8k_hard_accuracy": 0.8928993119172672,
"eval_gsm8k_hard_loss": 0.4951171875,
"eval_gsm8k_hard_runtime": 2.1138,
"eval_gsm8k_hard_samples_per_second": 124.893,
"eval_gsm8k_hard_steps_per_second": 8.042,
"step": 500
},
{
"epoch": 0.08,
"eval_webgpt_accuracy": 0.4654303097674511,
"eval_webgpt_loss": 2.478515625,
"eval_webgpt_runtime": 13.6298,
"eval_webgpt_samples_per_second": 287.312,
"eval_webgpt_steps_per_second": 17.975,
"step": 500
},
{
"epoch": 0.08,
"eval_squad_v2_accuracy": 0.8681394546082154,
"eval_squad_v2_loss": 0.51806640625,
"eval_squad_v2_runtime": 80.2931,
"eval_squad_v2_samples_per_second": 324.611,
"eval_squad_v2_steps_per_second": 20.288,
"step": 500
},
{
"epoch": 0.08,
"eval_adversarial_qa_accuracy": 0.7833800465144016,
"eval_adversarial_qa_loss": 1.310546875,
"eval_adversarial_qa_runtime": 19.1554,
"eval_adversarial_qa_samples_per_second": 313.228,
"eval_adversarial_qa_steps_per_second": 19.577,
"step": 500
},
{
"epoch": 0.08,
"eval_private_tuning_accuracy": 0.6404945703123248,
"eval_private_tuning_loss": 1.3779296875,
"eval_private_tuning_runtime": 68.286,
"eval_private_tuning_samples_per_second": 310.137,
"eval_private_tuning_steps_per_second": 19.389,
"step": 500
},
{
"epoch": 0.08,
"eval_oa_translated_accuracy": 0.6488286598439107,
"eval_oa_translated_loss": 1.5576171875,
"eval_oa_translated_runtime": 524.5762,
"eval_oa_translated_samples_per_second": 254.918,
"eval_oa_translated_steps_per_second": 15.933,
"step": 500
},
{
"epoch": 0.08,
"eval_prosocial_dialogue_accuracy": 0.52144098641849,
"eval_prosocial_dialogue_loss": 1.90625,
"eval_prosocial_dialogue_runtime": 90.5414,
"eval_prosocial_dialogue_samples_per_second": 298.018,
"eval_prosocial_dialogue_steps_per_second": 18.632,
"step": 500
},
{
"epoch": 0.08,
"eval_math_qa_accuracy": 0.5165153098127461,
"eval_math_qa_loss": 2.20703125,
"eval_math_qa_runtime": 17.7049,
"eval_math_qa_samples_per_second": 337.082,
"eval_math_qa_steps_per_second": 21.068,
"step": 500
},
{
"epoch": 0.08,
"eval_wikihow_accuracy": 0.5831415499792042,
"eval_wikihow_loss": 2.140625,
"eval_wikihow_runtime": 9.4415,
"eval_wikihow_samples_per_second": 242.863,
"eval_wikihow_steps_per_second": 15.252,
"step": 500
},
{
"epoch": 0.08,
"eval_joke_accuracy": 0.45545868081880214,
"eval_joke_loss": 2.529296875,
"eval_joke_runtime": 1.3918,
"eval_joke_samples_per_second": 54.606,
"eval_joke_steps_per_second": 3.593,
"step": 500
},
{
"epoch": 0.08,
"eval_gsm8k_accuracy": 0.7111711283077483,
"eval_gsm8k_loss": 1.1416015625,
"eval_gsm8k_runtime": 6.0874,
"eval_gsm8k_samples_per_second": 245.588,
"eval_gsm8k_steps_per_second": 15.442,
"step": 500
},
{
"epoch": 0.08,
"eval_ted_trans_en-hi_accuracy": 0.5343350158469304,
"eval_ted_trans_en-hi_loss": 2.244140625,
"eval_ted_trans_en-hi_runtime": 1.2611,
"eval_ted_trans_en-hi_samples_per_second": 81.672,
"eval_ted_trans_en-hi_steps_per_second": 5.551,
"step": 500
},
{
"epoch": 0.08,
"eval_ted_trans_de-ja_accuracy": 0.5195722742027878,
"eval_ted_trans_de-ja_loss": 2.314453125,
"eval_ted_trans_de-ja_runtime": 3.7052,
"eval_ted_trans_de-ja_samples_per_second": 193.783,
"eval_ted_trans_de-ja_steps_per_second": 12.145,
"step": 500
},
{
"epoch": 0.08,
"eval_ted_trans_nl-en_accuracy": 0.6433630400125447,
"eval_ted_trans_nl-en_loss": 1.7353515625,
"eval_ted_trans_nl-en_runtime": 3.5777,
"eval_ted_trans_nl-en_samples_per_second": 215.5,
"eval_ted_trans_nl-en_steps_per_second": 13.696,
"step": 500
},
{
"epoch": 0.08,
"eval_ted_trans_en-ja_accuracy": 0.5440905817396176,
"eval_ted_trans_en-ja_loss": 2.13671875,
"eval_ted_trans_en-ja_runtime": 3.7065,
"eval_ted_trans_en-ja_samples_per_second": 216.109,
"eval_ted_trans_en-ja_steps_per_second": 13.76,
"step": 500
},
{
"epoch": 0.08,
"eval_ted_trans_en-es_accuracy": 0.7055326931870142,
"eval_ted_trans_en-es_loss": 1.3369140625,
"eval_ted_trans_en-es_runtime": 3.3519,
"eval_ted_trans_en-es_samples_per_second": 246.427,
"eval_ted_trans_en-es_steps_per_second": 15.514,
"step": 500
},
{
"epoch": 0.08,
"eval_ted_trans_en-ms_accuracy": 0.5517070757050965,
"eval_ted_trans_en-ms_loss": 2.32421875,
"eval_ted_trans_en-ms_runtime": 0.8373,
"eval_ted_trans_en-ms_samples_per_second": 50.159,
"eval_ted_trans_en-ms_steps_per_second": 3.583,
"step": 500
},
{
"epoch": 0.08,
"eval_xsum_accuracy": 0.5663549372439451,
"eval_xsum_loss": NaN,
"eval_xsum_runtime": 140.8411,
"eval_xsum_samples_per_second": 289.752,
"eval_xsum_steps_per_second": 18.113,
"step": 500
},
{
"epoch": 0.08,
"eval_cnn_dailymail_accuracy": 0.6501961820900207,
"eval_cnn_dailymail_loss": NaN,
"eval_cnn_dailymail_runtime": 207.5377,
"eval_cnn_dailymail_samples_per_second": 276.687,
"eval_cnn_dailymail_steps_per_second": 17.293,
"step": 500
},
{
"epoch": 0.08,
"eval_multi_news_accuracy": 0.5168385769568282,
"eval_multi_news_loss": NaN,
"eval_multi_news_runtime": 36.1577,
"eval_multi_news_samples_per_second": 248.771,
"eval_multi_news_steps_per_second": 15.571,
"step": 500
},
{
"epoch": 0.08,
"eval_tldr_news_accuracy": 0.5048904354368475,
"eval_tldr_news_loss": 2.384765625,
"eval_tldr_news_runtime": 5.9032,
"eval_tldr_news_samples_per_second": 241.901,
"eval_tldr_news_steps_per_second": 15.246,
"step": 500
},
{
"epoch": 0.08,
"eval_scitldr_accuracy": 0.5,
"eval_scitldr_loss": NaN,
"eval_scitldr_runtime": 2.3974,
"eval_scitldr_samples_per_second": 166.428,
"eval_scitldr_steps_per_second": 10.428,
"step": 500
},
{
"epoch": 0.08,
"eval_samsum_accuracy": 0.5789020336200051,
"eval_samsum_loss": 1.619140625,
"eval_samsum_runtime": 10.1073,
"eval_samsum_samples_per_second": 291.572,
"eval_samsum_steps_per_second": 18.304,
"step": 500
},
{
"epoch": 0.08,
"eval_debate_sum_accuracy": 0.9321960723793357,
"eval_debate_sum_loss": NaN,
"eval_debate_sum_runtime": 188.2954,
"eval_debate_sum_samples_per_second": 255.524,
"eval_debate_sum_steps_per_second": 15.975,
"step": 500
},
{
"epoch": 0.08,
"eval_billsum_accuracy": 0.6453599014888616,
"eval_billsum_loss": NaN,
"eval_billsum_runtime": 22.3683,
"eval_billsum_samples_per_second": 169.436,
"eval_billsum_steps_per_second": 10.595,
"step": 500
},
{
"epoch": 0.08,
"eval_wmt2019_zh-en_accuracy": 0.5524590644131345,
"eval_wmt2019_zh-en_loss": 2.146484375,
"eval_wmt2019_zh-en_runtime": 13.8635,
"eval_wmt2019_zh-en_samples_per_second": 287.158,
"eval_wmt2019_zh-en_steps_per_second": 17.961,
"step": 500
},
{
"epoch": 0.08,
"eval_wmt2019_ru-en_accuracy": 0.6308636370293347,
"eval_wmt2019_ru-en_loss": 1.580078125,
"eval_wmt2019_ru-en_runtime": 11.2038,
"eval_wmt2019_ru-en_samples_per_second": 267.766,
"eval_wmt2019_ru-en_steps_per_second": 16.78,
"step": 500
},
{
"epoch": 0.08,
"eval_wmt2019_de-en_accuracy": 0.657534107930853,
"eval_wmt2019_de-en_loss": 1.501953125,
"eval_wmt2019_de-en_runtime": 9.454,
"eval_wmt2019_de-en_samples_per_second": 317.115,
"eval_wmt2019_de-en_steps_per_second": 19.886,
"step": 500
},
{
"epoch": 0.08,
"eval_wmt2019_fr-de_accuracy": 0.6479142094481346,
"eval_wmt2019_fr-de_loss": 1.5439453125,
"eval_wmt2019_fr-de_runtime": 5.2625,
"eval_wmt2019_fr-de_samples_per_second": 287.315,
"eval_wmt2019_fr-de_steps_per_second": 18.052,
"step": 500
},
{
"epoch": 0.08,
"eval_essay_instruction_accuracy": 0.5775154438520775,
"eval_essay_instruction_loss": 2.09765625,
"eval_essay_instruction_runtime": 3.0793,
"eval_essay_instruction_samples_per_second": 134.122,
"eval_essay_instruction_steps_per_second": 8.443,
"step": 500
},
{
"epoch": 0.08,
"eval_reddit_eli5_accuracy": 0.42260973997710893,
"eval_reddit_eli5_loss": 2.76171875,
"eval_reddit_eli5_runtime": 203.2121,
"eval_reddit_eli5_samples_per_second": 268.326,
"eval_reddit_eli5_steps_per_second": 16.771,
"step": 500
},
{
"epoch": 0.08,
"eval_reddit_askh_accuracy": 0.42568767737796204,
"eval_reddit_askh_loss": 2.84375,
"eval_reddit_askh_runtime": 111.1784,
"eval_reddit_askh_samples_per_second": 177.238,
"eval_reddit_askh_steps_per_second": 11.081,
"step": 500
},
{
"epoch": 0.08,
"eval_reddit_asks_accuracy": 0.43555163138333913,
"eval_reddit_asks_loss": 2.689453125,
"eval_reddit_asks_runtime": 119.9403,
"eval_reddit_asks_samples_per_second": 219.743,
"eval_reddit_asks_steps_per_second": 13.74,
"step": 500
},
{
"epoch": 0.08,
"learning_rate": 4.5083465988888945e-06,
"loss": 1.8966,
"step": 510
},
{
"epoch": 0.09,
"learning_rate": 4.5224842384899045e-06,
"loss": 1.9039,
"step": 520
},
{
"epoch": 0.09,
"learning_rate": 4.5363510253542444e-06,
"loss": 1.9029,
"step": 530
},
{
"epoch": 0.09,
"learning_rate": 4.549957142832593e-06,
"loss": 1.8759,
"step": 540
},
{
"epoch": 0.09,
"learning_rate": 4.563312210555719e-06,
"loss": 1.9042,
"step": 550
},
{
"epoch": 0.09,
"learning_rate": 4.576425325289549e-06,
"loss": 1.9205,
"step": 560
},
{
"epoch": 0.09,
"learning_rate": 4.589305098154845e-06,
"loss": 1.9324,
"step": 570
},
{
"epoch": 0.1,
"learning_rate": 4.601959688592886e-06,
"loss": 1.8757,
"step": 580
},
{
"epoch": 0.1,
"learning_rate": 4.614396835412691e-06,
"loss": 1.895,
"step": 590
},
{
"epoch": 0.1,
"learning_rate": 4.626623885215616e-06,
"loss": 1.9004,
"step": 600
},
{
"epoch": 0.1,
"learning_rate": 4.638647818458763e-06,
"loss": 1.8705,
"step": 610
},
{
"epoch": 0.1,
"learning_rate": 4.650475273388737e-06,
"loss": 1.8929,
"step": 620
},
{
"epoch": 0.1,
"learning_rate": 4.662112568051194e-06,
"loss": 1.8745,
"step": 630
},
{
"epoch": 0.11,
"learning_rate": 4.673565720558918e-06,
"loss": 1.8768,
"step": 640
},
{
"epoch": 0.11,
"learning_rate": 4.6848404677811685e-06,
"loss": 1.885,
"step": 650
},
{
"epoch": 0.11,
"learning_rate": 4.695942282599635e-06,
"loss": 1.8496,
"step": 660
},
{
"epoch": 0.11,
"learning_rate": 4.706876389860915e-06,
"loss": 1.9061,
"step": 670
},
{
"epoch": 0.11,
"learning_rate": 4.717647781141908e-06,
"loss": 1.8839,
"step": 680
},
{
"epoch": 0.11,
"learning_rate": 4.7282612284325845e-06,
"loss": 1.921,
"step": 690
},
{
"epoch": 0.12,
"learning_rate": 4.738721296830016e-06,
"loss": 1.8519,
"step": 700
},
{
"epoch": 0.12,
"learning_rate": 4.749032356328167e-06,
"loss": 1.8901,
"step": 710
},
{
"epoch": 0.12,
"learning_rate": 4.759198592779668e-06,
"loss": 1.8678,
"step": 720
},
{
"epoch": 0.12,
"learning_rate": 4.769224018098397e-06,
"loss": 1.859,
"step": 730
},
{
"epoch": 0.12,
"learning_rate": 4.7791124797650865e-06,
"loss": 1.8315,
"step": 740
},
{
"epoch": 0.12,
"learning_rate": 4.788867669692332e-06,
"loss": 1.8915,
"step": 750
},
{
"epoch": 0.12,
"learning_rate": 4.798493132500121e-06,
"loss": 1.8936,
"step": 760
},
{
"epoch": 0.13,
"learning_rate": 4.8079922732483016e-06,
"loss": 1.8869,
"step": 770
},
{
"epoch": 0.13,
"learning_rate": 4.817368364668191e-06,
"loss": 1.8556,
"step": 780
},
{
"epoch": 0.13,
"learning_rate": 4.8266245539317745e-06,
"loss": 1.8594,
"step": 790
},
{
"epoch": 0.13,
"learning_rate": 4.835763868993521e-06,
"loss": 1.8646,
"step": 800
},
{
"epoch": 0.13,
"learning_rate": 4.844789224536785e-06,
"loss": 1.8758,
"step": 810
},
{
"epoch": 0.13,
"learning_rate": 4.853703427554027e-06,
"loss": 1.8349,
"step": 820
},
{
"epoch": 0.14,
"learning_rate": 4.862509182587578e-06,
"loss": 1.8517,
"step": 830
},
{
"epoch": 0.14,
"learning_rate": 4.871209096655434e-06,
"loss": 1.8563,
"step": 840
},
{
"epoch": 0.14,
"learning_rate": 4.879805683884512e-06,
"loss": 1.8749,
"step": 850
},
{
"epoch": 0.14,
"learning_rate": 4.888301369871998e-06,
"loss": 1.8276,
"step": 860
},
{
"epoch": 0.14,
"learning_rate": 4.8966984957936845e-06,
"loss": 1.7967,
"step": 870
},
{
"epoch": 0.14,
"learning_rate": 4.904999322276735e-06,
"loss": 1.9041,
"step": 880
},
{
"epoch": 0.15,
"learning_rate": 4.913206033052878e-06,
"loss": 1.8417,
"step": 890
},
{
"epoch": 0.15,
"learning_rate": 4.921320738406821e-06,
"loss": 1.8611,
"step": 900
},
{
"epoch": 0.15,
"learning_rate": 4.929345478433492e-06,
"loss": 1.8924,
"step": 910
},
{
"epoch": 0.15,
"learning_rate": 4.937282226116702e-06,
"loss": 1.8684,
"step": 920
},
{
"epoch": 0.15,
"learning_rate": 4.945132890240829e-06,
"loss": 1.8292,
"step": 930
},
{
"epoch": 0.15,
"learning_rate": 4.952899318146298e-06,
"loss": 1.8498,
"step": 940
},
{
"epoch": 0.16,
"learning_rate": 4.96058329833879e-06,
"loss": 1.8944,
"step": 950
},
{
"epoch": 0.16,
"learning_rate": 4.968186562961406e-06,
"loss": 1.885,
"step": 960
},
{
"epoch": 0.16,
"learning_rate": 4.975710790138337e-06,
"loss": 1.8469,
"step": 970
},
{
"epoch": 0.16,
"learning_rate": 4.9831576061979556e-06,
"loss": 1.8536,
"step": 980
},
{
"epoch": 0.16,
"learning_rate": 4.990528587782728e-06,
"loss": 1.8514,
"step": 990
},
{
"epoch": 0.16,
"learning_rate": 4.99782526385276e-06,
"loss": 1.8168,
"step": 1000
},
{
"epoch": 0.16,
"eval_gsm8k_hard_accuracy": 0.9009201579740238,
"eval_gsm8k_hard_loss": 0.44189453125,
"eval_gsm8k_hard_runtime": 1.5125,
"eval_gsm8k_hard_samples_per_second": 174.543,
"eval_gsm8k_hard_steps_per_second": 11.24,
"step": 1000
},
{
"epoch": 0.16,
"eval_webgpt_accuracy": 0.4676998865211623,
"eval_webgpt_loss": 2.44921875,
"eval_webgpt_runtime": 15.1394,
"eval_webgpt_samples_per_second": 258.664,
"eval_webgpt_steps_per_second": 16.183,
"step": 1000
},
{
"epoch": 0.16,
"eval_squad_v2_accuracy": 0.8740146386480032,
"eval_squad_v2_loss": 0.456298828125,
"eval_squad_v2_runtime": 77.6449,
"eval_squad_v2_samples_per_second": 335.682,
"eval_squad_v2_steps_per_second": 20.98,
"step": 1000
},
{
"epoch": 0.16,
"eval_adversarial_qa_accuracy": 0.7841552865406405,
"eval_adversarial_qa_loss": 1.16796875,
"eval_adversarial_qa_runtime": 20.3196,
"eval_adversarial_qa_samples_per_second": 295.282,
"eval_adversarial_qa_steps_per_second": 18.455,
"step": 1000
},
{
"epoch": 0.16,
"eval_private_tuning_accuracy": 0.6468452789452516,
"eval_private_tuning_loss": 1.33203125,
"eval_private_tuning_runtime": 62.217,
"eval_private_tuning_samples_per_second": 340.389,
"eval_private_tuning_steps_per_second": 21.28,
"step": 1000
},
{
"epoch": 0.16,
"eval_oa_translated_accuracy": 0.6605713712776647,
"eval_oa_translated_loss": 1.4912109375,
"eval_oa_translated_runtime": 498.0305,
"eval_oa_translated_samples_per_second": 268.506,
"eval_oa_translated_steps_per_second": 16.782,
"step": 1000
},
{
"epoch": 0.16,
"eval_prosocial_dialogue_accuracy": 0.5267998067934173,
"eval_prosocial_dialogue_loss": 1.9033203125,
"eval_prosocial_dialogue_runtime": 126.2272,
"eval_prosocial_dialogue_samples_per_second": 213.765,
"eval_prosocial_dialogue_steps_per_second": 13.365,
"step": 1000
},
{
"epoch": 0.16,
"eval_math_qa_accuracy": 0.5343074095293895,
"eval_math_qa_loss": 2.080078125,
"eval_math_qa_runtime": 19.3631,
"eval_math_qa_samples_per_second": 308.215,
"eval_math_qa_steps_per_second": 19.263,
"step": 1000
},
{
"epoch": 0.16,
"eval_wikihow_accuracy": 0.5909261056425897,
"eval_wikihow_loss": 2.078125,
"eval_wikihow_runtime": 7.7313,
"eval_wikihow_samples_per_second": 296.588,
"eval_wikihow_steps_per_second": 18.626,
"step": 1000
},
{
"epoch": 0.16,
"eval_joke_accuracy": 0.45830174374526156,
"eval_joke_loss": 2.498046875,
"eval_joke_runtime": 2.2389,
"eval_joke_samples_per_second": 33.945,
"eval_joke_steps_per_second": 2.233,
"step": 1000
},
{
"epoch": 0.16,
"eval_gsm8k_accuracy": 0.7258224765956256,
"eval_gsm8k_loss": 1.0576171875,
"eval_gsm8k_runtime": 6.1435,
"eval_gsm8k_samples_per_second": 243.346,
"eval_gsm8k_steps_per_second": 15.301,
"step": 1000
},
{
"epoch": 0.16,
"eval_ted_trans_en-hi_accuracy": 0.5460148777895856,
"eval_ted_trans_en-hi_loss": 2.138671875,
"eval_ted_trans_en-hi_runtime": 0.5653,
"eval_ted_trans_en-hi_samples_per_second": 182.218,
"eval_ted_trans_en-hi_steps_per_second": 12.384,
"step": 1000
},
{
"epoch": 0.16,
"eval_ted_trans_de-ja_accuracy": 0.5330968145857443,
"eval_ted_trans_de-ja_loss": 2.193359375,
"eval_ted_trans_de-ja_runtime": 3.858,
"eval_ted_trans_de-ja_samples_per_second": 186.106,
"eval_ted_trans_de-ja_steps_per_second": 11.664,
"step": 1000
},
{
"epoch": 0.16,
"eval_ted_trans_nl-en_accuracy": 0.6414471117584333,
"eval_ted_trans_nl-en_loss": 1.689453125,
"eval_ted_trans_nl-en_runtime": 3.1044,
"eval_ted_trans_nl-en_samples_per_second": 248.36,
"eval_ted_trans_nl-en_steps_per_second": 15.784,
"step": 1000
},
{
"epoch": 0.16,
"eval_ted_trans_en-ja_accuracy": 0.5530009680542111,
"eval_ted_trans_en-ja_loss": 2.05859375,
"eval_ted_trans_en-ja_runtime": 3.8822,
"eval_ted_trans_en-ja_samples_per_second": 206.324,
"eval_ted_trans_en-ja_steps_per_second": 13.137,
"step": 1000
},
{
"epoch": 0.16,
"eval_ted_trans_en-es_accuracy": 0.7106248418922337,
"eval_ted_trans_en-es_loss": 1.2880859375,
"eval_ted_trans_en-es_runtime": 2.9435,
"eval_ted_trans_en-es_samples_per_second": 280.621,
"eval_ted_trans_en-es_steps_per_second": 17.666,
"step": 1000
},
{
"epoch": 0.16,
"eval_ted_trans_en-ms_accuracy": 0.5987135081642752,
"eval_ted_trans_en-ms_loss": 1.984375,
"eval_ted_trans_en-ms_runtime": 1.4559,
"eval_ted_trans_en-ms_samples_per_second": 28.848,
"eval_ted_trans_en-ms_steps_per_second": 2.061,
"step": 1000
},
{
"epoch": 0.16,
"eval_xsum_accuracy": 0.5722584941442159,
"eval_xsum_loss": NaN,
"eval_xsum_runtime": 141.7203,
"eval_xsum_samples_per_second": 287.955,
"eval_xsum_steps_per_second": 18.0,
"step": 1000
},
{
"epoch": 0.16,
"eval_cnn_dailymail_accuracy": 0.6576155822271417,
"eval_cnn_dailymail_loss": NaN,
"eval_cnn_dailymail_runtime": 209.5351,
"eval_cnn_dailymail_samples_per_second": 274.05,
"eval_cnn_dailymail_steps_per_second": 17.128,
"step": 1000
},
{
"epoch": 0.16,
"eval_multi_news_accuracy": 0.5226291863275405,
"eval_multi_news_loss": NaN,
"eval_multi_news_runtime": 36.8798,
"eval_multi_news_samples_per_second": 243.9,
"eval_multi_news_steps_per_second": 15.266,
"step": 1000
},
{
"epoch": 0.16,
"eval_tldr_news_accuracy": 0.5359729145114267,
"eval_tldr_news_loss": 2.20703125,
"eval_tldr_news_runtime": 4.9335,
"eval_tldr_news_samples_per_second": 289.451,
"eval_tldr_news_steps_per_second": 18.243,
"step": 1000
},
{
"epoch": 0.16,
"eval_scitldr_accuracy": 0.49700598802395207,
"eval_scitldr_loss": NaN,
"eval_scitldr_runtime": 1.5917,
"eval_scitldr_samples_per_second": 250.67,
"eval_scitldr_steps_per_second": 15.706,
"step": 1000
},
{
"epoch": 0.16,
"eval_samsum_accuracy": 0.590799585469913,
"eval_samsum_loss": 1.5537109375,
"eval_samsum_runtime": 10.6642,
"eval_samsum_samples_per_second": 276.345,
"eval_samsum_steps_per_second": 17.348,
"step": 1000
},
{
"epoch": 0.16,
"eval_debate_sum_accuracy": 0.9329163674973446,
"eval_debate_sum_loss": NaN,
"eval_debate_sum_runtime": 196.1179,
"eval_debate_sum_samples_per_second": 245.332,
"eval_debate_sum_steps_per_second": 15.338,
"step": 1000
},
{
"epoch": 0.16,
"eval_billsum_accuracy": 0.6510711811280909,
"eval_billsum_loss": NaN,
"eval_billsum_runtime": 16.6536,
"eval_billsum_samples_per_second": 227.579,
"eval_billsum_steps_per_second": 14.231,
"step": 1000
},
{
"epoch": 0.16,
"eval_wmt2019_zh-en_accuracy": 0.5587294145226513,
"eval_wmt2019_zh-en_loss": 2.111328125,
"eval_wmt2019_zh-en_runtime": 12.8767,
"eval_wmt2019_zh-en_samples_per_second": 309.164,
"eval_wmt2019_zh-en_steps_per_second": 19.337,
"step": 1000
},
{
"epoch": 0.16,
"eval_wmt2019_ru-en_accuracy": 0.6366095054310483,
"eval_wmt2019_ru-en_loss": 1.552734375,
"eval_wmt2019_ru-en_runtime": 10.1123,
"eval_wmt2019_ru-en_samples_per_second": 296.667,
"eval_wmt2019_ru-en_steps_per_second": 18.591,
"step": 1000
},
{
"epoch": 0.16,
"eval_wmt2019_de-en_accuracy": 0.6681106028096029,
"eval_wmt2019_de-en_loss": 1.4462890625,
"eval_wmt2019_de-en_runtime": 9.913,
"eval_wmt2019_de-en_samples_per_second": 302.432,
"eval_wmt2019_de-en_steps_per_second": 18.965,
"step": 1000
},
{
"epoch": 0.16,
"eval_wmt2019_fr-de_accuracy": 0.6540822971254923,
"eval_wmt2019_fr-de_loss": 1.51171875,
"eval_wmt2019_fr-de_runtime": 5.7364,
"eval_wmt2019_fr-de_samples_per_second": 263.579,
"eval_wmt2019_fr-de_steps_per_second": 16.561,
"step": 1000
},
{
"epoch": 0.16,
"eval_essay_instruction_accuracy": 0.5807311500380807,
"eval_essay_instruction_loss": 2.072265625,
"eval_essay_instruction_runtime": 4.2906,
"eval_essay_instruction_samples_per_second": 96.257,
"eval_essay_instruction_steps_per_second": 6.06,
"step": 1000
},
{
"epoch": 0.16,
"eval_reddit_eli5_accuracy": 0.42394149731958486,
"eval_reddit_eli5_loss": 2.748046875,
"eval_reddit_eli5_runtime": 220.7861,
"eval_reddit_eli5_samples_per_second": 246.968,
"eval_reddit_eli5_steps_per_second": 15.436,
"step": 1000
},
{
"epoch": 0.16,
"eval_reddit_askh_accuracy": 0.42705794870139985,
"eval_reddit_askh_loss": 2.826171875,
"eval_reddit_askh_runtime": 106.0159,
"eval_reddit_askh_samples_per_second": 185.868,
"eval_reddit_askh_steps_per_second": 11.621,
"step": 1000
},
{
"epoch": 0.16,
"eval_reddit_asks_accuracy": 0.43686793419350517,
"eval_reddit_asks_loss": 2.67578125,
"eval_reddit_asks_runtime": 110.0537,
"eval_reddit_asks_samples_per_second": 239.483,
"eval_reddit_asks_steps_per_second": 14.975,
"step": 1000
},
{
"epoch": 0.17,
"learning_rate": 4.997313753581662e-06,
"loss": 1.8558,
"step": 1010
},
{
"epoch": 0.17,
"learning_rate": 4.992836676217766e-06,
"loss": 1.862,
"step": 1020
},
{
"epoch": 0.17,
"learning_rate": 4.988359598853868e-06,
"loss": 1.8667,
"step": 1030
},
{
"epoch": 0.17,
"learning_rate": 4.9838825214899716e-06,
"loss": 1.8597,
"step": 1040
},
{
"epoch": 0.17,
"learning_rate": 4.979405444126075e-06,
"loss": 1.8472,
"step": 1050
},
{
"epoch": 0.17,
"learning_rate": 4.974928366762178e-06,
"loss": 1.8552,
"step": 1060
},
{
"epoch": 0.18,
"learning_rate": 4.9704512893982816e-06,
"loss": 1.8532,
"step": 1070
},
{
"epoch": 0.18,
"learning_rate": 4.965974212034385e-06,
"loss": 1.8235,
"step": 1080
},
{
"epoch": 0.18,
"learning_rate": 4.961497134670487e-06,
"loss": 1.861,
"step": 1090
},
{
"epoch": 0.18,
"learning_rate": 4.957020057306591e-06,
"loss": 1.7964,
"step": 1100
},
{
"epoch": 0.18,
"learning_rate": 4.952542979942694e-06,
"loss": 1.8203,
"step": 1110
},
{
"epoch": 0.18,
"learning_rate": 4.9480659025787965e-06,
"loss": 1.8759,
"step": 1120
},
{
"epoch": 0.19,
"learning_rate": 4.9435888252149e-06,
"loss": 1.8592,
"step": 1130
},
{
"epoch": 0.19,
"learning_rate": 4.939111747851003e-06,
"loss": 1.8487,
"step": 1140
},
{
"epoch": 0.19,
"learning_rate": 4.9346346704871065e-06,
"loss": 1.8112,
"step": 1150
},
{
"epoch": 0.19,
"learning_rate": 4.930157593123209e-06,
"loss": 1.8028,
"step": 1160
},
{
"epoch": 0.19,
"learning_rate": 4.925680515759313e-06,
"loss": 1.8361,
"step": 1170
},
{
"epoch": 0.19,
"learning_rate": 4.921203438395416e-06,
"loss": 1.8625,
"step": 1180
},
{
"epoch": 0.2,
"learning_rate": 4.916726361031519e-06,
"loss": 1.8345,
"step": 1190
},
{
"epoch": 0.2,
"learning_rate": 4.912249283667622e-06,
"loss": 1.8506,
"step": 1200
},
{
"epoch": 0.2,
"learning_rate": 4.907772206303726e-06,
"loss": 1.8326,
"step": 1210
},
{
"epoch": 0.2,
"learning_rate": 4.903295128939828e-06,
"loss": 1.8399,
"step": 1220
},
{
"epoch": 0.2,
"learning_rate": 4.8988180515759315e-06,
"loss": 1.8706,
"step": 1230
},
{
"epoch": 0.2,
"learning_rate": 4.894340974212035e-06,
"loss": 1.8227,
"step": 1240
},
{
"epoch": 0.21,
"learning_rate": 4.889863896848137e-06,
"loss": 1.8461,
"step": 1250
},
{
"epoch": 0.21,
"learning_rate": 4.8853868194842415e-06,
"loss": 1.874,
"step": 1260
},
{
"epoch": 0.21,
"learning_rate": 4.880909742120344e-06,
"loss": 1.8178,
"step": 1270
},
{
"epoch": 0.21,
"learning_rate": 4.876432664756447e-06,
"loss": 1.8141,
"step": 1280
},
{
"epoch": 0.21,
"learning_rate": 4.871955587392551e-06,
"loss": 1.8258,
"step": 1290
},
{
"epoch": 0.21,
"learning_rate": 4.867478510028654e-06,
"loss": 1.8595,
"step": 1300
},
{
"epoch": 0.22,
"learning_rate": 4.8630014326647565e-06,
"loss": 1.8495,
"step": 1310
},
{
"epoch": 0.22,
"learning_rate": 4.85852435530086e-06,
"loss": 1.8492,
"step": 1320
},
{
"epoch": 0.22,
"learning_rate": 4.854047277936963e-06,
"loss": 1.8339,
"step": 1330
},
{
"epoch": 0.22,
"learning_rate": 4.849570200573066e-06,
"loss": 1.8218,
"step": 1340
},
{
"epoch": 0.22,
"learning_rate": 4.84509312320917e-06,
"loss": 1.8411,
"step": 1350
},
{
"epoch": 0.22,
"learning_rate": 4.840616045845273e-06,
"loss": 1.8275,
"step": 1360
},
{
"epoch": 0.23,
"learning_rate": 4.836138968481376e-06,
"loss": 1.8358,
"step": 1370
},
{
"epoch": 0.23,
"learning_rate": 4.831661891117479e-06,
"loss": 1.8395,
"step": 1380
},
{
"epoch": 0.23,
"learning_rate": 4.827184813753582e-06,
"loss": 1.8114,
"step": 1390
},
{
"epoch": 0.23,
"learning_rate": 4.822707736389685e-06,
"loss": 1.8167,
"step": 1400
},
{
"epoch": 0.23,
"learning_rate": 4.818230659025788e-06,
"loss": 1.8502,
"step": 1410
},
{
"epoch": 0.23,
"learning_rate": 4.8137535816618915e-06,
"loss": 1.8494,
"step": 1420
},
{
"epoch": 0.24,
"learning_rate": 4.809276504297995e-06,
"loss": 1.8546,
"step": 1430
},
{
"epoch": 0.24,
"learning_rate": 4.804799426934098e-06,
"loss": 1.8219,
"step": 1440
},
{
"epoch": 0.24,
"learning_rate": 4.8003223495702015e-06,
"loss": 1.856,
"step": 1450
},
{
"epoch": 0.24,
"learning_rate": 4.795845272206304e-06,
"loss": 1.8235,
"step": 1460
},
{
"epoch": 0.24,
"learning_rate": 4.791368194842407e-06,
"loss": 1.8084,
"step": 1470
},
{
"epoch": 0.24,
"learning_rate": 4.786891117478511e-06,
"loss": 1.8036,
"step": 1480
},
{
"epoch": 0.24,
"learning_rate": 4.782414040114613e-06,
"loss": 1.807,
"step": 1490
},
{
"epoch": 0.25,
"learning_rate": 4.7779369627507165e-06,
"loss": 1.8223,
"step": 1500
},
{
"epoch": 0.25,
"eval_gsm8k_hard_accuracy": 0.9040755669557429,
"eval_gsm8k_hard_loss": 0.418212890625,
"eval_gsm8k_hard_runtime": 1.5226,
"eval_gsm8k_hard_samples_per_second": 173.393,
"eval_gsm8k_hard_steps_per_second": 11.165,
"step": 1500
},
{
"epoch": 0.25,
"eval_webgpt_accuracy": 0.4685187206997972,
"eval_webgpt_loss": 2.43359375,
"eval_webgpt_runtime": 16.9148,
"eval_webgpt_samples_per_second": 231.513,
"eval_webgpt_steps_per_second": 14.484,
"step": 1500
},
{
"epoch": 0.25,
"eval_squad_v2_accuracy": 0.8753178869062296,
"eval_squad_v2_loss": 0.424560546875,
"eval_squad_v2_runtime": 78.195,
"eval_squad_v2_samples_per_second": 333.32,
"eval_squad_v2_steps_per_second": 20.833,
"step": 1500
},
{
"epoch": 0.25,
"eval_adversarial_qa_accuracy": 0.7873755143419405,
"eval_adversarial_qa_loss": 1.076171875,
"eval_adversarial_qa_runtime": 18.1293,
"eval_adversarial_qa_samples_per_second": 330.956,
"eval_adversarial_qa_steps_per_second": 20.685,
"step": 1500
},
{
"epoch": 0.25,
"eval_private_tuning_accuracy": 0.6490207424086791,
"eval_private_tuning_loss": 1.30859375,
"eval_private_tuning_runtime": 65.2643,
"eval_private_tuning_samples_per_second": 324.496,
"eval_private_tuning_steps_per_second": 20.287,
"step": 1500
},
{
"epoch": 0.25,
"eval_oa_translated_accuracy": 0.6676994033045951,
"eval_oa_translated_loss": 1.447265625,
"eval_oa_translated_runtime": 495.9674,
"eval_oa_translated_samples_per_second": 269.623,
"eval_oa_translated_steps_per_second": 16.852,
"step": 1500
},
{
"epoch": 0.25,
"eval_prosocial_dialogue_accuracy": 0.52370774081413,
"eval_prosocial_dialogue_loss": 1.802734375,
"eval_prosocial_dialogue_runtime": 117.5764,
"eval_prosocial_dialogue_samples_per_second": 229.493,
"eval_prosocial_dialogue_steps_per_second": 14.348,
"step": 1500
},
{
"epoch": 0.25,
"eval_math_qa_accuracy": 0.5447965302424216,
"eval_math_qa_loss": 2.015625,
"eval_math_qa_runtime": 19.5318,
"eval_math_qa_samples_per_second": 305.553,
"eval_math_qa_steps_per_second": 19.097,
"step": 1500
},
{
"epoch": 0.25,
"eval_wikihow_accuracy": 0.5924303341189519,
"eval_wikihow_loss": 2.048828125,
"eval_wikihow_runtime": 8.7505,
"eval_wikihow_samples_per_second": 262.042,
"eval_wikihow_steps_per_second": 16.456,
"step": 1500
},
{
"epoch": 0.25,
"eval_joke_accuracy": 0.46341925701288855,
"eval_joke_loss": 2.439453125,
"eval_joke_runtime": 0.9638,
"eval_joke_samples_per_second": 78.857,
"eval_joke_steps_per_second": 5.188,
"step": 1500
},
{
"epoch": 0.25,
"eval_gsm8k_accuracy": 0.7337848616728005,
"eval_gsm8k_loss": 1.01171875,
"eval_gsm8k_runtime": 6.3369,
"eval_gsm8k_samples_per_second": 235.921,
"eval_gsm8k_steps_per_second": 14.834,
"step": 1500
},
{
"epoch": 0.25,
"eval_ted_trans_en-hi_accuracy": 0.5467649647887324,
"eval_ted_trans_en-hi_loss": 2.080078125,
"eval_ted_trans_en-hi_runtime": 0.6116,
"eval_ted_trans_en-hi_samples_per_second": 168.402,
"eval_ted_trans_en-hi_steps_per_second": 11.445,
"step": 1500
},
{
"epoch": 0.25,
"eval_ted_trans_de-ja_accuracy": 0.5412865271482303,
"eval_ted_trans_de-ja_loss": 2.130859375,
"eval_ted_trans_de-ja_runtime": 3.781,
"eval_ted_trans_de-ja_samples_per_second": 189.896,
"eval_ted_trans_de-ja_steps_per_second": 11.902,
"step": 1500
},
{
"epoch": 0.25,
"eval_ted_trans_nl-en_accuracy": 0.6488075112486671,
"eval_ted_trans_nl-en_loss": 1.65625,
"eval_ted_trans_nl-en_runtime": 3.2481,
"eval_ted_trans_nl-en_samples_per_second": 237.369,
"eval_ted_trans_nl-en_steps_per_second": 15.086,
"step": 1500
},
{
"epoch": 0.25,
"eval_ted_trans_en-ja_accuracy": 0.5640618403329245,
"eval_ted_trans_en-ja_loss": 1.990234375,
"eval_ted_trans_en-ja_runtime": 3.5175,
"eval_ted_trans_en-ja_samples_per_second": 227.717,
"eval_ted_trans_en-ja_steps_per_second": 14.499,
"step": 1500
},
{
"epoch": 0.25,
"eval_ted_trans_en-es_accuracy": 0.7159835441109249,
"eval_ted_trans_en-es_loss": 1.25,
"eval_ted_trans_en-es_runtime": 3.1192,
"eval_ted_trans_en-es_samples_per_second": 264.814,
"eval_ted_trans_en-es_steps_per_second": 16.671,
"step": 1500
},
{
"epoch": 0.25,
"eval_ted_trans_en-ms_accuracy": 0.5680356259277586,
"eval_ted_trans_en-ms_loss": 2.125,
"eval_ted_trans_en-ms_runtime": 1.4378,
"eval_ted_trans_en-ms_samples_per_second": 29.212,
"eval_ted_trans_en-ms_steps_per_second": 2.087,
"step": 1500
},
{
"epoch": 0.25,
"eval_xsum_accuracy": 0.575791398307109,
"eval_xsum_loss": NaN,
"eval_xsum_runtime": 142.8893,
"eval_xsum_samples_per_second": 285.599,
"eval_xsum_steps_per_second": 17.853,
"step": 1500
},
{
"epoch": 0.25,
"eval_cnn_dailymail_accuracy": 0.6578154814514997,
"eval_cnn_dailymail_loss": NaN,
"eval_cnn_dailymail_runtime": 210.3199,
"eval_cnn_dailymail_samples_per_second": 273.027,
"eval_cnn_dailymail_steps_per_second": 17.064,
"step": 1500
},
{
"epoch": 0.25,
"eval_multi_news_accuracy": 0.5236211410651092,
"eval_multi_news_loss": NaN,
"eval_multi_news_runtime": 35.5484,
"eval_multi_news_samples_per_second": 253.035,
"eval_multi_news_steps_per_second": 15.838,
"step": 1500
},
{
"epoch": 0.25,
"eval_tldr_news_accuracy": 0.5471644879149816,
"eval_tldr_news_loss": 2.119140625,
"eval_tldr_news_runtime": 4.1143,
"eval_tldr_news_samples_per_second": 347.078,
"eval_tldr_news_steps_per_second": 21.875,
"step": 1500
},
{
"epoch": 0.25,
"eval_scitldr_accuracy": 0.49550898203592814,
"eval_scitldr_loss": NaN,
"eval_scitldr_runtime": 2.4172,
"eval_scitldr_samples_per_second": 165.069,
"eval_scitldr_steps_per_second": 10.343,
"step": 1500
},
{
"epoch": 0.25,
"eval_samsum_accuracy": 0.5926165192931454,
"eval_samsum_loss": 1.5224609375,
"eval_samsum_runtime": 10.6814,
"eval_samsum_samples_per_second": 275.901,
"eval_samsum_steps_per_second": 17.32,
"step": 1500
},
{
"epoch": 0.25,
"eval_debate_sum_accuracy": 0.9358983757089394,
"eval_debate_sum_loss": NaN,
"eval_debate_sum_runtime": 196.0638,
"eval_debate_sum_samples_per_second": 245.4,
"eval_debate_sum_steps_per_second": 15.342,
"step": 1500
},
{
"epoch": 0.25,
"eval_billsum_accuracy": 0.653463309552768,
"eval_billsum_loss": NaN,
"eval_billsum_runtime": 16.6514,
"eval_billsum_samples_per_second": 227.609,
"eval_billsum_steps_per_second": 14.233,
"step": 1500
},
{
"epoch": 0.25,
"eval_wmt2019_zh-en_accuracy": 0.5612179149240267,
"eval_wmt2019_zh-en_loss": 2.091796875,
"eval_wmt2019_zh-en_runtime": 13.0415,
"eval_wmt2019_zh-en_samples_per_second": 305.255,
"eval_wmt2019_zh-en_steps_per_second": 19.093,
"step": 1500
},
{
"epoch": 0.25,
"eval_wmt2019_ru-en_accuracy": 0.6424937502741108,
"eval_wmt2019_ru-en_loss": 1.5146484375,
"eval_wmt2019_ru-en_runtime": 9.2157,
"eval_wmt2019_ru-en_samples_per_second": 325.531,
"eval_wmt2019_ru-en_steps_per_second": 20.4,
"step": 1500
},
{
"epoch": 0.25,
"eval_wmt2019_de-en_accuracy": 0.6710659487470143,
"eval_wmt2019_de-en_loss": 1.4248046875,
"eval_wmt2019_de-en_runtime": 10.6387,
"eval_wmt2019_de-en_samples_per_second": 281.801,
"eval_wmt2019_de-en_steps_per_second": 17.671,
"step": 1500
},
{
"epoch": 0.25,
"eval_wmt2019_fr-de_accuracy": 0.6660826692300537,
"eval_wmt2019_fr-de_loss": 1.4521484375,
"eval_wmt2019_fr-de_runtime": 5.6356,
"eval_wmt2019_fr-de_samples_per_second": 268.295,
"eval_wmt2019_fr-de_steps_per_second": 16.857,
"step": 1500
},
{
"epoch": 0.25,
"eval_essay_instruction_accuracy": 0.5827727003469578,
"eval_essay_instruction_loss": 2.05859375,
"eval_essay_instruction_runtime": 4.1866,
"eval_essay_instruction_samples_per_second": 98.649,
"eval_essay_instruction_steps_per_second": 6.21,
"step": 1500
},
{
"epoch": 0.25,
"eval_reddit_eli5_accuracy": 0.42421905510230506,
"eval_reddit_eli5_loss": 2.73828125,
"eval_reddit_eli5_runtime": 199.4145,
"eval_reddit_eli5_samples_per_second": 273.436,
"eval_reddit_eli5_steps_per_second": 17.09,
"step": 1500
},
{
"epoch": 0.25,
"eval_reddit_askh_accuracy": 0.4276221576585814,
"eval_reddit_askh_loss": 2.81640625,
"eval_reddit_askh_runtime": 110.2213,
"eval_reddit_askh_samples_per_second": 178.777,
"eval_reddit_askh_steps_per_second": 11.178,
"step": 1500
},
{
"epoch": 0.25,
"eval_reddit_asks_accuracy": 0.4372012246587393,
"eval_reddit_asks_loss": 2.66796875,
"eval_reddit_asks_runtime": 131.6507,
"eval_reddit_asks_samples_per_second": 200.196,
"eval_reddit_asks_steps_per_second": 12.518,
"step": 1500
},
{
"epoch": 0.25,
"learning_rate": 4.77345988538682e-06,
"loss": 1.8033,
"step": 1510
},
{
"epoch": 0.25,
"learning_rate": 4.768982808022923e-06,
"loss": 1.8477,
"step": 1520
},
{
"epoch": 0.25,
"learning_rate": 4.7645057306590265e-06,
"loss": 1.8417,
"step": 1530
},
{
"epoch": 0.25,
"learning_rate": 4.76002865329513e-06,
"loss": 1.7781,
"step": 1540
},
{
"epoch": 0.25,
"learning_rate": 4.755551575931232e-06,
"loss": 1.808,
"step": 1550
},
{
"epoch": 0.26,
"learning_rate": 4.751074498567336e-06,
"loss": 1.8719,
"step": 1560
},
{
"epoch": 0.26,
"learning_rate": 4.746597421203439e-06,
"loss": 1.8382,
"step": 1570
},
{
"epoch": 0.26,
"learning_rate": 4.742120343839542e-06,
"loss": 1.7991,
"step": 1580
},
{
"epoch": 0.26,
"learning_rate": 4.737643266475645e-06,
"loss": 1.809,
"step": 1590
},
{
"epoch": 0.26,
"learning_rate": 4.733166189111748e-06,
"loss": 1.8206,
"step": 1600
},
{
"epoch": 0.26,
"learning_rate": 4.7286891117478515e-06,
"loss": 1.8475,
"step": 1610
},
{
"epoch": 0.27,
"learning_rate": 4.724212034383955e-06,
"loss": 1.8342,
"step": 1620
},
{
"epoch": 0.27,
"learning_rate": 4.719734957020058e-06,
"loss": 1.8436,
"step": 1630
},
{
"epoch": 0.27,
"learning_rate": 4.715257879656161e-06,
"loss": 1.8198,
"step": 1640
},
{
"epoch": 0.27,
"learning_rate": 4.710780802292264e-06,
"loss": 1.8271,
"step": 1650
},
{
"epoch": 0.27,
"learning_rate": 4.706303724928367e-06,
"loss": 1.8584,
"step": 1660
},
{
"epoch": 0.27,
"learning_rate": 4.701826647564471e-06,
"loss": 1.8485,
"step": 1670
},
{
"epoch": 0.28,
"learning_rate": 4.697349570200573e-06,
"loss": 1.7872,
"step": 1680
},
{
"epoch": 0.28,
"learning_rate": 4.6928724928366764e-06,
"loss": 1.8026,
"step": 1690
},
{
"epoch": 0.28,
"learning_rate": 4.68839541547278e-06,
"loss": 1.8138,
"step": 1700
},
{
"epoch": 0.28,
"learning_rate": 4.683918338108882e-06,
"loss": 1.7871,
"step": 1710
},
{
"epoch": 0.28,
"learning_rate": 4.6794412607449864e-06,
"loss": 1.8459,
"step": 1720
},
{
"epoch": 0.28,
"learning_rate": 4.67496418338109e-06,
"loss": 1.8081,
"step": 1730
},
{
"epoch": 0.29,
"learning_rate": 4.670487106017192e-06,
"loss": 1.7956,
"step": 1740
},
{
"epoch": 0.29,
"learning_rate": 4.666010028653296e-06,
"loss": 1.779,
"step": 1750
},
{
"epoch": 0.29,
"learning_rate": 4.661532951289399e-06,
"loss": 1.8106,
"step": 1760
},
{
"epoch": 0.29,
"learning_rate": 4.657055873925501e-06,
"loss": 1.8225,
"step": 1770
},
{
"epoch": 0.29,
"learning_rate": 4.652578796561605e-06,
"loss": 1.8263,
"step": 1780
},
{
"epoch": 0.29,
"learning_rate": 4.648101719197708e-06,
"loss": 1.8232,
"step": 1790
},
{
"epoch": 0.3,
"learning_rate": 4.643624641833811e-06,
"loss": 1.751,
"step": 1800
},
{
"epoch": 0.3,
"learning_rate": 4.639147564469915e-06,
"loss": 1.8275,
"step": 1810
},
{
"epoch": 0.3,
"learning_rate": 4.634670487106018e-06,
"loss": 1.7738,
"step": 1820
},
{
"epoch": 0.3,
"learning_rate": 4.6301934097421206e-06,
"loss": 1.8064,
"step": 1830
},
{
"epoch": 0.3,
"learning_rate": 4.625716332378224e-06,
"loss": 1.7775,
"step": 1840
},
{
"epoch": 0.3,
"learning_rate": 4.621239255014327e-06,
"loss": 1.799,
"step": 1850
},
{
"epoch": 0.31,
"learning_rate": 4.61676217765043e-06,
"loss": 1.8349,
"step": 1860
},
{
"epoch": 0.31,
"learning_rate": 4.612285100286533e-06,
"loss": 1.7826,
"step": 1870
},
{
"epoch": 0.31,
"learning_rate": 4.607808022922636e-06,
"loss": 1.8322,
"step": 1880
},
{
"epoch": 0.31,
"learning_rate": 4.60333094555874e-06,
"loss": 1.8115,
"step": 1890
},
{
"epoch": 0.31,
"learning_rate": 4.598853868194843e-06,
"loss": 1.7811,
"step": 1900
},
{
"epoch": 0.31,
"learning_rate": 4.594376790830946e-06,
"loss": 1.8113,
"step": 1910
},
{
"epoch": 0.32,
"learning_rate": 4.589899713467049e-06,
"loss": 1.82,
"step": 1920
},
{
"epoch": 0.32,
"learning_rate": 4.585422636103152e-06,
"loss": 1.8562,
"step": 1930
},
{
"epoch": 0.32,
"learning_rate": 4.5809455587392556e-06,
"loss": 1.8197,
"step": 1940
},
{
"epoch": 0.32,
"learning_rate": 4.576468481375359e-06,
"loss": 1.8255,
"step": 1950
},
{
"epoch": 0.32,
"learning_rate": 4.571991404011461e-06,
"loss": 1.7827,
"step": 1960
},
{
"epoch": 0.32,
"learning_rate": 4.567514326647565e-06,
"loss": 1.7943,
"step": 1970
},
{
"epoch": 0.33,
"learning_rate": 4.563037249283668e-06,
"loss": 1.7536,
"step": 1980
},
{
"epoch": 0.33,
"learning_rate": 4.558560171919771e-06,
"loss": 1.783,
"step": 1990
},
{
"epoch": 0.33,
"learning_rate": 4.554083094555875e-06,
"loss": 1.7924,
"step": 2000
},
{
"epoch": 0.33,
"eval_gsm8k_hard_accuracy": 0.9076584829607915,
"eval_gsm8k_hard_loss": 0.39697265625,
"eval_gsm8k_hard_runtime": 2.2124,
"eval_gsm8k_hard_samples_per_second": 119.327,
"eval_gsm8k_hard_steps_per_second": 7.684,
"step": 2000
},
{
"epoch": 0.33,
"eval_webgpt_accuracy": 0.4696865932858906,
"eval_webgpt_loss": 2.423828125,
"eval_webgpt_runtime": 14.9073,
"eval_webgpt_samples_per_second": 262.691,
"eval_webgpt_steps_per_second": 16.435,
"step": 2000
},
{
"epoch": 0.33,
"eval_squad_v2_accuracy": 0.8851732615724923,
"eval_squad_v2_loss": 0.40966796875,
"eval_squad_v2_runtime": 80.4777,
"eval_squad_v2_samples_per_second": 323.866,
"eval_squad_v2_steps_per_second": 20.242,
"step": 2000
},
{
"epoch": 0.33,
"eval_adversarial_qa_accuracy": 0.7928618283737849,
"eval_adversarial_qa_loss": 1.1015625,
"eval_adversarial_qa_runtime": 18.1784,
"eval_adversarial_qa_samples_per_second": 330.062,
"eval_adversarial_qa_steps_per_second": 20.629,
"step": 2000
},
{
"epoch": 0.33,
"eval_private_tuning_accuracy": 0.6519236031057545,
"eval_private_tuning_loss": 1.2939453125,
"eval_private_tuning_runtime": 67.7066,
"eval_private_tuning_samples_per_second": 312.791,
"eval_private_tuning_steps_per_second": 19.555,
"step": 2000
},
{
"epoch": 0.33,
"eval_oa_translated_accuracy": 0.6739938039772579,
"eval_oa_translated_loss": 1.4169921875,
"eval_oa_translated_runtime": 489.9008,
"eval_oa_translated_samples_per_second": 272.961,
"eval_oa_translated_steps_per_second": 17.061,
"step": 2000
},
{
"epoch": 0.33,
"eval_prosocial_dialogue_accuracy": 0.5339765946198812,
"eval_prosocial_dialogue_loss": 1.8515625,
"eval_prosocial_dialogue_runtime": 112.602,
"eval_prosocial_dialogue_samples_per_second": 239.632,
"eval_prosocial_dialogue_steps_per_second": 14.982,
"step": 2000
},
{
"epoch": 0.33,
"eval_math_qa_accuracy": 0.5540153422185813,
"eval_math_qa_loss": 1.9658203125,
"eval_math_qa_runtime": 19.4551,
"eval_math_qa_samples_per_second": 306.758,
"eval_math_qa_steps_per_second": 19.172,
"step": 2000
},
{
"epoch": 0.33,
"eval_wikihow_accuracy": 0.5962636905587134,
"eval_wikihow_loss": 2.03515625,
"eval_wikihow_runtime": 8.6232,
"eval_wikihow_samples_per_second": 265.91,
"eval_wikihow_steps_per_second": 16.699,
"step": 2000
},
{
"epoch": 0.33,
"eval_joke_accuracy": 0.4670204700530705,
"eval_joke_loss": 2.40234375,
"eval_joke_runtime": 0.908,
"eval_joke_samples_per_second": 83.698,
"eval_joke_steps_per_second": 5.506,
"step": 2000
},
{
"epoch": 0.33,
"eval_gsm8k_accuracy": 0.7402429297099117,
"eval_gsm8k_loss": 0.97998046875,
"eval_gsm8k_runtime": 5.491,
"eval_gsm8k_samples_per_second": 272.266,
"eval_gsm8k_steps_per_second": 17.119,
"step": 2000
},
{
"epoch": 0.33,
"eval_ted_trans_en-hi_accuracy": 0.5490196078431373,
"eval_ted_trans_en-hi_loss": 2.10546875,
"eval_ted_trans_en-hi_runtime": 1.4695,
"eval_ted_trans_en-hi_samples_per_second": 70.093,
"eval_ted_trans_en-hi_steps_per_second": 4.764,
"step": 2000
},
{
"epoch": 0.33,
"eval_ted_trans_de-ja_accuracy": 0.5422406826169489,
"eval_ted_trans_de-ja_loss": 2.119140625,
"eval_ted_trans_de-ja_runtime": 2.8137,
"eval_ted_trans_de-ja_samples_per_second": 255.176,
"eval_ted_trans_de-ja_steps_per_second": 15.993,
"step": 2000
},
{
"epoch": 0.33,
"eval_ted_trans_nl-en_accuracy": 0.6510702489011417,
"eval_ted_trans_nl-en_loss": 1.6396484375,
"eval_ted_trans_nl-en_runtime": 4.3357,
"eval_ted_trans_nl-en_samples_per_second": 177.827,
"eval_ted_trans_nl-en_steps_per_second": 11.302,
"step": 2000
},
{
"epoch": 0.33,
"eval_ted_trans_en-ja_accuracy": 0.556157479064968,
"eval_ted_trans_en-ja_loss": 2.017578125,
"eval_ted_trans_en-ja_runtime": 3.2862,
"eval_ted_trans_en-ja_samples_per_second": 243.744,
"eval_ted_trans_en-ja_steps_per_second": 15.519,
"step": 2000
},
{
"epoch": 0.33,
"eval_ted_trans_en-es_accuracy": 0.7188412420341738,
"eval_ted_trans_en-es_loss": 1.2294921875,
"eval_ted_trans_en-es_runtime": 4.2374,
"eval_ted_trans_en-es_samples_per_second": 194.932,
"eval_ted_trans_en-es_steps_per_second": 12.272,
"step": 2000
},
{
"epoch": 0.33,
"eval_ted_trans_en-ms_accuracy": 0.5734784760019792,
"eval_ted_trans_en-ms_loss": 2.06640625,
"eval_ted_trans_en-ms_runtime": 0.629,
"eval_ted_trans_en-ms_samples_per_second": 66.768,
"eval_ted_trans_en-ms_steps_per_second": 4.769,
"step": 2000
},
{
"epoch": 0.33,
"eval_xsum_accuracy": 0.5781070399698887,
"eval_xsum_loss": NaN,
"eval_xsum_runtime": 144.895,
"eval_xsum_samples_per_second": 281.645,
"eval_xsum_steps_per_second": 17.606,
"step": 2000
},
{
"epoch": 0.33,
"eval_cnn_dailymail_accuracy": 0.661322804206144,
"eval_cnn_dailymail_loss": NaN,
"eval_cnn_dailymail_runtime": 208.0253,
"eval_cnn_dailymail_samples_per_second": 276.039,
"eval_cnn_dailymail_steps_per_second": 17.253,
"step": 2000
},
{
"epoch": 0.33,
"eval_multi_news_accuracy": 0.5257933653652501,
"eval_multi_news_loss": NaN,
"eval_multi_news_runtime": 34.8299,
"eval_multi_news_samples_per_second": 258.255,
"eval_multi_news_steps_per_second": 16.164,
"step": 2000
},
{
"epoch": 0.33,
"eval_tldr_news_accuracy": 0.554453117652591,
"eval_tldr_news_loss": 2.056640625,
"eval_tldr_news_runtime": 4.219,
"eval_tldr_news_samples_per_second": 338.467,
"eval_tldr_news_steps_per_second": 21.332,
"step": 2000
},
{
"epoch": 0.33,
"eval_scitldr_accuracy": 0.5014970059880239,
"eval_scitldr_loss": NaN,
"eval_scitldr_runtime": 2.5157,
"eval_scitldr_samples_per_second": 158.605,
"eval_scitldr_steps_per_second": 9.938,
"step": 2000
},
{
"epoch": 0.33,
"eval_samsum_accuracy": 0.5984307075274895,
"eval_samsum_loss": 1.5048828125,
"eval_samsum_runtime": 9.7229,
"eval_samsum_samples_per_second": 303.099,
"eval_samsum_steps_per_second": 19.027,
"step": 2000
},
{
"epoch": 0.33,
"eval_debate_sum_accuracy": 0.9366294751454436,
"eval_debate_sum_loss": NaN,
"eval_debate_sum_runtime": 191.1458,
"eval_debate_sum_samples_per_second": 251.714,
"eval_debate_sum_steps_per_second": 15.737,
"step": 2000
},
{
"epoch": 0.33,
"eval_billsum_accuracy": 0.6557828398195577,
"eval_billsum_loss": NaN,
"eval_billsum_runtime": 21.6214,
"eval_billsum_samples_per_second": 175.289,
"eval_billsum_steps_per_second": 10.961,
"step": 2000
},
{
"epoch": 0.33,
"eval_wmt2019_zh-en_accuracy": 0.5675863966606366,
"eval_wmt2019_zh-en_loss": 2.0625,
"eval_wmt2019_zh-en_runtime": 12.8505,
"eval_wmt2019_zh-en_samples_per_second": 309.793,
"eval_wmt2019_zh-en_steps_per_second": 19.377,
"step": 2000
},
{
"epoch": 0.33,
"eval_wmt2019_ru-en_accuracy": 0.6426618715553412,
"eval_wmt2019_ru-en_loss": 1.517578125,
"eval_wmt2019_ru-en_runtime": 9.7092,
"eval_wmt2019_ru-en_samples_per_second": 308.986,
"eval_wmt2019_ru-en_steps_per_second": 19.363,
"step": 2000
},
{
"epoch": 0.33,
"eval_wmt2019_de-en_accuracy": 0.6722804744747176,
"eval_wmt2019_de-en_loss": 1.4169921875,
"eval_wmt2019_de-en_runtime": 10.2232,
"eval_wmt2019_de-en_samples_per_second": 293.255,
"eval_wmt2019_de-en_steps_per_second": 18.39,
"step": 2000
},
{
"epoch": 0.33,
"eval_wmt2019_fr-de_accuracy": 0.6611832925051939,
"eval_wmt2019_fr-de_loss": 1.466796875,
"eval_wmt2019_fr-de_runtime": 5.2327,
"eval_wmt2019_fr-de_samples_per_second": 288.95,
"eval_wmt2019_fr-de_steps_per_second": 18.155,
"step": 2000
},
{
"epoch": 0.33,
"eval_essay_instruction_accuracy": 0.5831323516967082,
"eval_essay_instruction_loss": 2.0546875,
"eval_essay_instruction_runtime": 4.364,
"eval_essay_instruction_samples_per_second": 94.638,
"eval_essay_instruction_steps_per_second": 5.958,
"step": 2000
},
{
"epoch": 0.33,
"eval_reddit_eli5_accuracy": 0.4256390849199412,
"eval_reddit_eli5_loss": 2.732421875,
"eval_reddit_eli5_runtime": 199.1709,
"eval_reddit_eli5_samples_per_second": 273.77,
"eval_reddit_eli5_steps_per_second": 17.111,
"step": 2000
},
{
"epoch": 0.33,
"eval_reddit_askh_accuracy": 0.428839058527484,
"eval_reddit_askh_loss": 2.80859375,
"eval_reddit_askh_runtime": 129.3643,
"eval_reddit_askh_samples_per_second": 152.322,
"eval_reddit_askh_steps_per_second": 9.523,
"step": 2000
},
{
"epoch": 0.33,
"eval_reddit_asks_accuracy": 0.43882877148313176,
"eval_reddit_asks_loss": 2.662109375,
"eval_reddit_asks_runtime": 99.4788,
"eval_reddit_asks_samples_per_second": 264.941,
"eval_reddit_asks_steps_per_second": 16.566,
"step": 2000
}
],
"max_steps": 12168,
"num_train_epochs": 2,
"total_flos": 1.6560121131357438e+19,
"trial_name": null,
"trial_params": null
}