{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3287175905000616, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.4084967333570947e-06, "loss": 2.5231, "step": 10 }, { "epoch": 0.0, "learning_rate": 2.0507482022971233e-06, "loss": 2.3436, "step": 20 }, { "epoch": 0.0, "learning_rate": 2.385606273598312e-06, "loss": 2.22, "step": 30 }, { "epoch": 0.01, "learning_rate": 2.6136695401116585e-06, "loss": 2.1055, "step": 40 }, { "epoch": 0.01, "learning_rate": 2.7868297632261957e-06, "loss": 2.1275, "step": 50 }, { "epoch": 0.01, "learning_rate": 2.926458092787486e-06, "loss": 2.0425, "step": 60 }, { "epoch": 0.01, "learning_rate": 3.0434580045013773e-06, "loss": 2.0407, "step": 70 }, { "epoch": 0.01, "learning_rate": 3.1441512086208035e-06, "loss": 2.0558, "step": 80 }, { "epoch": 0.01, "learning_rate": 3.232532087697698e-06, "loss": 1.9887, "step": 90 }, { "epoch": 0.02, "learning_rate": 3.3112862237770753e-06, "loss": 1.9845, "step": 100 }, { "epoch": 0.02, "learning_rate": 3.3823062961420163e-06, "loss": 1.9856, "step": 110 }, { "epoch": 0.02, "learning_rate": 3.446976436243603e-06, "loss": 1.9968, "step": 120 }, { "epoch": 0.02, "learning_rate": 3.506339534926595e-06, "loss": 1.9247, "step": 130 }, { "epoch": 0.02, "learning_rate": 3.5612009452606784e-06, "loss": 1.9817, "step": 140 }, { "epoch": 0.02, "learning_rate": 3.612195557913627e-06, "loss": 1.9644, "step": 150 }, { "epoch": 0.03, "learning_rate": 3.65983275401539e-06, "loss": 1.9639, "step": 160 }, { "epoch": 0.03, "learning_rate": 3.7045274519126395e-06, "loss": 1.9587, "step": 170 }, { "epoch": 0.03, "learning_rate": 3.7466221106030114e-06, "loss": 1.9849, "step": 180 }, { "epoch": 0.03, "learning_rate": 3.786402677560832e-06, "loss": 1.9745, "step": 190 }, { "epoch": 0.03, "learning_rate": 3.824110376935989e-06, "loss": 1.9429, "step": 200 }, { "epoch": 0.03, "learning_rate": 3.8599505757615295e-06, "loss": 1.9484, "step": 210 }, { "epoch": 0.04, "learning_rate": 3.894099556414216e-06, "loss": 1.9214, "step": 220 }, { "epoch": 0.04, "learning_rate": 3.9267097619885385e-06, "loss": 1.9274, "step": 230 }, { "epoch": 0.04, "learning_rate": 3.95791391001684e-06, "loss": 1.9185, "step": 240 }, { "epoch": 0.04, "learning_rate": 3.987828255432777e-06, "loss": 1.9578, "step": 250 }, { "epoch": 0.04, "learning_rate": 4.016555205552159e-06, "loss": 1.907, "step": 260 }, { "epoch": 0.04, "learning_rate": 4.044185435607626e-06, "loss": 1.9448, "step": 270 }, { "epoch": 0.05, "learning_rate": 4.070799615107415e-06, "loss": 1.8884, "step": 280 }, { "epoch": 0.05, "learning_rate": 4.096469827889988e-06, "loss": 1.9402, "step": 290 }, { "epoch": 0.05, "learning_rate": 4.121260748862021e-06, "loss": 1.9153, "step": 300 }, { "epoch": 0.05, "learning_rate": 4.145230625795312e-06, "loss": 1.9106, "step": 310 }, { "epoch": 0.05, "learning_rate": 4.1684321036962525e-06, "loss": 1.958, "step": 320 }, { "epoch": 0.05, "learning_rate": 4.190912921100477e-06, "loss": 1.9117, "step": 330 }, { "epoch": 0.06, "learning_rate": 4.212716501452232e-06, "loss": 1.9097, "step": 340 }, { "epoch": 0.06, "learning_rate": 4.233882457984791e-06, "loss": 1.9279, "step": 350 }, { "epoch": 0.06, "learning_rate": 4.2544470268536555e-06, "loss": 1.9164, "step": 360 }, { "epoch": 0.06, "learning_rate": 4.27444344042015e-06, "loss": 1.9323, "step": 370 }, { "epoch": 0.06, "learning_rate": 4.293902250342989e-06, "loss": 1.9134, "step": 380 }, { "epoch": 0.06, "learning_rate": 4.312851608364853e-06, "loss": 1.8835, "step": 390 }, { "epoch": 0.07, "learning_rate": 4.3313175112718595e-06, "loss": 1.8969, "step": 400 }, { "epoch": 0.07, "learning_rate": 4.3493240153753665e-06, "loss": 1.9238, "step": 410 }, { "epoch": 0.07, "learning_rate": 4.366893424956263e-06, "loss": 1.8946, "step": 420 }, { "epoch": 0.07, "learning_rate": 4.38404645837504e-06, "loss": 1.8781, "step": 430 }, { "epoch": 0.07, "learning_rate": 4.400802394950703e-06, "loss": 1.8955, "step": 440 }, { "epoch": 0.07, "learning_rate": 4.4171792052198945e-06, "loss": 1.8515, "step": 450 }, { "epoch": 0.08, "learning_rate": 4.433193666783084e-06, "loss": 1.8978, "step": 460 }, { "epoch": 0.08, "learning_rate": 4.448861467610187e-06, "loss": 1.889, "step": 470 }, { "epoch": 0.08, "learning_rate": 4.4641972984001906e-06, "loss": 1.8667, "step": 480 }, { "epoch": 0.08, "learning_rate": 4.479214935357724e-06, "loss": 1.9304, "step": 490 }, { "epoch": 0.08, "learning_rate": 4.493927314555554e-06, "loss": 1.9042, "step": 500 }, { "epoch": 0.08, "eval_gsm8k_hard_accuracy": 0.8928993119172672, "eval_gsm8k_hard_loss": 0.4951171875, "eval_gsm8k_hard_runtime": 2.1138, "eval_gsm8k_hard_samples_per_second": 124.893, "eval_gsm8k_hard_steps_per_second": 8.042, "step": 500 }, { "epoch": 0.08, "eval_webgpt_accuracy": 0.4654303097674511, "eval_webgpt_loss": 2.478515625, "eval_webgpt_runtime": 13.6298, "eval_webgpt_samples_per_second": 287.312, "eval_webgpt_steps_per_second": 17.975, "step": 500 }, { "epoch": 0.08, "eval_squad_v2_accuracy": 0.8681394546082154, "eval_squad_v2_loss": 0.51806640625, "eval_squad_v2_runtime": 80.2931, "eval_squad_v2_samples_per_second": 324.611, "eval_squad_v2_steps_per_second": 20.288, "step": 500 }, { "epoch": 0.08, "eval_adversarial_qa_accuracy": 0.7833800465144016, "eval_adversarial_qa_loss": 1.310546875, "eval_adversarial_qa_runtime": 19.1554, "eval_adversarial_qa_samples_per_second": 313.228, "eval_adversarial_qa_steps_per_second": 19.577, "step": 500 }, { "epoch": 0.08, "eval_private_tuning_accuracy": 0.6404945703123248, "eval_private_tuning_loss": 1.3779296875, "eval_private_tuning_runtime": 68.286, "eval_private_tuning_samples_per_second": 310.137, "eval_private_tuning_steps_per_second": 19.389, "step": 500 }, { "epoch": 0.08, "eval_oa_translated_accuracy": 0.6488286598439107, "eval_oa_translated_loss": 1.5576171875, "eval_oa_translated_runtime": 524.5762, "eval_oa_translated_samples_per_second": 254.918, "eval_oa_translated_steps_per_second": 15.933, "step": 500 }, { "epoch": 0.08, "eval_prosocial_dialogue_accuracy": 0.52144098641849, "eval_prosocial_dialogue_loss": 1.90625, "eval_prosocial_dialogue_runtime": 90.5414, "eval_prosocial_dialogue_samples_per_second": 298.018, "eval_prosocial_dialogue_steps_per_second": 18.632, "step": 500 }, { "epoch": 0.08, "eval_math_qa_accuracy": 0.5165153098127461, "eval_math_qa_loss": 2.20703125, "eval_math_qa_runtime": 17.7049, "eval_math_qa_samples_per_second": 337.082, "eval_math_qa_steps_per_second": 21.068, "step": 500 }, { "epoch": 0.08, "eval_wikihow_accuracy": 0.5831415499792042, "eval_wikihow_loss": 2.140625, "eval_wikihow_runtime": 9.4415, "eval_wikihow_samples_per_second": 242.863, "eval_wikihow_steps_per_second": 15.252, "step": 500 }, { "epoch": 0.08, "eval_joke_accuracy": 0.45545868081880214, "eval_joke_loss": 2.529296875, "eval_joke_runtime": 1.3918, "eval_joke_samples_per_second": 54.606, "eval_joke_steps_per_second": 3.593, "step": 500 }, { "epoch": 0.08, "eval_gsm8k_accuracy": 0.7111711283077483, "eval_gsm8k_loss": 1.1416015625, "eval_gsm8k_runtime": 6.0874, "eval_gsm8k_samples_per_second": 245.588, "eval_gsm8k_steps_per_second": 15.442, "step": 500 }, { "epoch": 0.08, "eval_ted_trans_en-hi_accuracy": 0.5343350158469304, "eval_ted_trans_en-hi_loss": 2.244140625, "eval_ted_trans_en-hi_runtime": 1.2611, "eval_ted_trans_en-hi_samples_per_second": 81.672, "eval_ted_trans_en-hi_steps_per_second": 5.551, "step": 500 }, { "epoch": 0.08, "eval_ted_trans_de-ja_accuracy": 0.5195722742027878, "eval_ted_trans_de-ja_loss": 2.314453125, "eval_ted_trans_de-ja_runtime": 3.7052, "eval_ted_trans_de-ja_samples_per_second": 193.783, "eval_ted_trans_de-ja_steps_per_second": 12.145, "step": 500 }, { "epoch": 0.08, "eval_ted_trans_nl-en_accuracy": 0.6433630400125447, "eval_ted_trans_nl-en_loss": 1.7353515625, "eval_ted_trans_nl-en_runtime": 3.5777, "eval_ted_trans_nl-en_samples_per_second": 215.5, "eval_ted_trans_nl-en_steps_per_second": 13.696, "step": 500 }, { "epoch": 0.08, "eval_ted_trans_en-ja_accuracy": 0.5440905817396176, "eval_ted_trans_en-ja_loss": 2.13671875, "eval_ted_trans_en-ja_runtime": 3.7065, "eval_ted_trans_en-ja_samples_per_second": 216.109, "eval_ted_trans_en-ja_steps_per_second": 13.76, "step": 500 }, { "epoch": 0.08, "eval_ted_trans_en-es_accuracy": 0.7055326931870142, "eval_ted_trans_en-es_loss": 1.3369140625, "eval_ted_trans_en-es_runtime": 3.3519, "eval_ted_trans_en-es_samples_per_second": 246.427, "eval_ted_trans_en-es_steps_per_second": 15.514, "step": 500 }, { "epoch": 0.08, "eval_ted_trans_en-ms_accuracy": 0.5517070757050965, "eval_ted_trans_en-ms_loss": 2.32421875, "eval_ted_trans_en-ms_runtime": 0.8373, "eval_ted_trans_en-ms_samples_per_second": 50.159, "eval_ted_trans_en-ms_steps_per_second": 3.583, "step": 500 }, { "epoch": 0.08, "eval_xsum_accuracy": 0.5663549372439451, "eval_xsum_loss": NaN, "eval_xsum_runtime": 140.8411, "eval_xsum_samples_per_second": 289.752, "eval_xsum_steps_per_second": 18.113, "step": 500 }, { "epoch": 0.08, "eval_cnn_dailymail_accuracy": 0.6501961820900207, "eval_cnn_dailymail_loss": NaN, "eval_cnn_dailymail_runtime": 207.5377, "eval_cnn_dailymail_samples_per_second": 276.687, "eval_cnn_dailymail_steps_per_second": 17.293, "step": 500 }, { "epoch": 0.08, "eval_multi_news_accuracy": 0.5168385769568282, "eval_multi_news_loss": NaN, "eval_multi_news_runtime": 36.1577, "eval_multi_news_samples_per_second": 248.771, "eval_multi_news_steps_per_second": 15.571, "step": 500 }, { "epoch": 0.08, "eval_tldr_news_accuracy": 0.5048904354368475, "eval_tldr_news_loss": 2.384765625, "eval_tldr_news_runtime": 5.9032, "eval_tldr_news_samples_per_second": 241.901, "eval_tldr_news_steps_per_second": 15.246, "step": 500 }, { "epoch": 0.08, "eval_scitldr_accuracy": 0.5, "eval_scitldr_loss": NaN, "eval_scitldr_runtime": 2.3974, "eval_scitldr_samples_per_second": 166.428, "eval_scitldr_steps_per_second": 10.428, "step": 500 }, { "epoch": 0.08, "eval_samsum_accuracy": 0.5789020336200051, "eval_samsum_loss": 1.619140625, "eval_samsum_runtime": 10.1073, "eval_samsum_samples_per_second": 291.572, "eval_samsum_steps_per_second": 18.304, "step": 500 }, { "epoch": 0.08, "eval_debate_sum_accuracy": 0.9321960723793357, "eval_debate_sum_loss": NaN, "eval_debate_sum_runtime": 188.2954, "eval_debate_sum_samples_per_second": 255.524, "eval_debate_sum_steps_per_second": 15.975, "step": 500 }, { "epoch": 0.08, "eval_billsum_accuracy": 0.6453599014888616, "eval_billsum_loss": NaN, "eval_billsum_runtime": 22.3683, "eval_billsum_samples_per_second": 169.436, "eval_billsum_steps_per_second": 10.595, "step": 500 }, { "epoch": 0.08, "eval_wmt2019_zh-en_accuracy": 0.5524590644131345, "eval_wmt2019_zh-en_loss": 2.146484375, "eval_wmt2019_zh-en_runtime": 13.8635, "eval_wmt2019_zh-en_samples_per_second": 287.158, "eval_wmt2019_zh-en_steps_per_second": 17.961, "step": 500 }, { "epoch": 0.08, "eval_wmt2019_ru-en_accuracy": 0.6308636370293347, "eval_wmt2019_ru-en_loss": 1.580078125, "eval_wmt2019_ru-en_runtime": 11.2038, "eval_wmt2019_ru-en_samples_per_second": 267.766, "eval_wmt2019_ru-en_steps_per_second": 16.78, "step": 500 }, { "epoch": 0.08, "eval_wmt2019_de-en_accuracy": 0.657534107930853, "eval_wmt2019_de-en_loss": 1.501953125, "eval_wmt2019_de-en_runtime": 9.454, "eval_wmt2019_de-en_samples_per_second": 317.115, "eval_wmt2019_de-en_steps_per_second": 19.886, "step": 500 }, { "epoch": 0.08, "eval_wmt2019_fr-de_accuracy": 0.6479142094481346, "eval_wmt2019_fr-de_loss": 1.5439453125, "eval_wmt2019_fr-de_runtime": 5.2625, "eval_wmt2019_fr-de_samples_per_second": 287.315, "eval_wmt2019_fr-de_steps_per_second": 18.052, "step": 500 }, { "epoch": 0.08, "eval_essay_instruction_accuracy": 0.5775154438520775, "eval_essay_instruction_loss": 2.09765625, "eval_essay_instruction_runtime": 3.0793, "eval_essay_instruction_samples_per_second": 134.122, "eval_essay_instruction_steps_per_second": 8.443, "step": 500 }, { "epoch": 0.08, "eval_reddit_eli5_accuracy": 0.42260973997710893, "eval_reddit_eli5_loss": 2.76171875, "eval_reddit_eli5_runtime": 203.2121, "eval_reddit_eli5_samples_per_second": 268.326, "eval_reddit_eli5_steps_per_second": 16.771, "step": 500 }, { "epoch": 0.08, "eval_reddit_askh_accuracy": 0.42568767737796204, "eval_reddit_askh_loss": 2.84375, "eval_reddit_askh_runtime": 111.1784, "eval_reddit_askh_samples_per_second": 177.238, "eval_reddit_askh_steps_per_second": 11.081, "step": 500 }, { "epoch": 0.08, "eval_reddit_asks_accuracy": 0.43555163138333913, "eval_reddit_asks_loss": 2.689453125, "eval_reddit_asks_runtime": 119.9403, "eval_reddit_asks_samples_per_second": 219.743, "eval_reddit_asks_steps_per_second": 13.74, "step": 500 }, { "epoch": 0.08, "learning_rate": 4.5083465988888945e-06, "loss": 1.8966, "step": 510 }, { "epoch": 0.09, "learning_rate": 4.5224842384899045e-06, "loss": 1.9039, "step": 520 }, { "epoch": 0.09, "learning_rate": 4.5363510253542444e-06, "loss": 1.9029, "step": 530 }, { "epoch": 0.09, "learning_rate": 4.549957142832593e-06, "loss": 1.8759, "step": 540 }, { "epoch": 0.09, "learning_rate": 4.563312210555719e-06, "loss": 1.9042, "step": 550 }, { "epoch": 0.09, "learning_rate": 4.576425325289549e-06, "loss": 1.9205, "step": 560 }, { "epoch": 0.09, "learning_rate": 4.589305098154845e-06, "loss": 1.9324, "step": 570 }, { "epoch": 0.1, "learning_rate": 4.601959688592886e-06, "loss": 1.8757, "step": 580 }, { "epoch": 0.1, "learning_rate": 4.614396835412691e-06, "loss": 1.895, "step": 590 }, { "epoch": 0.1, "learning_rate": 4.626623885215616e-06, "loss": 1.9004, "step": 600 }, { "epoch": 0.1, "learning_rate": 4.638647818458763e-06, "loss": 1.8705, "step": 610 }, { "epoch": 0.1, "learning_rate": 4.650475273388737e-06, "loss": 1.8929, "step": 620 }, { "epoch": 0.1, "learning_rate": 4.662112568051194e-06, "loss": 1.8745, "step": 630 }, { "epoch": 0.11, "learning_rate": 4.673565720558918e-06, "loss": 1.8768, "step": 640 }, { "epoch": 0.11, "learning_rate": 4.6848404677811685e-06, "loss": 1.885, "step": 650 }, { "epoch": 0.11, "learning_rate": 4.695942282599635e-06, "loss": 1.8496, "step": 660 }, { "epoch": 0.11, "learning_rate": 4.706876389860915e-06, "loss": 1.9061, "step": 670 }, { "epoch": 0.11, "learning_rate": 4.717647781141908e-06, "loss": 1.8839, "step": 680 }, { "epoch": 0.11, "learning_rate": 4.7282612284325845e-06, "loss": 1.921, "step": 690 }, { "epoch": 0.12, "learning_rate": 4.738721296830016e-06, "loss": 1.8519, "step": 700 }, { "epoch": 0.12, "learning_rate": 4.749032356328167e-06, "loss": 1.8901, "step": 710 }, { "epoch": 0.12, "learning_rate": 4.759198592779668e-06, "loss": 1.8678, "step": 720 }, { "epoch": 0.12, "learning_rate": 4.769224018098397e-06, "loss": 1.859, "step": 730 }, { "epoch": 0.12, "learning_rate": 4.7791124797650865e-06, "loss": 1.8315, "step": 740 }, { "epoch": 0.12, "learning_rate": 4.788867669692332e-06, "loss": 1.8915, "step": 750 }, { "epoch": 0.12, "learning_rate": 4.798493132500121e-06, "loss": 1.8936, "step": 760 }, { "epoch": 0.13, "learning_rate": 4.8079922732483016e-06, "loss": 1.8869, "step": 770 }, { "epoch": 0.13, "learning_rate": 4.817368364668191e-06, "loss": 1.8556, "step": 780 }, { "epoch": 0.13, "learning_rate": 4.8266245539317745e-06, "loss": 1.8594, "step": 790 }, { "epoch": 0.13, "learning_rate": 4.835763868993521e-06, "loss": 1.8646, "step": 800 }, { "epoch": 0.13, "learning_rate": 4.844789224536785e-06, "loss": 1.8758, "step": 810 }, { "epoch": 0.13, "learning_rate": 4.853703427554027e-06, "loss": 1.8349, "step": 820 }, { "epoch": 0.14, "learning_rate": 4.862509182587578e-06, "loss": 1.8517, "step": 830 }, { "epoch": 0.14, "learning_rate": 4.871209096655434e-06, "loss": 1.8563, "step": 840 }, { "epoch": 0.14, "learning_rate": 4.879805683884512e-06, "loss": 1.8749, "step": 850 }, { "epoch": 0.14, "learning_rate": 4.888301369871998e-06, "loss": 1.8276, "step": 860 }, { "epoch": 0.14, "learning_rate": 4.8966984957936845e-06, "loss": 1.7967, "step": 870 }, { "epoch": 0.14, "learning_rate": 4.904999322276735e-06, "loss": 1.9041, "step": 880 }, { "epoch": 0.15, "learning_rate": 4.913206033052878e-06, "loss": 1.8417, "step": 890 }, { "epoch": 0.15, "learning_rate": 4.921320738406821e-06, "loss": 1.8611, "step": 900 }, { "epoch": 0.15, "learning_rate": 4.929345478433492e-06, "loss": 1.8924, "step": 910 }, { "epoch": 0.15, "learning_rate": 4.937282226116702e-06, "loss": 1.8684, "step": 920 }, { "epoch": 0.15, "learning_rate": 4.945132890240829e-06, "loss": 1.8292, "step": 930 }, { "epoch": 0.15, "learning_rate": 4.952899318146298e-06, "loss": 1.8498, "step": 940 }, { "epoch": 0.16, "learning_rate": 4.96058329833879e-06, "loss": 1.8944, "step": 950 }, { "epoch": 0.16, "learning_rate": 4.968186562961406e-06, "loss": 1.885, "step": 960 }, { "epoch": 0.16, "learning_rate": 4.975710790138337e-06, "loss": 1.8469, "step": 970 }, { "epoch": 0.16, "learning_rate": 4.9831576061979556e-06, "loss": 1.8536, "step": 980 }, { "epoch": 0.16, "learning_rate": 4.990528587782728e-06, "loss": 1.8514, "step": 990 }, { "epoch": 0.16, "learning_rate": 4.99782526385276e-06, "loss": 1.8168, "step": 1000 }, { "epoch": 0.16, "eval_gsm8k_hard_accuracy": 0.9009201579740238, "eval_gsm8k_hard_loss": 0.44189453125, "eval_gsm8k_hard_runtime": 1.5125, "eval_gsm8k_hard_samples_per_second": 174.543, "eval_gsm8k_hard_steps_per_second": 11.24, "step": 1000 }, { "epoch": 0.16, "eval_webgpt_accuracy": 0.4676998865211623, "eval_webgpt_loss": 2.44921875, "eval_webgpt_runtime": 15.1394, "eval_webgpt_samples_per_second": 258.664, "eval_webgpt_steps_per_second": 16.183, "step": 1000 }, { "epoch": 0.16, "eval_squad_v2_accuracy": 0.8740146386480032, "eval_squad_v2_loss": 0.456298828125, "eval_squad_v2_runtime": 77.6449, "eval_squad_v2_samples_per_second": 335.682, "eval_squad_v2_steps_per_second": 20.98, "step": 1000 }, { "epoch": 0.16, "eval_adversarial_qa_accuracy": 0.7841552865406405, "eval_adversarial_qa_loss": 1.16796875, "eval_adversarial_qa_runtime": 20.3196, "eval_adversarial_qa_samples_per_second": 295.282, "eval_adversarial_qa_steps_per_second": 18.455, "step": 1000 }, { "epoch": 0.16, "eval_private_tuning_accuracy": 0.6468452789452516, "eval_private_tuning_loss": 1.33203125, "eval_private_tuning_runtime": 62.217, "eval_private_tuning_samples_per_second": 340.389, "eval_private_tuning_steps_per_second": 21.28, "step": 1000 }, { "epoch": 0.16, "eval_oa_translated_accuracy": 0.6605713712776647, "eval_oa_translated_loss": 1.4912109375, "eval_oa_translated_runtime": 498.0305, "eval_oa_translated_samples_per_second": 268.506, "eval_oa_translated_steps_per_second": 16.782, "step": 1000 }, { "epoch": 0.16, "eval_prosocial_dialogue_accuracy": 0.5267998067934173, "eval_prosocial_dialogue_loss": 1.9033203125, "eval_prosocial_dialogue_runtime": 126.2272, "eval_prosocial_dialogue_samples_per_second": 213.765, "eval_prosocial_dialogue_steps_per_second": 13.365, "step": 1000 }, { "epoch": 0.16, "eval_math_qa_accuracy": 0.5343074095293895, "eval_math_qa_loss": 2.080078125, "eval_math_qa_runtime": 19.3631, "eval_math_qa_samples_per_second": 308.215, "eval_math_qa_steps_per_second": 19.263, "step": 1000 }, { "epoch": 0.16, "eval_wikihow_accuracy": 0.5909261056425897, "eval_wikihow_loss": 2.078125, "eval_wikihow_runtime": 7.7313, "eval_wikihow_samples_per_second": 296.588, "eval_wikihow_steps_per_second": 18.626, "step": 1000 }, { "epoch": 0.16, "eval_joke_accuracy": 0.45830174374526156, "eval_joke_loss": 2.498046875, "eval_joke_runtime": 2.2389, "eval_joke_samples_per_second": 33.945, "eval_joke_steps_per_second": 2.233, "step": 1000 }, { "epoch": 0.16, "eval_gsm8k_accuracy": 0.7258224765956256, "eval_gsm8k_loss": 1.0576171875, "eval_gsm8k_runtime": 6.1435, "eval_gsm8k_samples_per_second": 243.346, "eval_gsm8k_steps_per_second": 15.301, "step": 1000 }, { "epoch": 0.16, "eval_ted_trans_en-hi_accuracy": 0.5460148777895856, "eval_ted_trans_en-hi_loss": 2.138671875, "eval_ted_trans_en-hi_runtime": 0.5653, "eval_ted_trans_en-hi_samples_per_second": 182.218, "eval_ted_trans_en-hi_steps_per_second": 12.384, "step": 1000 }, { "epoch": 0.16, "eval_ted_trans_de-ja_accuracy": 0.5330968145857443, "eval_ted_trans_de-ja_loss": 2.193359375, "eval_ted_trans_de-ja_runtime": 3.858, "eval_ted_trans_de-ja_samples_per_second": 186.106, "eval_ted_trans_de-ja_steps_per_second": 11.664, "step": 1000 }, { "epoch": 0.16, "eval_ted_trans_nl-en_accuracy": 0.6414471117584333, "eval_ted_trans_nl-en_loss": 1.689453125, "eval_ted_trans_nl-en_runtime": 3.1044, "eval_ted_trans_nl-en_samples_per_second": 248.36, "eval_ted_trans_nl-en_steps_per_second": 15.784, "step": 1000 }, { "epoch": 0.16, "eval_ted_trans_en-ja_accuracy": 0.5530009680542111, "eval_ted_trans_en-ja_loss": 2.05859375, "eval_ted_trans_en-ja_runtime": 3.8822, "eval_ted_trans_en-ja_samples_per_second": 206.324, "eval_ted_trans_en-ja_steps_per_second": 13.137, "step": 1000 }, { "epoch": 0.16, "eval_ted_trans_en-es_accuracy": 0.7106248418922337, "eval_ted_trans_en-es_loss": 1.2880859375, "eval_ted_trans_en-es_runtime": 2.9435, "eval_ted_trans_en-es_samples_per_second": 280.621, "eval_ted_trans_en-es_steps_per_second": 17.666, "step": 1000 }, { "epoch": 0.16, "eval_ted_trans_en-ms_accuracy": 0.5987135081642752, "eval_ted_trans_en-ms_loss": 1.984375, "eval_ted_trans_en-ms_runtime": 1.4559, "eval_ted_trans_en-ms_samples_per_second": 28.848, "eval_ted_trans_en-ms_steps_per_second": 2.061, "step": 1000 }, { "epoch": 0.16, "eval_xsum_accuracy": 0.5722584941442159, "eval_xsum_loss": NaN, "eval_xsum_runtime": 141.7203, "eval_xsum_samples_per_second": 287.955, "eval_xsum_steps_per_second": 18.0, "step": 1000 }, { "epoch": 0.16, "eval_cnn_dailymail_accuracy": 0.6576155822271417, "eval_cnn_dailymail_loss": NaN, "eval_cnn_dailymail_runtime": 209.5351, "eval_cnn_dailymail_samples_per_second": 274.05, "eval_cnn_dailymail_steps_per_second": 17.128, "step": 1000 }, { "epoch": 0.16, "eval_multi_news_accuracy": 0.5226291863275405, "eval_multi_news_loss": NaN, "eval_multi_news_runtime": 36.8798, "eval_multi_news_samples_per_second": 243.9, "eval_multi_news_steps_per_second": 15.266, "step": 1000 }, { "epoch": 0.16, "eval_tldr_news_accuracy": 0.5359729145114267, "eval_tldr_news_loss": 2.20703125, "eval_tldr_news_runtime": 4.9335, "eval_tldr_news_samples_per_second": 289.451, "eval_tldr_news_steps_per_second": 18.243, "step": 1000 }, { "epoch": 0.16, "eval_scitldr_accuracy": 0.49700598802395207, "eval_scitldr_loss": NaN, "eval_scitldr_runtime": 1.5917, "eval_scitldr_samples_per_second": 250.67, "eval_scitldr_steps_per_second": 15.706, "step": 1000 }, { "epoch": 0.16, "eval_samsum_accuracy": 0.590799585469913, "eval_samsum_loss": 1.5537109375, "eval_samsum_runtime": 10.6642, "eval_samsum_samples_per_second": 276.345, "eval_samsum_steps_per_second": 17.348, "step": 1000 }, { "epoch": 0.16, "eval_debate_sum_accuracy": 0.9329163674973446, "eval_debate_sum_loss": NaN, "eval_debate_sum_runtime": 196.1179, "eval_debate_sum_samples_per_second": 245.332, "eval_debate_sum_steps_per_second": 15.338, "step": 1000 }, { "epoch": 0.16, "eval_billsum_accuracy": 0.6510711811280909, "eval_billsum_loss": NaN, "eval_billsum_runtime": 16.6536, "eval_billsum_samples_per_second": 227.579, "eval_billsum_steps_per_second": 14.231, "step": 1000 }, { "epoch": 0.16, "eval_wmt2019_zh-en_accuracy": 0.5587294145226513, "eval_wmt2019_zh-en_loss": 2.111328125, "eval_wmt2019_zh-en_runtime": 12.8767, "eval_wmt2019_zh-en_samples_per_second": 309.164, "eval_wmt2019_zh-en_steps_per_second": 19.337, "step": 1000 }, { "epoch": 0.16, "eval_wmt2019_ru-en_accuracy": 0.6366095054310483, "eval_wmt2019_ru-en_loss": 1.552734375, "eval_wmt2019_ru-en_runtime": 10.1123, "eval_wmt2019_ru-en_samples_per_second": 296.667, "eval_wmt2019_ru-en_steps_per_second": 18.591, "step": 1000 }, { "epoch": 0.16, "eval_wmt2019_de-en_accuracy": 0.6681106028096029, "eval_wmt2019_de-en_loss": 1.4462890625, "eval_wmt2019_de-en_runtime": 9.913, "eval_wmt2019_de-en_samples_per_second": 302.432, "eval_wmt2019_de-en_steps_per_second": 18.965, "step": 1000 }, { "epoch": 0.16, "eval_wmt2019_fr-de_accuracy": 0.6540822971254923, "eval_wmt2019_fr-de_loss": 1.51171875, "eval_wmt2019_fr-de_runtime": 5.7364, "eval_wmt2019_fr-de_samples_per_second": 263.579, "eval_wmt2019_fr-de_steps_per_second": 16.561, "step": 1000 }, { "epoch": 0.16, "eval_essay_instruction_accuracy": 0.5807311500380807, "eval_essay_instruction_loss": 2.072265625, "eval_essay_instruction_runtime": 4.2906, "eval_essay_instruction_samples_per_second": 96.257, "eval_essay_instruction_steps_per_second": 6.06, "step": 1000 }, { "epoch": 0.16, "eval_reddit_eli5_accuracy": 0.42394149731958486, "eval_reddit_eli5_loss": 2.748046875, "eval_reddit_eli5_runtime": 220.7861, "eval_reddit_eli5_samples_per_second": 246.968, "eval_reddit_eli5_steps_per_second": 15.436, "step": 1000 }, { "epoch": 0.16, "eval_reddit_askh_accuracy": 0.42705794870139985, "eval_reddit_askh_loss": 2.826171875, "eval_reddit_askh_runtime": 106.0159, "eval_reddit_askh_samples_per_second": 185.868, "eval_reddit_askh_steps_per_second": 11.621, "step": 1000 }, { "epoch": 0.16, "eval_reddit_asks_accuracy": 0.43686793419350517, "eval_reddit_asks_loss": 2.67578125, "eval_reddit_asks_runtime": 110.0537, "eval_reddit_asks_samples_per_second": 239.483, "eval_reddit_asks_steps_per_second": 14.975, "step": 1000 }, { "epoch": 0.17, "learning_rate": 4.997313753581662e-06, "loss": 1.8558, "step": 1010 }, { "epoch": 0.17, "learning_rate": 4.992836676217766e-06, "loss": 1.862, "step": 1020 }, { "epoch": 0.17, "learning_rate": 4.988359598853868e-06, "loss": 1.8667, "step": 1030 }, { "epoch": 0.17, "learning_rate": 4.9838825214899716e-06, "loss": 1.8597, "step": 1040 }, { "epoch": 0.17, "learning_rate": 4.979405444126075e-06, "loss": 1.8472, "step": 1050 }, { "epoch": 0.17, "learning_rate": 4.974928366762178e-06, "loss": 1.8552, "step": 1060 }, { "epoch": 0.18, "learning_rate": 4.9704512893982816e-06, "loss": 1.8532, "step": 1070 }, { "epoch": 0.18, "learning_rate": 4.965974212034385e-06, "loss": 1.8235, "step": 1080 }, { "epoch": 0.18, "learning_rate": 4.961497134670487e-06, "loss": 1.861, "step": 1090 }, { "epoch": 0.18, "learning_rate": 4.957020057306591e-06, "loss": 1.7964, "step": 1100 }, { "epoch": 0.18, "learning_rate": 4.952542979942694e-06, "loss": 1.8203, "step": 1110 }, { "epoch": 0.18, "learning_rate": 4.9480659025787965e-06, "loss": 1.8759, "step": 1120 }, { "epoch": 0.19, "learning_rate": 4.9435888252149e-06, "loss": 1.8592, "step": 1130 }, { "epoch": 0.19, "learning_rate": 4.939111747851003e-06, "loss": 1.8487, "step": 1140 }, { "epoch": 0.19, "learning_rate": 4.9346346704871065e-06, "loss": 1.8112, "step": 1150 }, { "epoch": 0.19, "learning_rate": 4.930157593123209e-06, "loss": 1.8028, "step": 1160 }, { "epoch": 0.19, "learning_rate": 4.925680515759313e-06, "loss": 1.8361, "step": 1170 }, { "epoch": 0.19, "learning_rate": 4.921203438395416e-06, "loss": 1.8625, "step": 1180 }, { "epoch": 0.2, "learning_rate": 4.916726361031519e-06, "loss": 1.8345, "step": 1190 }, { "epoch": 0.2, "learning_rate": 4.912249283667622e-06, "loss": 1.8506, "step": 1200 }, { "epoch": 0.2, "learning_rate": 4.907772206303726e-06, "loss": 1.8326, "step": 1210 }, { "epoch": 0.2, "learning_rate": 4.903295128939828e-06, "loss": 1.8399, "step": 1220 }, { "epoch": 0.2, "learning_rate": 4.8988180515759315e-06, "loss": 1.8706, "step": 1230 }, { "epoch": 0.2, "learning_rate": 4.894340974212035e-06, "loss": 1.8227, "step": 1240 }, { "epoch": 0.21, "learning_rate": 4.889863896848137e-06, "loss": 1.8461, "step": 1250 }, { "epoch": 0.21, "learning_rate": 4.8853868194842415e-06, "loss": 1.874, "step": 1260 }, { "epoch": 0.21, "learning_rate": 4.880909742120344e-06, "loss": 1.8178, "step": 1270 }, { "epoch": 0.21, "learning_rate": 4.876432664756447e-06, "loss": 1.8141, "step": 1280 }, { "epoch": 0.21, "learning_rate": 4.871955587392551e-06, "loss": 1.8258, "step": 1290 }, { "epoch": 0.21, "learning_rate": 4.867478510028654e-06, "loss": 1.8595, "step": 1300 }, { "epoch": 0.22, "learning_rate": 4.8630014326647565e-06, "loss": 1.8495, "step": 1310 }, { "epoch": 0.22, "learning_rate": 4.85852435530086e-06, "loss": 1.8492, "step": 1320 }, { "epoch": 0.22, "learning_rate": 4.854047277936963e-06, "loss": 1.8339, "step": 1330 }, { "epoch": 0.22, "learning_rate": 4.849570200573066e-06, "loss": 1.8218, "step": 1340 }, { "epoch": 0.22, "learning_rate": 4.84509312320917e-06, "loss": 1.8411, "step": 1350 }, { "epoch": 0.22, "learning_rate": 4.840616045845273e-06, "loss": 1.8275, "step": 1360 }, { "epoch": 0.23, "learning_rate": 4.836138968481376e-06, "loss": 1.8358, "step": 1370 }, { "epoch": 0.23, "learning_rate": 4.831661891117479e-06, "loss": 1.8395, "step": 1380 }, { "epoch": 0.23, "learning_rate": 4.827184813753582e-06, "loss": 1.8114, "step": 1390 }, { "epoch": 0.23, "learning_rate": 4.822707736389685e-06, "loss": 1.8167, "step": 1400 }, { "epoch": 0.23, "learning_rate": 4.818230659025788e-06, "loss": 1.8502, "step": 1410 }, { "epoch": 0.23, "learning_rate": 4.8137535816618915e-06, "loss": 1.8494, "step": 1420 }, { "epoch": 0.24, "learning_rate": 4.809276504297995e-06, "loss": 1.8546, "step": 1430 }, { "epoch": 0.24, "learning_rate": 4.804799426934098e-06, "loss": 1.8219, "step": 1440 }, { "epoch": 0.24, "learning_rate": 4.8003223495702015e-06, "loss": 1.856, "step": 1450 }, { "epoch": 0.24, "learning_rate": 4.795845272206304e-06, "loss": 1.8235, "step": 1460 }, { "epoch": 0.24, "learning_rate": 4.791368194842407e-06, "loss": 1.8084, "step": 1470 }, { "epoch": 0.24, "learning_rate": 4.786891117478511e-06, "loss": 1.8036, "step": 1480 }, { "epoch": 0.24, "learning_rate": 4.782414040114613e-06, "loss": 1.807, "step": 1490 }, { "epoch": 0.25, "learning_rate": 4.7779369627507165e-06, "loss": 1.8223, "step": 1500 }, { "epoch": 0.25, "eval_gsm8k_hard_accuracy": 0.9040755669557429, "eval_gsm8k_hard_loss": 0.418212890625, "eval_gsm8k_hard_runtime": 1.5226, "eval_gsm8k_hard_samples_per_second": 173.393, "eval_gsm8k_hard_steps_per_second": 11.165, "step": 1500 }, { "epoch": 0.25, "eval_webgpt_accuracy": 0.4685187206997972, "eval_webgpt_loss": 2.43359375, "eval_webgpt_runtime": 16.9148, "eval_webgpt_samples_per_second": 231.513, "eval_webgpt_steps_per_second": 14.484, "step": 1500 }, { "epoch": 0.25, "eval_squad_v2_accuracy": 0.8753178869062296, "eval_squad_v2_loss": 0.424560546875, "eval_squad_v2_runtime": 78.195, "eval_squad_v2_samples_per_second": 333.32, "eval_squad_v2_steps_per_second": 20.833, "step": 1500 }, { "epoch": 0.25, "eval_adversarial_qa_accuracy": 0.7873755143419405, "eval_adversarial_qa_loss": 1.076171875, "eval_adversarial_qa_runtime": 18.1293, "eval_adversarial_qa_samples_per_second": 330.956, "eval_adversarial_qa_steps_per_second": 20.685, "step": 1500 }, { "epoch": 0.25, "eval_private_tuning_accuracy": 0.6490207424086791, "eval_private_tuning_loss": 1.30859375, "eval_private_tuning_runtime": 65.2643, "eval_private_tuning_samples_per_second": 324.496, "eval_private_tuning_steps_per_second": 20.287, "step": 1500 }, { "epoch": 0.25, "eval_oa_translated_accuracy": 0.6676994033045951, "eval_oa_translated_loss": 1.447265625, "eval_oa_translated_runtime": 495.9674, "eval_oa_translated_samples_per_second": 269.623, "eval_oa_translated_steps_per_second": 16.852, "step": 1500 }, { "epoch": 0.25, "eval_prosocial_dialogue_accuracy": 0.52370774081413, "eval_prosocial_dialogue_loss": 1.802734375, "eval_prosocial_dialogue_runtime": 117.5764, "eval_prosocial_dialogue_samples_per_second": 229.493, "eval_prosocial_dialogue_steps_per_second": 14.348, "step": 1500 }, { "epoch": 0.25, "eval_math_qa_accuracy": 0.5447965302424216, "eval_math_qa_loss": 2.015625, "eval_math_qa_runtime": 19.5318, "eval_math_qa_samples_per_second": 305.553, "eval_math_qa_steps_per_second": 19.097, "step": 1500 }, { "epoch": 0.25, "eval_wikihow_accuracy": 0.5924303341189519, "eval_wikihow_loss": 2.048828125, "eval_wikihow_runtime": 8.7505, "eval_wikihow_samples_per_second": 262.042, "eval_wikihow_steps_per_second": 16.456, "step": 1500 }, { "epoch": 0.25, "eval_joke_accuracy": 0.46341925701288855, "eval_joke_loss": 2.439453125, "eval_joke_runtime": 0.9638, "eval_joke_samples_per_second": 78.857, "eval_joke_steps_per_second": 5.188, "step": 1500 }, { "epoch": 0.25, "eval_gsm8k_accuracy": 0.7337848616728005, "eval_gsm8k_loss": 1.01171875, "eval_gsm8k_runtime": 6.3369, "eval_gsm8k_samples_per_second": 235.921, "eval_gsm8k_steps_per_second": 14.834, "step": 1500 }, { "epoch": 0.25, "eval_ted_trans_en-hi_accuracy": 0.5467649647887324, "eval_ted_trans_en-hi_loss": 2.080078125, "eval_ted_trans_en-hi_runtime": 0.6116, "eval_ted_trans_en-hi_samples_per_second": 168.402, "eval_ted_trans_en-hi_steps_per_second": 11.445, "step": 1500 }, { "epoch": 0.25, "eval_ted_trans_de-ja_accuracy": 0.5412865271482303, "eval_ted_trans_de-ja_loss": 2.130859375, "eval_ted_trans_de-ja_runtime": 3.781, "eval_ted_trans_de-ja_samples_per_second": 189.896, "eval_ted_trans_de-ja_steps_per_second": 11.902, "step": 1500 }, { "epoch": 0.25, "eval_ted_trans_nl-en_accuracy": 0.6488075112486671, "eval_ted_trans_nl-en_loss": 1.65625, "eval_ted_trans_nl-en_runtime": 3.2481, "eval_ted_trans_nl-en_samples_per_second": 237.369, "eval_ted_trans_nl-en_steps_per_second": 15.086, "step": 1500 }, { "epoch": 0.25, "eval_ted_trans_en-ja_accuracy": 0.5640618403329245, "eval_ted_trans_en-ja_loss": 1.990234375, "eval_ted_trans_en-ja_runtime": 3.5175, "eval_ted_trans_en-ja_samples_per_second": 227.717, "eval_ted_trans_en-ja_steps_per_second": 14.499, "step": 1500 }, { "epoch": 0.25, "eval_ted_trans_en-es_accuracy": 0.7159835441109249, "eval_ted_trans_en-es_loss": 1.25, "eval_ted_trans_en-es_runtime": 3.1192, "eval_ted_trans_en-es_samples_per_second": 264.814, "eval_ted_trans_en-es_steps_per_second": 16.671, "step": 1500 }, { "epoch": 0.25, "eval_ted_trans_en-ms_accuracy": 0.5680356259277586, "eval_ted_trans_en-ms_loss": 2.125, "eval_ted_trans_en-ms_runtime": 1.4378, "eval_ted_trans_en-ms_samples_per_second": 29.212, "eval_ted_trans_en-ms_steps_per_second": 2.087, "step": 1500 }, { "epoch": 0.25, "eval_xsum_accuracy": 0.575791398307109, "eval_xsum_loss": NaN, "eval_xsum_runtime": 142.8893, "eval_xsum_samples_per_second": 285.599, "eval_xsum_steps_per_second": 17.853, "step": 1500 }, { "epoch": 0.25, "eval_cnn_dailymail_accuracy": 0.6578154814514997, "eval_cnn_dailymail_loss": NaN, "eval_cnn_dailymail_runtime": 210.3199, "eval_cnn_dailymail_samples_per_second": 273.027, "eval_cnn_dailymail_steps_per_second": 17.064, "step": 1500 }, { "epoch": 0.25, "eval_multi_news_accuracy": 0.5236211410651092, "eval_multi_news_loss": NaN, "eval_multi_news_runtime": 35.5484, "eval_multi_news_samples_per_second": 253.035, "eval_multi_news_steps_per_second": 15.838, "step": 1500 }, { "epoch": 0.25, "eval_tldr_news_accuracy": 0.5471644879149816, "eval_tldr_news_loss": 2.119140625, "eval_tldr_news_runtime": 4.1143, "eval_tldr_news_samples_per_second": 347.078, "eval_tldr_news_steps_per_second": 21.875, "step": 1500 }, { "epoch": 0.25, "eval_scitldr_accuracy": 0.49550898203592814, "eval_scitldr_loss": NaN, "eval_scitldr_runtime": 2.4172, "eval_scitldr_samples_per_second": 165.069, "eval_scitldr_steps_per_second": 10.343, "step": 1500 }, { "epoch": 0.25, "eval_samsum_accuracy": 0.5926165192931454, "eval_samsum_loss": 1.5224609375, "eval_samsum_runtime": 10.6814, "eval_samsum_samples_per_second": 275.901, "eval_samsum_steps_per_second": 17.32, "step": 1500 }, { "epoch": 0.25, "eval_debate_sum_accuracy": 0.9358983757089394, "eval_debate_sum_loss": NaN, "eval_debate_sum_runtime": 196.0638, "eval_debate_sum_samples_per_second": 245.4, "eval_debate_sum_steps_per_second": 15.342, "step": 1500 }, { "epoch": 0.25, "eval_billsum_accuracy": 0.653463309552768, "eval_billsum_loss": NaN, "eval_billsum_runtime": 16.6514, "eval_billsum_samples_per_second": 227.609, "eval_billsum_steps_per_second": 14.233, "step": 1500 }, { "epoch": 0.25, "eval_wmt2019_zh-en_accuracy": 0.5612179149240267, "eval_wmt2019_zh-en_loss": 2.091796875, "eval_wmt2019_zh-en_runtime": 13.0415, "eval_wmt2019_zh-en_samples_per_second": 305.255, "eval_wmt2019_zh-en_steps_per_second": 19.093, "step": 1500 }, { "epoch": 0.25, "eval_wmt2019_ru-en_accuracy": 0.6424937502741108, "eval_wmt2019_ru-en_loss": 1.5146484375, "eval_wmt2019_ru-en_runtime": 9.2157, "eval_wmt2019_ru-en_samples_per_second": 325.531, "eval_wmt2019_ru-en_steps_per_second": 20.4, "step": 1500 }, { "epoch": 0.25, "eval_wmt2019_de-en_accuracy": 0.6710659487470143, "eval_wmt2019_de-en_loss": 1.4248046875, "eval_wmt2019_de-en_runtime": 10.6387, "eval_wmt2019_de-en_samples_per_second": 281.801, "eval_wmt2019_de-en_steps_per_second": 17.671, "step": 1500 }, { "epoch": 0.25, "eval_wmt2019_fr-de_accuracy": 0.6660826692300537, "eval_wmt2019_fr-de_loss": 1.4521484375, "eval_wmt2019_fr-de_runtime": 5.6356, "eval_wmt2019_fr-de_samples_per_second": 268.295, "eval_wmt2019_fr-de_steps_per_second": 16.857, "step": 1500 }, { "epoch": 0.25, "eval_essay_instruction_accuracy": 0.5827727003469578, "eval_essay_instruction_loss": 2.05859375, "eval_essay_instruction_runtime": 4.1866, "eval_essay_instruction_samples_per_second": 98.649, "eval_essay_instruction_steps_per_second": 6.21, "step": 1500 }, { "epoch": 0.25, "eval_reddit_eli5_accuracy": 0.42421905510230506, "eval_reddit_eli5_loss": 2.73828125, "eval_reddit_eli5_runtime": 199.4145, "eval_reddit_eli5_samples_per_second": 273.436, "eval_reddit_eli5_steps_per_second": 17.09, "step": 1500 }, { "epoch": 0.25, "eval_reddit_askh_accuracy": 0.4276221576585814, "eval_reddit_askh_loss": 2.81640625, "eval_reddit_askh_runtime": 110.2213, "eval_reddit_askh_samples_per_second": 178.777, "eval_reddit_askh_steps_per_second": 11.178, "step": 1500 }, { "epoch": 0.25, "eval_reddit_asks_accuracy": 0.4372012246587393, "eval_reddit_asks_loss": 2.66796875, "eval_reddit_asks_runtime": 131.6507, "eval_reddit_asks_samples_per_second": 200.196, "eval_reddit_asks_steps_per_second": 12.518, "step": 1500 }, { "epoch": 0.25, "learning_rate": 4.77345988538682e-06, "loss": 1.8033, "step": 1510 }, { "epoch": 0.25, "learning_rate": 4.768982808022923e-06, "loss": 1.8477, "step": 1520 }, { "epoch": 0.25, "learning_rate": 4.7645057306590265e-06, "loss": 1.8417, "step": 1530 }, { "epoch": 0.25, "learning_rate": 4.76002865329513e-06, "loss": 1.7781, "step": 1540 }, { "epoch": 0.25, "learning_rate": 4.755551575931232e-06, "loss": 1.808, "step": 1550 }, { "epoch": 0.26, "learning_rate": 4.751074498567336e-06, "loss": 1.8719, "step": 1560 }, { "epoch": 0.26, "learning_rate": 4.746597421203439e-06, "loss": 1.8382, "step": 1570 }, { "epoch": 0.26, "learning_rate": 4.742120343839542e-06, "loss": 1.7991, "step": 1580 }, { "epoch": 0.26, "learning_rate": 4.737643266475645e-06, "loss": 1.809, "step": 1590 }, { "epoch": 0.26, "learning_rate": 4.733166189111748e-06, "loss": 1.8206, "step": 1600 }, { "epoch": 0.26, "learning_rate": 4.7286891117478515e-06, "loss": 1.8475, "step": 1610 }, { "epoch": 0.27, "learning_rate": 4.724212034383955e-06, "loss": 1.8342, "step": 1620 }, { "epoch": 0.27, "learning_rate": 4.719734957020058e-06, "loss": 1.8436, "step": 1630 }, { "epoch": 0.27, "learning_rate": 4.715257879656161e-06, "loss": 1.8198, "step": 1640 }, { "epoch": 0.27, "learning_rate": 4.710780802292264e-06, "loss": 1.8271, "step": 1650 }, { "epoch": 0.27, "learning_rate": 4.706303724928367e-06, "loss": 1.8584, "step": 1660 }, { "epoch": 0.27, "learning_rate": 4.701826647564471e-06, "loss": 1.8485, "step": 1670 }, { "epoch": 0.28, "learning_rate": 4.697349570200573e-06, "loss": 1.7872, "step": 1680 }, { "epoch": 0.28, "learning_rate": 4.6928724928366764e-06, "loss": 1.8026, "step": 1690 }, { "epoch": 0.28, "learning_rate": 4.68839541547278e-06, "loss": 1.8138, "step": 1700 }, { "epoch": 0.28, "learning_rate": 4.683918338108882e-06, "loss": 1.7871, "step": 1710 }, { "epoch": 0.28, "learning_rate": 4.6794412607449864e-06, "loss": 1.8459, "step": 1720 }, { "epoch": 0.28, "learning_rate": 4.67496418338109e-06, "loss": 1.8081, "step": 1730 }, { "epoch": 0.29, "learning_rate": 4.670487106017192e-06, "loss": 1.7956, "step": 1740 }, { "epoch": 0.29, "learning_rate": 4.666010028653296e-06, "loss": 1.779, "step": 1750 }, { "epoch": 0.29, "learning_rate": 4.661532951289399e-06, "loss": 1.8106, "step": 1760 }, { "epoch": 0.29, "learning_rate": 4.657055873925501e-06, "loss": 1.8225, "step": 1770 }, { "epoch": 0.29, "learning_rate": 4.652578796561605e-06, "loss": 1.8263, "step": 1780 }, { "epoch": 0.29, "learning_rate": 4.648101719197708e-06, "loss": 1.8232, "step": 1790 }, { "epoch": 0.3, "learning_rate": 4.643624641833811e-06, "loss": 1.751, "step": 1800 }, { "epoch": 0.3, "learning_rate": 4.639147564469915e-06, "loss": 1.8275, "step": 1810 }, { "epoch": 0.3, "learning_rate": 4.634670487106018e-06, "loss": 1.7738, "step": 1820 }, { "epoch": 0.3, "learning_rate": 4.6301934097421206e-06, "loss": 1.8064, "step": 1830 }, { "epoch": 0.3, "learning_rate": 4.625716332378224e-06, "loss": 1.7775, "step": 1840 }, { "epoch": 0.3, "learning_rate": 4.621239255014327e-06, "loss": 1.799, "step": 1850 }, { "epoch": 0.31, "learning_rate": 4.61676217765043e-06, "loss": 1.8349, "step": 1860 }, { "epoch": 0.31, "learning_rate": 4.612285100286533e-06, "loss": 1.7826, "step": 1870 }, { "epoch": 0.31, "learning_rate": 4.607808022922636e-06, "loss": 1.8322, "step": 1880 }, { "epoch": 0.31, "learning_rate": 4.60333094555874e-06, "loss": 1.8115, "step": 1890 }, { "epoch": 0.31, "learning_rate": 4.598853868194843e-06, "loss": 1.7811, "step": 1900 }, { "epoch": 0.31, "learning_rate": 4.594376790830946e-06, "loss": 1.8113, "step": 1910 }, { "epoch": 0.32, "learning_rate": 4.589899713467049e-06, "loss": 1.82, "step": 1920 }, { "epoch": 0.32, "learning_rate": 4.585422636103152e-06, "loss": 1.8562, "step": 1930 }, { "epoch": 0.32, "learning_rate": 4.5809455587392556e-06, "loss": 1.8197, "step": 1940 }, { "epoch": 0.32, "learning_rate": 4.576468481375359e-06, "loss": 1.8255, "step": 1950 }, { "epoch": 0.32, "learning_rate": 4.571991404011461e-06, "loss": 1.7827, "step": 1960 }, { "epoch": 0.32, "learning_rate": 4.567514326647565e-06, "loss": 1.7943, "step": 1970 }, { "epoch": 0.33, "learning_rate": 4.563037249283668e-06, "loss": 1.7536, "step": 1980 }, { "epoch": 0.33, "learning_rate": 4.558560171919771e-06, "loss": 1.783, "step": 1990 }, { "epoch": 0.33, "learning_rate": 4.554083094555875e-06, "loss": 1.7924, "step": 2000 }, { "epoch": 0.33, "eval_gsm8k_hard_accuracy": 0.9076584829607915, "eval_gsm8k_hard_loss": 0.39697265625, "eval_gsm8k_hard_runtime": 2.2124, "eval_gsm8k_hard_samples_per_second": 119.327, "eval_gsm8k_hard_steps_per_second": 7.684, "step": 2000 }, { "epoch": 0.33, "eval_webgpt_accuracy": 0.4696865932858906, "eval_webgpt_loss": 2.423828125, "eval_webgpt_runtime": 14.9073, "eval_webgpt_samples_per_second": 262.691, "eval_webgpt_steps_per_second": 16.435, "step": 2000 }, { "epoch": 0.33, "eval_squad_v2_accuracy": 0.8851732615724923, "eval_squad_v2_loss": 0.40966796875, "eval_squad_v2_runtime": 80.4777, "eval_squad_v2_samples_per_second": 323.866, "eval_squad_v2_steps_per_second": 20.242, "step": 2000 }, { "epoch": 0.33, "eval_adversarial_qa_accuracy": 0.7928618283737849, "eval_adversarial_qa_loss": 1.1015625, "eval_adversarial_qa_runtime": 18.1784, "eval_adversarial_qa_samples_per_second": 330.062, "eval_adversarial_qa_steps_per_second": 20.629, "step": 2000 }, { "epoch": 0.33, "eval_private_tuning_accuracy": 0.6519236031057545, "eval_private_tuning_loss": 1.2939453125, "eval_private_tuning_runtime": 67.7066, "eval_private_tuning_samples_per_second": 312.791, "eval_private_tuning_steps_per_second": 19.555, "step": 2000 }, { "epoch": 0.33, "eval_oa_translated_accuracy": 0.6739938039772579, "eval_oa_translated_loss": 1.4169921875, "eval_oa_translated_runtime": 489.9008, "eval_oa_translated_samples_per_second": 272.961, "eval_oa_translated_steps_per_second": 17.061, "step": 2000 }, { "epoch": 0.33, "eval_prosocial_dialogue_accuracy": 0.5339765946198812, "eval_prosocial_dialogue_loss": 1.8515625, "eval_prosocial_dialogue_runtime": 112.602, "eval_prosocial_dialogue_samples_per_second": 239.632, "eval_prosocial_dialogue_steps_per_second": 14.982, "step": 2000 }, { "epoch": 0.33, "eval_math_qa_accuracy": 0.5540153422185813, "eval_math_qa_loss": 1.9658203125, "eval_math_qa_runtime": 19.4551, "eval_math_qa_samples_per_second": 306.758, "eval_math_qa_steps_per_second": 19.172, "step": 2000 }, { "epoch": 0.33, "eval_wikihow_accuracy": 0.5962636905587134, "eval_wikihow_loss": 2.03515625, "eval_wikihow_runtime": 8.6232, "eval_wikihow_samples_per_second": 265.91, "eval_wikihow_steps_per_second": 16.699, "step": 2000 }, { "epoch": 0.33, "eval_joke_accuracy": 0.4670204700530705, "eval_joke_loss": 2.40234375, "eval_joke_runtime": 0.908, "eval_joke_samples_per_second": 83.698, "eval_joke_steps_per_second": 5.506, "step": 2000 }, { "epoch": 0.33, "eval_gsm8k_accuracy": 0.7402429297099117, "eval_gsm8k_loss": 0.97998046875, "eval_gsm8k_runtime": 5.491, "eval_gsm8k_samples_per_second": 272.266, "eval_gsm8k_steps_per_second": 17.119, "step": 2000 }, { "epoch": 0.33, "eval_ted_trans_en-hi_accuracy": 0.5490196078431373, "eval_ted_trans_en-hi_loss": 2.10546875, "eval_ted_trans_en-hi_runtime": 1.4695, "eval_ted_trans_en-hi_samples_per_second": 70.093, "eval_ted_trans_en-hi_steps_per_second": 4.764, "step": 2000 }, { "epoch": 0.33, "eval_ted_trans_de-ja_accuracy": 0.5422406826169489, "eval_ted_trans_de-ja_loss": 2.119140625, "eval_ted_trans_de-ja_runtime": 2.8137, "eval_ted_trans_de-ja_samples_per_second": 255.176, "eval_ted_trans_de-ja_steps_per_second": 15.993, "step": 2000 }, { "epoch": 0.33, "eval_ted_trans_nl-en_accuracy": 0.6510702489011417, "eval_ted_trans_nl-en_loss": 1.6396484375, "eval_ted_trans_nl-en_runtime": 4.3357, "eval_ted_trans_nl-en_samples_per_second": 177.827, "eval_ted_trans_nl-en_steps_per_second": 11.302, "step": 2000 }, { "epoch": 0.33, "eval_ted_trans_en-ja_accuracy": 0.556157479064968, "eval_ted_trans_en-ja_loss": 2.017578125, "eval_ted_trans_en-ja_runtime": 3.2862, "eval_ted_trans_en-ja_samples_per_second": 243.744, "eval_ted_trans_en-ja_steps_per_second": 15.519, "step": 2000 }, { "epoch": 0.33, "eval_ted_trans_en-es_accuracy": 0.7188412420341738, "eval_ted_trans_en-es_loss": 1.2294921875, "eval_ted_trans_en-es_runtime": 4.2374, "eval_ted_trans_en-es_samples_per_second": 194.932, "eval_ted_trans_en-es_steps_per_second": 12.272, "step": 2000 }, { "epoch": 0.33, "eval_ted_trans_en-ms_accuracy": 0.5734784760019792, "eval_ted_trans_en-ms_loss": 2.06640625, "eval_ted_trans_en-ms_runtime": 0.629, "eval_ted_trans_en-ms_samples_per_second": 66.768, "eval_ted_trans_en-ms_steps_per_second": 4.769, "step": 2000 }, { "epoch": 0.33, "eval_xsum_accuracy": 0.5781070399698887, "eval_xsum_loss": NaN, "eval_xsum_runtime": 144.895, "eval_xsum_samples_per_second": 281.645, "eval_xsum_steps_per_second": 17.606, "step": 2000 }, { "epoch": 0.33, "eval_cnn_dailymail_accuracy": 0.661322804206144, "eval_cnn_dailymail_loss": NaN, "eval_cnn_dailymail_runtime": 208.0253, "eval_cnn_dailymail_samples_per_second": 276.039, "eval_cnn_dailymail_steps_per_second": 17.253, "step": 2000 }, { "epoch": 0.33, "eval_multi_news_accuracy": 0.5257933653652501, "eval_multi_news_loss": NaN, "eval_multi_news_runtime": 34.8299, "eval_multi_news_samples_per_second": 258.255, "eval_multi_news_steps_per_second": 16.164, "step": 2000 }, { "epoch": 0.33, "eval_tldr_news_accuracy": 0.554453117652591, "eval_tldr_news_loss": 2.056640625, "eval_tldr_news_runtime": 4.219, "eval_tldr_news_samples_per_second": 338.467, "eval_tldr_news_steps_per_second": 21.332, "step": 2000 }, { "epoch": 0.33, "eval_scitldr_accuracy": 0.5014970059880239, "eval_scitldr_loss": NaN, "eval_scitldr_runtime": 2.5157, "eval_scitldr_samples_per_second": 158.605, "eval_scitldr_steps_per_second": 9.938, "step": 2000 }, { "epoch": 0.33, "eval_samsum_accuracy": 0.5984307075274895, "eval_samsum_loss": 1.5048828125, "eval_samsum_runtime": 9.7229, "eval_samsum_samples_per_second": 303.099, "eval_samsum_steps_per_second": 19.027, "step": 2000 }, { "epoch": 0.33, "eval_debate_sum_accuracy": 0.9366294751454436, "eval_debate_sum_loss": NaN, "eval_debate_sum_runtime": 191.1458, "eval_debate_sum_samples_per_second": 251.714, "eval_debate_sum_steps_per_second": 15.737, "step": 2000 }, { "epoch": 0.33, "eval_billsum_accuracy": 0.6557828398195577, "eval_billsum_loss": NaN, "eval_billsum_runtime": 21.6214, "eval_billsum_samples_per_second": 175.289, "eval_billsum_steps_per_second": 10.961, "step": 2000 }, { "epoch": 0.33, "eval_wmt2019_zh-en_accuracy": 0.5675863966606366, "eval_wmt2019_zh-en_loss": 2.0625, "eval_wmt2019_zh-en_runtime": 12.8505, "eval_wmt2019_zh-en_samples_per_second": 309.793, "eval_wmt2019_zh-en_steps_per_second": 19.377, "step": 2000 }, { "epoch": 0.33, "eval_wmt2019_ru-en_accuracy": 0.6426618715553412, "eval_wmt2019_ru-en_loss": 1.517578125, "eval_wmt2019_ru-en_runtime": 9.7092, "eval_wmt2019_ru-en_samples_per_second": 308.986, "eval_wmt2019_ru-en_steps_per_second": 19.363, "step": 2000 }, { "epoch": 0.33, "eval_wmt2019_de-en_accuracy": 0.6722804744747176, "eval_wmt2019_de-en_loss": 1.4169921875, "eval_wmt2019_de-en_runtime": 10.2232, "eval_wmt2019_de-en_samples_per_second": 293.255, "eval_wmt2019_de-en_steps_per_second": 18.39, "step": 2000 }, { "epoch": 0.33, "eval_wmt2019_fr-de_accuracy": 0.6611832925051939, "eval_wmt2019_fr-de_loss": 1.466796875, "eval_wmt2019_fr-de_runtime": 5.2327, "eval_wmt2019_fr-de_samples_per_second": 288.95, "eval_wmt2019_fr-de_steps_per_second": 18.155, "step": 2000 }, { "epoch": 0.33, "eval_essay_instruction_accuracy": 0.5831323516967082, "eval_essay_instruction_loss": 2.0546875, "eval_essay_instruction_runtime": 4.364, "eval_essay_instruction_samples_per_second": 94.638, "eval_essay_instruction_steps_per_second": 5.958, "step": 2000 }, { "epoch": 0.33, "eval_reddit_eli5_accuracy": 0.4256390849199412, "eval_reddit_eli5_loss": 2.732421875, "eval_reddit_eli5_runtime": 199.1709, "eval_reddit_eli5_samples_per_second": 273.77, "eval_reddit_eli5_steps_per_second": 17.111, "step": 2000 }, { "epoch": 0.33, "eval_reddit_askh_accuracy": 0.428839058527484, "eval_reddit_askh_loss": 2.80859375, "eval_reddit_askh_runtime": 129.3643, "eval_reddit_askh_samples_per_second": 152.322, "eval_reddit_askh_steps_per_second": 9.523, "step": 2000 }, { "epoch": 0.33, "eval_reddit_asks_accuracy": 0.43882877148313176, "eval_reddit_asks_loss": 2.662109375, "eval_reddit_asks_runtime": 99.4788, "eval_reddit_asks_samples_per_second": 264.941, "eval_reddit_asks_steps_per_second": 16.566, "step": 2000 } ], "max_steps": 12168, "num_train_epochs": 2, "total_flos": 1.6560121131357438e+19, "trial_name": null, "trial_params": null }