diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4039 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.13174555541730404, + "global_step": 2250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.666666666666667e-06, + "loss": 2.1425, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 2.1683833261066357e-06, + "loss": 2.0497, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 2.4618687578661045e-06, + "loss": 1.8724, + "step": 30 + }, + { + "epoch": 0.0, + "learning_rate": 2.6700999855466042e-06, + "loss": 1.8389, + "step": 40 + }, + { + "epoch": 0.0, + "learning_rate": 2.8316166738933647e-06, + "loss": 1.774, + "step": 50 + }, + { + "epoch": 0.0, + "learning_rate": 2.963585417306073e-06, + "loss": 1.7948, + "step": 60 + }, + { + "epoch": 0.0, + "learning_rate": 3.075163400023762e-06, + "loss": 1.7367, + "step": 70 + }, + { + "epoch": 0.0, + "learning_rate": 3.171816644986573e-06, + "loss": 1.7413, + "step": 80 + }, + { + "epoch": 0.0, + "learning_rate": 3.257070849065542e-06, + "loss": 1.7729, + "step": 90 + }, + { + "epoch": 0.0, + "learning_rate": 3.333333333333334e-06, + "loss": 1.7465, + "step": 100 + }, + { + "epoch": 0.01, + "learning_rate": 3.402321141930376e-06, + "loss": 1.7541, + "step": 110 + }, + { + "epoch": 0.01, + "learning_rate": 3.4653020767460416e-06, + "loss": 1.7026, + "step": 120 + }, + { + "epoch": 0.01, + "learning_rate": 3.523238920511395e-06, + "loss": 1.7366, + "step": 130 + }, + { + "epoch": 0.01, + "learning_rate": 3.5768800594637304e-06, + "loss": 1.6725, + "step": 140 + }, + { + "epoch": 0.01, + "learning_rate": 3.626818765092802e-06, + "loss": 1.6984, + "step": 150 + }, + { + "epoch": 0.01, + "learning_rate": 3.6735333044265414e-06, + "loss": 1.6792, + "step": 160 + }, + { + "epoch": 0.01, + "learning_rate": 3.717414868963791e-06, + "loss": 1.6731, + "step": 170 + }, + { + "epoch": 0.01, + "learning_rate": 3.7587875085055104e-06, + "loss": 1.7055, + "step": 180 + }, + { + "epoch": 0.01, + "learning_rate": 3.7979226682547152e-06, + "loss": 1.6812, + "step": 190 + }, + { + "epoch": 0.01, + "learning_rate": 3.835049992773302e-06, + "loss": 1.624, + "step": 200 + }, + { + "epoch": 0.01, + "learning_rate": 3.870365491223199e-06, + "loss": 1.6899, + "step": 210 + }, + { + "epoch": 0.01, + "learning_rate": 3.9040378013703444e-06, + "loss": 1.7062, + "step": 220 + }, + { + "epoch": 0.01, + "learning_rate": 3.936213060029322e-06, + "loss": 1.6366, + "step": 230 + }, + { + "epoch": 0.01, + "learning_rate": 3.96701873618601e-06, + "loss": 1.6621, + "step": 240 + }, + { + "epoch": 0.01, + "learning_rate": 3.996566681120062e-06, + "loss": 1.7629, + "step": 250 + }, + { + "epoch": 0.01, + "eval_gsm8k_hard_accuracy": 0.8147876715117462, + "eval_gsm8k_hard_loss": 0.76171875, + "eval_gsm8k_hard_runtime": 2.971, + "eval_gsm8k_hard_samples_per_second": 88.859, + "eval_gsm8k_hard_steps_per_second": 2.02, + "step": 250 + }, + { + "epoch": 0.01, + "eval_webgpt_accuracy": 0.48539789112821163, + "eval_webgpt_loss": 2.314453125, + "eval_webgpt_runtime": 16.0854, + "eval_webgpt_samples_per_second": 243.451, + "eval_webgpt_steps_per_second": 5.098, + "step": 250 + }, + { + "epoch": 0.01, + "eval_squad_v2_accuracy": 0.8767175411152919, + "eval_squad_v2_loss": 0.428466796875, + "eval_squad_v2_runtime": 89.9878, + "eval_squad_v2_samples_per_second": 289.639, + "eval_squad_v2_steps_per_second": 6.034, + "step": 250 + }, + { + "epoch": 0.01, + "eval_adversarial_qa_accuracy": 0.7944240470433719, + "eval_adversarial_qa_loss": 0.986328125, + "eval_adversarial_qa_runtime": 22.0494, + "eval_adversarial_qa_samples_per_second": 272.116, + "eval_adversarial_qa_steps_per_second": 5.669, + "step": 250 + }, + { + "epoch": 0.01, + "eval_private_tuning_accuracy": 0.6531239810829162, + "eval_private_tuning_loss": 1.337890625, + "eval_private_tuning_runtime": 61.5659, + "eval_private_tuning_samples_per_second": 343.989, + "eval_private_tuning_steps_per_second": 7.179, + "step": 250 + }, + { + "epoch": 0.01, + "eval_oa_translated_accuracy": 0.6721688366960878, + "eval_oa_translated_loss": 1.3828125, + "eval_oa_translated_runtime": 712.3024, + "eval_oa_translated_samples_per_second": 196.356, + "eval_oa_translated_steps_per_second": 4.091, + "step": 250 + }, + { + "epoch": 0.01, + "eval_prosocial_dialogue_accuracy": 0.4919427530416725, + "eval_prosocial_dialogue_loss": 1.9140625, + "eval_prosocial_dialogue_runtime": 91.8198, + "eval_prosocial_dialogue_samples_per_second": 293.869, + "eval_prosocial_dialogue_steps_per_second": 6.132, + "step": 250 + }, + { + "epoch": 0.01, + "eval_math_qa_accuracy": 0.540921279895434, + "eval_math_qa_loss": 2.076171875, + "eval_math_qa_runtime": 19.0288, + "eval_math_qa_samples_per_second": 313.63, + "eval_math_qa_steps_per_second": 6.569, + "step": 250 + }, + { + "epoch": 0.01, + "eval_wikihow_accuracy": 0.5926709194286507, + "eval_wikihow_loss": 2.01171875, + "eval_wikihow_runtime": 7.434, + "eval_wikihow_samples_per_second": 308.448, + "eval_wikihow_steps_per_second": 6.457, + "step": 250 + }, + { + "epoch": 0.01, + "eval_joke_accuracy": 0.4775398028809704, + "eval_joke_loss": 2.380859375, + "eval_joke_runtime": 0.5994, + "eval_joke_samples_per_second": 126.794, + "eval_joke_steps_per_second": 3.337, + "step": 250 + }, + { + "epoch": 0.01, + "eval_gsm8k_accuracy": 0.7411105358167392, + "eval_gsm8k_loss": 1.0078125, + "eval_gsm8k_runtime": 6.2732, + "eval_gsm8k_samples_per_second": 238.315, + "eval_gsm8k_steps_per_second": 5.101, + "step": 250 + }, + { + "epoch": 0.01, + "eval_ted_trans_en-hi_accuracy": 0.6262273619899629, + "eval_ted_trans_en-hi_loss": 1.46875, + "eval_ted_trans_en-hi_runtime": 1.1364, + "eval_ted_trans_en-hi_samples_per_second": 90.639, + "eval_ted_trans_en-hi_steps_per_second": 2.64, + "step": 250 + }, + { + "epoch": 0.01, + "eval_ted_trans_de-ja_accuracy": 0.6054150683400991, + "eval_ted_trans_de-ja_loss": 1.76953125, + "eval_ted_trans_de-ja_runtime": 3.475, + "eval_ted_trans_de-ja_samples_per_second": 206.617, + "eval_ted_trans_de-ja_steps_per_second": 4.317, + "step": 250 + }, + { + "epoch": 0.01, + "eval_ted_trans_nl-en_accuracy": 0.7055143773327025, + "eval_ted_trans_nl-en_loss": 1.3232421875, + "eval_ted_trans_nl-en_runtime": 4.3836, + "eval_ted_trans_nl-en_samples_per_second": 175.884, + "eval_ted_trans_nl-en_steps_per_second": 3.878, + "step": 250 + }, + { + "epoch": 0.01, + "eval_ted_trans_en-ja_accuracy": 0.6165139119558755, + "eval_ted_trans_en-ja_loss": 1.685546875, + "eval_ted_trans_en-ja_runtime": 3.9868, + "eval_ted_trans_en-ja_samples_per_second": 200.912, + "eval_ted_trans_en-ja_steps_per_second": 4.264, + "step": 250 + }, + { + "epoch": 0.01, + "eval_ted_trans_en-es_accuracy": 0.7508869040130834, + "eval_ted_trans_en-es_loss": 1.0810546875, + "eval_ted_trans_en-es_runtime": 4.6601, + "eval_ted_trans_en-es_samples_per_second": 177.248, + "eval_ted_trans_en-es_steps_per_second": 3.863, + "step": 250 + }, + { + "epoch": 0.01, + "eval_ted_trans_en-ms_accuracy": 0.6461456102783726, + "eval_ted_trans_en-ms_loss": 1.6650390625, + "eval_ted_trans_en-ms_runtime": 0.4654, + "eval_ted_trans_en-ms_samples_per_second": 90.238, + "eval_ted_trans_en-ms_steps_per_second": 2.149, + "step": 250 + }, + { + "epoch": 0.01, + "eval_xsum_accuracy": 0.5982287223480477, + "eval_xsum_loss": NaN, + "eval_xsum_runtime": 192.75, + "eval_xsum_samples_per_second": 211.72, + "eval_xsum_steps_per_second": 4.415, + "step": 250 + }, + { + "epoch": 0.01, + "eval_cnn_dailymail_accuracy": 0.6676366328844463, + "eval_cnn_dailymail_loss": NaN, + "eval_cnn_dailymail_runtime": 274.4306, + "eval_cnn_dailymail_samples_per_second": 209.244, + "eval_cnn_dailymail_steps_per_second": 4.362, + "step": 250 + }, + { + "epoch": 0.01, + "eval_multi_news_accuracy": 0.5282928997840721, + "eval_multi_news_loss": NaN, + "eval_multi_news_runtime": 46.6919, + "eval_multi_news_samples_per_second": 192.646, + "eval_multi_news_steps_per_second": 4.026, + "step": 250 + }, + { + "epoch": 0.01, + "eval_tldr_news_accuracy": 0.5294836828740713, + "eval_tldr_news_loss": 2.2265625, + "eval_tldr_news_runtime": 3.3342, + "eval_tldr_news_samples_per_second": 428.294, + "eval_tldr_news_steps_per_second": 8.998, + "step": 250 + }, + { + "epoch": 0.01, + "eval_scitldr_accuracy": 0.49432739059967584, + "eval_scitldr_loss": NaN, + "eval_scitldr_runtime": 2.6633, + "eval_scitldr_samples_per_second": 149.816, + "eval_scitldr_steps_per_second": 3.379, + "step": 250 + }, + { + "epoch": 0.01, + "eval_samsum_accuracy": 0.6079303492003407, + "eval_samsum_loss": 1.4892578125, + "eval_samsum_runtime": 14.0655, + "eval_samsum_samples_per_second": 209.52, + "eval_samsum_steps_per_second": 4.408, + "step": 250 + }, + { + "epoch": 0.01, + "eval_debate_sum_accuracy": 0.9296994127333449, + "eval_debate_sum_loss": NaN, + "eval_debate_sum_runtime": 250.1193, + "eval_debate_sum_samples_per_second": 192.364, + "eval_debate_sum_steps_per_second": 4.01, + "step": 250 + }, + { + "epoch": 0.01, + "eval_billsum_accuracy": 0.6637741667488066, + "eval_billsum_loss": 1.4560546875, + "eval_billsum_runtime": 20.9773, + "eval_billsum_samples_per_second": 180.672, + "eval_billsum_steps_per_second": 3.766, + "step": 250 + }, + { + "epoch": 0.01, + "eval_wmt2019_zh-en_accuracy": 0.6277540662129427, + "eval_wmt2019_zh-en_loss": 1.6943359375, + "eval_wmt2019_zh-en_runtime": 10.9759, + "eval_wmt2019_zh-en_samples_per_second": 362.702, + "eval_wmt2019_zh-en_steps_per_second": 7.562, + "step": 250 + }, + { + "epoch": 0.01, + "eval_wmt2019_ru-en_accuracy": 0.721860857670872, + "eval_wmt2019_ru-en_loss": 1.11328125, + "eval_wmt2019_ru-en_runtime": 10.5639, + "eval_wmt2019_ru-en_samples_per_second": 283.986, + "eval_wmt2019_ru-en_steps_per_second": 5.964, + "step": 250 + }, + { + "epoch": 0.01, + "eval_wmt2019_de-en_accuracy": 0.7348780028355294, + "eval_wmt2019_de-en_loss": 1.083984375, + "eval_wmt2019_de-en_runtime": 7.6263, + "eval_wmt2019_de-en_samples_per_second": 393.113, + "eval_wmt2019_de-en_steps_per_second": 8.261, + "step": 250 + }, + { + "epoch": 0.01, + "eval_wmt2019_fr-de_accuracy": 0.7198844756374025, + "eval_wmt2019_fr-de_loss": 1.1572265625, + "eval_wmt2019_fr-de_runtime": 5.8183, + "eval_wmt2019_fr-de_samples_per_second": 259.868, + "eval_wmt2019_fr-de_steps_per_second": 5.5, + "step": 250 + }, + { + "epoch": 0.01, + "eval_essay_instruction_accuracy": 0.5889370453088031, + "eval_essay_instruction_loss": 2.01171875, + "eval_essay_instruction_runtime": 4.9645, + "eval_essay_instruction_samples_per_second": 83.191, + "eval_essay_instruction_steps_per_second": 1.813, + "step": 250 + }, + { + "epoch": 0.01, + "eval_reddit_eli5_accuracy": 0.44436461029042645, + "eval_reddit_eli5_loss": 2.56640625, + "eval_reddit_eli5_runtime": 290.2795, + "eval_reddit_eli5_samples_per_second": 187.843, + "eval_reddit_eli5_steps_per_second": 3.913, + "step": 250 + }, + { + "epoch": 0.01, + "eval_reddit_askh_accuracy": 0.44699574235962536, + "eval_reddit_askh_loss": 2.666015625, + "eval_reddit_askh_runtime": 127.4754, + "eval_reddit_askh_samples_per_second": 154.579, + "eval_reddit_askh_steps_per_second": 3.224, + "step": 250 + }, + { + "epoch": 0.01, + "eval_reddit_asks_accuracy": 0.4559295270939454, + "eval_reddit_asks_loss": 2.515625, + "eval_reddit_asks_runtime": 148.9443, + "eval_reddit_asks_samples_per_second": 176.952, + "eval_reddit_asks_steps_per_second": 3.693, + "step": 250 + }, + { + "epoch": 0.01, + "learning_rate": 4.024955579951363e-06, + "loss": 1.6457, + "step": 260 + }, + { + "epoch": 0.01, + "learning_rate": 4.05227294026498e-06, + "loss": 1.6765, + "step": 270 + }, + { + "epoch": 0.01, + "learning_rate": 4.078596718903699e-06, + "loss": 1.7331, + "step": 280 + }, + { + "epoch": 0.01, + "learning_rate": 4.103996663164927e-06, + "loss": 1.7341, + "step": 290 + }, + { + "epoch": 0.01, + "learning_rate": 4.128535424532771e-06, + "loss": 1.6712, + "step": 300 + }, + { + "epoch": 0.02, + "learning_rate": 4.152269489723789e-06, + "loss": 1.7221, + "step": 310 + }, + { + "epoch": 0.02, + "learning_rate": 4.17524996386651e-06, + "loss": 1.6535, + "step": 320 + }, + { + "epoch": 0.02, + "learning_rate": 4.197523233129813e-06, + "loss": 1.7102, + "step": 330 + }, + { + "epoch": 0.02, + "learning_rate": 4.219131528403759e-06, + "loss": 1.7204, + "step": 340 + }, + { + "epoch": 0.02, + "learning_rate": 4.2401134072504595e-06, + "loss": 1.6645, + "step": 350 + }, + { + "epoch": 0.02, + "learning_rate": 4.26050416794548e-06, + "loss": 1.6375, + "step": 360 + }, + { + "epoch": 0.02, + "learning_rate": 4.280336206778326e-06, + "loss": 1.6983, + "step": 370 + }, + { + "epoch": 0.02, + "learning_rate": 4.299639327694684e-06, + "loss": 1.7058, + "step": 380 + }, + { + "epoch": 0.02, + "learning_rate": 4.318441011710832e-06, + "loss": 1.6323, + "step": 390 + }, + { + "epoch": 0.02, + "learning_rate": 4.336766652213271e-06, + "loss": 1.6884, + "step": 400 + }, + { + "epoch": 0.02, + "learning_rate": 4.35463976119956e-06, + "loss": 1.72, + "step": 410 + }, + { + "epoch": 0.02, + "learning_rate": 4.372082150663167e-06, + "loss": 1.6697, + "step": 420 + }, + { + "epoch": 0.02, + "learning_rate": 4.389114092632645e-06, + "loss": 1.6286, + "step": 430 + }, + { + "epoch": 0.02, + "learning_rate": 4.405754460810312e-06, + "loss": 1.7086, + "step": 440 + }, + { + "epoch": 0.02, + "learning_rate": 4.42202085629224e-06, + "loss": 1.6386, + "step": 450 + }, + { + "epoch": 0.02, + "learning_rate": 4.437929719469291e-06, + "loss": 1.6138, + "step": 460 + }, + { + "epoch": 0.02, + "learning_rate": 4.453496429892863e-06, + "loss": 1.6216, + "step": 470 + }, + { + "epoch": 0.02, + "learning_rate": 4.468735395625979e-06, + "loss": 1.7481, + "step": 480 + }, + { + "epoch": 0.02, + "learning_rate": 4.4836601333808566e-06, + "loss": 1.6641, + "step": 490 + }, + { + "epoch": 0.02, + "learning_rate": 4.498283340560032e-06, + "loss": 1.6732, + "step": 500 + }, + { + "epoch": 0.02, + "eval_gsm8k_hard_accuracy": 0.8998615691543503, + "eval_gsm8k_hard_loss": 0.46142578125, + "eval_gsm8k_hard_runtime": 2.1347, + "eval_gsm8k_hard_samples_per_second": 123.669, + "eval_gsm8k_hard_steps_per_second": 2.811, + "step": 500 + }, + { + "epoch": 0.02, + "eval_webgpt_accuracy": 0.4871765174992017, + "eval_webgpt_loss": 2.30078125, + "eval_webgpt_runtime": 17.9636, + "eval_webgpt_samples_per_second": 217.996, + "eval_webgpt_steps_per_second": 4.565, + "step": 500 + }, + { + "epoch": 0.02, + "eval_squad_v2_accuracy": 0.8868008286475288, + "eval_squad_v2_loss": 0.397216796875, + "eval_squad_v2_runtime": 88.1593, + "eval_squad_v2_samples_per_second": 295.647, + "eval_squad_v2_steps_per_second": 6.159, + "step": 500 + }, + { + "epoch": 0.02, + "eval_adversarial_qa_accuracy": 0.8145726993224083, + "eval_adversarial_qa_loss": 0.9189453125, + "eval_adversarial_qa_runtime": 21.7596, + "eval_adversarial_qa_samples_per_second": 275.74, + "eval_adversarial_qa_steps_per_second": 5.745, + "step": 500 + }, + { + "epoch": 0.02, + "eval_private_tuning_accuracy": 0.6609470207986216, + "eval_private_tuning_loss": 1.2919921875, + "eval_private_tuning_runtime": 64.7973, + "eval_private_tuning_samples_per_second": 326.834, + "eval_private_tuning_steps_per_second": 6.821, + "step": 500 + }, + { + "epoch": 0.02, + "eval_oa_translated_accuracy": 0.6798870038090434, + "eval_oa_translated_loss": 1.341796875, + "eval_oa_translated_runtime": 733.6761, + "eval_oa_translated_samples_per_second": 190.636, + "eval_oa_translated_steps_per_second": 3.972, + "step": 500 + }, + { + "epoch": 0.02, + "eval_prosocial_dialogue_accuracy": 0.5295571842042209, + "eval_prosocial_dialogue_loss": 1.8427734375, + "eval_prosocial_dialogue_runtime": 58.5686, + "eval_prosocial_dialogue_samples_per_second": 460.707, + "eval_prosocial_dialogue_steps_per_second": 9.613, + "step": 500 + }, + { + "epoch": 0.02, + "eval_math_qa_accuracy": 0.5540361105203919, + "eval_math_qa_loss": 1.9853515625, + "eval_math_qa_runtime": 19.0275, + "eval_math_qa_samples_per_second": 313.651, + "eval_math_qa_steps_per_second": 6.569, + "step": 500 + }, + { + "epoch": 0.02, + "eval_wikihow_accuracy": 0.6014838441270282, + "eval_wikihow_loss": 1.9541015625, + "eval_wikihow_runtime": 7.3976, + "eval_wikihow_samples_per_second": 309.967, + "eval_wikihow_steps_per_second": 6.489, + "step": 500 + }, + { + "epoch": 0.02, + "eval_joke_accuracy": 0.4797194844579227, + "eval_joke_loss": 2.34375, + "eval_joke_runtime": 0.5281, + "eval_joke_samples_per_second": 143.924, + "eval_joke_steps_per_second": 3.787, + "step": 500 + }, + { + "epoch": 0.02, + "eval_gsm8k_accuracy": 0.7496816445333818, + "eval_gsm8k_loss": 0.9697265625, + "eval_gsm8k_runtime": 6.3519, + "eval_gsm8k_samples_per_second": 235.363, + "eval_gsm8k_steps_per_second": 5.038, + "step": 500 + }, + { + "epoch": 0.02, + "eval_ted_trans_en-hi_accuracy": 0.6267826086956522, + "eval_ted_trans_en-hi_loss": 1.455078125, + "eval_ted_trans_en-hi_runtime": 1.042, + "eval_ted_trans_en-hi_samples_per_second": 98.852, + "eval_ted_trans_en-hi_steps_per_second": 2.879, + "step": 500 + }, + { + "epoch": 0.02, + "eval_ted_trans_de-ja_accuracy": 0.6095081533548741, + "eval_ted_trans_de-ja_loss": 1.736328125, + "eval_ted_trans_de-ja_runtime": 3.4864, + "eval_ted_trans_de-ja_samples_per_second": 205.945, + "eval_ted_trans_de-ja_steps_per_second": 4.302, + "step": 500 + }, + { + "epoch": 0.02, + "eval_ted_trans_nl-en_accuracy": 0.7156337065657362, + "eval_ted_trans_nl-en_loss": 1.2724609375, + "eval_ted_trans_nl-en_runtime": 4.8663, + "eval_ted_trans_nl-en_samples_per_second": 158.438, + "eval_ted_trans_nl-en_steps_per_second": 3.493, + "step": 500 + }, + { + "epoch": 0.02, + "eval_ted_trans_en-ja_accuracy": 0.624158725585033, + "eval_ted_trans_en-ja_loss": 1.63671875, + "eval_ted_trans_en-ja_runtime": 4.5137, + "eval_ted_trans_en-ja_samples_per_second": 177.461, + "eval_ted_trans_en-ja_steps_per_second": 3.766, + "step": 500 + }, + { + "epoch": 0.02, + "eval_ted_trans_en-es_accuracy": 0.7583100576231097, + "eval_ted_trans_en-es_loss": 1.048828125, + "eval_ted_trans_en-es_runtime": 3.4017, + "eval_ted_trans_en-es_samples_per_second": 242.822, + "eval_ted_trans_en-es_steps_per_second": 5.292, + "step": 500 + }, + { + "epoch": 0.02, + "eval_ted_trans_en-ms_accuracy": 0.7228320526893524, + "eval_ted_trans_en-ms_loss": 1.3466796875, + "eval_ted_trans_en-ms_runtime": 0.9049, + "eval_ted_trans_en-ms_samples_per_second": 46.416, + "eval_ted_trans_en-ms_steps_per_second": 1.105, + "step": 500 + }, + { + "epoch": 0.02, + "eval_xsum_accuracy": 0.6011633358116925, + "eval_xsum_loss": NaN, + "eval_xsum_runtime": 194.0576, + "eval_xsum_samples_per_second": 210.293, + "eval_xsum_steps_per_second": 4.385, + "step": 500 + }, + { + "epoch": 0.02, + "eval_cnn_dailymail_accuracy": 0.6701096765236707, + "eval_cnn_dailymail_loss": NaN, + "eval_cnn_dailymail_runtime": 276.9166, + "eval_cnn_dailymail_samples_per_second": 207.366, + "eval_cnn_dailymail_steps_per_second": 4.323, + "step": 500 + }, + { + "epoch": 0.02, + "eval_multi_news_accuracy": 0.5313963642137016, + "eval_multi_news_loss": NaN, + "eval_multi_news_runtime": 45.9972, + "eval_multi_news_samples_per_second": 195.555, + "eval_multi_news_steps_per_second": 4.087, + "step": 500 + }, + { + "epoch": 0.02, + "eval_tldr_news_accuracy": 0.5344681651462428, + "eval_tldr_news_loss": 2.201171875, + "eval_tldr_news_runtime": 3.1785, + "eval_tldr_news_samples_per_second": 449.262, + "eval_tldr_news_steps_per_second": 9.438, + "step": 500 + }, + { + "epoch": 0.02, + "eval_scitldr_accuracy": 0.49756888168557534, + "eval_scitldr_loss": NaN, + "eval_scitldr_runtime": 2.9905, + "eval_scitldr_samples_per_second": 133.421, + "eval_scitldr_steps_per_second": 3.009, + "step": 500 + }, + { + "epoch": 0.02, + "eval_samsum_accuracy": 0.6183671538076762, + "eval_samsum_loss": 1.4326171875, + "eval_samsum_runtime": 13.5218, + "eval_samsum_samples_per_second": 217.944, + "eval_samsum_steps_per_second": 4.585, + "step": 500 + }, + { + "epoch": 0.02, + "eval_debate_sum_accuracy": 0.9346053252084863, + "eval_debate_sum_loss": NaN, + "eval_debate_sum_runtime": 244.8422, + "eval_debate_sum_samples_per_second": 196.51, + "eval_debate_sum_steps_per_second": 4.097, + "step": 500 + }, + { + "epoch": 0.02, + "eval_billsum_accuracy": 0.6686931696172409, + "eval_billsum_loss": 1.427734375, + "eval_billsum_runtime": 27.101, + "eval_billsum_samples_per_second": 139.847, + "eval_billsum_steps_per_second": 2.915, + "step": 500 + }, + { + "epoch": 0.02, + "eval_wmt2019_zh-en_accuracy": 0.6249207026609304, + "eval_wmt2019_zh-en_loss": 1.7001953125, + "eval_wmt2019_zh-en_runtime": 12.5289, + "eval_wmt2019_zh-en_samples_per_second": 317.745, + "eval_wmt2019_zh-en_steps_per_second": 6.625, + "step": 500 + }, + { + "epoch": 0.02, + "eval_wmt2019_ru-en_accuracy": 0.7221410449334468, + "eval_wmt2019_ru-en_loss": 1.1083984375, + "eval_wmt2019_ru-en_runtime": 10.0702, + "eval_wmt2019_ru-en_samples_per_second": 297.91, + "eval_wmt2019_ru-en_steps_per_second": 6.256, + "step": 500 + }, + { + "epoch": 0.02, + "eval_wmt2019_de-en_accuracy": 0.7360578210047292, + "eval_wmt2019_de-en_loss": 1.072265625, + "eval_wmt2019_de-en_runtime": 7.6459, + "eval_wmt2019_de-en_samples_per_second": 392.106, + "eval_wmt2019_de-en_steps_per_second": 8.24, + "step": 500 + }, + { + "epoch": 0.02, + "eval_wmt2019_fr-de_accuracy": 0.7223688705319711, + "eval_wmt2019_fr-de_loss": 1.15234375, + "eval_wmt2019_fr-de_runtime": 5.1746, + "eval_wmt2019_fr-de_samples_per_second": 292.195, + "eval_wmt2019_fr-de_steps_per_second": 6.184, + "step": 500 + }, + { + "epoch": 0.02, + "eval_essay_instruction_accuracy": 0.5920659841231232, + "eval_essay_instruction_loss": 2.001953125, + "eval_essay_instruction_runtime": 4.5291, + "eval_essay_instruction_samples_per_second": 91.188, + "eval_essay_instruction_steps_per_second": 1.987, + "step": 500 + }, + { + "epoch": 0.02, + "eval_reddit_eli5_accuracy": 0.4464491832743919, + "eval_reddit_eli5_loss": 2.560546875, + "eval_reddit_eli5_runtime": 282.6054, + "eval_reddit_eli5_samples_per_second": 192.944, + "eval_reddit_eli5_steps_per_second": 4.02, + "step": 500 + }, + { + "epoch": 0.02, + "eval_reddit_askh_accuracy": 0.4485475495544, + "eval_reddit_askh_loss": 2.65625, + "eval_reddit_askh_runtime": 129.6151, + "eval_reddit_askh_samples_per_second": 152.027, + "eval_reddit_askh_steps_per_second": 3.171, + "step": 500 + }, + { + "epoch": 0.02, + "eval_reddit_asks_accuracy": 0.4581163067460401, + "eval_reddit_asks_loss": 2.5078125, + "eval_reddit_asks_runtime": 148.6855, + "eval_reddit_asks_samples_per_second": 177.26, + "eval_reddit_asks_steps_per_second": 3.699, + "step": 500 + }, + { + "epoch": 0.02, + "learning_rate": 4.512616960163228e-06, + "loss": 1.6485, + "step": 510 + }, + { + "epoch": 0.03, + "learning_rate": 4.526672239391333e-06, + "loss": 1.6097, + "step": 520 + }, + { + "epoch": 0.03, + "learning_rate": 4.540459782667983e-06, + "loss": 1.6802, + "step": 530 + }, + { + "epoch": 0.03, + "learning_rate": 4.553989599704948e-06, + "loss": 1.624, + "step": 540 + }, + { + "epoch": 0.03, + "learning_rate": 4.5672711491570735e-06, + "loss": 1.6027, + "step": 550 + }, + { + "epoch": 0.03, + "learning_rate": 4.5803133783436676e-06, + "loss": 1.6412, + "step": 560 + }, + { + "epoch": 0.03, + "learning_rate": 4.5931247594541535e-06, + "loss": 1.6312, + "step": 570 + }, + { + "epoch": 0.03, + "learning_rate": 4.605713322604896e-06, + "loss": 1.6101, + "step": 580 + }, + { + "epoch": 0.03, + "learning_rate": 4.61808668607024e-06, + "loss": 1.6172, + "step": 590 + }, + { + "epoch": 0.03, + "learning_rate": 4.63025208397274e-06, + "loss": 1.6527, + "step": 600 + }, + { + "epoch": 0.03, + "learning_rate": 4.642216391684613e-06, + "loss": 1.6598, + "step": 610 + }, + { + "epoch": 0.03, + "learning_rate": 4.653986149163757e-06, + "loss": 1.6289, + "step": 620 + }, + { + "epoch": 0.03, + "learning_rate": 4.6655675824226375e-06, + "loss": 1.6069, + "step": 630 + }, + { + "epoch": 0.03, + "learning_rate": 4.676966623306479e-06, + "loss": 1.5908, + "step": 640 + }, + { + "epoch": 0.03, + "learning_rate": 4.688188927738093e-06, + "loss": 1.6184, + "step": 650 + }, + { + "epoch": 0.03, + "learning_rate": 4.699239892569782e-06, + "loss": 1.6511, + "step": 660 + }, + { + "epoch": 0.03, + "learning_rate": 4.710124671168044e-06, + "loss": 1.6089, + "step": 670 + }, + { + "epoch": 0.03, + "learning_rate": 4.720848187843727e-06, + "loss": 1.74, + "step": 680 + }, + { + "epoch": 0.03, + "learning_rate": 4.73141515122876e-06, + "loss": 1.5827, + "step": 690 + }, + { + "epoch": 0.03, + "learning_rate": 4.741830066690428e-06, + "loss": 1.6384, + "step": 700 + }, + { + "epoch": 0.03, + "learning_rate": 4.752097247865126e-06, + "loss": 1.6331, + "step": 710 + }, + { + "epoch": 0.04, + "learning_rate": 4.7622208273854484e-06, + "loss": 1.6444, + "step": 720 + }, + { + "epoch": 0.04, + "learning_rate": 4.772204766867427e-06, + "loss": 1.597, + "step": 730 + }, + { + "epoch": 0.04, + "learning_rate": 4.782052866218294e-06, + "loss": 1.6041, + "step": 740 + }, + { + "epoch": 0.04, + "learning_rate": 4.7917687723195e-06, + "loss": 1.584, + "step": 750 + }, + { + "epoch": 0.04, + "eval_gsm8k_hard_accuracy": 0.9101217377142624, + "eval_gsm8k_hard_loss": 0.4091796875, + "eval_gsm8k_hard_runtime": 2.6619, + "eval_gsm8k_hard_samples_per_second": 99.177, + "eval_gsm8k_hard_steps_per_second": 2.254, + "step": 750 + }, + { + "epoch": 0.04, + "eval_webgpt_accuracy": 0.4874197343145836, + "eval_webgpt_loss": 2.296875, + "eval_webgpt_runtime": 16.2391, + "eval_webgpt_samples_per_second": 241.147, + "eval_webgpt_steps_per_second": 5.05, + "step": 750 + }, + { + "epoch": 0.04, + "eval_squad_v2_accuracy": 0.8846798855677223, + "eval_squad_v2_loss": 0.363037109375, + "eval_squad_v2_runtime": 89.6157, + "eval_squad_v2_samples_per_second": 290.842, + "eval_squad_v2_steps_per_second": 6.059, + "step": 750 + }, + { + "epoch": 0.04, + "eval_adversarial_qa_accuracy": 0.8026626070863556, + "eval_adversarial_qa_loss": 0.8447265625, + "eval_adversarial_qa_runtime": 21.271, + "eval_adversarial_qa_samples_per_second": 282.075, + "eval_adversarial_qa_steps_per_second": 5.877, + "step": 750 + }, + { + "epoch": 0.04, + "eval_private_tuning_accuracy": 0.6632682821768406, + "eval_private_tuning_loss": 1.2705078125, + "eval_private_tuning_runtime": 65.8398, + "eval_private_tuning_samples_per_second": 321.659, + "eval_private_tuning_steps_per_second": 6.713, + "step": 750 + }, + { + "epoch": 0.04, + "eval_oa_translated_accuracy": 0.6840566265427955, + "eval_oa_translated_loss": 1.31640625, + "eval_oa_translated_runtime": 739.9323, + "eval_oa_translated_samples_per_second": 189.024, + "eval_oa_translated_steps_per_second": 3.938, + "step": 750 + }, + { + "epoch": 0.04, + "eval_prosocial_dialogue_accuracy": 0.5235860358801359, + "eval_prosocial_dialogue_loss": 1.8115234375, + "eval_prosocial_dialogue_runtime": 61.6452, + "eval_prosocial_dialogue_samples_per_second": 437.715, + "eval_prosocial_dialogue_steps_per_second": 9.133, + "step": 750 + }, + { + "epoch": 0.04, + "eval_math_qa_accuracy": 0.5631207954480619, + "eval_math_qa_loss": 1.921875, + "eval_math_qa_runtime": 17.923, + "eval_math_qa_samples_per_second": 332.98, + "eval_math_qa_steps_per_second": 6.974, + "step": 750 + }, + { + "epoch": 0.04, + "eval_wikihow_accuracy": 0.6049785050617112, + "eval_wikihow_loss": 1.923828125, + "eval_wikihow_runtime": 7.5032, + "eval_wikihow_samples_per_second": 305.602, + "eval_wikihow_steps_per_second": 6.397, + "step": 750 + }, + { + "epoch": 0.04, + "eval_joke_accuracy": 0.4799090219863533, + "eval_joke_loss": 2.30078125, + "eval_joke_runtime": 1.3898, + "eval_joke_samples_per_second": 54.685, + "eval_joke_steps_per_second": 1.439, + "step": 750 + }, + { + "epoch": 0.04, + "eval_gsm8k_accuracy": 0.7521515232084633, + "eval_gsm8k_loss": 0.94091796875, + "eval_gsm8k_runtime": 5.0267, + "eval_gsm8k_samples_per_second": 297.41, + "eval_gsm8k_steps_per_second": 6.366, + "step": 750 + }, + { + "epoch": 0.04, + "eval_ted_trans_en-hi_accuracy": 0.6381951731374607, + "eval_ted_trans_en-hi_loss": 1.3837890625, + "eval_ted_trans_en-hi_runtime": 3.1926, + "eval_ted_trans_en-hi_samples_per_second": 32.262, + "eval_ted_trans_en-hi_steps_per_second": 0.94, + "step": 750 + }, + { + "epoch": 0.04, + "eval_ted_trans_de-ja_accuracy": 0.6046756766931446, + "eval_ted_trans_de-ja_loss": 1.75390625, + "eval_ted_trans_de-ja_runtime": 4.4495, + "eval_ted_trans_de-ja_samples_per_second": 161.365, + "eval_ted_trans_de-ja_steps_per_second": 3.371, + "step": 750 + }, + { + "epoch": 0.04, + "eval_ted_trans_nl-en_accuracy": 0.7248159831756046, + "eval_ted_trans_nl-en_loss": 1.2216796875, + "eval_ted_trans_nl-en_runtime": 3.3794, + "eval_ted_trans_nl-en_samples_per_second": 228.144, + "eval_ted_trans_nl-en_steps_per_second": 5.03, + "step": 750 + }, + { + "epoch": 0.04, + "eval_ted_trans_en-ja_accuracy": 0.62236684020825, + "eval_ted_trans_en-ja_loss": 1.6015625, + "eval_ted_trans_en-ja_runtime": 4.3536, + "eval_ted_trans_en-ja_samples_per_second": 183.987, + "eval_ted_trans_en-ja_steps_per_second": 3.905, + "step": 750 + }, + { + "epoch": 0.04, + "eval_ted_trans_en-es_accuracy": 0.7690527730088826, + "eval_ted_trans_en-es_loss": 0.99658203125, + "eval_ted_trans_en-es_runtime": 4.0371, + "eval_ted_trans_en-es_samples_per_second": 204.604, + "eval_ted_trans_en-es_steps_per_second": 4.459, + "step": 750 + }, + { + "epoch": 0.04, + "eval_ted_trans_en-ms_accuracy": 0.6281984334203655, + "eval_ted_trans_en-ms_loss": 1.7548828125, + "eval_ted_trans_en-ms_runtime": 1.3143, + "eval_ted_trans_en-ms_samples_per_second": 31.955, + "eval_ted_trans_en-ms_steps_per_second": 0.761, + "step": 750 + }, + { + "epoch": 0.04, + "eval_xsum_accuracy": 0.6022120068856478, + "eval_xsum_loss": NaN, + "eval_xsum_runtime": 190.7745, + "eval_xsum_samples_per_second": 213.912, + "eval_xsum_steps_per_second": 4.461, + "step": 750 + }, + { + "epoch": 0.04, + "eval_cnn_dailymail_accuracy": 0.6730714054329214, + "eval_cnn_dailymail_loss": NaN, + "eval_cnn_dailymail_runtime": 278.1455, + "eval_cnn_dailymail_samples_per_second": 206.45, + "eval_cnn_dailymail_steps_per_second": 4.304, + "step": 750 + }, + { + "epoch": 0.04, + "eval_multi_news_accuracy": 0.5342626698844151, + "eval_multi_news_loss": NaN, + "eval_multi_news_runtime": 44.9541, + "eval_multi_news_samples_per_second": 200.093, + "eval_multi_news_steps_per_second": 4.182, + "step": 750 + }, + { + "epoch": 0.04, + "eval_tldr_news_accuracy": 0.5577447568889307, + "eval_tldr_news_loss": 2.03125, + "eval_tldr_news_runtime": 4.8998, + "eval_tldr_news_samples_per_second": 291.441, + "eval_tldr_news_steps_per_second": 6.123, + "step": 750 + }, + { + "epoch": 0.04, + "eval_scitldr_accuracy": 0.5008103727714749, + "eval_scitldr_loss": NaN, + "eval_scitldr_runtime": 2.5302, + "eval_scitldr_samples_per_second": 157.698, + "eval_scitldr_steps_per_second": 3.557, + "step": 750 + }, + { + "epoch": 0.04, + "eval_samsum_accuracy": 0.6208006056591274, + "eval_samsum_loss": 1.40625, + "eval_samsum_runtime": 14.6527, + "eval_samsum_samples_per_second": 201.123, + "eval_samsum_steps_per_second": 4.231, + "step": 750 + }, + { + "epoch": 0.04, + "eval_debate_sum_accuracy": 0.9359010655534944, + "eval_debate_sum_loss": NaN, + "eval_debate_sum_runtime": 244.5165, + "eval_debate_sum_samples_per_second": 196.772, + "eval_debate_sum_steps_per_second": 4.102, + "step": 750 + }, + { + "epoch": 0.04, + "eval_billsum_accuracy": 0.6674552130307286, + "eval_billsum_loss": 1.412109375, + "eval_billsum_runtime": 27.8571, + "eval_billsum_samples_per_second": 136.052, + "eval_billsum_steps_per_second": 2.836, + "step": 750 + }, + { + "epoch": 0.04, + "eval_wmt2019_zh-en_accuracy": 0.6281871623861053, + "eval_wmt2019_zh-en_loss": 1.689453125, + "eval_wmt2019_zh-en_runtime": 11.4111, + "eval_wmt2019_zh-en_samples_per_second": 348.872, + "eval_wmt2019_zh-en_steps_per_second": 7.274, + "step": 750 + }, + { + "epoch": 0.04, + "eval_wmt2019_ru-en_accuracy": 0.7208443499252788, + "eval_wmt2019_ru-en_loss": 1.1123046875, + "eval_wmt2019_ru-en_runtime": 10.8964, + "eval_wmt2019_ru-en_samples_per_second": 275.321, + "eval_wmt2019_ru-en_steps_per_second": 5.782, + "step": 750 + }, + { + "epoch": 0.04, + "eval_wmt2019_de-en_accuracy": 0.7330685618729097, + "eval_wmt2019_de-en_loss": 1.0859375, + "eval_wmt2019_de-en_runtime": 7.6477, + "eval_wmt2019_de-en_samples_per_second": 392.011, + "eval_wmt2019_de-en_steps_per_second": 8.238, + "step": 750 + }, + { + "epoch": 0.04, + "eval_wmt2019_fr-de_accuracy": 0.7199981458877335, + "eval_wmt2019_fr-de_loss": 1.1474609375, + "eval_wmt2019_fr-de_runtime": 5.4342, + "eval_wmt2019_fr-de_samples_per_second": 278.24, + "eval_wmt2019_fr-de_steps_per_second": 5.889, + "step": 750 + }, + { + "epoch": 0.04, + "eval_essay_instruction_accuracy": 0.5924197863918802, + "eval_essay_instruction_loss": 1.9921875, + "eval_essay_instruction_runtime": 4.79, + "eval_essay_instruction_samples_per_second": 86.221, + "eval_essay_instruction_steps_per_second": 1.879, + "step": 750 + }, + { + "epoch": 0.04, + "eval_reddit_eli5_accuracy": 0.4455537602670511, + "eval_reddit_eli5_loss": 2.5546875, + "eval_reddit_eli5_runtime": 268.3918, + "eval_reddit_eli5_samples_per_second": 203.162, + "eval_reddit_eli5_steps_per_second": 4.233, + "step": 750 + }, + { + "epoch": 0.04, + "eval_reddit_askh_accuracy": 0.44842410632057694, + "eval_reddit_askh_loss": 2.65234375, + "eval_reddit_askh_runtime": 150.7797, + "eval_reddit_askh_samples_per_second": 130.687, + "eval_reddit_askh_steps_per_second": 2.726, + "step": 750 + }, + { + "epoch": 0.04, + "eval_reddit_asks_accuracy": 0.4572903640086479, + "eval_reddit_asks_loss": 2.501953125, + "eval_reddit_asks_runtime": 135.6226, + "eval_reddit_asks_samples_per_second": 194.333, + "eval_reddit_asks_steps_per_second": 4.055, + "step": 750 + }, + { + "epoch": 0.04, + "learning_rate": 4.801355987134653e-06, + "loss": 1.5707, + "step": 760 + }, + { + "epoch": 0.04, + "learning_rate": 4.81081787528747e-06, + "loss": 1.6039, + "step": 770 + }, + { + "epoch": 0.04, + "learning_rate": 4.820157671150801e-06, + "loss": 1.5763, + "step": 780 + }, + { + "epoch": 0.04, + "learning_rate": 4.82937848548407e-06, + "loss": 1.6415, + "step": 790 + }, + { + "epoch": 0.04, + "learning_rate": 4.83848331165324e-06, + "loss": 1.6192, + "step": 800 + }, + { + "epoch": 0.04, + "learning_rate": 4.847475031464417e-06, + "loss": 1.8104, + "step": 810 + }, + { + "epoch": 0.04, + "learning_rate": 4.856356420639528e-06, + "loss": 1.6151, + "step": 820 + }, + { + "epoch": 0.04, + "learning_rate": 4.8651301539601235e-06, + "loss": 1.6213, + "step": 830 + }, + { + "epoch": 0.04, + "learning_rate": 4.873798810103137e-06, + "loss": 1.5999, + "step": 840 + }, + { + "epoch": 0.04, + "learning_rate": 4.882364876190489e-06, + "loss": 1.5919, + "step": 850 + }, + { + "epoch": 0.04, + "learning_rate": 4.890830752072613e-06, + "loss": 1.6093, + "step": 860 + }, + { + "epoch": 0.04, + "learning_rate": 4.899198754364365e-06, + "loss": 1.6407, + "step": 870 + }, + { + "epoch": 0.04, + "learning_rate": 4.907471120250281e-06, + "loss": 1.6171, + "step": 880 + }, + { + "epoch": 0.04, + "learning_rate": 4.915650011074855e-06, + "loss": 1.6894, + "step": 890 + }, + { + "epoch": 0.04, + "learning_rate": 4.923737515732209e-06, + "loss": 1.6495, + "step": 900 + }, + { + "epoch": 0.04, + "learning_rate": 4.931735653868489e-06, + "loss": 1.6688, + "step": 910 + }, + { + "epoch": 0.04, + "learning_rate": 4.93964637890926e-06, + "loss": 1.6085, + "step": 920 + }, + { + "epoch": 0.05, + "learning_rate": 4.9474715809232256e-06, + "loss": 1.6499, + "step": 930 + }, + { + "epoch": 0.05, + "learning_rate": 4.955213089332832e-06, + "loss": 1.6319, + "step": 940 + }, + { + "epoch": 0.05, + "learning_rate": 4.962872675481414e-06, + "loss": 1.5965, + "step": 950 + }, + { + "epoch": 0.05, + "learning_rate": 4.970452055065948e-06, + "loss": 1.5977, + "step": 960 + }, + { + "epoch": 0.05, + "learning_rate": 4.977952890443742e-06, + "loss": 1.6161, + "step": 970 + }, + { + "epoch": 0.05, + "learning_rate": 4.985376792820825e-06, + "loss": 1.5886, + "step": 980 + }, + { + "epoch": 0.05, + "learning_rate": 4.992725324329251e-06, + "loss": 1.5945, + "step": 990 + }, + { + "epoch": 0.05, + "learning_rate": 5e-06, + "loss": 1.6211, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_gsm8k_hard_accuracy": 0.9133382191278857, + "eval_gsm8k_hard_loss": 0.38720703125, + "eval_gsm8k_hard_runtime": 2.5023, + "eval_gsm8k_hard_samples_per_second": 105.503, + "eval_gsm8k_hard_steps_per_second": 2.398, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_webgpt_accuracy": 0.4873552074043802, + "eval_webgpt_loss": 2.29296875, + "eval_webgpt_runtime": 18.3874, + "eval_webgpt_samples_per_second": 212.972, + "eval_webgpt_steps_per_second": 4.46, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_squad_v2_accuracy": 0.8973280344987951, + "eval_squad_v2_loss": 0.33642578125, + "eval_squad_v2_runtime": 87.2934, + "eval_squad_v2_samples_per_second": 298.579, + "eval_squad_v2_steps_per_second": 6.22, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_adversarial_qa_accuracy": 0.810095221038178, + "eval_adversarial_qa_loss": 0.85498046875, + "eval_adversarial_qa_runtime": 21.5732, + "eval_adversarial_qa_samples_per_second": 278.123, + "eval_adversarial_qa_steps_per_second": 5.794, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_private_tuning_accuracy": 0.6643376777215743, + "eval_private_tuning_loss": 1.2626953125, + "eval_private_tuning_runtime": 61.2475, + "eval_private_tuning_samples_per_second": 345.777, + "eval_private_tuning_steps_per_second": 7.217, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_oa_translated_accuracy": 0.6876637305407464, + "eval_oa_translated_loss": 1.298828125, + "eval_oa_translated_runtime": 714.9582, + "eval_oa_translated_samples_per_second": 195.627, + "eval_oa_translated_steps_per_second": 4.076, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_prosocial_dialogue_accuracy": 0.533683742122458, + "eval_prosocial_dialogue_loss": 1.8115234375, + "eval_prosocial_dialogue_runtime": 77.2682, + "eval_prosocial_dialogue_samples_per_second": 349.212, + "eval_prosocial_dialogue_steps_per_second": 7.286, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_math_qa_accuracy": 0.5697754633111511, + "eval_math_qa_loss": 1.884765625, + "eval_math_qa_runtime": 19.141, + "eval_math_qa_samples_per_second": 311.791, + "eval_math_qa_steps_per_second": 6.53, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_wikihow_accuracy": 0.6076965746775759, + "eval_wikihow_loss": 1.91015625, + "eval_wikihow_runtime": 7.4493, + "eval_wikihow_samples_per_second": 307.815, + "eval_wikihow_steps_per_second": 6.444, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_joke_accuracy": 0.4844579226686884, + "eval_joke_loss": 2.287109375, + "eval_joke_runtime": 0.5532, + "eval_joke_samples_per_second": 137.392, + "eval_joke_steps_per_second": 3.616, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_gsm8k_accuracy": 0.7594911909992863, + "eval_gsm8k_loss": 0.9140625, + "eval_gsm8k_runtime": 5.3345, + "eval_gsm8k_samples_per_second": 280.251, + "eval_gsm8k_steps_per_second": 5.999, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_ted_trans_en-hi_accuracy": 0.6449365772509877, + "eval_ted_trans_en-hi_loss": 1.306640625, + "eval_ted_trans_en-hi_runtime": 1.7092, + "eval_ted_trans_en-hi_samples_per_second": 60.262, + "eval_ted_trans_en-hi_steps_per_second": 1.755, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_ted_trans_de-ja_accuracy": 0.6195650127106676, + "eval_ted_trans_de-ja_loss": 1.6767578125, + "eval_ted_trans_de-ja_runtime": 4.3842, + "eval_ted_trans_de-ja_samples_per_second": 163.768, + "eval_ted_trans_de-ja_steps_per_second": 3.421, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_ted_trans_nl-en_accuracy": 0.7198614136853986, + "eval_ted_trans_nl-en_loss": 1.236328125, + "eval_ted_trans_nl-en_runtime": 3.7037, + "eval_ted_trans_nl-en_samples_per_second": 208.171, + "eval_ted_trans_nl-en_steps_per_second": 4.59, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_ted_trans_en-ja_accuracy": 0.6259939079868666, + "eval_ted_trans_en-ja_loss": 1.5947265625, + "eval_ted_trans_en-ja_runtime": 4.2274, + "eval_ted_trans_en-ja_samples_per_second": 189.48, + "eval_ted_trans_en-ja_steps_per_second": 4.021, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_ted_trans_en-es_accuracy": 0.7666360545061928, + "eval_ted_trans_en-es_loss": 1.01171875, + "eval_ted_trans_en-es_runtime": 4.7495, + "eval_ted_trans_en-es_samples_per_second": 173.914, + "eval_ted_trans_en-es_steps_per_second": 3.79, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_ted_trans_en-ms_accuracy": 0.6276762402088772, + "eval_ted_trans_en-ms_loss": 1.7060546875, + "eval_ted_trans_en-ms_runtime": 0.3249, + "eval_ted_trans_en-ms_samples_per_second": 129.281, + "eval_ted_trans_en-ms_steps_per_second": 3.078, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_xsum_accuracy": 0.6038378677782439, + "eval_xsum_loss": NaN, + "eval_xsum_runtime": 194.7148, + "eval_xsum_samples_per_second": 209.583, + "eval_xsum_steps_per_second": 4.37, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_cnn_dailymail_accuracy": 0.6724346337174325, + "eval_cnn_dailymail_loss": NaN, + "eval_cnn_dailymail_runtime": 276.6313, + "eval_cnn_dailymail_samples_per_second": 207.58, + "eval_cnn_dailymail_steps_per_second": 4.327, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_multi_news_accuracy": 0.5417725136542614, + "eval_multi_news_loss": NaN, + "eval_multi_news_runtime": 44.7388, + "eval_multi_news_samples_per_second": 201.056, + "eval_multi_news_steps_per_second": 4.202, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_tldr_news_accuracy": 0.5576977334712687, + "eval_tldr_news_loss": 2.015625, + "eval_tldr_news_runtime": 4.3825, + "eval_tldr_news_samples_per_second": 325.843, + "eval_tldr_news_steps_per_second": 6.845, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_scitldr_accuracy": 0.5008103727714749, + "eval_scitldr_loss": NaN, + "eval_scitldr_runtime": 2.9018, + "eval_scitldr_samples_per_second": 137.5, + "eval_scitldr_steps_per_second": 3.102, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_samsum_accuracy": 0.6229095972637186, + "eval_samsum_loss": 1.4013671875, + "eval_samsum_runtime": 14.884, + "eval_samsum_samples_per_second": 197.998, + "eval_samsum_steps_per_second": 4.166, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_debate_sum_accuracy": 0.9370327058673479, + "eval_debate_sum_loss": NaN, + "eval_debate_sum_runtime": 243.4339, + "eval_debate_sum_samples_per_second": 197.647, + "eval_debate_sum_steps_per_second": 4.12, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_billsum_accuracy": 0.6730090224558118, + "eval_billsum_loss": 1.3984375, + "eval_billsum_runtime": 26.3919, + "eval_billsum_samples_per_second": 143.605, + "eval_billsum_steps_per_second": 2.993, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_wmt2019_zh-en_accuracy": 0.6340022173592529, + "eval_wmt2019_zh-en_loss": 1.66015625, + "eval_wmt2019_zh-en_runtime": 12.9276, + "eval_wmt2019_zh-en_samples_per_second": 307.945, + "eval_wmt2019_zh-en_steps_per_second": 6.42, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_wmt2019_ru-en_accuracy": 0.7246060048314736, + "eval_wmt2019_ru-en_loss": 1.09375, + "eval_wmt2019_ru-en_runtime": 10.2691, + "eval_wmt2019_ru-en_samples_per_second": 292.138, + "eval_wmt2019_ru-en_steps_per_second": 6.135, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_wmt2019_de-en_accuracy": 0.7373287943940118, + "eval_wmt2019_de-en_loss": 1.076171875, + "eval_wmt2019_de-en_runtime": 8.2087, + "eval_wmt2019_de-en_samples_per_second": 365.221, + "eval_wmt2019_de-en_steps_per_second": 7.675, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_wmt2019_fr-de_accuracy": 0.7239072325829329, + "eval_wmt2019_fr-de_loss": 1.1376953125, + "eval_wmt2019_fr-de_runtime": 5.7413, + "eval_wmt2019_fr-de_samples_per_second": 263.356, + "eval_wmt2019_fr-de_steps_per_second": 5.574, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_essay_instruction_accuracy": 0.5944486212767839, + "eval_essay_instruction_loss": 1.982421875, + "eval_essay_instruction_runtime": 4.9211, + "eval_essay_instruction_samples_per_second": 83.925, + "eval_essay_instruction_steps_per_second": 1.829, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_reddit_eli5_accuracy": 0.44636294670867693, + "eval_reddit_eli5_loss": 2.552734375, + "eval_reddit_eli5_runtime": 266.8136, + "eval_reddit_eli5_samples_per_second": 204.364, + "eval_reddit_eli5_steps_per_second": 4.258, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_reddit_askh_accuracy": 0.4487002153615098, + "eval_reddit_askh_loss": 2.65234375, + "eval_reddit_askh_runtime": 136.1822, + "eval_reddit_askh_samples_per_second": 144.696, + "eval_reddit_askh_steps_per_second": 3.018, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_reddit_asks_accuracy": 0.45779545724968684, + "eval_reddit_asks_loss": 2.501953125, + "eval_reddit_asks_runtime": 151.9509, + "eval_reddit_asks_samples_per_second": 173.451, + "eval_reddit_asks_steps_per_second": 3.62, + "step": 1000 + }, + { + "epoch": 0.06, + "learning_rate": 4.998642779587406e-06, + "loss": 1.5937, + "step": 1010 + }, + { + "epoch": 0.06, + "learning_rate": 4.997134756906744e-06, + "loss": 1.6393, + "step": 1020 + }, + { + "epoch": 0.06, + "learning_rate": 4.995626734226083e-06, + "loss": 1.64, + "step": 1030 + }, + { + "epoch": 0.06, + "learning_rate": 4.994118711545422e-06, + "loss": 1.6529, + "step": 1040 + }, + { + "epoch": 0.06, + "learning_rate": 4.992610688864761e-06, + "loss": 1.6339, + "step": 1050 + }, + { + "epoch": 0.06, + "learning_rate": 4.9911026661841e-06, + "loss": 1.7531, + "step": 1060 + }, + { + "epoch": 0.06, + "learning_rate": 4.989594643503439e-06, + "loss": 1.622, + "step": 1070 + }, + { + "epoch": 0.06, + "learning_rate": 4.988086620822777e-06, + "loss": 1.7041, + "step": 1080 + }, + { + "epoch": 0.06, + "learning_rate": 4.986578598142116e-06, + "loss": 1.5922, + "step": 1090 + }, + { + "epoch": 0.06, + "learning_rate": 4.9850705754614555e-06, + "loss": 1.5937, + "step": 1100 + }, + { + "epoch": 0.06, + "learning_rate": 4.983562552780794e-06, + "loss": 1.6301, + "step": 1110 + }, + { + "epoch": 0.07, + "learning_rate": 4.982054530100133e-06, + "loss": 1.6054, + "step": 1120 + }, + { + "epoch": 0.07, + "learning_rate": 4.980546507419472e-06, + "loss": 1.745, + "step": 1130 + }, + { + "epoch": 0.07, + "learning_rate": 4.9790384847388105e-06, + "loss": 1.6002, + "step": 1140 + }, + { + "epoch": 0.07, + "learning_rate": 4.97753046205815e-06, + "loss": 1.623, + "step": 1150 + }, + { + "epoch": 0.07, + "learning_rate": 4.976022439377489e-06, + "loss": 1.6432, + "step": 1160 + }, + { + "epoch": 0.07, + "learning_rate": 4.974514416696827e-06, + "loss": 1.5977, + "step": 1170 + }, + { + "epoch": 0.07, + "learning_rate": 4.973006394016166e-06, + "loss": 1.5784, + "step": 1180 + }, + { + "epoch": 0.07, + "learning_rate": 4.971498371335505e-06, + "loss": 1.7948, + "step": 1190 + }, + { + "epoch": 0.07, + "learning_rate": 4.969990348654844e-06, + "loss": 1.5515, + "step": 1200 + }, + { + "epoch": 0.07, + "learning_rate": 4.968482325974183e-06, + "loss": 1.6368, + "step": 1210 + }, + { + "epoch": 0.07, + "learning_rate": 4.966974303293522e-06, + "loss": 1.6672, + "step": 1220 + }, + { + "epoch": 0.07, + "learning_rate": 4.96546628061286e-06, + "loss": 1.6363, + "step": 1230 + }, + { + "epoch": 0.07, + "learning_rate": 4.9639582579321995e-06, + "loss": 1.6082, + "step": 1240 + }, + { + "epoch": 0.07, + "learning_rate": 4.962450235251539e-06, + "loss": 1.5335, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_gsm8k_hard_accuracy": 0.9144782378567647, + "eval_gsm8k_hard_loss": 0.38037109375, + "eval_gsm8k_hard_runtime": 2.4991, + "eval_gsm8k_hard_samples_per_second": 105.638, + "eval_gsm8k_hard_steps_per_second": 2.401, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_webgpt_accuracy": 0.48714508131217954, + "eval_webgpt_loss": 2.287109375, + "eval_webgpt_runtime": 17.8711, + "eval_webgpt_samples_per_second": 219.124, + "eval_webgpt_steps_per_second": 4.588, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_squad_v2_accuracy": 0.8998647106075339, + "eval_squad_v2_loss": 0.327392578125, + "eval_squad_v2_runtime": 89.3681, + "eval_squad_v2_samples_per_second": 291.648, + "eval_squad_v2_steps_per_second": 6.076, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_adversarial_qa_accuracy": 0.8150204471508313, + "eval_adversarial_qa_loss": 0.82080078125, + "eval_adversarial_qa_runtime": 21.2941, + "eval_adversarial_qa_samples_per_second": 281.769, + "eval_adversarial_qa_steps_per_second": 5.87, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_private_tuning_accuracy": 0.6658206227322084, + "eval_private_tuning_loss": 1.251953125, + "eval_private_tuning_runtime": 65.5451, + "eval_private_tuning_samples_per_second": 323.106, + "eval_private_tuning_steps_per_second": 6.743, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_oa_translated_accuracy": 0.692265258615827, + "eval_oa_translated_loss": 1.2744140625, + "eval_oa_translated_runtime": 743.3603, + "eval_oa_translated_samples_per_second": 188.499, + "eval_oa_translated_steps_per_second": 3.928, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_prosocial_dialogue_accuracy": 0.5311811844172045, + "eval_prosocial_dialogue_loss": 1.7724609375, + "eval_prosocial_dialogue_runtime": 51.2881, + "eval_prosocial_dialogue_samples_per_second": 526.106, + "eval_prosocial_dialogue_steps_per_second": 10.977, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_math_qa_accuracy": 0.5769073170225678, + "eval_math_qa_loss": 1.8466796875, + "eval_math_qa_runtime": 19.0785, + "eval_math_qa_samples_per_second": 312.813, + "eval_math_qa_steps_per_second": 6.552, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_wikihow_accuracy": 0.6035709332963528, + "eval_wikihow_loss": 1.896484375, + "eval_wikihow_runtime": 7.4264, + "eval_wikihow_samples_per_second": 308.762, + "eval_wikihow_steps_per_second": 6.463, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_joke_accuracy": 0.48474222896133434, + "eval_joke_loss": 2.275390625, + "eval_joke_runtime": 0.6166, + "eval_joke_samples_per_second": 123.256, + "eval_joke_steps_per_second": 3.244, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_gsm8k_accuracy": 0.7641650690586473, + "eval_gsm8k_loss": 0.8916015625, + "eval_gsm8k_runtime": 5.7365, + "eval_gsm8k_samples_per_second": 260.61, + "eval_gsm8k_steps_per_second": 5.578, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_ted_trans_en-hi_accuracy": 0.6325462370594394, + "eval_ted_trans_en-hi_loss": 1.4697265625, + "eval_ted_trans_en-hi_runtime": 1.283, + "eval_ted_trans_en-hi_samples_per_second": 80.282, + "eval_ted_trans_en-hi_steps_per_second": 2.338, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_ted_trans_de-ja_accuracy": 0.6211990075587098, + "eval_ted_trans_de-ja_loss": 1.662109375, + "eval_ted_trans_de-ja_runtime": 3.8274, + "eval_ted_trans_de-ja_samples_per_second": 187.596, + "eval_ted_trans_de-ja_steps_per_second": 3.919, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_ted_trans_nl-en_accuracy": 0.7194199077125907, + "eval_ted_trans_nl-en_loss": 1.2353515625, + "eval_ted_trans_nl-en_runtime": 4.0007, + "eval_ted_trans_nl-en_samples_per_second": 192.717, + "eval_ted_trans_nl-en_steps_per_second": 4.249, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_ted_trans_en-ja_accuracy": 0.6241126045950204, + "eval_ted_trans_en-ja_loss": 1.603515625, + "eval_ted_trans_en-ja_runtime": 4.7838, + "eval_ted_trans_en-ja_samples_per_second": 167.442, + "eval_ted_trans_en-ja_steps_per_second": 3.554, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_ted_trans_en-es_accuracy": 0.7645402663284718, + "eval_ted_trans_en-es_loss": 1.001953125, + "eval_ted_trans_en-es_runtime": 4.101, + "eval_ted_trans_en-es_samples_per_second": 201.413, + "eval_ted_trans_en-es_steps_per_second": 4.389, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_ted_trans_en-ms_accuracy": 0.6673585884795018, + "eval_ted_trans_en-ms_loss": 1.525390625, + "eval_ted_trans_en-ms_runtime": 1.2996, + "eval_ted_trans_en-ms_samples_per_second": 32.318, + "eval_ted_trans_en-ms_steps_per_second": 0.769, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_xsum_accuracy": 0.6053592197062103, + "eval_xsum_loss": NaN, + "eval_xsum_runtime": 191.7698, + "eval_xsum_samples_per_second": 212.802, + "eval_xsum_steps_per_second": 4.438, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_cnn_dailymail_accuracy": 0.6746041001434587, + "eval_cnn_dailymail_loss": NaN, + "eval_cnn_dailymail_runtime": 278.9634, + "eval_cnn_dailymail_samples_per_second": 205.844, + "eval_cnn_dailymail_steps_per_second": 4.291, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_multi_news_accuracy": 0.5434468524251806, + "eval_multi_news_loss": NaN, + "eval_multi_news_runtime": 45.3002, + "eval_multi_news_samples_per_second": 198.564, + "eval_multi_news_steps_per_second": 4.15, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_tldr_news_accuracy": 0.5667732530800339, + "eval_tldr_news_loss": 1.9794921875, + "eval_tldr_news_runtime": 3.9274, + "eval_tldr_news_samples_per_second": 363.601, + "eval_tldr_news_steps_per_second": 7.639, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_scitldr_accuracy": 0.49108589951377635, + "eval_scitldr_loss": NaN, + "eval_scitldr_runtime": 2.8246, + "eval_scitldr_samples_per_second": 141.259, + "eval_scitldr_steps_per_second": 3.186, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_samsum_accuracy": 0.6251537806378348, + "eval_samsum_loss": 1.3818359375, + "eval_samsum_runtime": 13.864, + "eval_samsum_samples_per_second": 212.565, + "eval_samsum_steps_per_second": 4.472, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_debate_sum_accuracy": 0.9387001084116295, + "eval_debate_sum_loss": NaN, + "eval_debate_sum_runtime": 243.8994, + "eval_debate_sum_samples_per_second": 197.27, + "eval_debate_sum_steps_per_second": 4.112, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_billsum_accuracy": 0.6722638786214665, + "eval_billsum_loss": 1.392578125, + "eval_billsum_runtime": 27.2836, + "eval_billsum_samples_per_second": 138.911, + "eval_billsum_steps_per_second": 2.896, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_wmt2019_zh-en_accuracy": 0.6307350879022818, + "eval_wmt2019_zh-en_loss": 1.67578125, + "eval_wmt2019_zh-en_runtime": 11.4742, + "eval_wmt2019_zh-en_samples_per_second": 346.954, + "eval_wmt2019_zh-en_steps_per_second": 7.234, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_wmt2019_ru-en_accuracy": 0.7234031766265182, + "eval_wmt2019_ru-en_loss": 1.1044921875, + "eval_wmt2019_ru-en_runtime": 11.1292, + "eval_wmt2019_ru-en_samples_per_second": 269.561, + "eval_wmt2019_ru-en_steps_per_second": 5.661, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_wmt2019_de-en_accuracy": 0.7347881745809497, + "eval_wmt2019_de-en_loss": 1.0703125, + "eval_wmt2019_de-en_runtime": 6.7145, + "eval_wmt2019_de-en_samples_per_second": 446.495, + "eval_wmt2019_de-en_steps_per_second": 9.383, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_wmt2019_fr-de_accuracy": 0.7223718008231981, + "eval_wmt2019_fr-de_loss": 1.150390625, + "eval_wmt2019_fr-de_runtime": 5.999, + "eval_wmt2019_fr-de_samples_per_second": 252.041, + "eval_wmt2019_fr-de_steps_per_second": 5.334, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_essay_instruction_accuracy": 0.5959412245981027, + "eval_essay_instruction_loss": 1.9736328125, + "eval_essay_instruction_runtime": 4.9416, + "eval_essay_instruction_samples_per_second": 83.576, + "eval_essay_instruction_steps_per_second": 1.821, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_reddit_eli5_accuracy": 0.44527728061091026, + "eval_reddit_eli5_loss": 2.548828125, + "eval_reddit_eli5_runtime": 290.9524, + "eval_reddit_eli5_samples_per_second": 187.409, + "eval_reddit_eli5_steps_per_second": 3.904, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_reddit_askh_accuracy": 0.44824522875076156, + "eval_reddit_askh_loss": 2.6484375, + "eval_reddit_askh_runtime": 112.213, + "eval_reddit_askh_samples_per_second": 175.604, + "eval_reddit_askh_steps_per_second": 3.663, + "step": 1250 + }, + { + "epoch": 0.07, + "eval_reddit_asks_accuracy": 0.45689164053284076, + "eval_reddit_asks_loss": 2.498046875, + "eval_reddit_asks_runtime": 165.2712, + "eval_reddit_asks_samples_per_second": 159.471, + "eval_reddit_asks_steps_per_second": 3.328, + "step": 1250 + }, + { + "epoch": 0.07, + "learning_rate": 4.960942212570877e-06, + "loss": 1.6061, + "step": 1260 + }, + { + "epoch": 0.07, + "learning_rate": 4.959434189890216e-06, + "loss": 1.6272, + "step": 1270 + }, + { + "epoch": 0.07, + "learning_rate": 4.957926167209555e-06, + "loss": 1.58, + "step": 1280 + }, + { + "epoch": 0.08, + "learning_rate": 4.9564181445288936e-06, + "loss": 1.5576, + "step": 1290 + }, + { + "epoch": 0.08, + "learning_rate": 4.9549101218482336e-06, + "loss": 1.5794, + "step": 1300 + }, + { + "epoch": 0.08, + "learning_rate": 4.953402099167572e-06, + "loss": 1.5694, + "step": 1310 + }, + { + "epoch": 0.08, + "learning_rate": 4.95189407648691e-06, + "loss": 1.6599, + "step": 1320 + }, + { + "epoch": 0.08, + "learning_rate": 4.95038605380625e-06, + "loss": 1.5774, + "step": 1330 + }, + { + "epoch": 0.08, + "learning_rate": 4.9488780311255885e-06, + "loss": 1.5876, + "step": 1340 + }, + { + "epoch": 0.08, + "learning_rate": 4.947370008444927e-06, + "loss": 1.6382, + "step": 1350 + }, + { + "epoch": 0.08, + "learning_rate": 4.945861985764267e-06, + "loss": 1.6248, + "step": 1360 + }, + { + "epoch": 0.08, + "learning_rate": 4.944353963083605e-06, + "loss": 1.6227, + "step": 1370 + }, + { + "epoch": 0.08, + "learning_rate": 4.9428459404029434e-06, + "loss": 1.592, + "step": 1380 + }, + { + "epoch": 0.08, + "learning_rate": 4.9413379177222834e-06, + "loss": 1.6044, + "step": 1390 + }, + { + "epoch": 0.08, + "learning_rate": 4.939829895041622e-06, + "loss": 1.6386, + "step": 1400 + }, + { + "epoch": 0.08, + "learning_rate": 4.938321872360961e-06, + "loss": 1.5863, + "step": 1410 + }, + { + "epoch": 0.08, + "learning_rate": 4.9368138496803e-06, + "loss": 1.6269, + "step": 1420 + }, + { + "epoch": 0.08, + "learning_rate": 4.935305826999638e-06, + "loss": 1.6145, + "step": 1430 + }, + { + "epoch": 0.08, + "learning_rate": 4.9337978043189775e-06, + "loss": 1.5194, + "step": 1440 + }, + { + "epoch": 0.08, + "learning_rate": 4.932289781638317e-06, + "loss": 1.5841, + "step": 1450 + }, + { + "epoch": 0.09, + "learning_rate": 4.930781758957655e-06, + "loss": 1.5677, + "step": 1460 + }, + { + "epoch": 0.09, + "learning_rate": 4.929273736276994e-06, + "loss": 1.6513, + "step": 1470 + }, + { + "epoch": 0.09, + "learning_rate": 4.927765713596333e-06, + "loss": 1.5514, + "step": 1480 + }, + { + "epoch": 0.09, + "learning_rate": 4.926257690915672e-06, + "loss": 1.6071, + "step": 1490 + }, + { + "epoch": 0.09, + "learning_rate": 4.924749668235011e-06, + "loss": 1.5484, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_gsm8k_hard_accuracy": 0.9160661210862749, + "eval_gsm8k_hard_loss": 0.36669921875, + "eval_gsm8k_hard_runtime": 2.426, + "eval_gsm8k_hard_samples_per_second": 108.822, + "eval_gsm8k_hard_steps_per_second": 2.473, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_webgpt_accuracy": 0.4889121259300561, + "eval_webgpt_loss": 2.275390625, + "eval_webgpt_runtime": 16.7849, + "eval_webgpt_samples_per_second": 233.305, + "eval_webgpt_steps_per_second": 4.885, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_squad_v2_accuracy": 0.8985118166828732, + "eval_squad_v2_loss": 0.3203125, + "eval_squad_v2_runtime": 89.0026, + "eval_squad_v2_samples_per_second": 292.845, + "eval_squad_v2_steps_per_second": 6.101, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_adversarial_qa_accuracy": 0.8153786454135697, + "eval_adversarial_qa_loss": 0.80224609375, + "eval_adversarial_qa_runtime": 21.5777, + "eval_adversarial_qa_samples_per_second": 278.064, + "eval_adversarial_qa_steps_per_second": 5.793, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_private_tuning_accuracy": 0.6673820897933299, + "eval_private_tuning_loss": 1.2412109375, + "eval_private_tuning_runtime": 61.4345, + "eval_private_tuning_samples_per_second": 344.725, + "eval_private_tuning_steps_per_second": 7.195, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_oa_translated_accuracy": 0.694931804897923, + "eval_oa_translated_loss": 1.259765625, + "eval_oa_translated_runtime": 747.634, + "eval_oa_translated_samples_per_second": 187.422, + "eval_oa_translated_steps_per_second": 3.906, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_prosocial_dialogue_accuracy": 0.5319418402546676, + "eval_prosocial_dialogue_loss": 1.7685546875, + "eval_prosocial_dialogue_runtime": 40.2134, + "eval_prosocial_dialogue_samples_per_second": 670.995, + "eval_prosocial_dialogue_steps_per_second": 14.0, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_math_qa_accuracy": 0.5807196170064887, + "eval_math_qa_loss": 1.8212890625, + "eval_math_qa_runtime": 19.1438, + "eval_math_qa_samples_per_second": 311.745, + "eval_math_qa_steps_per_second": 6.53, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_wikihow_accuracy": 0.6014214394674803, + "eval_wikihow_loss": 1.896484375, + "eval_wikihow_runtime": 7.4397, + "eval_wikihow_samples_per_second": 308.21, + "eval_wikihow_steps_per_second": 6.452, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_joke_accuracy": 0.4866376042456406, + "eval_joke_loss": 2.24609375, + "eval_joke_runtime": 0.5334, + "eval_joke_samples_per_second": 142.487, + "eval_joke_steps_per_second": 3.75, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_gsm8k_accuracy": 0.7673556205482711, + "eval_gsm8k_loss": 0.87646484375, + "eval_gsm8k_runtime": 6.0291, + "eval_gsm8k_samples_per_second": 247.963, + "eval_gsm8k_steps_per_second": 5.308, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_ted_trans_en-hi_accuracy": 0.6494347770862441, + "eval_ted_trans_en-hi_loss": 1.390625, + "eval_ted_trans_en-hi_runtime": 1.0351, + "eval_ted_trans_en-hi_samples_per_second": 99.508, + "eval_ted_trans_en-hi_steps_per_second": 2.898, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_ted_trans_de-ja_accuracy": 0.616848081514485, + "eval_ted_trans_de-ja_loss": 1.677734375, + "eval_ted_trans_de-ja_runtime": 3.7491, + "eval_ted_trans_de-ja_samples_per_second": 191.511, + "eval_ted_trans_de-ja_steps_per_second": 4.001, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_ted_trans_nl-en_accuracy": 0.727337334175616, + "eval_ted_trans_nl-en_loss": 1.2138671875, + "eval_ted_trans_nl-en_runtime": 3.6692, + "eval_ted_trans_nl-en_samples_per_second": 210.129, + "eval_ted_trans_nl-en_steps_per_second": 4.633, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_ted_trans_en-ja_accuracy": 0.626860854480171, + "eval_ted_trans_en-ja_loss": 1.591796875, + "eval_ted_trans_en-ja_runtime": 4.8535, + "eval_ted_trans_en-ja_samples_per_second": 165.036, + "eval_ted_trans_en-ja_steps_per_second": 3.503, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_ted_trans_en-es_accuracy": 0.7669356066372355, + "eval_ted_trans_en-es_loss": 0.98388671875, + "eval_ted_trans_en-es_runtime": 4.1425, + "eval_ted_trans_en-es_samples_per_second": 199.395, + "eval_ted_trans_en-es_steps_per_second": 4.345, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_ted_trans_en-ms_accuracy": 0.6616502335236119, + "eval_ted_trans_en-ms_loss": 1.5517578125, + "eval_ted_trans_en-ms_runtime": 0.9865, + "eval_ted_trans_en-ms_samples_per_second": 42.573, + "eval_ted_trans_en-ms_steps_per_second": 1.014, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_xsum_accuracy": 0.6076375151351375, + "eval_xsum_loss": NaN, + "eval_xsum_runtime": 192.6396, + "eval_xsum_samples_per_second": 211.841, + "eval_xsum_steps_per_second": 4.418, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_cnn_dailymail_accuracy": 0.6761071775649035, + "eval_cnn_dailymail_loss": NaN, + "eval_cnn_dailymail_runtime": 278.6917, + "eval_cnn_dailymail_samples_per_second": 206.045, + "eval_cnn_dailymail_steps_per_second": 4.295, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_multi_news_accuracy": 0.5442969643083958, + "eval_multi_news_loss": NaN, + "eval_multi_news_runtime": 45.8208, + "eval_multi_news_samples_per_second": 196.308, + "eval_multi_news_steps_per_second": 4.103, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_tldr_news_accuracy": 0.5842189410326343, + "eval_tldr_news_loss": 1.8740234375, + "eval_tldr_news_runtime": 3.8029, + "eval_tldr_news_samples_per_second": 375.502, + "eval_tldr_news_steps_per_second": 7.889, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_scitldr_accuracy": 0.49108589951377635, + "eval_scitldr_loss": NaN, + "eval_scitldr_runtime": 2.7953, + "eval_scitldr_samples_per_second": 142.738, + "eval_scitldr_steps_per_second": 3.22, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_samsum_accuracy": 0.6296691857399721, + "eval_samsum_loss": 1.359375, + "eval_samsum_runtime": 14.6559, + "eval_samsum_samples_per_second": 201.08, + "eval_samsum_steps_per_second": 4.23, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_debate_sum_accuracy": 0.9376602666300555, + "eval_debate_sum_loss": NaN, + "eval_debate_sum_runtime": 243.7385, + "eval_debate_sum_samples_per_second": 197.4, + "eval_debate_sum_steps_per_second": 4.115, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_billsum_accuracy": 0.674664779787867, + "eval_billsum_loss": 1.3837890625, + "eval_billsum_runtime": 28.345, + "eval_billsum_samples_per_second": 133.71, + "eval_billsum_steps_per_second": 2.787, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_wmt2019_zh-en_accuracy": 0.6330300731414937, + "eval_wmt2019_zh-en_loss": 1.6611328125, + "eval_wmt2019_zh-en_runtime": 11.8726, + "eval_wmt2019_zh-en_samples_per_second": 335.309, + "eval_wmt2019_zh-en_steps_per_second": 6.991, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_wmt2019_ru-en_accuracy": 0.7280151437129657, + "eval_wmt2019_ru-en_loss": 1.083984375, + "eval_wmt2019_ru-en_runtime": 10.0609, + "eval_wmt2019_ru-en_samples_per_second": 298.183, + "eval_wmt2019_ru-en_steps_per_second": 6.262, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_wmt2019_de-en_accuracy": 0.7374484938192584, + "eval_wmt2019_de-en_loss": 1.05859375, + "eval_wmt2019_de-en_runtime": 7.6833, + "eval_wmt2019_de-en_samples_per_second": 390.199, + "eval_wmt2019_de-en_steps_per_second": 8.2, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_wmt2019_fr-de_accuracy": 0.7249404264537492, + "eval_wmt2019_fr-de_loss": 1.1298828125, + "eval_wmt2019_fr-de_runtime": 4.7455, + "eval_wmt2019_fr-de_samples_per_second": 318.618, + "eval_wmt2019_fr-de_steps_per_second": 6.743, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_essay_instruction_accuracy": 0.5962563297437145, + "eval_essay_instruction_loss": 1.96484375, + "eval_essay_instruction_runtime": 4.8055, + "eval_essay_instruction_samples_per_second": 85.943, + "eval_essay_instruction_steps_per_second": 1.873, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_reddit_eli5_accuracy": 0.44550306361932773, + "eval_reddit_eli5_loss": 2.546875, + "eval_reddit_eli5_runtime": 268.1996, + "eval_reddit_eli5_samples_per_second": 203.308, + "eval_reddit_eli5_steps_per_second": 4.236, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_reddit_askh_accuracy": 0.44892602615508864, + "eval_reddit_askh_loss": 2.642578125, + "eval_reddit_askh_runtime": 135.1366, + "eval_reddit_askh_samples_per_second": 145.815, + "eval_reddit_askh_steps_per_second": 3.041, + "step": 1500 + }, + { + "epoch": 0.09, + "eval_reddit_asks_accuracy": 0.4570882832072159, + "eval_reddit_asks_loss": 2.49609375, + "eval_reddit_asks_runtime": 151.4006, + "eval_reddit_asks_samples_per_second": 174.081, + "eval_reddit_asks_steps_per_second": 3.633, + "step": 1500 + }, + { + "epoch": 0.09, + "learning_rate": 4.92324164555435e-06, + "loss": 1.5053, + "step": 1510 + }, + { + "epoch": 0.09, + "learning_rate": 4.921733622873688e-06, + "loss": 1.6214, + "step": 1520 + }, + { + "epoch": 0.09, + "learning_rate": 4.920225600193027e-06, + "loss": 1.5547, + "step": 1530 + }, + { + "epoch": 0.09, + "learning_rate": 4.9187175775123666e-06, + "loss": 1.5706, + "step": 1540 + }, + { + "epoch": 0.09, + "learning_rate": 4.917209554831705e-06, + "loss": 1.6485, + "step": 1550 + }, + { + "epoch": 0.09, + "learning_rate": 4.915701532151044e-06, + "loss": 1.6188, + "step": 1560 + }, + { + "epoch": 0.09, + "learning_rate": 4.914193509470383e-06, + "loss": 1.5883, + "step": 1570 + }, + { + "epoch": 0.09, + "learning_rate": 4.9126854867897215e-06, + "loss": 1.5876, + "step": 1580 + }, + { + "epoch": 0.09, + "learning_rate": 4.911177464109061e-06, + "loss": 1.5883, + "step": 1590 + }, + { + "epoch": 0.09, + "learning_rate": 4.9096694414284e-06, + "loss": 1.584, + "step": 1600 + }, + { + "epoch": 0.09, + "learning_rate": 4.908161418747738e-06, + "loss": 1.6226, + "step": 1610 + }, + { + "epoch": 0.09, + "learning_rate": 4.906653396067077e-06, + "loss": 1.5241, + "step": 1620 + }, + { + "epoch": 0.1, + "learning_rate": 4.9051453733864164e-06, + "loss": 1.552, + "step": 1630 + }, + { + "epoch": 0.1, + "learning_rate": 4.903637350705755e-06, + "loss": 1.6007, + "step": 1640 + }, + { + "epoch": 0.1, + "learning_rate": 4.902129328025094e-06, + "loss": 1.6413, + "step": 1650 + }, + { + "epoch": 0.1, + "learning_rate": 4.900621305344433e-06, + "loss": 1.5901, + "step": 1660 + }, + { + "epoch": 0.1, + "learning_rate": 4.899113282663771e-06, + "loss": 1.618, + "step": 1670 + }, + { + "epoch": 0.1, + "learning_rate": 4.8976052599831105e-06, + "loss": 1.6042, + "step": 1680 + }, + { + "epoch": 0.1, + "learning_rate": 4.89609723730245e-06, + "loss": 1.6138, + "step": 1690 + }, + { + "epoch": 0.1, + "learning_rate": 4.894589214621788e-06, + "loss": 1.5651, + "step": 1700 + }, + { + "epoch": 0.1, + "learning_rate": 4.893081191941127e-06, + "loss": 1.557, + "step": 1710 + }, + { + "epoch": 0.1, + "learning_rate": 4.891573169260466e-06, + "loss": 1.589, + "step": 1720 + }, + { + "epoch": 0.1, + "learning_rate": 4.890065146579805e-06, + "loss": 1.6019, + "step": 1730 + }, + { + "epoch": 0.1, + "learning_rate": 4.888557123899144e-06, + "loss": 1.5311, + "step": 1740 + }, + { + "epoch": 0.1, + "learning_rate": 4.887049101218483e-06, + "loss": 1.5996, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_gsm8k_hard_accuracy": 0.917939008998005, + "eval_gsm8k_hard_loss": 0.353759765625, + "eval_gsm8k_hard_runtime": 2.0742, + "eval_gsm8k_hard_samples_per_second": 127.281, + "eval_gsm8k_hard_steps_per_second": 2.893, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_webgpt_accuracy": 0.48935223254836624, + "eval_webgpt_loss": 2.275390625, + "eval_webgpt_runtime": 18.9851, + "eval_webgpt_samples_per_second": 206.267, + "eval_webgpt_steps_per_second": 4.319, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_squad_v2_accuracy": 0.8966656801815133, + "eval_squad_v2_loss": 0.312255859375, + "eval_squad_v2_runtime": 87.2749, + "eval_squad_v2_samples_per_second": 298.642, + "eval_squad_v2_steps_per_second": 6.222, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_adversarial_qa_accuracy": 0.7900062684695979, + "eval_adversarial_qa_loss": 0.8447265625, + "eval_adversarial_qa_runtime": 21.2806, + "eval_adversarial_qa_samples_per_second": 281.947, + "eval_adversarial_qa_steps_per_second": 5.874, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_private_tuning_accuracy": 0.6689839396232736, + "eval_private_tuning_loss": 1.2333984375, + "eval_private_tuning_runtime": 65.9372, + "eval_private_tuning_samples_per_second": 321.185, + "eval_private_tuning_steps_per_second": 6.703, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_oa_translated_accuracy": 0.697770983402211, + "eval_oa_translated_loss": 1.24609375, + "eval_oa_translated_runtime": 707.9067, + "eval_oa_translated_samples_per_second": 197.94, + "eval_oa_translated_steps_per_second": 4.125, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_prosocial_dialogue_accuracy": 0.5329268895641822, + "eval_prosocial_dialogue_loss": 1.771484375, + "eval_prosocial_dialogue_runtime": 94.875, + "eval_prosocial_dialogue_samples_per_second": 284.406, + "eval_prosocial_dialogue_steps_per_second": 5.934, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_math_qa_accuracy": 0.5841662474001151, + "eval_math_qa_loss": 1.80078125, + "eval_math_qa_runtime": 18.717, + "eval_math_qa_samples_per_second": 318.854, + "eval_math_qa_steps_per_second": 6.678, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_wikihow_accuracy": 0.6094855082512828, + "eval_wikihow_loss": 1.8798828125, + "eval_wikihow_runtime": 8.1158, + "eval_wikihow_samples_per_second": 282.535, + "eval_wikihow_steps_per_second": 5.914, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_joke_accuracy": 0.4837945413191812, + "eval_joke_loss": 2.259765625, + "eval_joke_runtime": 0.8936, + "eval_joke_samples_per_second": 85.053, + "eval_joke_steps_per_second": 2.238, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_gsm8k_accuracy": 0.7703712514518408, + "eval_gsm8k_loss": 0.86376953125, + "eval_gsm8k_runtime": 5.5568, + "eval_gsm8k_samples_per_second": 269.039, + "eval_gsm8k_steps_per_second": 5.759, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_ted_trans_en-hi_accuracy": 0.6408450704225352, + "eval_ted_trans_en-hi_loss": 1.4541015625, + "eval_ted_trans_en-hi_runtime": 1.7176, + "eval_ted_trans_en-hi_samples_per_second": 59.968, + "eval_ted_trans_en-hi_steps_per_second": 1.747, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_ted_trans_de-ja_accuracy": 0.6207718785454686, + "eval_ted_trans_de-ja_loss": 1.6513671875, + "eval_ted_trans_de-ja_runtime": 4.1297, + "eval_ted_trans_de-ja_samples_per_second": 173.862, + "eval_ted_trans_de-ja_steps_per_second": 3.632, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_ted_trans_nl-en_accuracy": 0.7211525867714473, + "eval_ted_trans_nl-en_loss": 1.2236328125, + "eval_ted_trans_nl-en_runtime": 4.0953, + "eval_ted_trans_nl-en_samples_per_second": 188.265, + "eval_ted_trans_nl-en_steps_per_second": 4.151, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_ted_trans_en-ja_accuracy": 0.6330966258927838, + "eval_ted_trans_en-ja_loss": 1.5634765625, + "eval_ted_trans_en-ja_runtime": 4.1542, + "eval_ted_trans_en-ja_samples_per_second": 192.817, + "eval_ted_trans_en-ja_steps_per_second": 4.092, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_ted_trans_en-es_accuracy": 0.7685958860355181, + "eval_ted_trans_en-es_loss": 0.984375, + "eval_ted_trans_en-es_runtime": 4.9915, + "eval_ted_trans_en-es_samples_per_second": 165.482, + "eval_ted_trans_en-es_steps_per_second": 3.606, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_ted_trans_en-ms_accuracy": 0.6491956408925791, + "eval_ted_trans_en-ms_loss": 1.5751953125, + "eval_ted_trans_en-ms_runtime": 0.4795, + "eval_ted_trans_en-ms_samples_per_second": 87.587, + "eval_ted_trans_en-ms_steps_per_second": 2.085, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_xsum_accuracy": 0.6072009770109794, + "eval_xsum_loss": NaN, + "eval_xsum_runtime": 194.4193, + "eval_xsum_samples_per_second": 209.902, + "eval_xsum_steps_per_second": 4.377, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_cnn_dailymail_accuracy": 0.6753130640011107, + "eval_cnn_dailymail_loss": NaN, + "eval_cnn_dailymail_runtime": 277.0946, + "eval_cnn_dailymail_samples_per_second": 207.232, + "eval_cnn_dailymail_steps_per_second": 4.32, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_multi_news_accuracy": 0.5451066937634955, + "eval_multi_news_loss": NaN, + "eval_multi_news_runtime": 46.0554, + "eval_multi_news_samples_per_second": 195.308, + "eval_multi_news_steps_per_second": 4.082, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_tldr_news_accuracy": 0.5815386062259005, + "eval_tldr_news_loss": 1.869140625, + "eval_tldr_news_runtime": 3.242, + "eval_tldr_news_samples_per_second": 440.47, + "eval_tldr_news_steps_per_second": 9.254, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_scitldr_accuracy": 0.49108589951377635, + "eval_scitldr_loss": NaN, + "eval_scitldr_runtime": 2.7516, + "eval_scitldr_samples_per_second": 145.006, + "eval_scitldr_steps_per_second": 3.271, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_samsum_accuracy": 0.6312103719125579, + "eval_samsum_loss": 1.3564453125, + "eval_samsum_runtime": 13.9794, + "eval_samsum_samples_per_second": 210.81, + "eval_samsum_steps_per_second": 4.435, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_debate_sum_accuracy": 0.9370392048784827, + "eval_debate_sum_loss": NaN, + "eval_debate_sum_runtime": 244.8227, + "eval_debate_sum_samples_per_second": 196.526, + "eval_debate_sum_steps_per_second": 4.097, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_billsum_accuracy": 0.6767193885992487, + "eval_billsum_loss": 1.3740234375, + "eval_billsum_runtime": 27.455, + "eval_billsum_samples_per_second": 138.044, + "eval_billsum_steps_per_second": 2.877, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_wmt2019_zh-en_accuracy": 0.6325426044271557, + "eval_wmt2019_zh-en_loss": 1.658203125, + "eval_wmt2019_zh-en_runtime": 12.0617, + "eval_wmt2019_zh-en_samples_per_second": 330.053, + "eval_wmt2019_zh-en_steps_per_second": 6.881, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_wmt2019_ru-en_accuracy": 0.7264274688562131, + "eval_wmt2019_ru-en_loss": 1.0927734375, + "eval_wmt2019_ru-en_runtime": 10.931, + "eval_wmt2019_ru-en_samples_per_second": 274.449, + "eval_wmt2019_ru-en_steps_per_second": 5.763, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_wmt2019_de-en_accuracy": 0.7404288514621754, + "eval_wmt2019_de-en_loss": 1.0498046875, + "eval_wmt2019_de-en_runtime": 7.1074, + "eval_wmt2019_de-en_samples_per_second": 421.813, + "eval_wmt2019_de-en_steps_per_second": 8.864, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_wmt2019_fr-de_accuracy": 0.7267868761037272, + "eval_wmt2019_fr-de_loss": 1.1259765625, + "eval_wmt2019_fr-de_runtime": 6.3509, + "eval_wmt2019_fr-de_samples_per_second": 238.077, + "eval_wmt2019_fr-de_steps_per_second": 5.039, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_essay_instruction_accuracy": 0.5968423147513433, + "eval_essay_instruction_loss": 1.9580078125, + "eval_essay_instruction_runtime": 4.8143, + "eval_essay_instruction_samples_per_second": 85.786, + "eval_essay_instruction_steps_per_second": 1.869, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_reddit_eli5_accuracy": 0.4457660851447585, + "eval_reddit_eli5_loss": 2.54296875, + "eval_reddit_eli5_runtime": 291.6459, + "eval_reddit_eli5_samples_per_second": 186.963, + "eval_reddit_eli5_steps_per_second": 3.895, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_reddit_askh_accuracy": 0.4493032630102438, + "eval_reddit_askh_loss": 2.640625, + "eval_reddit_askh_runtime": 111.9151, + "eval_reddit_askh_samples_per_second": 176.071, + "eval_reddit_askh_steps_per_second": 3.672, + "step": 1750 + }, + { + "epoch": 0.1, + "eval_reddit_asks_accuracy": 0.4576012073512167, + "eval_reddit_asks_loss": 2.4921875, + "eval_reddit_asks_runtime": 151.9908, + "eval_reddit_asks_samples_per_second": 173.405, + "eval_reddit_asks_steps_per_second": 3.619, + "step": 1750 + }, + { + "epoch": 0.1, + "learning_rate": 4.885541078537821e-06, + "loss": 1.5723, + "step": 1760 + }, + { + "epoch": 0.1, + "learning_rate": 4.88403305585716e-06, + "loss": 1.5749, + "step": 1770 + }, + { + "epoch": 0.1, + "learning_rate": 4.8825250331764996e-06, + "loss": 1.6074, + "step": 1780 + }, + { + "epoch": 0.1, + "learning_rate": 4.881167812763905e-06, + "loss": 1.611, + "step": 1790 + }, + { + "epoch": 0.11, + "learning_rate": 4.879659790083243e-06, + "loss": 1.5503, + "step": 1800 + }, + { + "epoch": 0.11, + "learning_rate": 4.878151767402582e-06, + "loss": 1.5197, + "step": 1810 + }, + { + "epoch": 0.11, + "learning_rate": 4.876643744721921e-06, + "loss": 1.6165, + "step": 1820 + }, + { + "epoch": 0.11, + "learning_rate": 4.87513572204126e-06, + "loss": 1.5622, + "step": 1830 + }, + { + "epoch": 0.11, + "learning_rate": 4.873627699360599e-06, + "loss": 1.6142, + "step": 1840 + }, + { + "epoch": 0.11, + "learning_rate": 4.872119676679938e-06, + "loss": 1.5699, + "step": 1850 + }, + { + "epoch": 0.11, + "learning_rate": 4.870611653999276e-06, + "loss": 1.5571, + "step": 1860 + }, + { + "epoch": 0.11, + "learning_rate": 4.8691036313186155e-06, + "loss": 1.7093, + "step": 1870 + }, + { + "epoch": 0.11, + "learning_rate": 4.867595608637955e-06, + "loss": 1.5411, + "step": 1880 + }, + { + "epoch": 0.11, + "learning_rate": 4.866087585957293e-06, + "loss": 1.6657, + "step": 1890 + }, + { + "epoch": 0.11, + "learning_rate": 4.864579563276632e-06, + "loss": 1.6215, + "step": 1900 + }, + { + "epoch": 0.11, + "learning_rate": 4.863071540595971e-06, + "loss": 1.6317, + "step": 1910 + }, + { + "epoch": 0.11, + "learning_rate": 4.86156351791531e-06, + "loss": 1.573, + "step": 1920 + }, + { + "epoch": 0.11, + "learning_rate": 4.860055495234649e-06, + "loss": 1.5732, + "step": 1930 + }, + { + "epoch": 0.11, + "learning_rate": 4.858547472553988e-06, + "loss": 1.5991, + "step": 1940 + }, + { + "epoch": 0.11, + "learning_rate": 4.857039449873326e-06, + "loss": 1.6105, + "step": 1950 + }, + { + "epoch": 0.11, + "learning_rate": 4.855531427192665e-06, + "loss": 1.627, + "step": 1960 + }, + { + "epoch": 0.12, + "learning_rate": 4.8540234045120045e-06, + "loss": 1.59, + "step": 1970 + }, + { + "epoch": 0.12, + "learning_rate": 4.852515381831343e-06, + "loss": 1.5923, + "step": 1980 + }, + { + "epoch": 0.12, + "learning_rate": 4.851007359150682e-06, + "loss": 1.5461, + "step": 1990 + }, + { + "epoch": 0.12, + "learning_rate": 4.849499336470021e-06, + "loss": 1.5792, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_gsm8k_hard_accuracy": 0.9194658197956109, + "eval_gsm8k_hard_loss": 0.34375, + "eval_gsm8k_hard_runtime": 2.5861, + "eval_gsm8k_hard_samples_per_second": 102.084, + "eval_gsm8k_hard_steps_per_second": 2.32, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_webgpt_accuracy": 0.48958552214679374, + "eval_webgpt_loss": 2.26953125, + "eval_webgpt_runtime": 18.454, + "eval_webgpt_samples_per_second": 212.203, + "eval_webgpt_steps_per_second": 4.443, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_squad_v2_accuracy": 0.902612776392001, + "eval_squad_v2_loss": 0.312744140625, + "eval_squad_v2_runtime": 87.2414, + "eval_squad_v2_samples_per_second": 298.757, + "eval_squad_v2_steps_per_second": 6.224, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_adversarial_qa_accuracy": 0.8242739022715739, + "eval_adversarial_qa_loss": 0.765625, + "eval_adversarial_qa_runtime": 21.8127, + "eval_adversarial_qa_samples_per_second": 275.069, + "eval_adversarial_qa_steps_per_second": 5.731, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_private_tuning_accuracy": 0.6703786408057409, + "eval_private_tuning_loss": 1.2275390625, + "eval_private_tuning_runtime": 61.3818, + "eval_private_tuning_samples_per_second": 345.021, + "eval_private_tuning_steps_per_second": 7.201, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_oa_translated_accuracy": 0.6998150442485734, + "eval_oa_translated_loss": 1.2353515625, + "eval_oa_translated_runtime": 717.5723, + "eval_oa_translated_samples_per_second": 195.274, + "eval_oa_translated_steps_per_second": 4.069, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_prosocial_dialogue_accuracy": 0.5407654479692391, + "eval_prosocial_dialogue_loss": 1.75390625, + "eval_prosocial_dialogue_runtime": 73.7989, + "eval_prosocial_dialogue_samples_per_second": 365.629, + "eval_prosocial_dialogue_steps_per_second": 7.629, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_math_qa_accuracy": 0.5880693164312722, + "eval_math_qa_loss": 1.77734375, + "eval_math_qa_runtime": 18.7243, + "eval_math_qa_samples_per_second": 318.731, + "eval_math_qa_steps_per_second": 6.676, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_wikihow_accuracy": 0.6112120371654417, + "eval_wikihow_loss": 1.8701171875, + "eval_wikihow_runtime": 8.0237, + "eval_wikihow_samples_per_second": 285.777, + "eval_wikihow_steps_per_second": 5.982, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_joke_accuracy": 0.4860689916603487, + "eval_joke_loss": 2.2421875, + "eval_joke_runtime": 0.9634, + "eval_joke_samples_per_second": 78.886, + "eval_joke_steps_per_second": 2.076, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_gsm8k_accuracy": 0.7713298162634165, + "eval_gsm8k_loss": 0.857421875, + "eval_gsm8k_runtime": 6.4806, + "eval_gsm8k_samples_per_second": 230.688, + "eval_gsm8k_steps_per_second": 4.938, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_ted_trans_en-hi_accuracy": 0.6369557351344832, + "eval_ted_trans_en-hi_loss": 1.482421875, + "eval_ted_trans_en-hi_runtime": 1.0804, + "eval_ted_trans_en-hi_samples_per_second": 95.332, + "eval_ted_trans_en-hi_steps_per_second": 2.777, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_ted_trans_de-ja_accuracy": 0.6203946747238221, + "eval_ted_trans_de-ja_loss": 1.6611328125, + "eval_ted_trans_de-ja_runtime": 4.4636, + "eval_ted_trans_de-ja_samples_per_second": 160.856, + "eval_ted_trans_de-ja_steps_per_second": 3.36, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_ted_trans_nl-en_accuracy": 0.7250118302749882, + "eval_ted_trans_nl-en_loss": 1.2099609375, + "eval_ted_trans_nl-en_runtime": 3.3901, + "eval_ted_trans_nl-en_samples_per_second": 227.426, + "eval_ted_trans_nl-en_steps_per_second": 5.015, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_ted_trans_en-ja_accuracy": 0.6299969453212504, + "eval_ted_trans_en-ja_loss": 1.5703125, + "eval_ted_trans_en-ja_runtime": 3.888, + "eval_ted_trans_en-ja_samples_per_second": 206.017, + "eval_ted_trans_en-ja_steps_per_second": 4.372, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_ted_trans_en-es_accuracy": 0.7667636252296387, + "eval_ted_trans_en-es_loss": 0.9814453125, + "eval_ted_trans_en-es_runtime": 4.4116, + "eval_ted_trans_en-es_samples_per_second": 187.233, + "eval_ted_trans_en-es_steps_per_second": 4.08, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_ted_trans_en-ms_accuracy": 0.6590555267254801, + "eval_ted_trans_en-ms_loss": 1.509765625, + "eval_ted_trans_en-ms_runtime": 1.256, + "eval_ted_trans_en-ms_samples_per_second": 33.438, + "eval_ted_trans_en-ms_steps_per_second": 0.796, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_xsum_accuracy": 0.6091027172290235, + "eval_xsum_loss": NaN, + "eval_xsum_runtime": 192.154, + "eval_xsum_samples_per_second": 212.377, + "eval_xsum_steps_per_second": 4.429, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_cnn_dailymail_accuracy": 0.6791855245499561, + "eval_cnn_dailymail_loss": NaN, + "eval_cnn_dailymail_runtime": 278.657, + "eval_cnn_dailymail_samples_per_second": 206.071, + "eval_cnn_dailymail_steps_per_second": 4.296, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_multi_news_accuracy": 0.547242906816125, + "eval_multi_news_loss": NaN, + "eval_multi_news_runtime": 46.0885, + "eval_multi_news_samples_per_second": 195.168, + "eval_multi_news_steps_per_second": 4.079, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_tldr_news_accuracy": 0.5899087745697358, + "eval_tldr_news_loss": 1.82421875, + "eval_tldr_news_runtime": 3.2945, + "eval_tldr_news_samples_per_second": 433.455, + "eval_tldr_news_steps_per_second": 9.106, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_scitldr_accuracy": 0.48946515397082657, + "eval_scitldr_loss": NaN, + "eval_scitldr_runtime": 2.999, + "eval_scitldr_samples_per_second": 133.043, + "eval_scitldr_steps_per_second": 3.001, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_samsum_accuracy": 0.6357663345455529, + "eval_samsum_loss": 1.3447265625, + "eval_samsum_runtime": 13.9699, + "eval_samsum_samples_per_second": 210.954, + "eval_samsum_steps_per_second": 4.438, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_debate_sum_accuracy": 0.937797152052083, + "eval_debate_sum_loss": NaN, + "eval_debate_sum_runtime": 251.6686, + "eval_debate_sum_samples_per_second": 191.18, + "eval_debate_sum_steps_per_second": 3.985, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_billsum_accuracy": 0.6771982567431513, + "eval_billsum_loss": 1.3671875, + "eval_billsum_runtime": 20.1117, + "eval_billsum_samples_per_second": 188.448, + "eval_billsum_steps_per_second": 3.928, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_wmt2019_zh-en_accuracy": 0.6312172508742782, + "eval_wmt2019_zh-en_loss": 1.6611328125, + "eval_wmt2019_zh-en_runtime": 12.8125, + "eval_wmt2019_zh-en_samples_per_second": 310.712, + "eval_wmt2019_zh-en_steps_per_second": 6.478, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_wmt2019_ru-en_accuracy": 0.7286970006594347, + "eval_wmt2019_ru-en_loss": 1.0859375, + "eval_wmt2019_ru-en_runtime": 10.0956, + "eval_wmt2019_ru-en_samples_per_second": 297.16, + "eval_wmt2019_ru-en_steps_per_second": 6.24, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_wmt2019_de-en_accuracy": 0.7417590110813298, + "eval_wmt2019_de-en_loss": 1.044921875, + "eval_wmt2019_de-en_runtime": 7.9617, + "eval_wmt2019_de-en_samples_per_second": 376.551, + "eval_wmt2019_de-en_steps_per_second": 7.913, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_wmt2019_fr-de_accuracy": 0.7250758560901603, + "eval_wmt2019_fr-de_loss": 1.130859375, + "eval_wmt2019_fr-de_runtime": 4.4498, + "eval_wmt2019_fr-de_samples_per_second": 339.792, + "eval_wmt2019_fr-de_steps_per_second": 7.191, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_essay_instruction_accuracy": 0.5974504124007696, + "eval_essay_instruction_loss": 1.953125, + "eval_essay_instruction_runtime": 5.4069, + "eval_essay_instruction_samples_per_second": 76.383, + "eval_essay_instruction_steps_per_second": 1.665, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_reddit_eli5_accuracy": 0.44636360001599296, + "eval_reddit_eli5_loss": 2.54296875, + "eval_reddit_eli5_runtime": 271.6541, + "eval_reddit_eli5_samples_per_second": 200.722, + "eval_reddit_eli5_steps_per_second": 4.182, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_reddit_askh_accuracy": 0.449544836282747, + "eval_reddit_askh_loss": 2.638671875, + "eval_reddit_askh_runtime": 133.9706, + "eval_reddit_askh_samples_per_second": 147.084, + "eval_reddit_askh_steps_per_second": 3.068, + "step": 2000 + }, + { + "epoch": 0.12, + "eval_reddit_asks_accuracy": 0.4580279915626371, + "eval_reddit_asks_loss": 2.490234375, + "eval_reddit_asks_runtime": 148.6473, + "eval_reddit_asks_samples_per_second": 177.306, + "eval_reddit_asks_steps_per_second": 3.7, + "step": 2000 + }, + { + "epoch": 0.12, + "learning_rate": 4.8479913137893595e-06, + "loss": 1.5467, + "step": 2010 + }, + { + "epoch": 0.12, + "learning_rate": 4.846483291108699e-06, + "loss": 1.6584, + "step": 2020 + }, + { + "epoch": 0.12, + "learning_rate": 4.844975268428038e-06, + "loss": 1.5414, + "step": 2030 + }, + { + "epoch": 0.12, + "learning_rate": 4.843467245747376e-06, + "loss": 1.5565, + "step": 2040 + }, + { + "epoch": 0.12, + "learning_rate": 4.841959223066715e-06, + "loss": 1.5377, + "step": 2050 + }, + { + "epoch": 0.12, + "learning_rate": 4.840451200386054e-06, + "loss": 1.5815, + "step": 2060 + }, + { + "epoch": 0.12, + "learning_rate": 4.838943177705393e-06, + "loss": 1.5763, + "step": 2070 + }, + { + "epoch": 0.12, + "learning_rate": 4.837435155024732e-06, + "loss": 1.5399, + "step": 2080 + }, + { + "epoch": 0.12, + "learning_rate": 4.835927132344071e-06, + "loss": 1.5486, + "step": 2090 + }, + { + "epoch": 0.12, + "learning_rate": 4.834419109663409e-06, + "loss": 1.5657, + "step": 2100 + }, + { + "epoch": 0.12, + "learning_rate": 4.8329110869827485e-06, + "loss": 1.5551, + "step": 2110 + }, + { + "epoch": 0.12, + "learning_rate": 4.831403064302088e-06, + "loss": 1.5926, + "step": 2120 + }, + { + "epoch": 0.12, + "learning_rate": 4.829895041621426e-06, + "loss": 1.5123, + "step": 2130 + }, + { + "epoch": 0.13, + "learning_rate": 4.828387018940765e-06, + "loss": 1.6461, + "step": 2140 + }, + { + "epoch": 0.13, + "learning_rate": 4.826878996260104e-06, + "loss": 1.5276, + "step": 2150 + }, + { + "epoch": 0.13, + "learning_rate": 4.825370973579443e-06, + "loss": 1.6597, + "step": 2160 + }, + { + "epoch": 0.13, + "learning_rate": 4.823862950898782e-06, + "loss": 1.5458, + "step": 2170 + }, + { + "epoch": 0.13, + "learning_rate": 4.822354928218121e-06, + "loss": 1.565, + "step": 2180 + }, + { + "epoch": 0.13, + "learning_rate": 4.820846905537459e-06, + "loss": 1.5876, + "step": 2190 + }, + { + "epoch": 0.13, + "learning_rate": 4.819338882856798e-06, + "loss": 1.5786, + "step": 2200 + }, + { + "epoch": 0.13, + "learning_rate": 4.8178308601761375e-06, + "loss": 1.5501, + "step": 2210 + }, + { + "epoch": 0.13, + "learning_rate": 4.816322837495476e-06, + "loss": 1.5303, + "step": 2220 + }, + { + "epoch": 0.13, + "learning_rate": 4.814814814814815e-06, + "loss": 1.6207, + "step": 2230 + }, + { + "epoch": 0.13, + "learning_rate": 4.813306792134154e-06, + "loss": 1.5776, + "step": 2240 + }, + { + "epoch": 0.13, + "learning_rate": 4.811798769453493e-06, + "loss": 1.6652, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_gsm8k_hard_accuracy": 0.9206872684336957, + "eval_gsm8k_hard_loss": 0.338134765625, + "eval_gsm8k_hard_runtime": 3.5208, + "eval_gsm8k_hard_samples_per_second": 74.983, + "eval_gsm8k_hard_steps_per_second": 1.704, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_webgpt_accuracy": 0.48976586658813137, + "eval_webgpt_loss": 2.265625, + "eval_webgpt_runtime": 18.5149, + "eval_webgpt_samples_per_second": 211.505, + "eval_webgpt_steps_per_second": 4.429, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_squad_v2_accuracy": 0.906819430938993, + "eval_squad_v2_loss": 0.292724609375, + "eval_squad_v2_runtime": 87.4932, + "eval_squad_v2_samples_per_second": 297.897, + "eval_squad_v2_steps_per_second": 6.206, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_adversarial_qa_accuracy": 0.8013193636010866, + "eval_adversarial_qa_loss": 0.8310546875, + "eval_adversarial_qa_runtime": 21.1349, + "eval_adversarial_qa_samples_per_second": 283.891, + "eval_adversarial_qa_steps_per_second": 5.914, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_private_tuning_accuracy": 0.6708063990236344, + "eval_private_tuning_loss": 1.2236328125, + "eval_private_tuning_runtime": 65.2825, + "eval_private_tuning_samples_per_second": 324.405, + "eval_private_tuning_steps_per_second": 6.771, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_oa_translated_accuracy": 0.7019998653150882, + "eval_oa_translated_loss": 1.2255859375, + "eval_oa_translated_runtime": 721.6165, + "eval_oa_translated_samples_per_second": 194.179, + "eval_oa_translated_steps_per_second": 4.046, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_prosocial_dialogue_accuracy": 0.5347828898075921, + "eval_prosocial_dialogue_loss": 1.7451171875, + "eval_prosocial_dialogue_runtime": 75.1624, + "eval_prosocial_dialogue_samples_per_second": 358.996, + "eval_prosocial_dialogue_steps_per_second": 7.49, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_math_qa_accuracy": 0.5915055732195003, + "eval_math_qa_loss": 1.7578125, + "eval_math_qa_runtime": 18.6859, + "eval_math_qa_samples_per_second": 319.384, + "eval_math_qa_steps_per_second": 6.69, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_wikihow_accuracy": 0.6088822632089863, + "eval_wikihow_loss": 1.8623046875, + "eval_wikihow_runtime": 7.0536, + "eval_wikihow_samples_per_second": 325.084, + "eval_wikihow_steps_per_second": 6.805, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_joke_accuracy": 0.4927975739196361, + "eval_joke_loss": 2.203125, + "eval_joke_runtime": 1.8797, + "eval_joke_samples_per_second": 40.431, + "eval_joke_steps_per_second": 1.064, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_gsm8k_accuracy": 0.7729670729488812, + "eval_gsm8k_loss": 0.84765625, + "eval_gsm8k_runtime": 5.4471, + "eval_gsm8k_samples_per_second": 274.458, + "eval_gsm8k_steps_per_second": 5.875, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_ted_trans_en-hi_accuracy": 0.6606576107655199, + "eval_ted_trans_en-hi_loss": 1.3681640625, + "eval_ted_trans_en-hi_runtime": 1.9419, + "eval_ted_trans_en-hi_samples_per_second": 53.04, + "eval_ted_trans_en-hi_steps_per_second": 1.545, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_ted_trans_de-ja_accuracy": 0.6181494047961354, + "eval_ted_trans_de-ja_loss": 1.669921875, + "eval_ted_trans_de-ja_runtime": 4.4657, + "eval_ted_trans_de-ja_samples_per_second": 160.781, + "eval_ted_trans_de-ja_steps_per_second": 3.359, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_ted_trans_nl-en_accuracy": 0.7275129478913435, + "eval_ted_trans_nl-en_loss": 1.193359375, + "eval_ted_trans_nl-en_runtime": 3.5649, + "eval_ted_trans_nl-en_samples_per_second": 216.275, + "eval_ted_trans_nl-en_steps_per_second": 4.769, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_ted_trans_en-ja_accuracy": 0.6332216902623955, + "eval_ted_trans_en-ja_loss": 1.5615234375, + "eval_ted_trans_en-ja_runtime": 4.3033, + "eval_ted_trans_en-ja_samples_per_second": 186.138, + "eval_ted_trans_en-ja_steps_per_second": 3.95, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_ted_trans_en-es_accuracy": 0.7696410515672396, + "eval_ted_trans_en-es_loss": 0.96875, + "eval_ted_trans_en-es_runtime": 5.0224, + "eval_ted_trans_en-es_samples_per_second": 164.462, + "eval_ted_trans_en-es_steps_per_second": 3.584, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_ted_trans_en-ms_accuracy": 0.6727053140096618, + "eval_ted_trans_en-ms_loss": 1.4013671875, + "eval_ted_trans_en-ms_runtime": 0.4904, + "eval_ted_trans_en-ms_samples_per_second": 85.65, + "eval_ted_trans_en-ms_steps_per_second": 2.039, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_xsum_accuracy": 0.6095022536611729, + "eval_xsum_loss": NaN, + "eval_xsum_runtime": 191.8464, + "eval_xsum_samples_per_second": 212.717, + "eval_xsum_steps_per_second": 4.436, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_cnn_dailymail_accuracy": 0.6799426165023833, + "eval_cnn_dailymail_loss": NaN, + "eval_cnn_dailymail_runtime": 276.5156, + "eval_cnn_dailymail_samples_per_second": 207.666, + "eval_cnn_dailymail_steps_per_second": 4.329, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_multi_news_accuracy": 0.547789914899022, + "eval_multi_news_loss": NaN, + "eval_multi_news_runtime": 46.7011, + "eval_multi_news_samples_per_second": 192.608, + "eval_multi_news_steps_per_second": 4.026, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_tldr_news_accuracy": 0.586805229004044, + "eval_tldr_news_loss": 1.861328125, + "eval_tldr_news_runtime": 4.1288, + "eval_tldr_news_samples_per_second": 345.86, + "eval_tldr_news_steps_per_second": 7.266, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_scitldr_accuracy": 0.49270664505672607, + "eval_scitldr_loss": NaN, + "eval_scitldr_runtime": 2.9031, + "eval_scitldr_samples_per_second": 137.437, + "eval_scitldr_steps_per_second": 3.1, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_samsum_accuracy": 0.6352255674674526, + "eval_samsum_loss": 1.3408203125, + "eval_samsum_runtime": 13.3091, + "eval_samsum_samples_per_second": 221.427, + "eval_samsum_steps_per_second": 4.658, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_debate_sum_accuracy": 0.9398914583902843, + "eval_debate_sum_loss": NaN, + "eval_debate_sum_runtime": 244.3103, + "eval_debate_sum_samples_per_second": 196.938, + "eval_debate_sum_steps_per_second": 4.105, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_billsum_accuracy": 0.6780196221313985, + "eval_billsum_loss": 1.3662109375, + "eval_billsum_runtime": 26.3691, + "eval_billsum_samples_per_second": 143.729, + "eval_billsum_steps_per_second": 2.996, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_wmt2019_zh-en_accuracy": 0.6364605543710021, + "eval_wmt2019_zh-en_loss": 1.6376953125, + "eval_wmt2019_zh-en_runtime": 14.2747, + "eval_wmt2019_zh-en_samples_per_second": 278.886, + "eval_wmt2019_zh-en_steps_per_second": 5.815, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_wmt2019_ru-en_accuracy": 0.7199560058060365, + "eval_wmt2019_ru-en_loss": 1.107421875, + "eval_wmt2019_ru-en_runtime": 8.7473, + "eval_wmt2019_ru-en_samples_per_second": 342.965, + "eval_wmt2019_ru-en_steps_per_second": 7.202, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_wmt2019_de-en_accuracy": 0.7420960891168457, + "eval_wmt2019_de-en_loss": 1.0400390625, + "eval_wmt2019_de-en_runtime": 8.7795, + "eval_wmt2019_de-en_samples_per_second": 341.476, + "eval_wmt2019_de-en_steps_per_second": 7.176, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_wmt2019_fr-de_accuracy": 0.7250604501209003, + "eval_wmt2019_fr-de_loss": 1.12890625, + "eval_wmt2019_fr-de_runtime": 4.8598, + "eval_wmt2019_fr-de_samples_per_second": 311.121, + "eval_wmt2019_fr-de_steps_per_second": 6.585, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_essay_instruction_accuracy": 0.5982741083077195, + "eval_essay_instruction_loss": 1.94921875, + "eval_essay_instruction_runtime": 5.5402, + "eval_essay_instruction_samples_per_second": 74.546, + "eval_essay_instruction_steps_per_second": 1.624, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_reddit_eli5_accuracy": 0.44749512828734445, + "eval_reddit_eli5_loss": 2.541015625, + "eval_reddit_eli5_runtime": 289.5058, + "eval_reddit_eli5_samples_per_second": 188.345, + "eval_reddit_eli5_steps_per_second": 3.924, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_reddit_askh_accuracy": 0.45043337961716656, + "eval_reddit_askh_loss": 2.63671875, + "eval_reddit_askh_runtime": 112.2629, + "eval_reddit_askh_samples_per_second": 175.525, + "eval_reddit_askh_steps_per_second": 3.661, + "step": 2250 + }, + { + "epoch": 0.13, + "eval_reddit_asks_accuracy": 0.4590568852017904, + "eval_reddit_asks_loss": 2.490234375, + "eval_reddit_asks_runtime": 152.7985, + "eval_reddit_asks_samples_per_second": 172.489, + "eval_reddit_asks_steps_per_second": 3.6, + "step": 2250 + } + ], + "max_steps": 34156, + "num_train_epochs": 2, + "total_flos": 1.6587065990940983e+19, + "trial_name": null, + "trial_params": null +}