diff --git a/.gitattributes b/.gitattributes index 118bab74a516326954c6b9e5914cf080b66369d0..de55ba550880b81c3070e352a1aacf35dda40d97 100644 --- a/.gitattributes +++ b/.gitattributes @@ -153,3 +153,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text 8b7178b25b/evaluation/generation/examples.8b7178b25b_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text 8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text 8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text +*/evaluatuon/generation/examples*.jsonl filter=lfs diff=lfs merge=lfs -text diff --git a/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.json b/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..561006dabaf0b2163a0c3a189e873df3d2ebcf17 --- /dev/null +++ b/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.30679877376441267, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003595563898423582}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.25602196677157735, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002878244091822486}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.24392276306894944, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023000547396419086}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.09387841885638748, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002194124801129067}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0740439804559834, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001644608125064704}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.07100355255541753, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014153910474231026}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.23412738417408802, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029250825035858734}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.19426592228730813, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023089496783281213}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.18445520873996413, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001797621869515743}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.28859199947210284, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003452141980034977}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.23966421160449644, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027125548096589747}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.2286765314746336, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021801688851002263}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 4.074528254683577, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10710940903995719}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.json b/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0f04cbb44007e5ee52e0c2353ea0adef6da99db9 --- /dev/null +++ b/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.272311629129121, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0039505188546921826}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.21816270627496134, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0032448735585025937}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.2094195871488414, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027068429969493734}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.08329439938013247, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0021782819294409984}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.06480971019253767, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016827399941316222}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.06204427812400913, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014401113659882462}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.20896331681491714, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031500865399556595}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.1664625275086072, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025592493419649177}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1590668148316741, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020766324795938023}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.2563037368454477, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0037619221351167056}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.20447650361202882, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030540843932082493}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.1964756114951262, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0025500807038816515}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.168617248016535, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08041979746938775}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.json b/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..158f2d4c0adbfca27ca7bf2546a91b0cdfb9a152 --- /dev/null +++ b/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.0910819200952716, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0033747683121654196}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.07028567198192245, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002640964970940966}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.06757697669893872, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023707490073172043}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.028646478253466095, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016076285180980135}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.020868526921797886, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011537893245677335}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.019948424307073904, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000986170441488306}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.07082578661014885, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002691634510208164}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.05448506471222756, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020854622601667937}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.05201990108593777, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018314763879696625}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.08551950005220435, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031983492312005384}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.06543250975997009, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024532813023277916}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.06308719820573185, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002213451741392458}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.07788444152267485, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.01315525656911281}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.json b/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3905dfcc95d1f2bbdbc0b6f418c5ea1a02a97ca0 --- /dev/null +++ b/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.014515441910190443, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001501466217296952}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.011301943813768834, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0012288119306628567}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.010473201550267177, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010554421691169314}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.004591980814271401, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006515529824929627}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.00416888931272546, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005930107839352501}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0035604598673033626, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004625418185814249}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.011779972055249343, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012586537746088814}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.009040049894954919, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0009873007391796482}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.008371164394493067, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008468942819407445}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.013708349668480058, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014293611241212706}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.010579263673799228, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0011521180305532216}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.009841875351812794, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.000994934070773035}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.4470192084599072e-15, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 5.1547920765171173e-14}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_3.json b/8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..37493f2214f899f8d11fb89a404f100763f36f78 --- /dev/null +++ b/8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.2511767032658256, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004670743744963309}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.2497224746972405, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004054437554455434}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.23697855013529284, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003931784147472208}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.06435388180319074, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002934840810442915}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.060376215777618605, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002521653823700491}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.059313458488013185, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0025641694242472487}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.19057534922020994, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0038620821553629983}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.1901634518957205, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033714037003502494}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.17977672776005849, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00326317345808302}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.19211901704097034, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003850775083612694}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.19288110429850555, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003421247538700275}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.18163235303784986, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003264460230665997}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 3.267835808853254, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.26356764701611307}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_4.json b/8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a991e28d62afe2611e8a62480471cd1add9df321 --- /dev/null +++ b/8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.06645727693786248, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004158365862658779}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.061174931724787315, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003797370735030977}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05891769720848019, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0035369396004201713}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.01730890961224063, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0017698712455992984}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.016289532767626923, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017234032328131926}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.015297632126595406, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00149002417901287}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.050637820115463826, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0032826504244346}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.04683971382625969, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003039332848507969}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.04476173097758684, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0027730017560448727}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.051192679124604304, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0033047254109378095}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0475395827212547, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030900986283582004}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.04533434532971928, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0028025558912498296}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.17970307621766216, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04965869269887041}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_5.json b/8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6ce08644841383cdebaeb02f38d43a4ca14f3ea9 --- /dev/null +++ b/8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.0025272416860947825, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0007813236132809655}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.002586618259740399, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0008670624223009818}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0025036711184808556, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.000808568948388182}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0003236042782341223, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0001381414258477455}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0003324696313480705, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00015129466488844655}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0003242107125389267, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00014221025181447073}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0016454419652040672, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0004929867496820194}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.001639076487035711, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005281423440330004}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.00159555474061474, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0004956987539243005}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0016454419652040672, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0004929867496820194}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.001639076487035711, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005281423440330004}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.00159555474061474, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0004956987539243005}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.393056311124318e-43, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.2487431746784071e-33}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.jsonl b/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..e80a17609872ebee0bc229899a7a27f032063c0e 100644 --- a/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.jsonl +++ b/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a65f5bdecdcdbda148d7cf5f9a8083572b9b5126c74c6d30245dd0c6cd2c9014 +size 18548218 diff --git a/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.jsonl b/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..d2d28c4622aa68e016991c8530f7e25e801bd7ee 100644 --- a/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.jsonl +++ b/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5374e9109304e0df5ed2574cfc9f109a1bb720175a205d37d9ac84ded27edfb +size 24030292 diff --git a/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.jsonl b/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..d917c834c0846c028725a357e5e033d53f747fe0 100644 --- a/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.jsonl +++ b/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:310cb6d4407a529f7d950d4793387bcd9f135ca965d953c9b52223fe42eb205c +size 29368906 diff --git a/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.jsonl b/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..bc1a230ad49ad143969fbba097bcc677eedf3f05 100644 --- a/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.jsonl +++ b/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfcf599522eee8de2bb128abdb6f9b462612cd1e029f7e9fff32a4a6eea5edf7 +size 34782587 diff --git a/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_3.jsonl b/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..546c5ab7a72f742565d7a23760677fc5b8e19e60 100644 --- a/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_3.jsonl +++ b/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b173fc150a337823b0701d7368bfb431e82df82576249bb67449c03fcb9ecdd +size 9475426 diff --git a/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_4.jsonl b/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..4363441e078240686bbfb0ee01584b73e2beb88d 100644 --- a/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_4.jsonl +++ b/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46702c5bc39b1b96361477587114a9a3aa24c24502dbe7b328a34ab3585bef66 +size 11630243 diff --git a/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_5.jsonl b/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..3bdcb89c30265fabc4c5c73c34e71d9c1d2f6a93 100644 --- a/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_5.jsonl +++ b/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e07d25012d66b17c9c8318583932c7cae300c19f37c931271015911964b1436a +size 13897359 diff --git a/8b7178b25b/evaluation/generation/merged.csv b/8b7178b25b/evaluation/generation/merged.csv index 34105cce50cee066f3752e3210f49a595101d540..4f25d65e9a4b5c339666eb27ae3fc2f04f82495c 100644 --- a/8b7178b25b/evaluation/generation/merged.csv +++ b/8b7178b25b/evaluation/generation/merged.csv @@ -18,7 +18,13 @@ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.05891708042714147 gem_xsum,1,median,rouge2_fmeasure,0.05891708042714147 gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.06271701269177109 gem_xsum,2,median,rouge2_fmeasure,0.06271701269177109 -gem_xsum,2,average,multiple,0.05525598610960542 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.059313458488013185 +gem_xsum,3,median,rouge2_fmeasure,0.059313458488013185 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.015297632126595406 +gem_xsum,4,median,rouge2_fmeasure,0.015297632126595406 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0003242107125389267 +gem_xsum,5,median,rouge2_fmeasure,0.0003242107125389267 +gem_xsum,5,average,multiple,0.04011720994266063 web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04353421604632926 web_nlg_en,0,median,rouge2_fmeasure,0.04353421604632926 web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.07820906828691397 @@ -36,4 +42,12 @@ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.04445750772336212 wiki_lingua_en,0,median,rouge2_fmeasure,0.04445750772336212 wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.053780768787129375 wiki_lingua_en,1,median,rouge2_fmeasure,0.053780768787129375 -wiki_lingua_en,1,average,multiple,0.04911913825524575 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.07100355255541753 +wiki_lingua_en,2,median,rouge2_fmeasure,0.07100355255541753 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.06204427812400913 +wiki_lingua_en,3,median,rouge2_fmeasure,0.06204427812400913 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.019948424307073904 +wiki_lingua_en,4,median,rouge2_fmeasure,0.019948424307073904 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0035604598673033626 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0035604598673033626 +wiki_lingua_en,5,average,multiple,0.04246583189404924 diff --git a/8b7178b25b/evaluation/generation/merged.json b/8b7178b25b/evaluation/generation/merged.json index c00c1d8eed082bff1c15b03a9cdee95841cebfc7..7972f639b78a3a5cb758559c1f29618b8e845a1f 100644 --- a/8b7178b25b/evaluation/generation/merged.json +++ b/8b7178b25b/evaluation/generation/merged.json @@ -1 +1 @@ -{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.33618386090404295, "bleu_stderr": 0.043286547374004225, "rouge1_fmeasure": 0.09550360928926843, "rouge1_fmeasure_stderr": 0.0021611192824620468, "rouge1_precision": 0.07608115633976532, "rouge1_precision_stderr": 0.0028424335866037475, "rouge1_recall": 0.243734587530279, "rouge1_recall_stderr": 0.005032751672173048, "rouge2_fmeasure": 0.04353421604632926, "rouge2_fmeasure_stderr": 0.0013022741040944032, "rouge2_precision": 0.03391274645846755, "rouge2_precision_stderr": 0.0017782441786429572, "rouge2_recall": 0.11477361563470867, "rouge2_recall_stderr": 0.0030177112931945443, "rougeL_fmeasure": 0.0908905716888711, "rougeL_fmeasure_stderr": 0.0019748576875776485, "rougeL_precision": 0.07249954185930807, "rougeL_precision_stderr": 0.002716866763111571, "rougeL_recall": 0.23447583587320983, "rougeL_recall_stderr": 0.004820896551192642, "rougeLsum_fmeasure": 0.09063319866429098, "rougeLsum_fmeasure_stderr": 0.002018782468480028, "rougeLsum_precision": 0.07260565563718514, "rougeLsum_precision_stderr": 0.0027622865524293628, "rougeLsum_recall": 0.2312007323793673, "rougeLsum_recall_stderr": 0.004666750865618609}}, "1": {"PALM_prompt": {"bleu": 0.5152510207869508, "bleu_stderr": 0.039697768768641464, "rouge1_fmeasure": 0.15419793855600986, "rouge1_fmeasure_stderr": 0.0037752055082234935, "rouge1_precision": 0.1335251141437586, "rouge1_precision_stderr": 0.004421796872864802, "rouge1_recall": 0.3040669240102272, "rouge1_recall_stderr": 0.0049926465687368735, "rouge2_fmeasure": 0.07820906828691397, "rouge2_fmeasure_stderr": 0.0026222161094111586, "rouge2_precision": 0.06857339375782437, "rouge2_precision_stderr": 0.0030273264083127568, "rouge2_recall": 0.1555986112064758, "rouge2_recall_stderr": 0.0036341014671041087, "rougeL_fmeasure": 0.13965714245032726, "rougeL_fmeasure_stderr": 0.0032487392690607572, "rougeL_precision": 0.1195649771956543, "rougeL_precision_stderr": 0.0038518942294089972, "rougeL_recall": 0.2838951533263151, "rougeL_recall_stderr": 0.004599064364517073, "rougeLsum_fmeasure": 0.14150307262693815, "rougeLsum_fmeasure_stderr": 0.0033007395324502467, "rougeLsum_precision": 0.12140487798820795, "rougeLsum_precision_stderr": 0.003915984273012587, "rougeLsum_recall": 0.2857833042522088, "rougeLsum_recall_stderr": 0.004601461544101827}}, "2": {"PALM_prompt": {"bleu": 0.6450721424031474, "bleu_stderr": 0.035938592978876936, "rouge1_fmeasure": 0.19228537050298922, "rouge1_fmeasure_stderr": 0.004356183491682661, "rouge1_precision": 0.17715496451419446, "rouge1_precision_stderr": 0.005377686826508908, "rouge1_recall": 0.3442204927205235, "rouge1_recall_stderr": 0.00499209962075588, "rouge2_fmeasure": 0.1017490401630499, "rouge2_fmeasure_stderr": 0.003080557280075834, "rouge2_precision": 0.09659406582034752, "rouge2_precision_stderr": 0.0036720254666308344, "rouge2_recall": 0.1816245156374427, "rouge2_recall_stderr": 0.0038421174433974025, "rougeL_fmeasure": 0.17158392384169643, "rougeL_fmeasure_stderr": 0.0037128302671958994, "rougeL_precision": 0.15519367504713033, "rougeL_precision_stderr": 0.004580706449804959, "rougeL_recall": 0.31893881029765586, "rougeL_recall_stderr": 0.004597766706728441, "rougeLsum_fmeasure": 0.17498254398589475, "rougeLsum_fmeasure_stderr": 0.0037985612156152086, "rougeLsum_precision": 0.1592800915810083, "rougeLsum_precision_stderr": 0.0047149524911017215, "rougeLsum_recall": 0.3221825821541532, "rougeLsum_recall_stderr": 0.0046252310161258746}}, "3": {"PALM_prompt": {"bleu": 0.8785043636862591, "bleu_stderr": 0.02652000083973547, "rouge1_fmeasure": 0.21217682060449367, "rouge1_fmeasure_stderr": 0.004581243352642801, "rouge1_precision": 0.2002191357399255, "rouge1_precision_stderr": 0.005700191155087746, "rouge1_recall": 0.3622650735119951, "rouge1_recall_stderr": 0.005105318033672305, "rouge2_fmeasure": 0.11312048852329888, "rouge2_fmeasure_stderr": 0.003186877123824252, "rouge2_precision": 0.10955905558045385, "rouge2_precision_stderr": 0.0038849982073218873, "rouge2_recall": 0.19262377036723086, "rouge2_recall_stderr": 0.003902442560309681, "rougeL_fmeasure": 0.1874319081280966, "rougeL_fmeasure_stderr": 0.0038899168243038715, "rougeL_precision": 0.1744946205047383, "rougeL_precision_stderr": 0.004899868234689971, "rougeL_recall": 0.33190561350734876, "rougeL_recall_stderr": 0.004628476785750535, "rougeLsum_fmeasure": 0.19207381785134825, "rougeLsum_fmeasure_stderr": 0.004003703743860644, "rougeLsum_precision": 0.17972186968155374, "rougeLsum_precision_stderr": 0.005058207025529124, "rougeLsum_recall": 0.33723235220496645, "rougeLsum_recall_stderr": 0.0046889691038068775}}, "4": {"PALM_prompt": {"bleu": 1.020225734756704, "bleu_stderr": 0.06946466420275214, "rouge1_fmeasure": 0.22615455778377724, "rouge1_fmeasure_stderr": 0.004584303113870005, "rouge1_precision": 0.21447559212644274, "rouge1_precision_stderr": 0.005787602630220149, "rouge1_recall": 0.3797811786576114, "rouge1_recall_stderr": 0.005078572769442694, "rouge2_fmeasure": 0.12153309984630654, "rouge2_fmeasure_stderr": 0.0032009564132827184, "rouge2_precision": 0.11802182161980528, "rouge2_precision_stderr": 0.003954053081043237, "rouge2_recall": 0.20495230642568182, "rouge2_recall_stderr": 0.003956396099130858, "rougeL_fmeasure": 0.1985084531863174, "rougeL_fmeasure_stderr": 0.003863020108912202, "rougeL_precision": 0.18574729294363898, "rougeL_precision_stderr": 0.00493965653212434, "rougeL_recall": 0.34661430398431914, "rougeL_recall_stderr": 0.004643977843173891, "rougeLsum_fmeasure": 0.20493355705865743, "rougeLsum_fmeasure_stderr": 0.004011218572127487, "rougeLsum_precision": 0.19295820334137398, "rougeLsum_precision_stderr": 0.005149923293184489, "rougeLsum_recall": 0.3532599821115898, "rougeLsum_recall_stderr": 0.004685486372918684}}, "5": {"PALM_prompt": {"bleu": 1.1738991703843653, "bleu_stderr": 0.05137420728965795, "rouge1_fmeasure": 0.2452749113827306, "rouge1_fmeasure_stderr": 0.004866057995099286, "rouge1_precision": 0.23842752486609714, "rouge1_precision_stderr": 0.00616558462817229, "rouge1_recall": 0.39104608539227964, "rouge1_recall_stderr": 0.005079097315439687, "rouge2_fmeasure": 0.13673599319945717, "rouge2_fmeasure_stderr": 0.003545382189555846, "rouge2_precision": 0.13743673376990653, "rouge2_precision_stderr": 0.004448566773563913, "rouge2_recall": 0.21617363071605727, "rouge2_recall_stderr": 0.004065117072186819, "rougeL_fmeasure": 0.2145149869812429, "rougeL_fmeasure_stderr": 0.004135695136449003, "rougeL_precision": 0.20669362004199143, "rougeL_precision_stderr": 0.0053303444381744315, "rougeL_recall": 0.35345362943589836, "rougeL_recall_stderr": 0.004581784867877382, "rougeLsum_fmeasure": 0.22169261840373009, "rougeLsum_fmeasure_stderr": 0.004292938565416988, "rougeLsum_precision": 0.21482050441032427, "rougeLsum_precision_stderr": 0.005549487778372521, "rougeLsum_recall": 0.3612051449037077, "rougeLsum_recall_stderr": 0.004657096205531439}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 3.2204812581094187, "bleu_stderr": 0.0968937530189579, "rouge1_fmeasure": 0.17281704832161807, "rouge1_fmeasure_stderr": 0.0025639705499622615, "rouge1_precision": 0.16222110345712215, "rouge1_precision_stderr": 0.002918181626000938, "rouge1_recall": 0.23556142979563985, "rouge1_recall_stderr": 0.0035092798940514463, "rouge2_fmeasure": 0.04445750772336212, "rouge2_fmeasure_stderr": 0.0010694173040695087, "rouge2_precision": 0.04035916262517482, "rouge2_precision_stderr": 0.0011089852877097363, "rouge2_recall": 0.06187933259063034, "rouge2_recall_stderr": 0.0015927286355937283, "rougeL_fmeasure": 0.12828595447707558, "rougeL_fmeasure_stderr": 0.0018630698897748783, "rougeL_precision": 0.12115800294799915, "rougeL_precision_stderr": 0.0023302786580353405, "rougeL_recall": 0.1793437869920672, "rougeL_recall_stderr": 0.0027904364481447434, "rougeLsum_fmeasure": 0.16125234602130137, "rougeLsum_fmeasure_stderr": 0.0024054029356366125, "rougeLsum_precision": 0.15193645088089489, "rougeLsum_precision_stderr": 0.002794596188541928, "rougeLsum_recall": 0.22017606445792967, "rougeLsum_recall_stderr": 0.003307741344782461}}, "1": {"tldr_en": {"bleu": 3.030764799858383, "bleu_stderr": 0.10180137949258324, "rouge1_fmeasure": 0.20057722026752264, "rouge1_fmeasure_stderr": 0.00237727127672953, "rouge1_precision": 0.24835405674131716, "rouge1_precision_stderr": 0.003595946507957674, "rouge1_recall": 0.22065222520601965, "rouge1_recall_stderr": 0.0030326234032250735, "rouge2_fmeasure": 0.053780768787129375, "rouge2_fmeasure_stderr": 0.0012652221510456923, "rouge2_precision": 0.07072572329707479, "rouge2_precision_stderr": 0.002023483136430399, "rouge2_recall": 0.05888749376061464, "rouge2_recall_stderr": 0.0015271270172799948, "rougeL_fmeasure": 0.15099195793433165, "rougeL_fmeasure_stderr": 0.0017939149386485946, "rougeL_precision": 0.18962847615836217, "rougeL_precision_stderr": 0.0029056148560385174, "rougeL_recall": 0.16669593752772216, "rougeL_recall_stderr": 0.0023452388157941375, "rougeLsum_fmeasure": 0.18619742012806825, "rougeLsum_fmeasure_stderr": 0.0022080861119325576, "rougeLsum_precision": 0.23119707647225618, "rougeLsum_precision_stderr": 0.0034029730598757074, "rougeLsum_recall": 0.20514467867088854, "rougeLsum_recall_stderr": 0.0028358663517956315}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 1.8080075016885704, "bleu_stderr": 0.061027545228237855, "rouge1_fmeasure": 0.17861979049522314, "rouge1_fmeasure_stderr": 0.0018002125016461491, "rouge1_precision": 0.1356910382997071, "rouge1_precision_stderr": 0.0015410643240858764, "rouge1_recall": 0.2778453886224831, "rouge1_recall_stderr": 0.0023539268248121705, "rouge2_fmeasure": 0.046215425350075, "rouge2_fmeasure_stderr": 0.0010566397123776003, "rouge2_precision": 0.03490809623131593, "rouge2_precision_stderr": 0.0008357898041576343, "rouge2_recall": 0.07284634942140401, "rouge2_recall_stderr": 0.0015701168954429102, "rougeL_fmeasure": 0.16201597249905292, "rougeL_fmeasure_stderr": 0.0013971090679042123, "rougeL_precision": 0.12227403244327856, "rougeL_precision_stderr": 0.0011855454993101654, "rougeL_recall": 0.2551033251187225, "rougeL_recall_stderr": 0.0019031145709281366, "rougeLsum_fmeasure": 0.14908466648922475, "rougeLsum_fmeasure_stderr": 0.0016626826909044133, "rougeLsum_precision": 0.113242028542405, "rougeLsum_precision_stderr": 0.0013998996791185847, "rougeLsum_recall": 0.23203443734017606, "rougeLsum_recall_stderr": 0.0022390418934348437}}, "1": {"generate_text_restaurant": {"bleu": 12.173849341913149, "bleu_stderr": 0.0669092468478733, "rouge1_fmeasure": 0.48129644782721304, "rouge1_fmeasure_stderr": 0.0023219173855042303, "rouge1_precision": 0.5970385739901486, "rouge1_precision_stderr": 0.003272991717133695, "rouge1_recall": 0.44205108840831364, "rouge1_recall_stderr": 0.002990943407358534, "rouge2_fmeasure": 0.23032000484043905, "rouge2_fmeasure_stderr": 0.0021147211593944798, "rouge2_precision": 0.2911262146867607, "rouge2_precision_stderr": 0.002892523328423528, "rouge2_recall": 0.2107609477318019, "rouge2_recall_stderr": 0.0022166981158979714, "rougeL_fmeasure": 0.34966965256911114, "rougeL_fmeasure_stderr": 0.002108342891501849, "rougeL_precision": 0.4377488624210833, "rougeL_precision_stderr": 0.003110832463080634, "rougeL_recall": 0.3198170788492075, "rougeL_recall_stderr": 0.002444190246186975, "rougeLsum_fmeasure": 0.39331173897680705, "rougeLsum_fmeasure_stderr": 0.0023364505709058668, "rougeLsum_precision": 0.48957738460024036, "rougeLsum_precision_stderr": 0.00327157474691632, "rougeLsum_recall": 0.36068779636155734, "rougeLsum_recall_stderr": 0.0027524175233159713}}, "2": {"generate_text_restaurant": {"bleu": 14.360624153571306, "bleu_stderr": 0.15724700369597677, "rouge1_fmeasure": 0.5041317140273615, "rouge1_fmeasure_stderr": 0.0022753134233410278, "rouge1_precision": 0.6032331597687249, "rouge1_precision_stderr": 0.003142877448057622, "rouge1_recall": 0.4697575589134947, "rouge1_recall_stderr": 0.0029526957130554953, "rouge2_fmeasure": 0.25374280478792693, "rouge2_fmeasure_stderr": 0.002169411661246531, "rouge2_precision": 0.30851618494266825, "rouge2_precision_stderr": 0.002855313490834544, "rouge2_recall": 0.23616858770973606, "rouge2_recall_stderr": 0.002325964607821345, "rougeL_fmeasure": 0.3721296496966484, "rougeL_fmeasure_stderr": 0.002164018647000424, "rougeL_precision": 0.4477614821909179, "rougeL_precision_stderr": 0.003011118677636196, "rougeL_recall": 0.3459816794847227, "rougeL_recall_stderr": 0.002532543665969986, "rougeLsum_fmeasure": 0.42160904040581204, "rougeLsum_fmeasure_stderr": 0.002384317338752149, "rougeLsum_precision": 0.5054563233222107, "rougeLsum_precision_stderr": 0.0032185989379280733, "rougeLsum_recall": 0.39233324008569126, "rougeLsum_recall_stderr": 0.0027964602331100393}}, "3": {"generate_text_restaurant": {"bleu": 15.173778916293744, "bleu_stderr": 0.24020125390484678, "rouge1_fmeasure": 0.5122282123885756, "rouge1_fmeasure_stderr": 0.0022794016747342454, "rouge1_precision": 0.6038782195503243, "rouge1_precision_stderr": 0.003130218028564903, "rouge1_recall": 0.4797961783746489, "rouge1_recall_stderr": 0.0029264432136497623, "rouge2_fmeasure": 0.2611392423531194, "rouge2_fmeasure_stderr": 0.0021928684064362373, "rouge2_precision": 0.31128306325561933, "rouge2_precision_stderr": 0.0028044649859898306, "rouge2_recall": 0.2447494516430458, "rouge2_recall_stderr": 0.002368198668181156, "rougeL_fmeasure": 0.37928627551530325, "rougeL_fmeasure_stderr": 0.0021871071072142663, "rougeL_precision": 0.44868202075612884, "rougeL_precision_stderr": 0.0029681578589764264, "rougeL_recall": 0.35482043982343436, "rougeL_recall_stderr": 0.002532913929369527, "rougeLsum_fmeasure": 0.43123333798993546, "rougeLsum_fmeasure_stderr": 0.002401597688343299, "rougeLsum_precision": 0.5087702457406414, "rougeLsum_precision_stderr": 0.0031818356512290486, "rougeLsum_recall": 0.4036850753818513, "rougeLsum_recall_stderr": 0.0028061971662930287}}, "4": {"generate_text_restaurant": {"bleu": 15.78836334652692, "bleu_stderr": 0.16277087917874616, "rouge1_fmeasure": 0.5201963806927633, "rouge1_fmeasure_stderr": 0.0022664890545995124, "rouge1_precision": 0.6052306848983445, "rouge1_precision_stderr": 0.0031205758451821283, "rouge1_recall": 0.49015291142822803, "rouge1_recall_stderr": 0.002870806275113502, "rouge2_fmeasure": 0.2653610425045864, "rouge2_fmeasure_stderr": 0.0021838284720614534, "rouge2_precision": 0.31194428367362853, "rouge2_precision_stderr": 0.0027609843407215874, "rouge2_recall": 0.2500276639929066, "rouge2_recall_stderr": 0.0023425058959290804, "rougeL_fmeasure": 0.3838642484464721, "rougeL_fmeasure_stderr": 0.002185515405072229, "rougeL_precision": 0.4475978408946653, "rougeL_precision_stderr": 0.002901824428209634, "rougeL_recall": 0.36147111146407906, "rougeL_recall_stderr": 0.002523200720958866, "rougeLsum_fmeasure": 0.43919378367441203, "rougeLsum_fmeasure_stderr": 0.002408659068024605, "rougeLsum_precision": 0.5110579537103325, "rougeLsum_precision_stderr": 0.0031404148777386892, "rougeLsum_recall": 0.4135967411759738, "rougeLsum_recall_stderr": 0.0027946622087117543}}, "5": {"generate_text_restaurant": {"bleu": 15.879323420491742, "bleu_stderr": 0.1522927233333457, "rouge1_fmeasure": 0.5216373530710218, "rouge1_fmeasure_stderr": 0.002262224044659468, "rouge1_precision": 0.6064323245894362, "rouge1_precision_stderr": 0.00315510278665659, "rouge1_recall": 0.49013366730559393, "rouge1_recall_stderr": 0.0028136604439425025, "rouge2_fmeasure": 0.26786622625063644, "rouge2_fmeasure_stderr": 0.002208434520877025, "rouge2_precision": 0.3150817079641561, "rouge2_precision_stderr": 0.002829192112850875, "rouge2_recall": 0.25132514912770915, "rouge2_recall_stderr": 0.002318826430015561, "rougeL_fmeasure": 0.38681699896646954, "rougeL_fmeasure_stderr": 0.002220989687079745, "rougeL_precision": 0.45023806138186195, "rougeL_precision_stderr": 0.002949168521598567, "rougeL_recall": 0.36355539000425124, "rougeL_recall_stderr": 0.002526510160020316, "rougeLsum_fmeasure": 0.4417889417099761, "rougeLsum_fmeasure_stderr": 0.0024158193931717187, "rougeLsum_precision": 0.5137337228447252, "rougeLsum_precision_stderr": 0.0031936836223124105, "rougeLsum_recall": 0.41496788459604567, "rougeLsum_recall_stderr": 0.0027605438893900454}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.9472662234201847, "bleu_stderr": 0.0954010664780363, "rouge1_fmeasure": 0.20062931818668506, "rouge1_fmeasure_stderr": 0.002892087092668822, "rouge1_precision": 0.1512189693389312, "rouge1_precision_stderr": 0.002590198483449934, "rouge1_recall": 0.32951007687075756, "rouge1_recall_stderr": 0.004923756202298708, "rouge2_fmeasure": 0.044133865209903714, "rouge2_fmeasure_stderr": 0.0015693045278532822, "rouge2_precision": 0.03240083882273609, "rouge2_precision_stderr": 0.0011887319293568278, "rouge2_recall": 0.07461053739480651, "rouge2_recall_stderr": 0.0027791525130900266, "rougeL_fmeasure": 0.14533977808404885, "rougeL_fmeasure_stderr": 0.0021630231020242576, "rougeL_precision": 0.11056422701824653, "rougeL_precision_stderr": 0.0021853162208514175, "rougeL_recall": 0.23921769394978765, "rougeL_recall_stderr": 0.003783594428094317, "rougeLsum_fmeasure": 0.15912733837926105, "rougeLsum_fmeasure_stderr": 0.0023937408312475765, "rougeLsum_precision": 0.12055829236568064, "rougeLsum_precision_stderr": 0.002297890666943901, "rougeLsum_recall": 0.2623026232832005, "rougeLsum_recall_stderr": 0.004182995633448934}}, "1": {"article_DOC_summary": {"bleu": 2.857997728240847, "bleu_stderr": 0.15132505328438867, "rouge1_fmeasure": 0.24058051563117627, "rouge1_fmeasure_stderr": 0.003619950901783304, "rouge1_precision": 0.2431489069099882, "rouge1_precision_stderr": 0.004319149380207122, "rouge1_recall": 0.27274551008869324, "rouge1_recall_stderr": 0.004181125297952178, "rouge2_fmeasure": 0.05891708042714147, "rouge2_fmeasure_stderr": 0.002369248568269019, "rouge2_precision": 0.06156338993755893, "rouge2_precision_stderr": 0.0027090151794028584, "rouge2_recall": 0.06564279411701872, "rouge2_recall_stderr": 0.0026141410795920464, "rougeL_fmeasure": 0.1845786738464429, "rougeL_fmeasure_stderr": 0.0030093215424524295, "rougeL_precision": 0.18703509762652787, "rougeL_precision_stderr": 0.0036036431129194833, "rougeL_recall": 0.20956010556672522, "rougeL_recall_stderr": 0.00345405751329193, "rougeLsum_fmeasure": 0.18768186260751757, "rougeLsum_fmeasure_stderr": 0.003017598467168456, "rougeLsum_precision": 0.18954846036997042, "rougeLsum_precision_stderr": 0.0035941447605698394, "rougeLsum_recall": 0.21418956311713871, "rougeLsum_recall_stderr": 0.0035468160066617596}}, "2": {"article_DOC_summary": {"bleu": 3.19195992073562, "bleu_stderr": 0.2067955639958893, "rouge1_fmeasure": 0.24657160673903905, "rouge1_fmeasure_stderr": 0.0037739176215048444, "rouge1_precision": 0.25234682823435783, "rouge1_precision_stderr": 0.004406932928762655, "rouge1_recall": 0.26739840698638306, "rouge1_recall_stderr": 0.0039999072862688684, "rouge2_fmeasure": 0.06271701269177109, "rouge2_fmeasure_stderr": 0.0025663699767628305, "rouge2_precision": 0.06567480327233229, "rouge2_precision_stderr": 0.0027842377535723913, "rouge2_recall": 0.06599412859644187, "rouge2_recall_stderr": 0.0026775951767211374, "rougeL_fmeasure": 0.1876781101224112, "rougeL_fmeasure_stderr": 0.0031437463539169125, "rougeL_precision": 0.1921361411782326, "rougeL_precision_stderr": 0.0036263536814562453, "rougeL_recall": 0.20386135964058816, "rougeL_recall_stderr": 0.0033147209011893724, "rougeLsum_fmeasure": 0.1896272533115015, "rougeLsum_fmeasure_stderr": 0.0031550558124475133, "rougeLsum_precision": 0.19374913452852363, "rougeLsum_precision_stderr": 0.0036251531161871544, "rougeLsum_recall": 0.20676693183261305, "rougeLsum_recall_stderr": 0.0034014354284824795}}}} \ No newline at end of file +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.33618386090404295, "bleu_stderr": 0.043286547374004225, "rouge1_fmeasure": 0.09550360928926843, "rouge1_fmeasure_stderr": 0.0021611192824620468, "rouge1_precision": 0.07608115633976532, "rouge1_precision_stderr": 0.0028424335866037475, "rouge1_recall": 0.243734587530279, "rouge1_recall_stderr": 0.005032751672173048, "rouge2_fmeasure": 0.04353421604632926, "rouge2_fmeasure_stderr": 0.0013022741040944032, "rouge2_precision": 0.03391274645846755, "rouge2_precision_stderr": 0.0017782441786429572, "rouge2_recall": 0.11477361563470867, "rouge2_recall_stderr": 0.0030177112931945443, "rougeL_fmeasure": 0.0908905716888711, "rougeL_fmeasure_stderr": 0.0019748576875776485, "rougeL_precision": 0.07249954185930807, "rougeL_precision_stderr": 0.002716866763111571, "rougeL_recall": 0.23447583587320983, "rougeL_recall_stderr": 0.004820896551192642, "rougeLsum_fmeasure": 0.09063319866429098, "rougeLsum_fmeasure_stderr": 0.002018782468480028, "rougeLsum_precision": 0.07260565563718514, "rougeLsum_precision_stderr": 0.0027622865524293628, "rougeLsum_recall": 0.2312007323793673, "rougeLsum_recall_stderr": 0.004666750865618609}}, "1": {"PALM_prompt": {"bleu": 0.5152510207869508, "bleu_stderr": 0.039697768768641464, "rouge1_fmeasure": 0.15419793855600986, "rouge1_fmeasure_stderr": 0.0037752055082234935, "rouge1_precision": 0.1335251141437586, "rouge1_precision_stderr": 0.004421796872864802, "rouge1_recall": 0.3040669240102272, "rouge1_recall_stderr": 0.0049926465687368735, "rouge2_fmeasure": 0.07820906828691397, "rouge2_fmeasure_stderr": 0.0026222161094111586, "rouge2_precision": 0.06857339375782437, "rouge2_precision_stderr": 0.0030273264083127568, "rouge2_recall": 0.1555986112064758, "rouge2_recall_stderr": 0.0036341014671041087, "rougeL_fmeasure": 0.13965714245032726, "rougeL_fmeasure_stderr": 0.0032487392690607572, "rougeL_precision": 0.1195649771956543, "rougeL_precision_stderr": 0.0038518942294089972, "rougeL_recall": 0.2838951533263151, "rougeL_recall_stderr": 0.004599064364517073, "rougeLsum_fmeasure": 0.14150307262693815, "rougeLsum_fmeasure_stderr": 0.0033007395324502467, "rougeLsum_precision": 0.12140487798820795, "rougeLsum_precision_stderr": 0.003915984273012587, "rougeLsum_recall": 0.2857833042522088, "rougeLsum_recall_stderr": 0.004601461544101827}}, "2": {"PALM_prompt": {"bleu": 0.6450721424031474, "bleu_stderr": 0.035938592978876936, "rouge1_fmeasure": 0.19228537050298922, "rouge1_fmeasure_stderr": 0.004356183491682661, "rouge1_precision": 0.17715496451419446, "rouge1_precision_stderr": 0.005377686826508908, "rouge1_recall": 0.3442204927205235, "rouge1_recall_stderr": 0.00499209962075588, "rouge2_fmeasure": 0.1017490401630499, "rouge2_fmeasure_stderr": 0.003080557280075834, "rouge2_precision": 0.09659406582034752, "rouge2_precision_stderr": 0.0036720254666308344, "rouge2_recall": 0.1816245156374427, "rouge2_recall_stderr": 0.0038421174433974025, "rougeL_fmeasure": 0.17158392384169643, "rougeL_fmeasure_stderr": 0.0037128302671958994, "rougeL_precision": 0.15519367504713033, "rougeL_precision_stderr": 0.004580706449804959, "rougeL_recall": 0.31893881029765586, "rougeL_recall_stderr": 0.004597766706728441, "rougeLsum_fmeasure": 0.17498254398589475, "rougeLsum_fmeasure_stderr": 0.0037985612156152086, "rougeLsum_precision": 0.1592800915810083, "rougeLsum_precision_stderr": 0.0047149524911017215, "rougeLsum_recall": 0.3221825821541532, "rougeLsum_recall_stderr": 0.0046252310161258746}}, "3": {"PALM_prompt": {"bleu": 0.8785043636862591, "bleu_stderr": 0.02652000083973547, "rouge1_fmeasure": 0.21217682060449367, "rouge1_fmeasure_stderr": 0.004581243352642801, "rouge1_precision": 0.2002191357399255, "rouge1_precision_stderr": 0.005700191155087746, "rouge1_recall": 0.3622650735119951, "rouge1_recall_stderr": 0.005105318033672305, "rouge2_fmeasure": 0.11312048852329888, "rouge2_fmeasure_stderr": 0.003186877123824252, "rouge2_precision": 0.10955905558045385, "rouge2_precision_stderr": 0.0038849982073218873, "rouge2_recall": 0.19262377036723086, "rouge2_recall_stderr": 0.003902442560309681, "rougeL_fmeasure": 0.1874319081280966, "rougeL_fmeasure_stderr": 0.0038899168243038715, "rougeL_precision": 0.1744946205047383, "rougeL_precision_stderr": 0.004899868234689971, "rougeL_recall": 0.33190561350734876, "rougeL_recall_stderr": 0.004628476785750535, "rougeLsum_fmeasure": 0.19207381785134825, "rougeLsum_fmeasure_stderr": 0.004003703743860644, "rougeLsum_precision": 0.17972186968155374, "rougeLsum_precision_stderr": 0.005058207025529124, "rougeLsum_recall": 0.33723235220496645, "rougeLsum_recall_stderr": 0.0046889691038068775}}, "4": {"PALM_prompt": {"bleu": 1.020225734756704, "bleu_stderr": 0.06946466420275214, "rouge1_fmeasure": 0.22615455778377724, "rouge1_fmeasure_stderr": 0.004584303113870005, "rouge1_precision": 0.21447559212644274, "rouge1_precision_stderr": 0.005787602630220149, "rouge1_recall": 0.3797811786576114, "rouge1_recall_stderr": 0.005078572769442694, "rouge2_fmeasure": 0.12153309984630654, "rouge2_fmeasure_stderr": 0.0032009564132827184, "rouge2_precision": 0.11802182161980528, "rouge2_precision_stderr": 0.003954053081043237, "rouge2_recall": 0.20495230642568182, "rouge2_recall_stderr": 0.003956396099130858, "rougeL_fmeasure": 0.1985084531863174, "rougeL_fmeasure_stderr": 0.003863020108912202, "rougeL_precision": 0.18574729294363898, "rougeL_precision_stderr": 0.00493965653212434, "rougeL_recall": 0.34661430398431914, "rougeL_recall_stderr": 0.004643977843173891, "rougeLsum_fmeasure": 0.20493355705865743, "rougeLsum_fmeasure_stderr": 0.004011218572127487, "rougeLsum_precision": 0.19295820334137398, "rougeLsum_precision_stderr": 0.005149923293184489, "rougeLsum_recall": 0.3532599821115898, "rougeLsum_recall_stderr": 0.004685486372918684}}, "5": {"PALM_prompt": {"bleu": 1.1738991703843653, "bleu_stderr": 0.05137420728965795, "rouge1_fmeasure": 0.2452749113827306, "rouge1_fmeasure_stderr": 0.004866057995099286, "rouge1_precision": 0.23842752486609714, "rouge1_precision_stderr": 0.00616558462817229, "rouge1_recall": 0.39104608539227964, "rouge1_recall_stderr": 0.005079097315439687, "rouge2_fmeasure": 0.13673599319945717, "rouge2_fmeasure_stderr": 0.003545382189555846, "rouge2_precision": 0.13743673376990653, "rouge2_precision_stderr": 0.004448566773563913, "rouge2_recall": 0.21617363071605727, "rouge2_recall_stderr": 0.004065117072186819, "rougeL_fmeasure": 0.2145149869812429, "rougeL_fmeasure_stderr": 0.004135695136449003, "rougeL_precision": 0.20669362004199143, "rougeL_precision_stderr": 0.0053303444381744315, "rougeL_recall": 0.35345362943589836, "rougeL_recall_stderr": 0.004581784867877382, "rougeLsum_fmeasure": 0.22169261840373009, "rougeLsum_fmeasure_stderr": 0.004292938565416988, "rougeLsum_precision": 0.21482050441032427, "rougeLsum_precision_stderr": 0.005549487778372521, "rougeLsum_recall": 0.3612051449037077, "rougeLsum_recall_stderr": 0.004657096205531439}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 3.2204812581094187, "bleu_stderr": 0.0968937530189579, "rouge1_fmeasure": 0.17281704832161807, "rouge1_fmeasure_stderr": 0.0025639705499622615, "rouge1_precision": 0.16222110345712215, "rouge1_precision_stderr": 0.002918181626000938, "rouge1_recall": 0.23556142979563985, "rouge1_recall_stderr": 0.0035092798940514463, "rouge2_fmeasure": 0.04445750772336212, "rouge2_fmeasure_stderr": 0.0010694173040695087, "rouge2_precision": 0.04035916262517482, "rouge2_precision_stderr": 0.0011089852877097363, "rouge2_recall": 0.06187933259063034, "rouge2_recall_stderr": 0.0015927286355937283, "rougeL_fmeasure": 0.12828595447707558, "rougeL_fmeasure_stderr": 0.0018630698897748783, "rougeL_precision": 0.12115800294799915, "rougeL_precision_stderr": 0.0023302786580353405, "rougeL_recall": 0.1793437869920672, "rougeL_recall_stderr": 0.0027904364481447434, "rougeLsum_fmeasure": 0.16125234602130137, "rougeLsum_fmeasure_stderr": 0.0024054029356366125, "rougeLsum_precision": 0.15193645088089489, "rougeLsum_precision_stderr": 0.002794596188541928, "rougeLsum_recall": 0.22017606445792967, "rougeLsum_recall_stderr": 0.003307741344782461}}, "1": {"tldr_en": {"bleu": 3.030764799858383, "bleu_stderr": 0.10180137949258324, "rouge1_fmeasure": 0.20057722026752264, "rouge1_fmeasure_stderr": 0.00237727127672953, "rouge1_precision": 0.24835405674131716, "rouge1_precision_stderr": 0.003595946507957674, "rouge1_recall": 0.22065222520601965, "rouge1_recall_stderr": 0.0030326234032250735, "rouge2_fmeasure": 0.053780768787129375, "rouge2_fmeasure_stderr": 0.0012652221510456923, "rouge2_precision": 0.07072572329707479, "rouge2_precision_stderr": 0.002023483136430399, "rouge2_recall": 0.05888749376061464, "rouge2_recall_stderr": 0.0015271270172799948, "rougeL_fmeasure": 0.15099195793433165, "rougeL_fmeasure_stderr": 0.0017939149386485946, "rougeL_precision": 0.18962847615836217, "rougeL_precision_stderr": 0.0029056148560385174, "rougeL_recall": 0.16669593752772216, "rougeL_recall_stderr": 0.0023452388157941375, "rougeLsum_fmeasure": 0.18619742012806825, "rougeLsum_fmeasure_stderr": 0.0022080861119325576, "rougeLsum_precision": 0.23119707647225618, "rougeLsum_precision_stderr": 0.0034029730598757074, "rougeLsum_recall": 0.20514467867088854, "rougeLsum_recall_stderr": 0.0028358663517956315}}, "2": {"tldr_en": {"bleu": 4.074528254683577, "bleu_stderr": 0.10710940903995719, "rouge1_fmeasure": 0.24392276306894944, "rouge1_fmeasure_stderr": 0.0023000547396419086, "rouge1_precision": 0.30679877376441267, "rouge1_precision_stderr": 0.003595563898423582, "rouge1_recall": 0.25602196677157735, "rouge1_recall_stderr": 0.002878244091822486, "rouge2_fmeasure": 0.07100355255541753, "rouge2_fmeasure_stderr": 0.0014153910474231026, "rouge2_precision": 0.09387841885638748, "rouge2_precision_stderr": 0.002194124801129067, "rouge2_recall": 0.0740439804559834, "rouge2_recall_stderr": 0.001644608125064704, "rougeL_fmeasure": 0.18445520873996413, "rougeL_fmeasure_stderr": 0.001797621869515743, "rougeL_precision": 0.23412738417408802, "rougeL_precision_stderr": 0.0029250825035858734, "rougeL_recall": 0.19426592228730813, "rougeL_recall_stderr": 0.0023089496783281213, "rougeLsum_fmeasure": 0.2286765314746336, "rougeLsum_fmeasure_stderr": 0.0021801688851002263, "rougeLsum_precision": 0.28859199947210284, "rougeLsum_precision_stderr": 0.003452141980034977, "rougeLsum_recall": 0.23966421160449644, "rougeLsum_recall_stderr": 0.0027125548096589747}}, "3": {"tldr_en": {"bleu": 3.168617248016535, "bleu_stderr": 0.08041979746938775, "rouge1_fmeasure": 0.2094195871488414, "rouge1_fmeasure_stderr": 0.0027068429969493734, "rouge1_precision": 0.272311629129121, "rouge1_precision_stderr": 0.0039505188546921826, "rouge1_recall": 0.21816270627496134, "rouge1_recall_stderr": 0.0032448735585025937, "rouge2_fmeasure": 0.06204427812400913, "rouge2_fmeasure_stderr": 0.0014401113659882462, "rouge2_precision": 0.08329439938013247, "rouge2_precision_stderr": 0.0021782819294409984, "rouge2_recall": 0.06480971019253767, "rouge2_recall_stderr": 0.0016827399941316222, "rougeL_fmeasure": 0.1590668148316741, "rougeL_fmeasure_stderr": 0.0020766324795938023, "rougeL_precision": 0.20896331681491714, "rougeL_precision_stderr": 0.0031500865399556595, "rougeL_recall": 0.1664625275086072, "rougeL_recall_stderr": 0.0025592493419649177, "rougeLsum_fmeasure": 0.1964756114951262, "rougeLsum_fmeasure_stderr": 0.0025500807038816515, "rougeLsum_precision": 0.2563037368454477, "rougeLsum_precision_stderr": 0.0037619221351167056, "rougeLsum_recall": 0.20447650361202882, "rougeLsum_recall_stderr": 0.0030540843932082493}}, "4": {"tldr_en": {"bleu": 0.07788444152267485, "bleu_stderr": 0.01315525656911281, "rouge1_fmeasure": 0.06757697669893872, "rouge1_fmeasure_stderr": 0.0023707490073172043, "rouge1_precision": 0.0910819200952716, "rouge1_precision_stderr": 0.0033747683121654196, "rouge1_recall": 0.07028567198192245, "rouge1_recall_stderr": 0.002640964970940966, "rouge2_fmeasure": 0.019948424307073904, "rouge2_fmeasure_stderr": 0.000986170441488306, "rouge2_precision": 0.028646478253466095, "rouge2_precision_stderr": 0.0016076285180980135, "rouge2_recall": 0.020868526921797886, "rouge2_recall_stderr": 0.0011537893245677335, "rougeL_fmeasure": 0.05201990108593777, "rougeL_fmeasure_stderr": 0.0018314763879696625, "rougeL_precision": 0.07082578661014885, "rougeL_precision_stderr": 0.002691634510208164, "rougeL_recall": 0.05448506471222756, "rougeL_recall_stderr": 0.0020854622601667937, "rougeLsum_fmeasure": 0.06308719820573185, "rougeLsum_fmeasure_stderr": 0.002213451741392458, "rougeLsum_precision": 0.08551950005220435, "rougeLsum_precision_stderr": 0.0031983492312005384, "rougeLsum_recall": 0.06543250975997009, "rougeLsum_recall_stderr": 0.0024532813023277916}}, "5": {"tldr_en": {"bleu": 1.4470192084599072e-15, "bleu_stderr": 5.1547920765171173e-14, "rouge1_fmeasure": 0.010473201550267177, "rouge1_fmeasure_stderr": 0.0010554421691169314, "rouge1_precision": 0.014515441910190443, "rouge1_precision_stderr": 0.001501466217296952, "rouge1_recall": 0.011301943813768834, "rouge1_recall_stderr": 0.0012288119306628567, "rouge2_fmeasure": 0.0035604598673033626, "rouge2_fmeasure_stderr": 0.0004625418185814249, "rouge2_precision": 0.004591980814271401, "rouge2_precision_stderr": 0.0006515529824929627, "rouge2_recall": 0.00416888931272546, "rouge2_recall_stderr": 0.0005930107839352501, "rougeL_fmeasure": 0.008371164394493067, "rougeL_fmeasure_stderr": 0.0008468942819407445, "rougeL_precision": 0.011779972055249343, "rougeL_precision_stderr": 0.0012586537746088814, "rougeL_recall": 0.009040049894954919, "rougeL_recall_stderr": 0.0009873007391796482, "rougeLsum_fmeasure": 0.009841875351812794, "rougeLsum_fmeasure_stderr": 0.000994934070773035, "rougeLsum_precision": 0.013708349668480058, "rougeLsum_precision_stderr": 0.0014293611241212706, "rougeLsum_recall": 0.010579263673799228, "rougeLsum_recall_stderr": 0.0011521180305532216}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 1.8080075016885704, "bleu_stderr": 0.061027545228237855, "rouge1_fmeasure": 0.17861979049522314, "rouge1_fmeasure_stderr": 0.0018002125016461491, "rouge1_precision": 0.1356910382997071, "rouge1_precision_stderr": 0.0015410643240858764, "rouge1_recall": 0.2778453886224831, "rouge1_recall_stderr": 0.0023539268248121705, "rouge2_fmeasure": 0.046215425350075, "rouge2_fmeasure_stderr": 0.0010566397123776003, "rouge2_precision": 0.03490809623131593, "rouge2_precision_stderr": 0.0008357898041576343, "rouge2_recall": 0.07284634942140401, "rouge2_recall_stderr": 0.0015701168954429102, "rougeL_fmeasure": 0.16201597249905292, "rougeL_fmeasure_stderr": 0.0013971090679042123, "rougeL_precision": 0.12227403244327856, "rougeL_precision_stderr": 0.0011855454993101654, "rougeL_recall": 0.2551033251187225, "rougeL_recall_stderr": 0.0019031145709281366, "rougeLsum_fmeasure": 0.14908466648922475, "rougeLsum_fmeasure_stderr": 0.0016626826909044133, "rougeLsum_precision": 0.113242028542405, "rougeLsum_precision_stderr": 0.0013998996791185847, "rougeLsum_recall": 0.23203443734017606, "rougeLsum_recall_stderr": 0.0022390418934348437}}, "1": {"generate_text_restaurant": {"bleu": 12.173849341913149, "bleu_stderr": 0.0669092468478733, "rouge1_fmeasure": 0.48129644782721304, "rouge1_fmeasure_stderr": 0.0023219173855042303, "rouge1_precision": 0.5970385739901486, "rouge1_precision_stderr": 0.003272991717133695, "rouge1_recall": 0.44205108840831364, "rouge1_recall_stderr": 0.002990943407358534, "rouge2_fmeasure": 0.23032000484043905, "rouge2_fmeasure_stderr": 0.0021147211593944798, "rouge2_precision": 0.2911262146867607, "rouge2_precision_stderr": 0.002892523328423528, "rouge2_recall": 0.2107609477318019, "rouge2_recall_stderr": 0.0022166981158979714, "rougeL_fmeasure": 0.34966965256911114, "rougeL_fmeasure_stderr": 0.002108342891501849, "rougeL_precision": 0.4377488624210833, "rougeL_precision_stderr": 0.003110832463080634, "rougeL_recall": 0.3198170788492075, "rougeL_recall_stderr": 0.002444190246186975, "rougeLsum_fmeasure": 0.39331173897680705, "rougeLsum_fmeasure_stderr": 0.0023364505709058668, "rougeLsum_precision": 0.48957738460024036, "rougeLsum_precision_stderr": 0.00327157474691632, "rougeLsum_recall": 0.36068779636155734, "rougeLsum_recall_stderr": 0.0027524175233159713}}, "2": {"generate_text_restaurant": {"bleu": 14.360624153571306, "bleu_stderr": 0.15724700369597677, "rouge1_fmeasure": 0.5041317140273615, "rouge1_fmeasure_stderr": 0.0022753134233410278, "rouge1_precision": 0.6032331597687249, "rouge1_precision_stderr": 0.003142877448057622, "rouge1_recall": 0.4697575589134947, "rouge1_recall_stderr": 0.0029526957130554953, "rouge2_fmeasure": 0.25374280478792693, "rouge2_fmeasure_stderr": 0.002169411661246531, "rouge2_precision": 0.30851618494266825, "rouge2_precision_stderr": 0.002855313490834544, "rouge2_recall": 0.23616858770973606, "rouge2_recall_stderr": 0.002325964607821345, "rougeL_fmeasure": 0.3721296496966484, "rougeL_fmeasure_stderr": 0.002164018647000424, "rougeL_precision": 0.4477614821909179, "rougeL_precision_stderr": 0.003011118677636196, "rougeL_recall": 0.3459816794847227, "rougeL_recall_stderr": 0.002532543665969986, "rougeLsum_fmeasure": 0.42160904040581204, "rougeLsum_fmeasure_stderr": 0.002384317338752149, "rougeLsum_precision": 0.5054563233222107, "rougeLsum_precision_stderr": 0.0032185989379280733, "rougeLsum_recall": 0.39233324008569126, "rougeLsum_recall_stderr": 0.0027964602331100393}}, "3": {"generate_text_restaurant": {"bleu": 15.173778916293744, "bleu_stderr": 0.24020125390484678, "rouge1_fmeasure": 0.5122282123885756, "rouge1_fmeasure_stderr": 0.0022794016747342454, "rouge1_precision": 0.6038782195503243, "rouge1_precision_stderr": 0.003130218028564903, "rouge1_recall": 0.4797961783746489, "rouge1_recall_stderr": 0.0029264432136497623, "rouge2_fmeasure": 0.2611392423531194, "rouge2_fmeasure_stderr": 0.0021928684064362373, "rouge2_precision": 0.31128306325561933, "rouge2_precision_stderr": 0.0028044649859898306, "rouge2_recall": 0.2447494516430458, "rouge2_recall_stderr": 0.002368198668181156, "rougeL_fmeasure": 0.37928627551530325, "rougeL_fmeasure_stderr": 0.0021871071072142663, "rougeL_precision": 0.44868202075612884, "rougeL_precision_stderr": 0.0029681578589764264, "rougeL_recall": 0.35482043982343436, "rougeL_recall_stderr": 0.002532913929369527, "rougeLsum_fmeasure": 0.43123333798993546, "rougeLsum_fmeasure_stderr": 0.002401597688343299, "rougeLsum_precision": 0.5087702457406414, "rougeLsum_precision_stderr": 0.0031818356512290486, "rougeLsum_recall": 0.4036850753818513, "rougeLsum_recall_stderr": 0.0028061971662930287}}, "4": {"generate_text_restaurant": {"bleu": 15.78836334652692, "bleu_stderr": 0.16277087917874616, "rouge1_fmeasure": 0.5201963806927633, "rouge1_fmeasure_stderr": 0.0022664890545995124, "rouge1_precision": 0.6052306848983445, "rouge1_precision_stderr": 0.0031205758451821283, "rouge1_recall": 0.49015291142822803, "rouge1_recall_stderr": 0.002870806275113502, "rouge2_fmeasure": 0.2653610425045864, "rouge2_fmeasure_stderr": 0.0021838284720614534, "rouge2_precision": 0.31194428367362853, "rouge2_precision_stderr": 0.0027609843407215874, "rouge2_recall": 0.2500276639929066, "rouge2_recall_stderr": 0.0023425058959290804, "rougeL_fmeasure": 0.3838642484464721, "rougeL_fmeasure_stderr": 0.002185515405072229, "rougeL_precision": 0.4475978408946653, "rougeL_precision_stderr": 0.002901824428209634, "rougeL_recall": 0.36147111146407906, "rougeL_recall_stderr": 0.002523200720958866, "rougeLsum_fmeasure": 0.43919378367441203, "rougeLsum_fmeasure_stderr": 0.002408659068024605, "rougeLsum_precision": 0.5110579537103325, "rougeLsum_precision_stderr": 0.0031404148777386892, "rougeLsum_recall": 0.4135967411759738, "rougeLsum_recall_stderr": 0.0027946622087117543}}, "5": {"generate_text_restaurant": {"bleu": 15.879323420491742, "bleu_stderr": 0.1522927233333457, "rouge1_fmeasure": 0.5216373530710218, "rouge1_fmeasure_stderr": 0.002262224044659468, "rouge1_precision": 0.6064323245894362, "rouge1_precision_stderr": 0.00315510278665659, "rouge1_recall": 0.49013366730559393, "rouge1_recall_stderr": 0.0028136604439425025, "rouge2_fmeasure": 0.26786622625063644, "rouge2_fmeasure_stderr": 0.002208434520877025, "rouge2_precision": 0.3150817079641561, "rouge2_precision_stderr": 0.002829192112850875, "rouge2_recall": 0.25132514912770915, "rouge2_recall_stderr": 0.002318826430015561, "rougeL_fmeasure": 0.38681699896646954, "rougeL_fmeasure_stderr": 0.002220989687079745, "rougeL_precision": 0.45023806138186195, "rougeL_precision_stderr": 0.002949168521598567, "rougeL_recall": 0.36355539000425124, "rougeL_recall_stderr": 0.002526510160020316, "rougeLsum_fmeasure": 0.4417889417099761, "rougeLsum_fmeasure_stderr": 0.0024158193931717187, "rougeLsum_precision": 0.5137337228447252, "rougeLsum_precision_stderr": 0.0031936836223124105, "rougeLsum_recall": 0.41496788459604567, "rougeLsum_recall_stderr": 0.0027605438893900454}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.9472662234201847, "bleu_stderr": 0.0954010664780363, "rouge1_fmeasure": 0.20062931818668506, "rouge1_fmeasure_stderr": 0.002892087092668822, "rouge1_precision": 0.1512189693389312, "rouge1_precision_stderr": 0.002590198483449934, "rouge1_recall": 0.32951007687075756, "rouge1_recall_stderr": 0.004923756202298708, "rouge2_fmeasure": 0.044133865209903714, "rouge2_fmeasure_stderr": 0.0015693045278532822, "rouge2_precision": 0.03240083882273609, "rouge2_precision_stderr": 0.0011887319293568278, "rouge2_recall": 0.07461053739480651, "rouge2_recall_stderr": 0.0027791525130900266, "rougeL_fmeasure": 0.14533977808404885, "rougeL_fmeasure_stderr": 0.0021630231020242576, "rougeL_precision": 0.11056422701824653, "rougeL_precision_stderr": 0.0021853162208514175, "rougeL_recall": 0.23921769394978765, "rougeL_recall_stderr": 0.003783594428094317, "rougeLsum_fmeasure": 0.15912733837926105, "rougeLsum_fmeasure_stderr": 0.0023937408312475765, "rougeLsum_precision": 0.12055829236568064, "rougeLsum_precision_stderr": 0.002297890666943901, "rougeLsum_recall": 0.2623026232832005, "rougeLsum_recall_stderr": 0.004182995633448934}}, "1": {"article_DOC_summary": {"bleu": 2.857997728240847, "bleu_stderr": 0.15132505328438867, "rouge1_fmeasure": 0.24058051563117627, "rouge1_fmeasure_stderr": 0.003619950901783304, "rouge1_precision": 0.2431489069099882, "rouge1_precision_stderr": 0.004319149380207122, "rouge1_recall": 0.27274551008869324, "rouge1_recall_stderr": 0.004181125297952178, "rouge2_fmeasure": 0.05891708042714147, "rouge2_fmeasure_stderr": 0.002369248568269019, "rouge2_precision": 0.06156338993755893, "rouge2_precision_stderr": 0.0027090151794028584, "rouge2_recall": 0.06564279411701872, "rouge2_recall_stderr": 0.0026141410795920464, "rougeL_fmeasure": 0.1845786738464429, "rougeL_fmeasure_stderr": 0.0030093215424524295, "rougeL_precision": 0.18703509762652787, "rougeL_precision_stderr": 0.0036036431129194833, "rougeL_recall": 0.20956010556672522, "rougeL_recall_stderr": 0.00345405751329193, "rougeLsum_fmeasure": 0.18768186260751757, "rougeLsum_fmeasure_stderr": 0.003017598467168456, "rougeLsum_precision": 0.18954846036997042, "rougeLsum_precision_stderr": 0.0035941447605698394, "rougeLsum_recall": 0.21418956311713871, "rougeLsum_recall_stderr": 0.0035468160066617596}}, "2": {"article_DOC_summary": {"bleu": 3.19195992073562, "bleu_stderr": 0.2067955639958893, "rouge1_fmeasure": 0.24657160673903905, "rouge1_fmeasure_stderr": 0.0037739176215048444, "rouge1_precision": 0.25234682823435783, "rouge1_precision_stderr": 0.004406932928762655, "rouge1_recall": 0.26739840698638306, "rouge1_recall_stderr": 0.0039999072862688684, "rouge2_fmeasure": 0.06271701269177109, "rouge2_fmeasure_stderr": 0.0025663699767628305, "rouge2_precision": 0.06567480327233229, "rouge2_precision_stderr": 0.0027842377535723913, "rouge2_recall": 0.06599412859644187, "rouge2_recall_stderr": 0.0026775951767211374, "rougeL_fmeasure": 0.1876781101224112, "rougeL_fmeasure_stderr": 0.0031437463539169125, "rougeL_precision": 0.1921361411782326, "rougeL_precision_stderr": 0.0036263536814562453, "rougeL_recall": 0.20386135964058816, "rougeL_recall_stderr": 0.0033147209011893724, "rougeLsum_fmeasure": 0.1896272533115015, "rougeLsum_fmeasure_stderr": 0.0031550558124475133, "rougeLsum_precision": 0.19374913452852363, "rougeLsum_precision_stderr": 0.0036251531161871544, "rougeLsum_recall": 0.20676693183261305, "rougeLsum_recall_stderr": 0.0034014354284824795}}, "3": {"article_DOC_summary": {"bleu": 3.267835808853254, "bleu_stderr": 0.26356764701611307, "rouge1_fmeasure": 0.23697855013529284, "rouge1_fmeasure_stderr": 0.003931784147472208, "rouge1_precision": 0.2511767032658256, "rouge1_precision_stderr": 0.004670743744963309, "rouge1_recall": 0.2497224746972405, "rouge1_recall_stderr": 0.004054437554455434, "rouge2_fmeasure": 0.059313458488013185, "rouge2_fmeasure_stderr": 0.0025641694242472487, "rouge2_precision": 0.06435388180319074, "rouge2_precision_stderr": 0.002934840810442915, "rouge2_recall": 0.060376215777618605, "rouge2_recall_stderr": 0.002521653823700491, "rougeL_fmeasure": 0.17977672776005849, "rougeL_fmeasure_stderr": 0.00326317345808302, "rougeL_precision": 0.19057534922020994, "rougeL_precision_stderr": 0.0038620821553629983, "rougeL_recall": 0.1901634518957205, "rougeL_recall_stderr": 0.0033714037003502494, "rougeLsum_fmeasure": 0.18163235303784986, "rougeLsum_fmeasure_stderr": 0.003264460230665997, "rougeLsum_precision": 0.19211901704097034, "rougeLsum_precision_stderr": 0.003850775083612694, "rougeLsum_recall": 0.19288110429850555, "rougeLsum_recall_stderr": 0.003421247538700275}}, "4": {"article_DOC_summary": {"bleu": 0.17970307621766216, "bleu_stderr": 0.04965869269887041, "rouge1_fmeasure": 0.05891769720848019, "rouge1_fmeasure_stderr": 0.0035369396004201713, "rouge1_precision": 0.06645727693786248, "rouge1_precision_stderr": 0.004158365862658779, "rouge1_recall": 0.061174931724787315, "rouge1_recall_stderr": 0.003797370735030977, "rouge2_fmeasure": 0.015297632126595406, "rouge2_fmeasure_stderr": 0.00149002417901287, "rouge2_precision": 0.01730890961224063, "rouge2_precision_stderr": 0.0017698712455992984, "rouge2_recall": 0.016289532767626923, "rouge2_recall_stderr": 0.0017234032328131926, "rougeL_fmeasure": 0.04476173097758684, "rougeL_fmeasure_stderr": 0.0027730017560448727, "rougeL_precision": 0.050637820115463826, "rougeL_precision_stderr": 0.0032826504244346, "rougeL_recall": 0.04683971382625969, "rougeL_recall_stderr": 0.003039332848507969, "rougeLsum_fmeasure": 0.04533434532971928, "rougeLsum_fmeasure_stderr": 0.0028025558912498296, "rougeLsum_precision": 0.051192679124604304, "rougeLsum_precision_stderr": 0.0033047254109378095, "rougeLsum_recall": 0.0475395827212547, "rougeLsum_recall_stderr": 0.0030900986283582004}}, "5": {"article_DOC_summary": {"bleu": 1.393056311124318e-43, "bleu_stderr": 1.2487431746784071e-33, "rouge1_fmeasure": 0.0025036711184808556, "rouge1_fmeasure_stderr": 0.000808568948388182, "rouge1_precision": 0.0025272416860947825, "rouge1_precision_stderr": 0.0007813236132809655, "rouge1_recall": 0.002586618259740399, "rouge1_recall_stderr": 0.0008670624223009818, "rouge2_fmeasure": 0.0003242107125389267, "rouge2_fmeasure_stderr": 0.00014221025181447073, "rouge2_precision": 0.0003236042782341223, "rouge2_precision_stderr": 0.0001381414258477455, "rouge2_recall": 0.0003324696313480705, "rouge2_recall_stderr": 0.00015129466488844655, "rougeL_fmeasure": 0.00159555474061474, "rougeL_fmeasure_stderr": 0.0004956987539243005, "rougeL_precision": 0.0016454419652040672, "rougeL_precision_stderr": 0.0004929867496820194, "rougeL_recall": 0.001639076487035711, "rougeL_recall_stderr": 0.0005281423440330004, "rougeLsum_fmeasure": 0.00159555474061474, "rougeLsum_fmeasure_stderr": 0.0004956987539243005, "rougeLsum_precision": 0.0016454419652040672, "rougeLsum_precision_stderr": 0.0004929867496820194, "rougeLsum_recall": 0.001639076487035711, "rougeLsum_recall_stderr": 0.0005281423440330004}}}} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.json b/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e9b6acb2773086a15e35da55ac6efe27c64d66f1 --- /dev/null +++ b/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.30679877376441267, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.003595563898423582 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.25602196677157735, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002878244091822486 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.24392276306894944, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0023000547396419086 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.09387841885638748, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.002194124801129067 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0740439804559834, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001644608125064704 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.07100355255541753, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0014153910474231026 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.23412738417408802, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0029250825035858734 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.19426592228730813, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0023089496783281213 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.18445520873996413, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001797621869515743 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.28859199947210284, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.003452141980034977 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.23966421160449644, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0027125548096589747 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.2286765314746336, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0021801688851002263 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 4.074528254683577, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.10710940903995719 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.json b/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..0f6eeadd318fe6599de56f9f6f34039701fa64c1 --- /dev/null +++ b/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.272311629129121, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0039505188546921826 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.21816270627496134, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0032448735585025937 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.2094195871488414, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0027068429969493734 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.08329439938013247, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0021782819294409984 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.06480971019253767, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016827399941316222 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.06204427812400913, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0014401113659882462 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.20896331681491714, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0031500865399556595 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.1664625275086072, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0025592493419649177 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.1590668148316741, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0020766324795938023 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.2563037368454477, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0037619221351167056 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.20447650361202882, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0030540843932082493 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.1964756114951262, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0025500807038816515 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.168617248016535, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08041979746938775 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.json b/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b78428db475f3807b81bf599eafc8134a182a2d4 --- /dev/null +++ b/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.0910819200952716, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0033747683121654196 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.07028567198192245, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002640964970940966 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.06757697669893872, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0023707490073172043 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.028646478253466095, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0016076285180980135 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.020868526921797886, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011537893245677335 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.019948424307073904, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000986170441488306 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.07082578661014885, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002691634510208164 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.05448506471222756, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0020854622601667937 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.05201990108593777, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018314763879696625 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.08551950005220435, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0031983492312005384 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.06543250975997009, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0024532813023277916 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.06308719820573185, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002213451741392458 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.07788444152267485, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.01315525656911281 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.json b/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3f2450f095661119454e03566e7c0e9f08a6c32b --- /dev/null +++ b/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.014515441910190443, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001501466217296952 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.011301943813768834, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0012288119306628567 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.010473201550267177, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0010554421691169314 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.004591980814271401, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006515529824929627 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.00416888931272546, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0005930107839352501 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0035604598673033626, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0004625418185814249 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.011779972055249343, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012586537746088814 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.009040049894954919, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0009873007391796482 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.008371164394493067, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0008468942819407445 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.013708349668480058, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014293611241212706 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.010579263673799228, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0011521180305532216 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.009841875351812794, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.000994934070773035 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.4470192084599072e-15, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 5.1547920765171173e-14 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_3.json b/8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6b736d2c037aecd8978496077622a292cd1801e7 --- /dev/null +++ b/8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.2511767032658256, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.004670743744963309 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.2497224746972405, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004054437554455434 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.23697855013529284, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.003931784147472208 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.06435388180319074, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.002934840810442915 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.060376215777618605, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002521653823700491 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.059313458488013185, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0025641694242472487 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.19057534922020994, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0038620821553629983 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.1901634518957205, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033714037003502494 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.17977672776005849, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.00326317345808302 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.19211901704097034, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.003850775083612694 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.19288110429850555, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003421247538700275 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.18163235303784986, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.003264460230665997 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 3.267835808853254, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.26356764701611307 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_4.json b/8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..07b56db52138bd2092451e899e6365d925c81d45 --- /dev/null +++ b/8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.06645727693786248, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.004158365862658779 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.061174931724787315, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.003797370735030977 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.05891769720848019, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0035369396004201713 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.01730890961224063, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0017698712455992984 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.016289532767626923, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0017234032328131926 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.015297632126595406, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00149002417901287 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.050637820115463826, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0032826504244346 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.04683971382625969, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003039332848507969 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.04476173097758684, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0027730017560448727 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.051192679124604304, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0033047254109378095 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0475395827212547, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0030900986283582004 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.04533434532971928, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0028025558912498296 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.17970307621766216, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.04965869269887041 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_5.json b/8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..29ee2825609bd5283955ead3bc5a9df9473dad57 --- /dev/null +++ b/8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.0025272416860947825, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0007813236132809655 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.002586618259740399, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0008670624223009818 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.0025036711184808556, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.000808568948388182 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0003236042782341223, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0001381414258477455 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0003324696313480705, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00015129466488844655 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0003242107125389267, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00014221025181447073 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0016454419652040672, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0004929867496820194 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.001639076487035711, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0005281423440330004 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.00159555474061474, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0004956987539243005 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0016454419652040672, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0004929867496820194 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.001639076487035711, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0005281423440330004 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.00159555474061474, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0004956987539243005 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.393056311124318e-43, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 1.2487431746784071e-33 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.json b/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3c65b7ee452b90d77c76bd59d6307bc0a1322143 --- /dev/null +++ b/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.31947645525912327, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0038030155885502574}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.26784034634508286, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003035447513908994}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.24919698955059236, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002338885547952402}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.10032559893960638, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0024161327250120574}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.07957621599352918, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017828844980089247}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.07449487544541136, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015020810496653879}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.2468115494253994, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0032025411700306672}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.20411776076410448, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024277091219038756}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.18970698519609924, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018593336285269075}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.3009096197940564, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0036645356865323267}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.251048984299023, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002859798214873002}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.23380521600569548, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022238011394774266}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 4.480739832822098, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11578330856151871}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.json b/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..cb5275e2e2561ece30c8623dd13c0083ab077150 --- /dev/null +++ b/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.2871095429771904, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004185534949827048}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.22207861869609608, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0033297349416556445}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.21276268379590219, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002745499539353653}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.08919737395825947, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00234436562476581}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.06668770358204071, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017001378203776699}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.06395005883547085, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014814659874998457}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.22428029391025195, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0034506563669751745}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.17117251055926017, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002654839665946902}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.16381135583162096, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021664607071098315}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.27186005782650335, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004024882931968101}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2092333223784394, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0031542921648456132}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.20067896546125685, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026118338406594167}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.416486588059767, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05871038450450248}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.json b/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9cea65934801e377a7e07c444b6dcf601d1118b9 --- /dev/null +++ b/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.09854206688714345, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0036437541450359593}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.07142922091930162, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026886044555192072}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.06934635599486277, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002419160037746626}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.03200012684242646, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001809559832121984}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.02183618792959388, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011694185534509838}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.021348512828022314, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001055380041694426}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.07858578853182602, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003017215712567151}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.055804214936503324, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021264630991137314}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.05429349387264579, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001919877615338037}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.0930444540425215, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003479952193050714}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.06690767452071034, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025201480735759655}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.06504921478523501, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002272098654651633}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.08573923963165347, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.01281159560771889}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.json b/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6cc0ba025286979165c28dde9e04525d002517ce --- /dev/null +++ b/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.019085080359257173, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018850577941172406}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.011645843404748045, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0012042546813690255}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.012068612050969258, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001175572535963405}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.006894559713384093, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010187138526863938}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.003894968708693124, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.000532555761738665}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.004052279635441424, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005200626541191212}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.016020963520094682, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016304077510915223}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.00943399649665596, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.000984307352962584}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.009809956651925644, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0009599920054622314}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.018298677104570716, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001820476829671769}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.011022020415497191, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0011405019701246609}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.011452198317124414, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0011152369987060954}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 7.249514036066506e-16, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 3.339336428482265e-14}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_3.json b/8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..203fb8bd7b444106d5b0ee7b0c6bc8cf4e5aa0a1 --- /dev/null +++ b/8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.27838096040655913, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004710736944579292}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.256517363989891, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0043630766599847375}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.25539660890379623, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00408774094209769}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.07429604893692207, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0029381253193927104}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.06987562130359767, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002769991072408768}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.06904105989761854, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0026844346914174943}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.21060638862089187, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003887329749618103}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.19474922081143328, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0036536236006364338}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.19320917420626382, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0033971194258629626}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.21158267461706523, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003892248180779862}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.1959669205905632, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0036938624037444975}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.19420752036343683, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0034107307485400193}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 3.892045553878533, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.15338529710936155}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_4.json b/8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0c087ba7f6b5005e971cc92ef1ab9c05b6bce763 --- /dev/null +++ b/8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.06908582074094831, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004346958400381964}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0576659953376829, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0036525448833407675}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05924716020491812, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0036326176527612343}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.018055976337857795, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001722423057966192}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.015572542649221232, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014824376881666455}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.015889553201926813, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001483207722000358}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0526906303370273, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00345962655866441}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.04371274927728761, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00285083893180292}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.04481653988120624, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0028149978857180923}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.05278495458510948, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0034751372117411933}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.04367091698209838, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002855118245232499}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.044863049752252554, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002827990480856111}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.0793183178082136, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.033794375862044715}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_5.json b/8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1fbf1fd043068d586839bb4d695e7a1a57758701 --- /dev/null +++ b/8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.002843906337499315, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009956544746623187}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0024927455220140884, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0008268054962235649}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.002561173851810196, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008604007460830072}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0005943099210680686, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00031949865564033644}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0004189368714895907, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00021998461769511408}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.00047911010108716033, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00025331584786903913}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0019474370851628444, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000688363115056785}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0016525178188876207, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005530966506647648}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0017324699968285832, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005921585827612698}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0019474370851628444, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.000688363115056785}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0016525178188876207, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005530966506647648}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0017324699968285832, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005921585827612698}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 4.872051358237048e-56, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.0994911419227211e-38}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.jsonl b/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..2cede6422445e4d42e0b5040ed89a104e725765d 100644 --- a/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.jsonl +++ b/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cef9e23c45b67405beb3ff4d8e97a8060db73f0e64df98df4af5d1acd5a1c31 +size 18570445 diff --git a/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.jsonl b/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..5b137a0099eb5f95c5e439e607607e808516f305 100644 --- a/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.jsonl +++ b/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d8ab41153f8c5cbdc0e5db1ae88eb357e47d8a89ad418763eecd322d316ca53 +size 24036414 diff --git a/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.jsonl b/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..14a14bb0a2ea41e5f86a76b2f1d35e67e4b32f12 100644 --- a/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.jsonl +++ b/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b63a61d5ecbd64af6b936ef71578a174d5f6e8a17c62f394bac9e9bc9a989e2e +size 29368136 diff --git a/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.jsonl b/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0bba1414b68e7570caf5e5b99a63ce38cc78b2de 100644 --- a/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.jsonl +++ b/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c3059fcd2c910dc6f6c2c416d6d9b01454f1e9159acc33a0ca091228d556a9f +size 34782138 diff --git a/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_3.jsonl b/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..8b7a78c6a454604ac4249b2c5a43b489e2429bae 100644 --- a/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_3.jsonl +++ b/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93d43594e15f1d34ea0fbf2c7239b5a7c541f3b067f5cc9e2c9642eaa2e12a98 +size 9452835 diff --git a/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_4.jsonl b/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..c5dbdb439b0bee4abf3a3d954826ada342d07e0c 100644 --- a/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_4.jsonl +++ b/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94abf8a4ac5eb39351ff0818bc2877bbf39ffecb2edb70b911dc7ab3111798c9 +size 11623564 diff --git a/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_5.jsonl b/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..1d67bfbffb2bd70cdf1201593c336499e28e621c 100644 --- a/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_5.jsonl +++ b/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:826540fe8c81c1c81615e699be345425a39b09165794b22a59b7bae5dea47af6 +size 13897110 diff --git a/8b7178b35b/evaluation/generation/merged.csv b/8b7178b35b/evaluation/generation/merged.csv index 94e96839377380f1faa8ddf2dbeb12c2d127e226..0815b8b70a600e32f019d92b9586827c8c52fc2f 100644 --- a/8b7178b35b/evaluation/generation/merged.csv +++ b/8b7178b35b/evaluation/generation/merged.csv @@ -18,7 +18,13 @@ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.05697362472735911 gem_xsum,1,median,rouge2_fmeasure,0.05697362472735911 gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.06850101702956525 gem_xsum,2,median,rouge2_fmeasure,0.06850101702956525 -gem_xsum,2,average,multiple,0.056975464704954716 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.06904105989761854 +gem_xsum,3,median,rouge2_fmeasure,0.06904105989761854 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.015889553201926813 +gem_xsum,4,median,rouge2_fmeasure,0.015889553201926813 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00047911010108716033 +gem_xsum,5,median,rouge2_fmeasure,0.00047911010108716033 +gem_xsum,5,average,multiple,0.04272268621924944 web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05172430149730381 web_nlg_en,0,median,rouge2_fmeasure,0.05172430149730381 web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.0838728953683248 @@ -36,4 +42,12 @@ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.049529716403418626 wiki_lingua_en,0,median,rouge2_fmeasure,0.049529716403418626 wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04749462077218236 wiki_lingua_en,1,median,rouge2_fmeasure,0.04749462077218236 -wiki_lingua_en,1,average,multiple,0.0485121685878005 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.07449487544541136 +wiki_lingua_en,2,median,rouge2_fmeasure,0.07449487544541136 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.06395005883547085 +wiki_lingua_en,3,median,rouge2_fmeasure,0.06395005883547085 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.021348512828022314 +wiki_lingua_en,4,median,rouge2_fmeasure,0.021348512828022314 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.004052279635441424 +wiki_lingua_en,5,median,rouge2_fmeasure,0.004052279635441424 +wiki_lingua_en,5,average,multiple,0.04347834398665782 diff --git a/8b7178b35b/evaluation/generation/merged.json b/8b7178b35b/evaluation/generation/merged.json index 2091ecf15b978177515d491c0194e18f74193615..be98f9026842a945eebe279967fbe9ffa3874b5f 100644 --- a/8b7178b35b/evaluation/generation/merged.json +++ b/8b7178b35b/evaluation/generation/merged.json @@ -1 +1 @@ -{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.347289354458488, "bleu_stderr": 0.03777797007924809, "rouge1_fmeasure": 0.10890954353426278, "rouge1_fmeasure_stderr": 0.002110258935484955, "rouge1_precision": 0.079226368742227, "rouge1_precision_stderr": 0.0023231969845277887, "rouge1_recall": 0.28110828630725015, "rouge1_recall_stderr": 0.004882004381200993, "rouge2_fmeasure": 0.05172430149730381, "rouge2_fmeasure_stderr": 0.0013180585408877222, "rouge2_precision": 0.03743726607123581, "rouge2_precision_stderr": 0.001494228694268955, "rouge2_recall": 0.13690374884005102, "rouge2_recall_stderr": 0.003169133187878269, "rougeL_fmeasure": 0.10554978776184795, "rougeL_fmeasure_stderr": 0.0020007799763877027, "rougeL_precision": 0.07666350308702279, "rougeL_precision_stderr": 0.0022367957170880146, "rougeL_recall": 0.2745019482026448, "rougeL_recall_stderr": 0.004789753920178336, "rougeLsum_fmeasure": 0.10411236694388043, "rougeLsum_fmeasure_stderr": 0.0019983020211639833, "rougeLsum_precision": 0.07592638133051025, "rougeLsum_precision_stderr": 0.002251359653603086, "rougeLsum_recall": 0.2686557447211624, "rougeLsum_recall_stderr": 0.004613052990575398}}, "1": {"PALM_prompt": {"bleu": 0.5749625372797772, "bleu_stderr": 0.042168587225678913, "rouge1_fmeasure": 0.1639618887300654, "rouge1_fmeasure_stderr": 0.003768473720694237, "rouge1_precision": 0.1384507797063748, "rouge1_precision_stderr": 0.004372707845611215, "rouge1_recall": 0.322644364844527, "rouge1_recall_stderr": 0.004855940304467451, "rouge2_fmeasure": 0.0838728953683248, "rouge2_fmeasure_stderr": 0.002551976059024005, "rouge2_precision": 0.0725164426950792, "rouge2_precision_stderr": 0.0030197107273513208, "rouge2_recall": 0.16687390506721525, "rouge2_recall_stderr": 0.0035088962671374286, "rougeL_fmeasure": 0.14973459838273895, "rougeL_fmeasure_stderr": 0.003227143648327153, "rougeL_precision": 0.12455299793053924, "rougeL_precision_stderr": 0.0037936801945088155, "rougeL_recall": 0.3050028170207448, "rougeL_recall_stderr": 0.004518320595879189, "rougeLsum_fmeasure": 0.15125353657464896, "rougeLsum_fmeasure_stderr": 0.0032779530465988412, "rougeLsum_precision": 0.1263163648899627, "rougeLsum_precision_stderr": 0.0038597130269363692, "rougeLsum_recall": 0.3062159795615415, "rougeLsum_recall_stderr": 0.004522396040564721}}, "2": {"PALM_prompt": {"bleu": 0.6896289267842831, "bleu_stderr": 0.04363432869368013, "rouge1_fmeasure": 0.19577491362940544, "rouge1_fmeasure_stderr": 0.004183820712066921, "rouge1_precision": 0.1736763143186271, "rouge1_precision_stderr": 0.005190144069158226, "rouge1_recall": 0.3690495199973296, "rouge1_recall_stderr": 0.004674550419980399, "rouge2_fmeasure": 0.10468204406915697, "rouge2_fmeasure_stderr": 0.002958633901985044, "rouge2_precision": 0.09630923485520457, "rouge2_precision_stderr": 0.003591484421705004, "rouge2_recall": 0.19774238776363945, "rouge2_recall_stderr": 0.0036773455774089777, "rougeL_fmeasure": 0.17655301985743188, "rougeL_fmeasure_stderr": 0.003543990303517691, "rougeL_precision": 0.15346964126360133, "rougeL_precision_stderr": 0.004394358403285555, "rougeL_recall": 0.34605615384304983, "rougeL_recall_stderr": 0.0043553659806620975, "rougeLsum_fmeasure": 0.17970682314011696, "rougeLsum_fmeasure_stderr": 0.003629660902642662, "rougeLsum_precision": 0.15737438288230765, "rougeLsum_precision_stderr": 0.004551780258151066, "rougeLsum_recall": 0.34891501370883077, "rougeLsum_recall_stderr": 0.004358767783710016}}, "3": {"PALM_prompt": {"bleu": 0.7168649256709774, "bleu_stderr": 0.04675221435801664, "rouge1_fmeasure": 0.20039314889286589, "rouge1_fmeasure_stderr": 0.00433393844787534, "rouge1_precision": 0.1766984056786614, "rouge1_precision_stderr": 0.0052889863501627, "rouge1_recall": 0.3784023798729607, "rouge1_recall_stderr": 0.004834949146967397, "rouge2_fmeasure": 0.10711581894262515, "rouge2_fmeasure_stderr": 0.00302754276771825, "rouge2_precision": 0.09764044765164406, "rouge2_precision_stderr": 0.0036619448797318528, "rouge2_recall": 0.2030335209107597, "rouge2_recall_stderr": 0.003804312974743258, "rougeL_fmeasure": 0.18005823477527308, "rougeL_fmeasure_stderr": 0.003625480259314596, "rougeL_precision": 0.15566438191020568, "rougeL_precision_stderr": 0.004489975630701125, "rougeL_recall": 0.3537310697482285, "rougeL_recall_stderr": 0.004433439714918285, "rougeLsum_fmeasure": 0.18404319632429025, "rougeLsum_fmeasure_stderr": 0.0037645994118713916, "rougeLsum_precision": 0.1606361822943105, "rougeLsum_precision_stderr": 0.004693428623221418, "rougeLsum_recall": 0.3571516509838202, "rougeLsum_recall_stderr": 0.004469628307077723}}, "4": {"PALM_prompt": {"bleu": 0.8264767377736691, "bleu_stderr": 0.053779296385247366, "rouge1_fmeasure": 0.20221675887809062, "rouge1_fmeasure_stderr": 0.004283401665288551, "rouge1_precision": 0.1812068418232479, "rouge1_precision_stderr": 0.005398716331751978, "rouge1_recall": 0.38369035451056227, "rouge1_recall_stderr": 0.00470013713630282, "rouge2_fmeasure": 0.10880911597097326, "rouge2_fmeasure_stderr": 0.0029539167777379253, "rouge2_precision": 0.1006261222318028, "rouge2_precision_stderr": 0.0036927051039995647, "rouge2_recall": 0.2074416749104224, "rouge2_recall_stderr": 0.0036680698832641224, "rougeL_fmeasure": 0.18068778740178215, "rougeL_fmeasure_stderr": 0.0035531966464061853, "rougeL_precision": 0.15862122681407606, "rougeL_precision_stderr": 0.004530792062039435, "rougeL_recall": 0.35743845572365024, "rougeL_recall_stderr": 0.004293662429385583, "rougeLsum_fmeasure": 0.185234182101754, "rougeLsum_fmeasure_stderr": 0.00371289144584664, "rougeLsum_precision": 0.16438209047726673, "rougeLsum_precision_stderr": 0.004775185780341334, "rougeLsum_recall": 0.3611996661245711, "rougeLsum_recall_stderr": 0.00431294873519373}}, "5": {"PALM_prompt": {"bleu": 0.8765489182001927, "bleu_stderr": 0.040019456401903304, "rouge1_fmeasure": 0.20880684983729342, "rouge1_fmeasure_stderr": 0.004530179904806841, "rouge1_precision": 0.1926905497088421, "rouge1_precision_stderr": 0.0057538188799125855, "rouge1_recall": 0.3858328625547419, "rouge1_recall_stderr": 0.004823164304577201, "rouge2_fmeasure": 0.11565656452634464, "rouge2_fmeasure_stderr": 0.003314764756228703, "rouge2_precision": 0.11101027585585174, "rouge2_precision_stderr": 0.004140682260724748, "rouge2_recall": 0.21158156307350992, "rouge2_recall_stderr": 0.003943074328986311, "rougeL_fmeasure": 0.18694354311801048, "rougeL_fmeasure_stderr": 0.0038293768377802495, "rougeL_precision": 0.16975713372550302, "rougeL_precision_stderr": 0.0049633182199218225, "rougeL_recall": 0.3591348058816613, "rougeL_recall_stderr": 0.004445791053730718, "rougeLsum_fmeasure": 0.1915290665625795, "rougeLsum_fmeasure_stderr": 0.00396816747113494, "rougeLsum_precision": 0.17553725023286618, "rougeLsum_precision_stderr": 0.005177574879953629, "rougeLsum_recall": 0.36311828129199303, "rougeLsum_recall_stderr": 0.0044570305199087895}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 3.586033759423314, "bleu_stderr": 0.09221854298529836, "rouge1_fmeasure": 0.18563962193300107, "rouge1_fmeasure_stderr": 0.00254964484535964, "rouge1_precision": 0.1716696703091672, "rouge1_precision_stderr": 0.0028501359020609955, "rouge1_recall": 0.25185497103999677, "rouge1_recall_stderr": 0.0034752671023028777, "rouge2_fmeasure": 0.049529716403418626, "rouge2_fmeasure_stderr": 0.001152868080944952, "rouge2_precision": 0.04455795738149438, "rouge2_precision_stderr": 0.0011159745679413468, "rouge2_recall": 0.06850978198098658, "rouge2_recall_stderr": 0.0017237693104594396, "rougeL_fmeasure": 0.13995045719718288, "rougeL_fmeasure_stderr": 0.0018763707447302035, "rougeL_precision": 0.1292734642683531, "rougeL_precision_stderr": 0.0022188669387071605, "rougeL_recall": 0.1943570396317403, "rougeL_recall_stderr": 0.0027761702028766522, "rougeLsum_fmeasure": 0.17363683324091103, "rougeLsum_fmeasure_stderr": 0.0023982582496962375, "rougeLsum_precision": 0.1610229468194331, "rougeLsum_precision_stderr": 0.002727168940548498, "rougeLsum_recall": 0.2357960382787708, "rougeLsum_recall_stderr": 0.0032789217359646683}}, "1": {"tldr_en": {"bleu": 3.0885083531376645, "bleu_stderr": 0.09894775701896347, "rouge1_fmeasure": 0.18742769556123198, "rouge1_fmeasure_stderr": 0.0024179522944743578, "rouge1_precision": 0.22141486924288373, "rouge1_precision_stderr": 0.003489092311505774, "rouge1_recall": 0.21774103131853748, "rouge1_recall_stderr": 0.003219238261544544, "rouge2_fmeasure": 0.04749462077218236, "rouge2_fmeasure_stderr": 0.0013391994596319457, "rouge2_precision": 0.05911494441028749, "rouge2_precision_stderr": 0.001977492552657427, "rouge2_recall": 0.056021987595245265, "rouge2_recall_stderr": 0.0016925719239639409, "rougeL_fmeasure": 0.14177658902154563, "rougeL_fmeasure_stderr": 0.0018452092691051947, "rougeL_precision": 0.17052813755243684, "rougeL_precision_stderr": 0.0028543436290086777, "rougeL_recall": 0.16498921391985125, "rougeL_recall_stderr": 0.0024957799702815765, "rougeLsum_fmeasure": 0.17566654570551468, "rougeLsum_fmeasure_stderr": 0.0022619479293200127, "rougeLsum_precision": 0.20843084756852662, "rougeLsum_precision_stderr": 0.003321140474934239, "rougeLsum_recall": 0.20397276049127364, "rougeLsum_recall_stderr": 0.0030133958209501476}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 3.8008125605460465, "bleu_stderr": 0.08334728702857473, "rouge1_fmeasure": 0.2613118947613821, "rouge1_fmeasure_stderr": 0.002127912583794242, "rouge1_precision": 0.2090865590305055, "rouge1_precision_stderr": 0.0018470177010718961, "rouge1_recall": 0.38188549726437987, "rouge1_recall_stderr": 0.003279728734456605, "rouge2_fmeasure": 0.07297635058372318, "rouge2_fmeasure_stderr": 0.001261248605220041, "rouge2_precision": 0.05673694028468428, "rouge2_precision_stderr": 0.0009833704090490367, "rouge2_recall": 0.1101409383371642, "rouge2_recall_stderr": 0.001995492794975982, "rougeL_fmeasure": 0.22836186321205557, "rougeL_fmeasure_stderr": 0.0015605125797972495, "rougeL_precision": 0.18275732889261928, "rougeL_precision_stderr": 0.0013921067012969746, "rougeL_recall": 0.3338079929800571, "rougeL_recall_stderr": 0.0024505755903694226, "rougeLsum_fmeasure": 0.21899459401932223, "rougeLsum_fmeasure_stderr": 0.002102657843438912, "rougeLsum_precision": 0.17512630400560542, "rougeLsum_precision_stderr": 0.0017783983400275266, "rougeLsum_recall": 0.32141092125405557, "rougeLsum_recall_stderr": 0.0032662985068743528}}, "1": {"generate_text_restaurant": {"bleu": 12.455563496168116, "bleu_stderr": 0.15428951986028144, "rouge1_fmeasure": 0.4844618589871223, "rouge1_fmeasure_stderr": 0.002388878380866762, "rouge1_precision": 0.5954981850070853, "rouge1_precision_stderr": 0.0032755377697093577, "rouge1_recall": 0.44832902186902923, "rouge1_recall_stderr": 0.0030688387452721486, "rouge2_fmeasure": 0.23224231794118463, "rouge2_fmeasure_stderr": 0.0020931392807158806, "rouge2_precision": 0.29040989982948295, "rouge2_precision_stderr": 0.0028204861869818784, "rouge2_recall": 0.21450317916657013, "rouge2_recall_stderr": 0.002222395267553394, "rougeL_fmeasure": 0.34990734371381305, "rougeL_fmeasure_stderr": 0.0021300316703662904, "rougeL_precision": 0.43368332329208353, "rougeL_precision_stderr": 0.003056248156608264, "rougeL_recall": 0.32270327842717356, "rougeL_recall_stderr": 0.0024894724353540653, "rougeLsum_fmeasure": 0.39499604706438096, "rougeLsum_fmeasure_stderr": 0.0023754095690034817, "rougeLsum_precision": 0.48684291022612447, "rougeLsum_precision_stderr": 0.0032370809243618716, "rougeLsum_recall": 0.3651475528395549, "rougeLsum_recall_stderr": 0.002807567486848305}}, "2": {"generate_text_restaurant": {"bleu": 15.187891510159636, "bleu_stderr": 0.2005340847158173, "rouge1_fmeasure": 0.5149400564373232, "rouge1_fmeasure_stderr": 0.002281960188799717, "rouge1_precision": 0.6089225605030864, "rouge1_precision_stderr": 0.0031749506024181815, "rouge1_recall": 0.4850253270309617, "rouge1_recall_stderr": 0.0029712295443402523, "rouge2_fmeasure": 0.26064964037026866, "rouge2_fmeasure_stderr": 0.002149918786285521, "rouge2_precision": 0.3125612676733449, "rouge2_precision_stderr": 0.002807570058211188, "rouge2_recall": 0.24520830668256238, "rouge2_recall_stderr": 0.0023166169808109976, "rougeL_fmeasure": 0.3791251783681887, "rougeL_fmeasure_stderr": 0.0021638392150190966, "rougeL_precision": 0.45032557438452553, "rougeL_precision_stderr": 0.002995769757075173, "rougeL_recall": 0.3564750276333542, "rougeL_recall_stderr": 0.002545550505071048, "rougeLsum_fmeasure": 0.4311876269636124, "rougeLsum_fmeasure_stderr": 0.0024222285980316075, "rougeLsum_precision": 0.5099476065779874, "rougeLsum_precision_stderr": 0.003210910234266457, "rougeLsum_recall": 0.40596954377109, "rougeLsum_recall_stderr": 0.002856919494329858}}, "3": {"generate_text_restaurant": {"bleu": 16.060659394975822, "bleu_stderr": 0.17284642655126536, "rouge1_fmeasure": 0.5239878618101873, "rouge1_fmeasure_stderr": 0.0022891503962730515, "rouge1_precision": 0.6148474805598093, "rouge1_precision_stderr": 0.003175147795291498, "rouge1_recall": 0.4938729679272436, "rouge1_recall_stderr": 0.0029612490025592896, "rouge2_fmeasure": 0.27038735747786175, "rouge2_fmeasure_stderr": 0.002196480015942097, "rouge2_precision": 0.32141615003361074, "rouge2_precision_stderr": 0.00284369777288176, "rouge2_recall": 0.2546854150197406, "rouge2_recall_stderr": 0.0023748524797935944, "rougeL_fmeasure": 0.3858853477225852, "rougeL_fmeasure_stderr": 0.002195201839278644, "rougeL_precision": 0.45482796385524316, "rougeL_precision_stderr": 0.0030285934039142345, "rougeL_recall": 0.3632799719766207, "rougeL_recall_stderr": 0.0025747603467030653, "rougeLsum_fmeasure": 0.44212298577697895, "rougeLsum_fmeasure_stderr": 0.0024314611034434628, "rougeLsum_precision": 0.5189202264105984, "rougeLsum_precision_stderr": 0.0032211525451001028, "rougeLsum_recall": 0.41656979202046523, "rougeLsum_recall_stderr": 0.0028606302793073527}}, "4": {"generate_text_restaurant": {"bleu": 16.35064977606228, "bleu_stderr": 0.15318703610951076, "rouge1_fmeasure": 0.5272948463681918, "rouge1_fmeasure_stderr": 0.002276632828321617, "rouge1_precision": 0.6141579219291876, "rouge1_precision_stderr": 0.0031375276423873367, "rouge1_recall": 0.49642790420160215, "rouge1_recall_stderr": 0.002890279129158977, "rouge2_fmeasure": 0.2730606307438099, "rouge2_fmeasure_stderr": 0.002234438055141524, "rouge2_precision": 0.3216796674918701, "rouge2_precision_stderr": 0.0028452669990831196, "rouge2_recall": 0.2567141692534647, "rouge2_recall_stderr": 0.002372105255220152, "rougeL_fmeasure": 0.38874867977872807, "rougeL_fmeasure_stderr": 0.0021952914125996665, "rougeL_precision": 0.45388329842280073, "rougeL_precision_stderr": 0.0029409250663779253, "rougeL_recall": 0.36571295934350256, "rougeL_recall_stderr": 0.0025294206995360793, "rougeLsum_fmeasure": 0.44624941166081916, "rougeLsum_fmeasure_stderr": 0.0024399419515776116, "rougeLsum_precision": 0.5193905306564236, "rougeLsum_precision_stderr": 0.0031731739172339655, "rougeLsum_recall": 0.4201701863764282, "rougeLsum_recall_stderr": 0.0028340856821376906}}, "5": {"generate_text_restaurant": {"bleu": 16.33989554218624, "bleu_stderr": 0.17192498787094718, "rouge1_fmeasure": 0.5272012483217712, "rouge1_fmeasure_stderr": 0.0022727033449929193, "rouge1_precision": 0.6138887802372733, "rouge1_precision_stderr": 0.003148762043973577, "rouge1_recall": 0.49579908715969256, "rouge1_recall_stderr": 0.002855704257905023, "rouge2_fmeasure": 0.2739732304449431, "rouge2_fmeasure_stderr": 0.0022083117263179994, "rouge2_precision": 0.32296239014683903, "rouge2_precision_stderr": 0.002836651143474931, "rouge2_recall": 0.25721295886814427, "rouge2_recall_stderr": 0.0023323863742251865, "rougeL_fmeasure": 0.39053942113540757, "rougeL_fmeasure_stderr": 0.0022084680095602566, "rougeL_precision": 0.4557756119232901, "rougeL_precision_stderr": 0.002957529253678058, "rougeL_recall": 0.36723275796263094, "rougeL_recall_stderr": 0.002545583277155397, "rougeLsum_fmeasure": 0.44667560043167237, "rougeLsum_fmeasure_stderr": 0.002425231007898886, "rougeLsum_precision": 0.520413988660044, "rougeLsum_precision_stderr": 0.003202124211985598, "rougeLsum_recall": 0.41985936033161986, "rougeLsum_recall_stderr": 0.002793790259009851}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.9172897233734647, "bleu_stderr": 0.07612979258741162, "rouge1_fmeasure": 0.19497689185235167, "rouge1_fmeasure_stderr": 0.003037459759774, "rouge1_precision": 0.14704474756783373, "rouge1_precision_stderr": 0.00260243994472115, "rouge1_recall": 0.3179426585120001, "rouge1_recall_stderr": 0.005091359061361595, "rouge2_fmeasure": 0.04545175235793978, "rouge2_fmeasure_stderr": 0.0016613895167432173, "rouge2_precision": 0.033133611529725375, "rouge2_precision_stderr": 0.0012222134099385885, "rouge2_recall": 0.07643755617885226, "rouge2_recall_stderr": 0.002852925969928893, "rougeL_fmeasure": 0.14329622002037548, "rougeL_fmeasure_stderr": 0.002251083734524946, "rougeL_precision": 0.10872152825582168, "rougeL_precision_stderr": 0.0020985136406316504, "rougeL_recall": 0.23420658227277613, "rougeL_recall_stderr": 0.0038579727721169913, "rougeLsum_fmeasure": 0.15671768300152755, "rougeLsum_fmeasure_stderr": 0.0024938518539829066, "rougeLsum_precision": 0.11878580563146314, "rougeLsum_precision_stderr": 0.0022791200374934033, "rougeLsum_recall": 0.2561567987942721, "rougeLsum_recall_stderr": 0.004250537243559008}}, "1": {"article_DOC_summary": {"bleu": 2.9617243214867686, "bleu_stderr": 0.2121625238447871, "rouge1_fmeasure": 0.2429048922678839, "rouge1_fmeasure_stderr": 0.0035470294629619853, "rouge1_precision": 0.2506117820029391, "rouge1_precision_stderr": 0.004197193925421806, "rouge1_recall": 0.265051427357207, "rouge1_recall_stderr": 0.004113838412416427, "rouge2_fmeasure": 0.05697362472735911, "rouge2_fmeasure_stderr": 0.002319852424027265, "rouge2_precision": 0.059355725931601024, "rouge2_precision_stderr": 0.0026146656506255265, "rouge2_recall": 0.06240115907434781, "rouge2_recall_stderr": 0.002528707836330792, "rougeL_fmeasure": 0.1806939148235656, "rougeL_fmeasure_stderr": 0.002897868234740968, "rougeL_precision": 0.18675397233452354, "rougeL_precision_stderr": 0.003437969762757092, "rougeL_recall": 0.19744938072178414, "rougeL_recall_stderr": 0.0033460143453592026, "rougeLsum_fmeasure": 0.18340626109421165, "rougeLsum_fmeasure_stderr": 0.002921137289513508, "rougeLsum_precision": 0.18896297297256576, "rougeLsum_precision_stderr": 0.003436743475620793, "rougeLsum_recall": 0.20147068575801652, "rougeLsum_recall_stderr": 0.0034574795296360844}}, "2": {"article_DOC_summary": {"bleu": 3.790180948775114, "bleu_stderr": 0.21949698226608796, "rouge1_fmeasure": 0.2684969037756941, "rouge1_fmeasure_stderr": 0.003538515661007479, "rouge1_precision": 0.2862450933885503, "rouge1_precision_stderr": 0.0042447208222623255, "rouge1_recall": 0.2741637614630613, "rouge1_recall_stderr": 0.0038690227267741645, "rouge2_fmeasure": 0.06850101702956525, "rouge2_fmeasure_stderr": 0.002469575816155894, "rouge2_precision": 0.07370429393444561, "rouge2_precision_stderr": 0.002782856999759695, "rouge2_recall": 0.07014627514180007, "rouge2_recall_stderr": 0.002596637645417416, "rougeL_fmeasure": 0.20209449252255896, "rougeL_fmeasure_stderr": 0.003025686120342655, "rougeL_precision": 0.21544246954500543, "rougeL_precision_stderr": 0.003580176542199536, "rougeL_recall": 0.20674126511922747, "rougeL_recall_stderr": 0.0032887383484612206, "rougeLsum_fmeasure": 0.20333325134792413, "rougeLsum_fmeasure_stderr": 0.0030149542501899957, "rougeLsum_precision": 0.21658328451330233, "rougeLsum_precision_stderr": 0.003567997453456091, "rougeLsum_recall": 0.2083566653493878, "rougeLsum_recall_stderr": 0.0032948366397107273}}}} \ No newline at end of file +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.347289354458488, "bleu_stderr": 0.03777797007924809, "rouge1_fmeasure": 0.10890954353426278, "rouge1_fmeasure_stderr": 0.002110258935484955, "rouge1_precision": 0.079226368742227, "rouge1_precision_stderr": 0.0023231969845277887, "rouge1_recall": 0.28110828630725015, "rouge1_recall_stderr": 0.004882004381200993, "rouge2_fmeasure": 0.05172430149730381, "rouge2_fmeasure_stderr": 0.0013180585408877222, "rouge2_precision": 0.03743726607123581, "rouge2_precision_stderr": 0.001494228694268955, "rouge2_recall": 0.13690374884005102, "rouge2_recall_stderr": 0.003169133187878269, "rougeL_fmeasure": 0.10554978776184795, "rougeL_fmeasure_stderr": 0.0020007799763877027, "rougeL_precision": 0.07666350308702279, "rougeL_precision_stderr": 0.0022367957170880146, "rougeL_recall": 0.2745019482026448, "rougeL_recall_stderr": 0.004789753920178336, "rougeLsum_fmeasure": 0.10411236694388043, "rougeLsum_fmeasure_stderr": 0.0019983020211639833, "rougeLsum_precision": 0.07592638133051025, "rougeLsum_precision_stderr": 0.002251359653603086, "rougeLsum_recall": 0.2686557447211624, "rougeLsum_recall_stderr": 0.004613052990575398}}, "1": {"PALM_prompt": {"bleu": 0.5749625372797772, "bleu_stderr": 0.042168587225678913, "rouge1_fmeasure": 0.1639618887300654, "rouge1_fmeasure_stderr": 0.003768473720694237, "rouge1_precision": 0.1384507797063748, "rouge1_precision_stderr": 0.004372707845611215, "rouge1_recall": 0.322644364844527, "rouge1_recall_stderr": 0.004855940304467451, "rouge2_fmeasure": 0.0838728953683248, "rouge2_fmeasure_stderr": 0.002551976059024005, "rouge2_precision": 0.0725164426950792, "rouge2_precision_stderr": 0.0030197107273513208, "rouge2_recall": 0.16687390506721525, "rouge2_recall_stderr": 0.0035088962671374286, "rougeL_fmeasure": 0.14973459838273895, "rougeL_fmeasure_stderr": 0.003227143648327153, "rougeL_precision": 0.12455299793053924, "rougeL_precision_stderr": 0.0037936801945088155, "rougeL_recall": 0.3050028170207448, "rougeL_recall_stderr": 0.004518320595879189, "rougeLsum_fmeasure": 0.15125353657464896, "rougeLsum_fmeasure_stderr": 0.0032779530465988412, "rougeLsum_precision": 0.1263163648899627, "rougeLsum_precision_stderr": 0.0038597130269363692, "rougeLsum_recall": 0.3062159795615415, "rougeLsum_recall_stderr": 0.004522396040564721}}, "2": {"PALM_prompt": {"bleu": 0.6896289267842831, "bleu_stderr": 0.04363432869368013, "rouge1_fmeasure": 0.19577491362940544, "rouge1_fmeasure_stderr": 0.004183820712066921, "rouge1_precision": 0.1736763143186271, "rouge1_precision_stderr": 0.005190144069158226, "rouge1_recall": 0.3690495199973296, "rouge1_recall_stderr": 0.004674550419980399, "rouge2_fmeasure": 0.10468204406915697, "rouge2_fmeasure_stderr": 0.002958633901985044, "rouge2_precision": 0.09630923485520457, "rouge2_precision_stderr": 0.003591484421705004, "rouge2_recall": 0.19774238776363945, "rouge2_recall_stderr": 0.0036773455774089777, "rougeL_fmeasure": 0.17655301985743188, "rougeL_fmeasure_stderr": 0.003543990303517691, "rougeL_precision": 0.15346964126360133, "rougeL_precision_stderr": 0.004394358403285555, "rougeL_recall": 0.34605615384304983, "rougeL_recall_stderr": 0.0043553659806620975, "rougeLsum_fmeasure": 0.17970682314011696, "rougeLsum_fmeasure_stderr": 0.003629660902642662, "rougeLsum_precision": 0.15737438288230765, "rougeLsum_precision_stderr": 0.004551780258151066, "rougeLsum_recall": 0.34891501370883077, "rougeLsum_recall_stderr": 0.004358767783710016}}, "3": {"PALM_prompt": {"bleu": 0.7168649256709774, "bleu_stderr": 0.04675221435801664, "rouge1_fmeasure": 0.20039314889286589, "rouge1_fmeasure_stderr": 0.00433393844787534, "rouge1_precision": 0.1766984056786614, "rouge1_precision_stderr": 0.0052889863501627, "rouge1_recall": 0.3784023798729607, "rouge1_recall_stderr": 0.004834949146967397, "rouge2_fmeasure": 0.10711581894262515, "rouge2_fmeasure_stderr": 0.00302754276771825, "rouge2_precision": 0.09764044765164406, "rouge2_precision_stderr": 0.0036619448797318528, "rouge2_recall": 0.2030335209107597, "rouge2_recall_stderr": 0.003804312974743258, "rougeL_fmeasure": 0.18005823477527308, "rougeL_fmeasure_stderr": 0.003625480259314596, "rougeL_precision": 0.15566438191020568, "rougeL_precision_stderr": 0.004489975630701125, "rougeL_recall": 0.3537310697482285, "rougeL_recall_stderr": 0.004433439714918285, "rougeLsum_fmeasure": 0.18404319632429025, "rougeLsum_fmeasure_stderr": 0.0037645994118713916, "rougeLsum_precision": 0.1606361822943105, "rougeLsum_precision_stderr": 0.004693428623221418, "rougeLsum_recall": 0.3571516509838202, "rougeLsum_recall_stderr": 0.004469628307077723}}, "4": {"PALM_prompt": {"bleu": 0.8264767377736691, "bleu_stderr": 0.053779296385247366, "rouge1_fmeasure": 0.20221675887809062, "rouge1_fmeasure_stderr": 0.004283401665288551, "rouge1_precision": 0.1812068418232479, "rouge1_precision_stderr": 0.005398716331751978, "rouge1_recall": 0.38369035451056227, "rouge1_recall_stderr": 0.00470013713630282, "rouge2_fmeasure": 0.10880911597097326, "rouge2_fmeasure_stderr": 0.0029539167777379253, "rouge2_precision": 0.1006261222318028, "rouge2_precision_stderr": 0.0036927051039995647, "rouge2_recall": 0.2074416749104224, "rouge2_recall_stderr": 0.0036680698832641224, "rougeL_fmeasure": 0.18068778740178215, "rougeL_fmeasure_stderr": 0.0035531966464061853, "rougeL_precision": 0.15862122681407606, "rougeL_precision_stderr": 0.004530792062039435, "rougeL_recall": 0.35743845572365024, "rougeL_recall_stderr": 0.004293662429385583, "rougeLsum_fmeasure": 0.185234182101754, "rougeLsum_fmeasure_stderr": 0.00371289144584664, "rougeLsum_precision": 0.16438209047726673, "rougeLsum_precision_stderr": 0.004775185780341334, "rougeLsum_recall": 0.3611996661245711, "rougeLsum_recall_stderr": 0.00431294873519373}}, "5": {"PALM_prompt": {"bleu": 0.8765489182001927, "bleu_stderr": 0.040019456401903304, "rouge1_fmeasure": 0.20880684983729342, "rouge1_fmeasure_stderr": 0.004530179904806841, "rouge1_precision": 0.1926905497088421, "rouge1_precision_stderr": 0.0057538188799125855, "rouge1_recall": 0.3858328625547419, "rouge1_recall_stderr": 0.004823164304577201, "rouge2_fmeasure": 0.11565656452634464, "rouge2_fmeasure_stderr": 0.003314764756228703, "rouge2_precision": 0.11101027585585174, "rouge2_precision_stderr": 0.004140682260724748, "rouge2_recall": 0.21158156307350992, "rouge2_recall_stderr": 0.003943074328986311, "rougeL_fmeasure": 0.18694354311801048, "rougeL_fmeasure_stderr": 0.0038293768377802495, "rougeL_precision": 0.16975713372550302, "rougeL_precision_stderr": 0.0049633182199218225, "rougeL_recall": 0.3591348058816613, "rougeL_recall_stderr": 0.004445791053730718, "rougeLsum_fmeasure": 0.1915290665625795, "rougeLsum_fmeasure_stderr": 0.00396816747113494, "rougeLsum_precision": 0.17553725023286618, "rougeLsum_precision_stderr": 0.005177574879953629, "rougeLsum_recall": 0.36311828129199303, "rougeLsum_recall_stderr": 0.0044570305199087895}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 3.586033759423314, "bleu_stderr": 0.09221854298529836, "rouge1_fmeasure": 0.18563962193300107, "rouge1_fmeasure_stderr": 0.00254964484535964, "rouge1_precision": 0.1716696703091672, "rouge1_precision_stderr": 0.0028501359020609955, "rouge1_recall": 0.25185497103999677, "rouge1_recall_stderr": 0.0034752671023028777, "rouge2_fmeasure": 0.049529716403418626, "rouge2_fmeasure_stderr": 0.001152868080944952, "rouge2_precision": 0.04455795738149438, "rouge2_precision_stderr": 0.0011159745679413468, "rouge2_recall": 0.06850978198098658, "rouge2_recall_stderr": 0.0017237693104594396, "rougeL_fmeasure": 0.13995045719718288, "rougeL_fmeasure_stderr": 0.0018763707447302035, "rougeL_precision": 0.1292734642683531, "rougeL_precision_stderr": 0.0022188669387071605, "rougeL_recall": 0.1943570396317403, "rougeL_recall_stderr": 0.0027761702028766522, "rougeLsum_fmeasure": 0.17363683324091103, "rougeLsum_fmeasure_stderr": 0.0023982582496962375, "rougeLsum_precision": 0.1610229468194331, "rougeLsum_precision_stderr": 0.002727168940548498, "rougeLsum_recall": 0.2357960382787708, "rougeLsum_recall_stderr": 0.0032789217359646683}}, "1": {"tldr_en": {"bleu": 3.0885083531376645, "bleu_stderr": 0.09894775701896347, "rouge1_fmeasure": 0.18742769556123198, "rouge1_fmeasure_stderr": 0.0024179522944743578, "rouge1_precision": 0.22141486924288373, "rouge1_precision_stderr": 0.003489092311505774, "rouge1_recall": 0.21774103131853748, "rouge1_recall_stderr": 0.003219238261544544, "rouge2_fmeasure": 0.04749462077218236, "rouge2_fmeasure_stderr": 0.0013391994596319457, "rouge2_precision": 0.05911494441028749, "rouge2_precision_stderr": 0.001977492552657427, "rouge2_recall": 0.056021987595245265, "rouge2_recall_stderr": 0.0016925719239639409, "rougeL_fmeasure": 0.14177658902154563, "rougeL_fmeasure_stderr": 0.0018452092691051947, "rougeL_precision": 0.17052813755243684, "rougeL_precision_stderr": 0.0028543436290086777, "rougeL_recall": 0.16498921391985125, "rougeL_recall_stderr": 0.0024957799702815765, "rougeLsum_fmeasure": 0.17566654570551468, "rougeLsum_fmeasure_stderr": 0.0022619479293200127, "rougeLsum_precision": 0.20843084756852662, "rougeLsum_precision_stderr": 0.003321140474934239, "rougeLsum_recall": 0.20397276049127364, "rougeLsum_recall_stderr": 0.0030133958209501476}}, "2": {"tldr_en": {"bleu": 4.480739832822098, "bleu_stderr": 0.11578330856151871, "rouge1_fmeasure": 0.24919698955059236, "rouge1_fmeasure_stderr": 0.002338885547952402, "rouge1_precision": 0.31947645525912327, "rouge1_precision_stderr": 0.0038030155885502574, "rouge1_recall": 0.26784034634508286, "rouge1_recall_stderr": 0.003035447513908994, "rouge2_fmeasure": 0.07449487544541136, "rouge2_fmeasure_stderr": 0.0015020810496653879, "rouge2_precision": 0.10032559893960638, "rouge2_precision_stderr": 0.0024161327250120574, "rouge2_recall": 0.07957621599352918, "rouge2_recall_stderr": 0.0017828844980089247, "rougeL_fmeasure": 0.18970698519609924, "rougeL_fmeasure_stderr": 0.0018593336285269075, "rougeL_precision": 0.2468115494253994, "rougeL_precision_stderr": 0.0032025411700306672, "rougeL_recall": 0.20411776076410448, "rougeL_recall_stderr": 0.0024277091219038756, "rougeLsum_fmeasure": 0.23380521600569548, "rougeLsum_fmeasure_stderr": 0.0022238011394774266, "rougeLsum_precision": 0.3009096197940564, "rougeLsum_precision_stderr": 0.0036645356865323267, "rougeLsum_recall": 0.251048984299023, "rougeLsum_recall_stderr": 0.002859798214873002}}, "3": {"tldr_en": {"bleu": 3.416486588059767, "bleu_stderr": 0.05871038450450248, "rouge1_fmeasure": 0.21276268379590219, "rouge1_fmeasure_stderr": 0.002745499539353653, "rouge1_precision": 0.2871095429771904, "rouge1_precision_stderr": 0.004185534949827048, "rouge1_recall": 0.22207861869609608, "rouge1_recall_stderr": 0.0033297349416556445, "rouge2_fmeasure": 0.06395005883547085, "rouge2_fmeasure_stderr": 0.0014814659874998457, "rouge2_precision": 0.08919737395825947, "rouge2_precision_stderr": 0.00234436562476581, "rouge2_recall": 0.06668770358204071, "rouge2_recall_stderr": 0.0017001378203776699, "rougeL_fmeasure": 0.16381135583162096, "rougeL_fmeasure_stderr": 0.0021664607071098315, "rougeL_precision": 0.22428029391025195, "rougeL_precision_stderr": 0.0034506563669751745, "rougeL_recall": 0.17117251055926017, "rougeL_recall_stderr": 0.002654839665946902, "rougeLsum_fmeasure": 0.20067896546125685, "rougeLsum_fmeasure_stderr": 0.0026118338406594167, "rougeLsum_precision": 0.27186005782650335, "rougeLsum_precision_stderr": 0.004024882931968101, "rougeLsum_recall": 0.2092333223784394, "rougeLsum_recall_stderr": 0.0031542921648456132}}, "4": {"tldr_en": {"bleu": 0.08573923963165347, "bleu_stderr": 0.01281159560771889, "rouge1_fmeasure": 0.06934635599486277, "rouge1_fmeasure_stderr": 0.002419160037746626, "rouge1_precision": 0.09854206688714345, "rouge1_precision_stderr": 0.0036437541450359593, "rouge1_recall": 0.07142922091930162, "rouge1_recall_stderr": 0.0026886044555192072, "rouge2_fmeasure": 0.021348512828022314, "rouge2_fmeasure_stderr": 0.001055380041694426, "rouge2_precision": 0.03200012684242646, "rouge2_precision_stderr": 0.001809559832121984, "rouge2_recall": 0.02183618792959388, "rouge2_recall_stderr": 0.0011694185534509838, "rougeL_fmeasure": 0.05429349387264579, "rougeL_fmeasure_stderr": 0.001919877615338037, "rougeL_precision": 0.07858578853182602, "rougeL_precision_stderr": 0.003017215712567151, "rougeL_recall": 0.055804214936503324, "rougeL_recall_stderr": 0.0021264630991137314, "rougeLsum_fmeasure": 0.06504921478523501, "rougeLsum_fmeasure_stderr": 0.002272098654651633, "rougeLsum_precision": 0.0930444540425215, "rougeLsum_precision_stderr": 0.003479952193050714, "rougeLsum_recall": 0.06690767452071034, "rougeLsum_recall_stderr": 0.0025201480735759655}}, "5": {"tldr_en": {"bleu": 7.249514036066506e-16, "bleu_stderr": 3.339336428482265e-14, "rouge1_fmeasure": 0.012068612050969258, "rouge1_fmeasure_stderr": 0.001175572535963405, "rouge1_precision": 0.019085080359257173, "rouge1_precision_stderr": 0.0018850577941172406, "rouge1_recall": 0.011645843404748045, "rouge1_recall_stderr": 0.0012042546813690255, "rouge2_fmeasure": 0.004052279635441424, "rouge2_fmeasure_stderr": 0.0005200626541191212, "rouge2_precision": 0.006894559713384093, "rouge2_precision_stderr": 0.0010187138526863938, "rouge2_recall": 0.003894968708693124, "rouge2_recall_stderr": 0.000532555761738665, "rougeL_fmeasure": 0.009809956651925644, "rougeL_fmeasure_stderr": 0.0009599920054622314, "rougeL_precision": 0.016020963520094682, "rougeL_precision_stderr": 0.0016304077510915223, "rougeL_recall": 0.00943399649665596, "rougeL_recall_stderr": 0.000984307352962584, "rougeLsum_fmeasure": 0.011452198317124414, "rougeLsum_fmeasure_stderr": 0.0011152369987060954, "rougeLsum_precision": 0.018298677104570716, "rougeLsum_precision_stderr": 0.001820476829671769, "rougeLsum_recall": 0.011022020415497191, "rougeLsum_recall_stderr": 0.0011405019701246609}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 3.8008125605460465, "bleu_stderr": 0.08334728702857473, "rouge1_fmeasure": 0.2613118947613821, "rouge1_fmeasure_stderr": 0.002127912583794242, "rouge1_precision": 0.2090865590305055, "rouge1_precision_stderr": 0.0018470177010718961, "rouge1_recall": 0.38188549726437987, "rouge1_recall_stderr": 0.003279728734456605, "rouge2_fmeasure": 0.07297635058372318, "rouge2_fmeasure_stderr": 0.001261248605220041, "rouge2_precision": 0.05673694028468428, "rouge2_precision_stderr": 0.0009833704090490367, "rouge2_recall": 0.1101409383371642, "rouge2_recall_stderr": 0.001995492794975982, "rougeL_fmeasure": 0.22836186321205557, "rougeL_fmeasure_stderr": 0.0015605125797972495, "rougeL_precision": 0.18275732889261928, "rougeL_precision_stderr": 0.0013921067012969746, "rougeL_recall": 0.3338079929800571, "rougeL_recall_stderr": 0.0024505755903694226, "rougeLsum_fmeasure": 0.21899459401932223, "rougeLsum_fmeasure_stderr": 0.002102657843438912, "rougeLsum_precision": 0.17512630400560542, "rougeLsum_precision_stderr": 0.0017783983400275266, "rougeLsum_recall": 0.32141092125405557, "rougeLsum_recall_stderr": 0.0032662985068743528}}, "1": {"generate_text_restaurant": {"bleu": 12.455563496168116, "bleu_stderr": 0.15428951986028144, "rouge1_fmeasure": 0.4844618589871223, "rouge1_fmeasure_stderr": 0.002388878380866762, "rouge1_precision": 0.5954981850070853, "rouge1_precision_stderr": 0.0032755377697093577, "rouge1_recall": 0.44832902186902923, "rouge1_recall_stderr": 0.0030688387452721486, "rouge2_fmeasure": 0.23224231794118463, "rouge2_fmeasure_stderr": 0.0020931392807158806, "rouge2_precision": 0.29040989982948295, "rouge2_precision_stderr": 0.0028204861869818784, "rouge2_recall": 0.21450317916657013, "rouge2_recall_stderr": 0.002222395267553394, "rougeL_fmeasure": 0.34990734371381305, "rougeL_fmeasure_stderr": 0.0021300316703662904, "rougeL_precision": 0.43368332329208353, "rougeL_precision_stderr": 0.003056248156608264, "rougeL_recall": 0.32270327842717356, "rougeL_recall_stderr": 0.0024894724353540653, "rougeLsum_fmeasure": 0.39499604706438096, "rougeLsum_fmeasure_stderr": 0.0023754095690034817, "rougeLsum_precision": 0.48684291022612447, "rougeLsum_precision_stderr": 0.0032370809243618716, "rougeLsum_recall": 0.3651475528395549, "rougeLsum_recall_stderr": 0.002807567486848305}}, "2": {"generate_text_restaurant": {"bleu": 15.187891510159636, "bleu_stderr": 0.2005340847158173, "rouge1_fmeasure": 0.5149400564373232, "rouge1_fmeasure_stderr": 0.002281960188799717, "rouge1_precision": 0.6089225605030864, "rouge1_precision_stderr": 0.0031749506024181815, "rouge1_recall": 0.4850253270309617, "rouge1_recall_stderr": 0.0029712295443402523, "rouge2_fmeasure": 0.26064964037026866, "rouge2_fmeasure_stderr": 0.002149918786285521, "rouge2_precision": 0.3125612676733449, "rouge2_precision_stderr": 0.002807570058211188, "rouge2_recall": 0.24520830668256238, "rouge2_recall_stderr": 0.0023166169808109976, "rougeL_fmeasure": 0.3791251783681887, "rougeL_fmeasure_stderr": 0.0021638392150190966, "rougeL_precision": 0.45032557438452553, "rougeL_precision_stderr": 0.002995769757075173, "rougeL_recall": 0.3564750276333542, "rougeL_recall_stderr": 0.002545550505071048, "rougeLsum_fmeasure": 0.4311876269636124, "rougeLsum_fmeasure_stderr": 0.0024222285980316075, "rougeLsum_precision": 0.5099476065779874, "rougeLsum_precision_stderr": 0.003210910234266457, "rougeLsum_recall": 0.40596954377109, "rougeLsum_recall_stderr": 0.002856919494329858}}, "3": {"generate_text_restaurant": {"bleu": 16.060659394975822, "bleu_stderr": 0.17284642655126536, "rouge1_fmeasure": 0.5239878618101873, "rouge1_fmeasure_stderr": 0.0022891503962730515, "rouge1_precision": 0.6148474805598093, "rouge1_precision_stderr": 0.003175147795291498, "rouge1_recall": 0.4938729679272436, "rouge1_recall_stderr": 0.0029612490025592896, "rouge2_fmeasure": 0.27038735747786175, "rouge2_fmeasure_stderr": 0.002196480015942097, "rouge2_precision": 0.32141615003361074, "rouge2_precision_stderr": 0.00284369777288176, "rouge2_recall": 0.2546854150197406, "rouge2_recall_stderr": 0.0023748524797935944, "rougeL_fmeasure": 0.3858853477225852, "rougeL_fmeasure_stderr": 0.002195201839278644, "rougeL_precision": 0.45482796385524316, "rougeL_precision_stderr": 0.0030285934039142345, "rougeL_recall": 0.3632799719766207, "rougeL_recall_stderr": 0.0025747603467030653, "rougeLsum_fmeasure": 0.44212298577697895, "rougeLsum_fmeasure_stderr": 0.0024314611034434628, "rougeLsum_precision": 0.5189202264105984, "rougeLsum_precision_stderr": 0.0032211525451001028, "rougeLsum_recall": 0.41656979202046523, "rougeLsum_recall_stderr": 0.0028606302793073527}}, "4": {"generate_text_restaurant": {"bleu": 16.35064977606228, "bleu_stderr": 0.15318703610951076, "rouge1_fmeasure": 0.5272948463681918, "rouge1_fmeasure_stderr": 0.002276632828321617, "rouge1_precision": 0.6141579219291876, "rouge1_precision_stderr": 0.0031375276423873367, "rouge1_recall": 0.49642790420160215, "rouge1_recall_stderr": 0.002890279129158977, "rouge2_fmeasure": 0.2730606307438099, "rouge2_fmeasure_stderr": 0.002234438055141524, "rouge2_precision": 0.3216796674918701, "rouge2_precision_stderr": 0.0028452669990831196, "rouge2_recall": 0.2567141692534647, "rouge2_recall_stderr": 0.002372105255220152, "rougeL_fmeasure": 0.38874867977872807, "rougeL_fmeasure_stderr": 0.0021952914125996665, "rougeL_precision": 0.45388329842280073, "rougeL_precision_stderr": 0.0029409250663779253, "rougeL_recall": 0.36571295934350256, "rougeL_recall_stderr": 0.0025294206995360793, "rougeLsum_fmeasure": 0.44624941166081916, "rougeLsum_fmeasure_stderr": 0.0024399419515776116, "rougeLsum_precision": 0.5193905306564236, "rougeLsum_precision_stderr": 0.0031731739172339655, "rougeLsum_recall": 0.4201701863764282, "rougeLsum_recall_stderr": 0.0028340856821376906}}, "5": {"generate_text_restaurant": {"bleu": 16.33989554218624, "bleu_stderr": 0.17192498787094718, "rouge1_fmeasure": 0.5272012483217712, "rouge1_fmeasure_stderr": 0.0022727033449929193, "rouge1_precision": 0.6138887802372733, "rouge1_precision_stderr": 0.003148762043973577, "rouge1_recall": 0.49579908715969256, "rouge1_recall_stderr": 0.002855704257905023, "rouge2_fmeasure": 0.2739732304449431, "rouge2_fmeasure_stderr": 0.0022083117263179994, "rouge2_precision": 0.32296239014683903, "rouge2_precision_stderr": 0.002836651143474931, "rouge2_recall": 0.25721295886814427, "rouge2_recall_stderr": 0.0023323863742251865, "rougeL_fmeasure": 0.39053942113540757, "rougeL_fmeasure_stderr": 0.0022084680095602566, "rougeL_precision": 0.4557756119232901, "rougeL_precision_stderr": 0.002957529253678058, "rougeL_recall": 0.36723275796263094, "rougeL_recall_stderr": 0.002545583277155397, "rougeLsum_fmeasure": 0.44667560043167237, "rougeLsum_fmeasure_stderr": 0.002425231007898886, "rougeLsum_precision": 0.520413988660044, "rougeLsum_precision_stderr": 0.003202124211985598, "rougeLsum_recall": 0.41985936033161986, "rougeLsum_recall_stderr": 0.002793790259009851}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.9172897233734647, "bleu_stderr": 0.07612979258741162, "rouge1_fmeasure": 0.19497689185235167, "rouge1_fmeasure_stderr": 0.003037459759774, "rouge1_precision": 0.14704474756783373, "rouge1_precision_stderr": 0.00260243994472115, "rouge1_recall": 0.3179426585120001, "rouge1_recall_stderr": 0.005091359061361595, "rouge2_fmeasure": 0.04545175235793978, "rouge2_fmeasure_stderr": 0.0016613895167432173, "rouge2_precision": 0.033133611529725375, "rouge2_precision_stderr": 0.0012222134099385885, "rouge2_recall": 0.07643755617885226, "rouge2_recall_stderr": 0.002852925969928893, "rougeL_fmeasure": 0.14329622002037548, "rougeL_fmeasure_stderr": 0.002251083734524946, "rougeL_precision": 0.10872152825582168, "rougeL_precision_stderr": 0.0020985136406316504, "rougeL_recall": 0.23420658227277613, "rougeL_recall_stderr": 0.0038579727721169913, "rougeLsum_fmeasure": 0.15671768300152755, "rougeLsum_fmeasure_stderr": 0.0024938518539829066, "rougeLsum_precision": 0.11878580563146314, "rougeLsum_precision_stderr": 0.0022791200374934033, "rougeLsum_recall": 0.2561567987942721, "rougeLsum_recall_stderr": 0.004250537243559008}}, "1": {"article_DOC_summary": {"bleu": 2.9617243214867686, "bleu_stderr": 0.2121625238447871, "rouge1_fmeasure": 0.2429048922678839, "rouge1_fmeasure_stderr": 0.0035470294629619853, "rouge1_precision": 0.2506117820029391, "rouge1_precision_stderr": 0.004197193925421806, "rouge1_recall": 0.265051427357207, "rouge1_recall_stderr": 0.004113838412416427, "rouge2_fmeasure": 0.05697362472735911, "rouge2_fmeasure_stderr": 0.002319852424027265, "rouge2_precision": 0.059355725931601024, "rouge2_precision_stderr": 0.0026146656506255265, "rouge2_recall": 0.06240115907434781, "rouge2_recall_stderr": 0.002528707836330792, "rougeL_fmeasure": 0.1806939148235656, "rougeL_fmeasure_stderr": 0.002897868234740968, "rougeL_precision": 0.18675397233452354, "rougeL_precision_stderr": 0.003437969762757092, "rougeL_recall": 0.19744938072178414, "rougeL_recall_stderr": 0.0033460143453592026, "rougeLsum_fmeasure": 0.18340626109421165, "rougeLsum_fmeasure_stderr": 0.002921137289513508, "rougeLsum_precision": 0.18896297297256576, "rougeLsum_precision_stderr": 0.003436743475620793, "rougeLsum_recall": 0.20147068575801652, "rougeLsum_recall_stderr": 0.0034574795296360844}}, "2": {"article_DOC_summary": {"bleu": 3.790180948775114, "bleu_stderr": 0.21949698226608796, "rouge1_fmeasure": 0.2684969037756941, "rouge1_fmeasure_stderr": 0.003538515661007479, "rouge1_precision": 0.2862450933885503, "rouge1_precision_stderr": 0.0042447208222623255, "rouge1_recall": 0.2741637614630613, "rouge1_recall_stderr": 0.0038690227267741645, "rouge2_fmeasure": 0.06850101702956525, "rouge2_fmeasure_stderr": 0.002469575816155894, "rouge2_precision": 0.07370429393444561, "rouge2_precision_stderr": 0.002782856999759695, "rouge2_recall": 0.07014627514180007, "rouge2_recall_stderr": 0.002596637645417416, "rougeL_fmeasure": 0.20209449252255896, "rougeL_fmeasure_stderr": 0.003025686120342655, "rougeL_precision": 0.21544246954500543, "rougeL_precision_stderr": 0.003580176542199536, "rougeL_recall": 0.20674126511922747, "rougeL_recall_stderr": 0.0032887383484612206, "rougeLsum_fmeasure": 0.20333325134792413, "rougeLsum_fmeasure_stderr": 0.0030149542501899957, "rougeLsum_precision": 0.21658328451330233, "rougeLsum_precision_stderr": 0.003567997453456091, "rougeLsum_recall": 0.2083566653493878, "rougeLsum_recall_stderr": 0.0032948366397107273}}, "3": {"article_DOC_summary": {"bleu": 3.892045553878533, "bleu_stderr": 0.15338529710936155, "rouge1_fmeasure": 0.25539660890379623, "rouge1_fmeasure_stderr": 0.00408774094209769, "rouge1_precision": 0.27838096040655913, "rouge1_precision_stderr": 0.004710736944579292, "rouge1_recall": 0.256517363989891, "rouge1_recall_stderr": 0.0043630766599847375, "rouge2_fmeasure": 0.06904105989761854, "rouge2_fmeasure_stderr": 0.0026844346914174943, "rouge2_precision": 0.07429604893692207, "rouge2_precision_stderr": 0.0029381253193927104, "rouge2_recall": 0.06987562130359767, "rouge2_recall_stderr": 0.002769991072408768, "rougeL_fmeasure": 0.19320917420626382, "rougeL_fmeasure_stderr": 0.0033971194258629626, "rougeL_precision": 0.21060638862089187, "rougeL_precision_stderr": 0.003887329749618103, "rougeL_recall": 0.19474922081143328, "rougeL_recall_stderr": 0.0036536236006364338, "rougeLsum_fmeasure": 0.19420752036343683, "rougeLsum_fmeasure_stderr": 0.0034107307485400193, "rougeLsum_precision": 0.21158267461706523, "rougeLsum_precision_stderr": 0.003892248180779862, "rougeLsum_recall": 0.1959669205905632, "rougeLsum_recall_stderr": 0.0036938624037444975}}, "4": {"article_DOC_summary": {"bleu": 0.0793183178082136, "bleu_stderr": 0.033794375862044715, "rouge1_fmeasure": 0.05924716020491812, "rouge1_fmeasure_stderr": 0.0036326176527612343, "rouge1_precision": 0.06908582074094831, "rouge1_precision_stderr": 0.004346958400381964, "rouge1_recall": 0.0576659953376829, "rouge1_recall_stderr": 0.0036525448833407675, "rouge2_fmeasure": 0.015889553201926813, "rouge2_fmeasure_stderr": 0.001483207722000358, "rouge2_precision": 0.018055976337857795, "rouge2_precision_stderr": 0.001722423057966192, "rouge2_recall": 0.015572542649221232, "rouge2_recall_stderr": 0.0014824376881666455, "rougeL_fmeasure": 0.04481653988120624, "rougeL_fmeasure_stderr": 0.0028149978857180923, "rougeL_precision": 0.0526906303370273, "rougeL_precision_stderr": 0.00345962655866441, "rougeL_recall": 0.04371274927728761, "rougeL_recall_stderr": 0.00285083893180292, "rougeLsum_fmeasure": 0.044863049752252554, "rougeLsum_fmeasure_stderr": 0.002827990480856111, "rougeLsum_precision": 0.05278495458510948, "rougeLsum_precision_stderr": 0.0034751372117411933, "rougeLsum_recall": 0.04367091698209838, "rougeLsum_recall_stderr": 0.002855118245232499}}, "5": {"article_DOC_summary": {"bleu": 4.872051358237048e-56, "bleu_stderr": 1.0994911419227211e-38, "rouge1_fmeasure": 0.002561173851810196, "rouge1_fmeasure_stderr": 0.0008604007460830072, "rouge1_precision": 0.002843906337499315, "rouge1_precision_stderr": 0.0009956544746623187, "rouge1_recall": 0.0024927455220140884, "rouge1_recall_stderr": 0.0008268054962235649, "rouge2_fmeasure": 0.00047911010108716033, "rouge2_fmeasure_stderr": 0.00025331584786903913, "rouge2_precision": 0.0005943099210680686, "rouge2_precision_stderr": 0.00031949865564033644, "rouge2_recall": 0.0004189368714895907, "rouge2_recall_stderr": 0.00021998461769511408, "rougeL_fmeasure": 0.0017324699968285832, "rougeL_fmeasure_stderr": 0.0005921585827612698, "rougeL_precision": 0.0019474370851628444, "rougeL_precision_stderr": 0.000688363115056785, "rougeL_recall": 0.0016525178188876207, "rougeL_recall_stderr": 0.0005530966506647648, "rougeLsum_fmeasure": 0.0017324699968285832, "rougeLsum_fmeasure_stderr": 0.0005921585827612698, "rougeLsum_precision": 0.0019474370851628444, "rougeLsum_precision_stderr": 0.000688363115056785, "rougeLsum_recall": 0.0016525178188876207, "rougeLsum_recall_stderr": 0.0005530966506647648}}}} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.json b/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5e06a11bbbf9f0ece85ce170b0d1ac013d0f68e3 --- /dev/null +++ b/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.31947645525912327, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0038030155885502574 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.26784034634508286, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003035447513908994 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.24919698955059236, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002338885547952402 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.10032559893960638, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0024161327250120574 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.07957621599352918, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0017828844980089247 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.07449487544541136, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0015020810496653879 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.2468115494253994, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0032025411700306672 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.20411776076410448, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0024277091219038756 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.18970698519609924, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018593336285269075 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.3009096197940564, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0036645356865323267 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.251048984299023, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002859798214873002 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.23380521600569548, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0022238011394774266 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 4.480739832822098, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.11578330856151871 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.json b/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..18f14f27236cd5a5fcc5eaece19b4f24b66b8a2f --- /dev/null +++ b/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.2871095429771904, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.004185534949827048 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.22207861869609608, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0033297349416556445 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.21276268379590219, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002745499539353653 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.08919737395825947, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00234436562476581 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.06668770358204071, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0017001378203776699 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.06395005883547085, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0014814659874998457 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.22428029391025195, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0034506563669751745 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.17117251055926017, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002654839665946902 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.16381135583162096, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0021664607071098315 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.27186005782650335, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.004024882931968101 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.2092333223784394, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0031542921648456132 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.20067896546125685, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0026118338406594167 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.416486588059767, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05871038450450248 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.json b/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..afcc8cb1da3bd4989dc0a686dc165f75a25cf7d7 --- /dev/null +++ b/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.09854206688714345, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0036437541450359593 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.07142922091930162, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0026886044555192072 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.06934635599486277, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002419160037746626 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.03200012684242646, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001809559832121984 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.02183618792959388, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011694185534509838 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.021348512828022314, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001055380041694426 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.07858578853182602, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.003017215712567151 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.055804214936503324, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0021264630991137314 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.05429349387264579, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001919877615338037 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.0930444540425215, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.003479952193050714 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.06690767452071034, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0025201480735759655 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.06504921478523501, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002272098654651633 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.08573923963165347, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.01281159560771889 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.json b/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f835bf9a5122df2b9856c121a66df77e907a40a9 --- /dev/null +++ b/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.019085080359257173, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0018850577941172406 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.011645843404748045, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0012042546813690255 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.012068612050969258, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001175572535963405 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.006894559713384093, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0010187138526863938 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.003894968708693124, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.000532555761738665 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.004052279635441424, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0005200626541191212 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.016020963520094682, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0016304077510915223 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.00943399649665596, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.000984307352962584 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.009809956651925644, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0009599920054622314 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.018298677104570716, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001820476829671769 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.011022020415497191, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0011405019701246609 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.011452198317124414, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0011152369987060954 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 7.249514036066506e-16, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 3.339336428482265e-14 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_3.json b/8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..58ac14274a537ad144c1f94230caaa90e6963555 --- /dev/null +++ b/8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.27838096040655913, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.004710736944579292 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.256517363989891, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0043630766599847375 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.25539660890379623, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.00408774094209769 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.07429604893692207, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0029381253193927104 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.06987562130359767, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002769991072408768 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.06904105989761854, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0026844346914174943 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.21060638862089187, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.003887329749618103 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.19474922081143328, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0036536236006364338 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.19320917420626382, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0033971194258629626 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.21158267461706523, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.003892248180779862 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.1959669205905632, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0036938624037444975 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.19420752036343683, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0034107307485400193 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 3.892045553878533, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.15338529710936155 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_4.json b/8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..25110e2a7ff18c9d3a7ef86955d3c7bb296bbffa --- /dev/null +++ b/8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.06908582074094831, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.004346958400381964 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.0576659953376829, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0036525448833407675 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.05924716020491812, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0036326176527612343 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.018055976337857795, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.001722423057966192 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.015572542649221232, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0014824376881666455 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.015889553201926813, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001483207722000358 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0526906303370273, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.00345962655866441 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.04371274927728761, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.00285083893180292 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.04481653988120624, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0028149978857180923 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.05278495458510948, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0034751372117411933 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.04367091698209838, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.002855118245232499 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.044863049752252554, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002827990480856111 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.0793183178082136, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.033794375862044715 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_5.json b/8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..7ccd1d5ac5c470e26057fd2956c5f87ac5bcb15f --- /dev/null +++ b/8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.002843906337499315, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0009956544746623187 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.0024927455220140884, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0008268054962235649 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.002561173851810196, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0008604007460830072 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0005943099210680686, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00031949865564033644 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0004189368714895907, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00021998461769511408 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.00047911010108716033, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00025331584786903913 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0019474370851628444, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.000688363115056785 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.0016525178188876207, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0005530966506647648 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.0017324699968285832, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0005921585827612698 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0019474370851628444, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.000688363115056785 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0016525178188876207, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0005530966506647648 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0017324699968285832, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0005921585827612698 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 4.872051358237048e-56, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 1.0994911419227211e-38 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.json b/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d061faa8c22e9ad747a9dc14b908a413383e6add --- /dev/null +++ b/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.26704682686051745, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003346061664825914}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.28723900360273236, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0030648378055716376}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.24048738248155943, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022924559211728}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.07760883007326022, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018992689472675252}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.07930488429722847, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016958561414342646}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0668489557832933, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013397924519285302}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.2005421334534283, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00272610264193461}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.2147121574085488, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023921943267692113}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.17873946616791977, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017622551083101064}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.25120585518065286, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032054522314792226}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2689555721836676, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002872035212266304}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.22548796825222664, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002166294099617983}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 4.287770006656117, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11835519853257852}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.json b/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..01427b5018aca5bde05e0e80509023cbf0f15be8 --- /dev/null +++ b/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.24693572848760356, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0036808375259794837}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.24354376203183603, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0034150456640924024}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.21120410580427312, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002645585016214612}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.07085834748957563, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001874517454430376}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0691245743627346, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017265463033494647}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05941725493999427, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013446993130627589}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.18619149092636542, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029566758253007693}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.18275407900501345, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002678372152846199}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1574847757182755, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020235834993420653}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.23179629616820094, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003517652101595812}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.22762811767303537, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0031986554975203343}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.19767422939876575, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00249381697530238}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.8253545690458486, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11601169899524703}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.json b/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..575330d19f389e004b69a18947128f61a06512ac --- /dev/null +++ b/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.08491288146025691, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003092982485525366}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.08011242516269225, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028916714775573983}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.069564585739879, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002354877693129134}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.02565888234744232, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014806953806721635}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.023735577183813514, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012482652822662537}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.02020024245182335, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009720348342506413}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.06624573735046896, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0025139925050591855}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.06166301644336901, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002275982977075628}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.0532771556440879, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001820427400127109}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.07976639037062659, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0029385774245711256}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.07471788462506215, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002705709953549525}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.06493333871562736, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022026091321757347}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.2364078795997809, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.024391601699964672}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_5.json b/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c1335c6103a81836438007103e4b31f093e205a5 --- /dev/null +++ b/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.015915821872377674, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016702869678876702}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.011688601517501347, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0011952240401007063}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.010622171790435694, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010302186253132802}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.005627833349821597, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009009236122981561}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0037129642663769444, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005184603335382291}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0035013576004744333, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00047232485179348453}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.013277596011797644, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014699518837369332}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.009310479193227725, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0009665047884074748}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.008452079824288578, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008242289394208465}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.015271784138576902, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001626300881272708}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.010986042036294772, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0011291877819487723}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.009996482786372373, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009674414655995565}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.099893037305623e-12, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 2.420929985001087e-11}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/agg.8b7178b44b_gem_xsum_article_DOC_summary_3.json b/8b7178b44b/evaluation/generation/agg.8b7178b44b_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..6aa6100a1b704b7772084152d14b02b142686ab3 --- /dev/null +++ b/8b7178b44b/evaluation/generation/agg.8b7178b44b_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.22724912362863622, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004617605872422519}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.26380458901019693, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004323907531182028}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.22535248615052844, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003791735783514838}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.06038128094861811, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002741435955597867}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.06597238885255258, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0026568422218367894}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.05815173892712961, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002414756084543016}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.17388874536583043, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0037665824275065446}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.2036906299119507, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035595683530041146}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.17298927032856967, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0031337802161657533}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.17573844022643165, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003788816404981867}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.20748309723587363, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0037272616939512094}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.17531050854597707, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003177216988533208}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.911154273868146, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.15584859367239043}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/agg.8b7178b44b_gem_xsum_article_DOC_summary_4.json b/8b7178b44b/evaluation/generation/agg.8b7178b44b_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..59b36c3ac4b7383d32946cb94c32a19e535a3107 --- /dev/null +++ b/8b7178b44b/evaluation/generation/agg.8b7178b44b_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.06384997695789031, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004081417327764998}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.06700890326086541, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00398883760575528}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.058891655759634634, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0034788169061019582}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.01497412391522327, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015535957922032293}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.016447549994388213, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015411299403550512}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.014176278266192046, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013476830056293153}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.04936509536592748, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003202638138772514}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.05218241102055928, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003145249972269267}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.045492505164083784, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00272525124648742}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.049919934471052654, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003226500601808736}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.05295420506561277, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032077224941783403}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.04608216819829664, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0027590740017696767}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.3728774440118906, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09579003392790254}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/agg.8b7178b44b_gem_xsum_article_DOC_summary_5.json b/8b7178b44b/evaluation/generation/agg.8b7178b44b_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4445b309cac4ad3b6a5ca858b92c969403976bd0 --- /dev/null +++ b/8b7178b44b/evaluation/generation/agg.8b7178b44b_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.003245026115016397, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0010018752109838872}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.003816108316949039, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0010591733379027508}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.003209685928721461, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0009321057176689659}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0006149053716742533, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0002603220643343989}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0005865993976426828, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00023171003012774805}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0005731642694331555, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00023579174692404015}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0022714483278982152, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0007108947692843597}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0028112298473500735, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0008069199908163127}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.002272633054035741, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006656958601438186}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.002332315692638469, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.000733094697158169}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0027357899134195732, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0007566942257832804}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0023210463979317724, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006849794862388629}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 4.5775755670070436e-24, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.165194982722847e-20}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.jsonl b/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..356ca99b8543859c842ae199774fb681a23d86b2 100644 --- a/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.jsonl +++ b/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce1765e434db4f5fc6d553534a0fcd1e039b47a3aa2f1a2642a1495a0a592805 +size 18685976 diff --git a/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.jsonl b/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..9425c5a6cc26c957c04ee5b26dc4ef21ec3461d6 100644 --- a/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.jsonl +++ b/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab1731c46b616e96fbfe40546e76be5088854579fdcb04b96ab0995ef69a276b +size 24132973 diff --git a/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.jsonl b/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..d5f24f2552685eb825dd6eac79a73e8a33a472cf 100644 --- a/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.jsonl +++ b/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:099168b0bc8145951575dea20b44d4ff6e12d4cd7b370d1b707cde6769e2399e +size 29401562 diff --git a/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_5.jsonl b/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..77c0d1a5b1d18756e37e1c0a23b52c857edf09b3 100644 --- a/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_5.jsonl +++ b/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff38b505ed0203cca5b2a54781943532268f6cfc9cab6b12844d82c7a8becacf +size 34785938 diff --git a/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_3.jsonl b/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..7ebefa3995a67d1783975e92fecc55b4883ce797 100644 --- a/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_3.jsonl +++ b/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:411be7fe6c319eff619c0424cd1b2d08fd157794cb88398318828af456cc566d +size 9511518 diff --git a/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_4.jsonl b/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..4ca3cc01ddc373727d49ef61197d6684490b7c45 100644 --- a/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_4.jsonl +++ b/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:146e1d39dadc2532be20c9638d557b54a78ca954098d27de6068f96e99dcb759 +size 11641661 diff --git a/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_5.jsonl b/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..e651b3e714a4f4169c54536b1252f3759c685ca8 100644 --- a/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_5.jsonl +++ b/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30aa57257e181e3be0e8190d515ecdbc3c60195be93cf8569a8305c4c5d5ea4a +size 13898477 diff --git a/8b7178b44b/evaluation/generation/merged.csv b/8b7178b44b/evaluation/generation/merged.csv index 7ffb27a96e8c80795eae284d1ad84113262e7574..fb43139daa0a766a7551262f2e0277ef6cb9de9e 100644 --- a/8b7178b44b/evaluation/generation/merged.csv +++ b/8b7178b44b/evaluation/generation/merged.csv @@ -18,7 +18,13 @@ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.046694043648379924 gem_xsum,1,median,rouge2_fmeasure,0.046694043648379924 gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.05750919277461529 gem_xsum,2,median,rouge2_fmeasure,0.05750919277461529 -gem_xsum,2,average,multiple,0.05160986059071043 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.05815173892712961 +gem_xsum,3,median,rouge2_fmeasure,0.05815173892712961 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.014176278266192046 +gem_xsum,4,median,rouge2_fmeasure,0.014176278266192046 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0005731642694331555 +gem_xsum,5,median,rouge2_fmeasure,0.0005731642694331555 +gem_xsum,5,average,multiple,0.03795512720581435 web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.051776101353314175 web_nlg_en,0,median,rouge2_fmeasure,0.051776101353314175 web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.08797235452625987 @@ -36,4 +42,12 @@ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.02778920003491775 wiki_lingua_en,0,median,rouge2_fmeasure,0.02778920003491775 wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.044909001327117046 wiki_lingua_en,1,median,rouge2_fmeasure,0.044909001327117046 -wiki_lingua_en,1,average,multiple,0.0363491006810174 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.0668489557832933 +wiki_lingua_en,2,median,rouge2_fmeasure,0.0668489557832933 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.05941725493999427 +wiki_lingua_en,3,median,rouge2_fmeasure,0.05941725493999427 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.02020024245182335 +wiki_lingua_en,4,median,rouge2_fmeasure,0.02020024245182335 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0035013576004744333 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0035013576004744333 +wiki_lingua_en,5,average,multiple,0.03711100202293669 diff --git a/8b7178b44b/evaluation/generation/merged.json b/8b7178b44b/evaluation/generation/merged.json index e95ebd09f202c4d5ad2c5fd452d4aa862b67ea72..40756473f433d1e217a132a262ca1134db64d6a1 100644 --- a/8b7178b44b/evaluation/generation/merged.json +++ b/8b7178b44b/evaluation/generation/merged.json @@ -1 +1 @@ -{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4712440224858371, "bleu_stderr": 0.041795896662364315, "rouge1_fmeasure": 0.11420475482093911, "rouge1_fmeasure_stderr": 0.0022183305831393195, "rouge1_precision": 0.08494278433659254, "rouge1_precision_stderr": 0.002682014199464667, "rouge1_recall": 0.3010315100774798, "rouge1_recall_stderr": 0.005434382926718196, "rouge2_fmeasure": 0.051776101353314175, "rouge2_fmeasure_stderr": 0.0012810666933093713, "rouge2_precision": 0.03708415721222578, "rouge2_precision_stderr": 0.0015055606328312114, "rouge2_recall": 0.14224175280721357, "rouge2_recall_stderr": 0.0032367826210838984, "rougeL_fmeasure": 0.10723944522759159, "rougeL_fmeasure_stderr": 0.002017538602140417, "rougeL_precision": 0.07986488274116094, "rougeL_precision_stderr": 0.0025364066921602967, "rougeL_recall": 0.2851084703862567, "rougeL_recall_stderr": 0.005103727035204417, "rougeLsum_fmeasure": 0.10754705129811067, "rougeLsum_fmeasure_stderr": 0.002066923006506901, "rougeLsum_precision": 0.08031203138023377, "rougeLsum_precision_stderr": 0.0025700547990470703, "rougeLsum_recall": 0.2832049885841092, "rougeLsum_recall_stderr": 0.00501983173550152}}, "1": {"PALM_prompt": {"bleu": 0.6352422858353469, "bleu_stderr": 0.04386319262846487, "rouge1_fmeasure": 0.17595828606649466, "rouge1_fmeasure_stderr": 0.0037103128590180736, "rouge1_precision": 0.14729082149250516, "rouge1_precision_stderr": 0.004315116801617204, "rouge1_recall": 0.34501701041772753, "rouge1_recall_stderr": 0.004883560454809294, "rouge2_fmeasure": 0.08797235452625987, "rouge2_fmeasure_stderr": 0.002563800160615811, "rouge2_precision": 0.0749912603946425, "rouge2_precision_stderr": 0.0029697645981284103, "rouge2_recall": 0.17507177387031794, "rouge2_recall_stderr": 0.0035680879509635034, "rougeL_fmeasure": 0.1587241385605771, "rougeL_fmeasure_stderr": 0.0031813191221144485, "rougeL_precision": 0.13087012462145003, "rougeL_precision_stderr": 0.0037359089786958557, "rougeL_recall": 0.3214469196295103, "rougeL_recall_stderr": 0.004518231473535444, "rougeLsum_fmeasure": 0.16209364614512511, "rougeLsum_fmeasure_stderr": 0.003268839028418146, "rougeLsum_precision": 0.13443162008077075, "rougeLsum_precision_stderr": 0.0038480943977239573, "rougeLsum_recall": 0.32447309188867546, "rougeLsum_recall_stderr": 0.0044995663315275155}}, "2": {"PALM_prompt": {"bleu": 0.9195810919624675, "bleu_stderr": 0.04315216107683015, "rouge1_fmeasure": 0.21485950562026987, "rouge1_fmeasure_stderr": 0.004330200790716057, "rouge1_precision": 0.19281853446958125, "rouge1_precision_stderr": 0.005348179657635172, "rouge1_recall": 0.3839239693259562, "rouge1_recall_stderr": 0.004896794421772872, "rouge2_fmeasure": 0.11382579286837109, "rouge2_fmeasure_stderr": 0.0029731216918058084, "rouge2_precision": 0.1059225294754287, "rouge2_precision_stderr": 0.0036916446521448806, "rouge2_recall": 0.20457409251843786, "rouge2_recall_stderr": 0.0037775743582373947, "rougeL_fmeasure": 0.1905777152487244, "rougeL_fmeasure_stderr": 0.0036266046372882984, "rougeL_precision": 0.16888883147055286, "rougeL_precision_stderr": 0.00457575327708451, "rougeL_recall": 0.35302151670878984, "rougeL_recall_stderr": 0.004443535122018173, "rougeLsum_fmeasure": 0.1966781681343204, "rougeLsum_fmeasure_stderr": 0.0037869984327305512, "rougeLsum_precision": 0.17526191475368208, "rougeLsum_precision_stderr": 0.004769580382692894, "rougeLsum_recall": 0.35984651407202956, "rougeLsum_recall_stderr": 0.004507478564774203}}, "3": {"PALM_prompt": {"bleu": 1.0349566025343735, "bleu_stderr": 0.03887674612290353, "rouge1_fmeasure": 0.2237082219543607, "rouge1_fmeasure_stderr": 0.004572034500678942, "rouge1_precision": 0.20438835211537978, "rouge1_precision_stderr": 0.005665805208271208, "rouge1_recall": 0.39198992418866246, "rouge1_recall_stderr": 0.004947498335222262, "rouge2_fmeasure": 0.1207306381621409, "rouge2_fmeasure_stderr": 0.003262380279344123, "rouge2_precision": 0.11365499071190853, "rouge2_precision_stderr": 0.00397252439813096, "rouge2_recall": 0.2111302605559281, "rouge2_recall_stderr": 0.003926393767113415, "rougeL_fmeasure": 0.196789710556852, "rougeL_fmeasure_stderr": 0.0038281461566836554, "rougeL_precision": 0.17756713475764335, "rougeL_precision_stderr": 0.004833439612735641, "rougeL_recall": 0.3583014136638203, "rougeL_recall_stderr": 0.004476979554028604, "rougeLsum_fmeasure": 0.20416935580581227, "rougeLsum_fmeasure_stderr": 0.004021347054930177, "rougeLsum_precision": 0.18556820599751273, "rougeLsum_precision_stderr": 0.00508629808674153, "rougeLsum_recall": 0.3664900622762122, "rougeLsum_recall_stderr": 0.004561176586485258}}, "4": {"PALM_prompt": {"bleu": 1.1673998473534046, "bleu_stderr": 0.09133948510400369, "rouge1_fmeasure": 0.23196338111553216, "rouge1_fmeasure_stderr": 0.004558457306896048, "rouge1_precision": 0.2125195111528658, "rouge1_precision_stderr": 0.005730453983133926, "rouge1_recall": 0.40637639113635166, "rouge1_recall_stderr": 0.004933922377880338, "rouge2_fmeasure": 0.12559736589051002, "rouge2_fmeasure_stderr": 0.003188817274548042, "rouge2_precision": 0.11875092253811664, "rouge2_precision_stderr": 0.0039714331213516595, "rouge2_recall": 0.22248575515884728, "rouge2_recall_stderr": 0.004014145150661029, "rougeL_fmeasure": 0.2038615706419507, "rougeL_fmeasure_stderr": 0.003812860468542949, "rougeL_precision": 0.1843607825179914, "rougeL_precision_stderr": 0.0048800492055134925, "rougeL_recall": 0.370665372766866, "rougeL_recall_stderr": 0.004464677380992271, "rougeLsum_fmeasure": 0.2125920854603434, "rougeLsum_fmeasure_stderr": 0.004058013873763605, "rougeLsum_precision": 0.19370515184964268, "rougeLsum_precision_stderr": 0.005184788234116766, "rougeLsum_recall": 0.3806829975264072, "rougeLsum_recall_stderr": 0.004583640895651089}}, "5": {"PALM_prompt": {"bleu": 1.194760741144373, "bleu_stderr": 0.09038400286323779, "rouge1_fmeasure": 0.24067238350377898, "rouge1_fmeasure_stderr": 0.004627635578763758, "rouge1_precision": 0.22388107728965712, "rouge1_precision_stderr": 0.0058375878448932725, "rouge1_recall": 0.41258053843820386, "rouge1_recall_stderr": 0.004808891702805189, "rouge2_fmeasure": 0.12988590207293185, "rouge2_fmeasure_stderr": 0.003239297680717399, "rouge2_precision": 0.12470779957706617, "rouge2_precision_stderr": 0.004011608962377894, "rouge2_recall": 0.22396081182592556, "rouge2_recall_stderr": 0.003932938864761309, "rougeL_fmeasure": 0.20930811896578502, "rougeL_fmeasure_stderr": 0.0038259333936250185, "rougeL_precision": 0.19195297464964095, "rougeL_precision_stderr": 0.004914089701123312, "rougeL_recall": 0.37374160359881364, "rougeL_recall_stderr": 0.00433847098671097, "rougeLsum_fmeasure": 0.21923266732565286, "rougeLsum_fmeasure_stderr": 0.004087600756703223, "rougeLsum_precision": 0.20275170487892208, "rougeLsum_precision_stderr": 0.005253109590732199, "rougeLsum_recall": 0.3844755575959121, "rougeLsum_recall_stderr": 0.004448663215506157}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.170472303270534, "bleu_stderr": 0.09245843828343826, "rouge1_fmeasure": 0.10762579083182928, "rouge1_fmeasure_stderr": 0.0025835076322910672, "rouge1_precision": 0.11259234653795358, "rouge1_precision_stderr": 0.0032411862971363117, "rouge1_recall": 0.14340893843595093, "rouge1_recall_stderr": 0.0034694907773717048, "rouge2_fmeasure": 0.02778920003491775, "rouge2_fmeasure_stderr": 0.000982242456017776, "rouge2_precision": 0.02582999428983234, "rouge2_precision_stderr": 0.0010000072112516562, "rouge2_recall": 0.03784241933814092, "rouge2_recall_stderr": 0.001450453403192633, "rougeL_fmeasure": 0.08192620888873649, "rougeL_fmeasure_stderr": 0.0019339948879164794, "rougeL_precision": 0.08841484558766141, "rougeL_precision_stderr": 0.0027853934411114305, "rougeL_recall": 0.1114317644839379, "rougeL_recall_stderr": 0.0027453844256091257, "rougeLsum_fmeasure": 0.10120163014874665, "rougeLsum_fmeasure_stderr": 0.002435237757238233, "rougeLsum_precision": 0.1068082984161991, "rougeLsum_precision_stderr": 0.0031424461382987463, "rougeLsum_recall": 0.13490305217744453, "rougeLsum_recall_stderr": 0.0032803950524800847}}, "1": {"tldr_en": {"bleu": 3.020335174960559, "bleu_stderr": 0.06787715772935045, "rouge1_fmeasure": 0.18153788588353004, "rouge1_fmeasure_stderr": 0.002424482153914544, "rouge1_precision": 0.19504169763237164, "rouge1_precision_stderr": 0.003161655570483573, "rouge1_recall": 0.2268980166797919, "rouge1_recall_stderr": 0.00336143207911689, "rouge2_fmeasure": 0.044909001327117046, "rouge2_fmeasure_stderr": 0.0011943824375694858, "rouge2_precision": 0.049740220187040864, "rouge2_precision_stderr": 0.001561722025312558, "rouge2_recall": 0.05782059703115125, "rouge2_recall_stderr": 0.0016860297998920056, "rougeL_fmeasure": 0.1341843957584949, "rougeL_fmeasure_stderr": 0.0017857657209747223, "rougeL_precision": 0.14594590659077963, "rougeL_precision_stderr": 0.002476347943750222, "rougeL_recall": 0.1691189101402509, "rougeL_recall_stderr": 0.002588936970621776, "rougeLsum_fmeasure": 0.16951014920125493, "rougeLsum_fmeasure_stderr": 0.0022640636265336893, "rougeLsum_precision": 0.18281924229155466, "rougeLsum_precision_stderr": 0.0029977938731759806, "rougeLsum_recall": 0.21164415433278383, "rougeLsum_recall_stderr": 0.0031361900114708872}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 2.235702101845383, "bleu_stderr": 0.14717774293969751, "rouge1_fmeasure": 0.18121867050334917, "rouge1_fmeasure_stderr": 0.0018474922816842606, "rouge1_precision": 0.7622052797070503, "rouge1_precision_stderr": 0.006647669795578922, "rouge1_recall": 0.1532954300944369, "rouge1_recall_stderr": 0.0031267049368978316, "rouge2_fmeasure": 0.08752141617950068, "rouge2_fmeasure_stderr": 0.0010542837911429734, "rouge2_precision": 0.7122733552799622, "rouge2_precision_stderr": 0.007806658717987775, "rouge2_recall": 0.07189313722539607, "rouge2_recall_stderr": 0.0017250469433453682, "rougeL_fmeasure": 0.1698326390033901, "rougeL_fmeasure_stderr": 0.0014973431260616968, "rougeL_precision": 0.7500255707124592, "rougeL_precision_stderr": 0.006889071529543362, "rougeL_recall": 0.13882842661597, "rougeL_recall_stderr": 0.0025055451424711027, "rougeLsum_fmeasure": 0.17261995880897366, "rougeLsum_fmeasure_stderr": 0.0016987141071618745, "rougeLsum_precision": 0.752191135927755, "rougeLsum_precision_stderr": 0.00685154752320934, "rougeLsum_recall": 0.14346581040848494, "rougeLsum_recall_stderr": 0.002891637286051436}}, "1": {"generate_text_restaurant": {"bleu": 12.507959373818647, "bleu_stderr": 0.12774308491764494, "rouge1_fmeasure": 0.48125855004986695, "rouge1_fmeasure_stderr": 0.0023747652067436864, "rouge1_precision": 0.5901037503482837, "rouge1_precision_stderr": 0.0032159724501758416, "rouge1_recall": 0.4453954075758376, "rouge1_recall_stderr": 0.0030647232677422412, "rouge2_fmeasure": 0.23034576083422903, "rouge2_fmeasure_stderr": 0.0020975083120057516, "rouge2_precision": 0.2860806077756621, "rouge2_precision_stderr": 0.0027716337523015977, "rouge2_recall": 0.21297260057374331, "rouge2_recall_stderr": 0.002224122053216537, "rougeL_fmeasure": 0.34846752107389445, "rougeL_fmeasure_stderr": 0.0021243881165946155, "rougeL_precision": 0.4308357021587011, "rougeL_precision_stderr": 0.003009420690844223, "rougeL_recall": 0.3212543906087175, "rougeL_recall_stderr": 0.002480078078958591, "rougeLsum_fmeasure": 0.3931631455827191, "rougeLsum_fmeasure_stderr": 0.0023981333184838593, "rougeLsum_precision": 0.4830396285828985, "rougeLsum_precision_stderr": 0.003213107569523447, "rougeLsum_recall": 0.3634977467417529, "rougeLsum_recall_stderr": 0.002819214726631758}}, "2": {"generate_text_restaurant": {"bleu": 15.158653678134238, "bleu_stderr": 0.11217626725691181, "rouge1_fmeasure": 0.5131292320126455, "rouge1_fmeasure_stderr": 0.002253031761361406, "rouge1_precision": 0.6066145287034505, "rouge1_precision_stderr": 0.003147539042422012, "rouge1_recall": 0.48239587619310037, "rouge1_recall_stderr": 0.0029532891322940407, "rouge2_fmeasure": 0.2578530121604481, "rouge2_fmeasure_stderr": 0.0021628607938910917, "rouge2_precision": 0.3089787427206784, "rouge2_precision_stderr": 0.0028203707876968153, "rouge2_recall": 0.24188957715036025, "rouge2_recall_stderr": 0.002292974031208558, "rougeL_fmeasure": 0.37525966202827027, "rougeL_fmeasure_stderr": 0.0021725307399361672, "rougeL_precision": 0.44560511468454206, "rougeL_precision_stderr": 0.002995038040318342, "rougeL_recall": 0.35193582043955735, "rougeL_recall_stderr": 0.0025196283655795136, "rougeLsum_fmeasure": 0.4285131129511556, "rougeLsum_fmeasure_stderr": 0.0024045780604980463, "rougeLsum_precision": 0.5064590234844139, "rougeLsum_precision_stderr": 0.003190583549175626, "rougeLsum_recall": 0.4026845079975637, "rougeLsum_recall_stderr": 0.002836627735824717}}, "3": {"generate_text_restaurant": {"bleu": 15.759812537730546, "bleu_stderr": 0.16130432495289687, "rouge1_fmeasure": 0.5204784544081206, "rouge1_fmeasure_stderr": 0.002251275712244091, "rouge1_precision": 0.6077270325813979, "rouge1_precision_stderr": 0.0031150320109689183, "rouge1_recall": 0.49124527491366843, "rouge1_recall_stderr": 0.0029113142559981263, "rouge2_fmeasure": 0.26545692823299993, "rouge2_fmeasure_stderr": 0.0021835553197599713, "rouge2_precision": 0.31332017254120853, "rouge2_precision_stderr": 0.0027772905192189512, "rouge2_recall": 0.2502643652420692, "rouge2_recall_stderr": 0.0023315173133087275, "rougeL_fmeasure": 0.3810216920994399, "rougeL_fmeasure_stderr": 0.002194042986121944, "rougeL_precision": 0.4459205694292829, "rougeL_precision_stderr": 0.0029361567575171836, "rougeL_recall": 0.3592970906286205, "rougeL_recall_stderr": 0.0025375120589350396, "rougeLsum_fmeasure": 0.4357908055166842, "rougeLsum_fmeasure_stderr": 0.0024150704709467426, "rougeLsum_precision": 0.5082193147554575, "rougeLsum_precision_stderr": 0.003142597202698407, "rougeLsum_recall": 0.41160458845561226, "rougeLsum_recall_stderr": 0.002840955053130549}}, "4": {"generate_text_restaurant": {"bleu": 16.134799256710256, "bleu_stderr": 0.1070119918698532, "rouge1_fmeasure": 0.523909375920506, "rouge1_fmeasure_stderr": 0.0022880595790812155, "rouge1_precision": 0.6054119860771526, "rouge1_precision_stderr": 0.0031401965578103136, "rouge1_recall": 0.4946565816842518, "rouge1_recall_stderr": 0.0028364911596105765, "rouge2_fmeasure": 0.26757332273287654, "rouge2_fmeasure_stderr": 0.0022434333099258102, "rouge2_precision": 0.3124961122227526, "rouge2_precision_stderr": 0.002808954903697186, "rouge2_recall": 0.25220130152340275, "rouge2_recall_stderr": 0.0023493523906758557, "rougeL_fmeasure": 0.382175935620986, "rougeL_fmeasure_stderr": 0.002236237383536334, "rougeL_precision": 0.44233843950127705, "rougeL_precision_stderr": 0.0029306698549013154, "rougeL_recall": 0.3607369273273119, "rougeL_recall_stderr": 0.002523691933961996, "rougeLsum_fmeasure": 0.4381722454639816, "rougeLsum_fmeasure_stderr": 0.0024443106668905357, "rougeLsum_precision": 0.5057660051353207, "rougeLsum_precision_stderr": 0.003153645376968123, "rougeLsum_recall": 0.4139501510871044, "rougeLsum_recall_stderr": 0.002788136076678763}}, "5": {"generate_text_restaurant": {"bleu": 16.107165763126492, "bleu_stderr": 0.1725403495293288, "rouge1_fmeasure": 0.5240001141378704, "rouge1_fmeasure_stderr": 0.0022053636458810423, "rouge1_precision": 0.6037882438302521, "rouge1_precision_stderr": 0.00307437348704947, "rouge1_recall": 0.49496382545882733, "rouge1_recall_stderr": 0.0027753751046680057, "rouge2_fmeasure": 0.26817812765923393, "rouge2_fmeasure_stderr": 0.002197626024453061, "rouge2_precision": 0.31296117294303094, "rouge2_precision_stderr": 0.0027976539566726744, "rouge2_recall": 0.25262525126987617, "rouge2_recall_stderr": 0.0023072545439166217, "rougeL_fmeasure": 0.3843695820781079, "rougeL_fmeasure_stderr": 0.0022098458849370652, "rougeL_precision": 0.44434419215812615, "rougeL_precision_stderr": 0.002949180064465491, "rougeL_recall": 0.3625276939571968, "rougeL_recall_stderr": 0.0024933568400759844, "rougeLsum_fmeasure": 0.4393845856350276, "rougeLsum_fmeasure_stderr": 0.002395852175826684, "rougeLsum_precision": 0.5068534628215157, "rougeLsum_precision_stderr": 0.0031588710330076797, "rougeLsum_recall": 0.41448240798432473, "rougeLsum_recall_stderr": 0.0027203178472831524}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.1587870129118882, "bleu_stderr": 0.09029922216218629, "rouge1_fmeasure": 0.2131475561803473, "rouge1_fmeasure_stderr": 0.002809398976147748, "rouge1_precision": 0.1554932484923166, "rouge1_precision_stderr": 0.002221691062513405, "rouge1_recall": 0.35841398237522276, "rouge1_recall_stderr": 0.004880460975300627, "rouge2_fmeasure": 0.05062634534913608, "rouge2_fmeasure_stderr": 0.001690768330530915, "rouge2_precision": 0.036146559262545196, "rouge2_precision_stderr": 0.0012086761533876766, "rouge2_recall": 0.08868762877243597, "rouge2_recall_stderr": 0.0030835154047321074, "rougeL_fmeasure": 0.15595658881556435, "rougeL_fmeasure_stderr": 0.002127497880378741, "rougeL_precision": 0.11383026012975615, "rougeL_precision_stderr": 0.001748646718497343, "rougeL_recall": 0.2638855275835188, "rougeL_recall_stderr": 0.0038587410761858755, "rougeLsum_fmeasure": 0.16967238254142056, "rougeLsum_fmeasure_stderr": 0.00237624607133809, "rougeLsum_precision": 0.12373439304630478, "rougeLsum_precision_stderr": 0.0019078089158618033, "rougeLsum_recall": 0.2868552037024372, "rougeLsum_recall_stderr": 0.004272131408381669}}, "1": {"article_DOC_summary": {"bleu": 1.9813214649410344, "bleu_stderr": 0.06605541875330423, "rouge1_fmeasure": 0.20475174329785, "rouge1_fmeasure_stderr": 0.0030292857270426262, "rouge1_precision": 0.17602273459783957, "rouge1_precision_stderr": 0.0034912567740095035, "rouge1_recall": 0.2978672198691626, "rouge1_recall_stderr": 0.004325712043489534, "rouge2_fmeasure": 0.046694043648379924, "rouge2_fmeasure_stderr": 0.001891957305961457, "rouge2_precision": 0.04044850363752258, "rouge2_precision_stderr": 0.0019137570100629107, "rouge2_recall": 0.06884979773351274, "rouge2_recall_stderr": 0.002724070824860895, "rougeL_fmeasure": 0.1593397594005996, "rougeL_fmeasure_stderr": 0.0024118116760541847, "rougeL_precision": 0.13694272638916472, "rougeL_precision_stderr": 0.0028412360369673493, "rougeL_recall": 0.2334897227634632, "rougeL_recall_stderr": 0.003556590427451311, "rougeLsum_fmeasure": 0.15946764140405875, "rougeLsum_fmeasure_stderr": 0.0025141424269190023, "rougeLsum_precision": 0.137142234387202, "rougeLsum_precision_stderr": 0.0029073893715640745, "rougeLsum_recall": 0.23357550407307784, "rougeLsum_recall_stderr": 0.003728086289055363}}, "2": {"article_DOC_summary": {"bleu": 2.568315279137756, "bleu_stderr": 0.13888090668923836, "rouge1_fmeasure": 0.2303304665083427, "rouge1_fmeasure_stderr": 0.003468001334593542, "rouge1_precision": 0.2229374260378376, "rouge1_precision_stderr": 0.004280928411283304, "rouge1_recall": 0.28304069580526636, "rouge1_recall_stderr": 0.003972947611177992, "rouge2_fmeasure": 0.05750919277461529, "rouge2_fmeasure_stderr": 0.002337201303458201, "rouge2_precision": 0.057639208977718646, "rouge2_precision_stderr": 0.002614759341604805, "rouge2_recall": 0.06850665995022794, "rouge2_recall_stderr": 0.0026514610571335023, "rougeL_fmeasure": 0.17795352717964225, "rougeL_fmeasure_stderr": 0.0028890194692426686, "rougeL_precision": 0.17219483167439611, "rougeL_precision_stderr": 0.003538412807763811, "rougeL_recall": 0.21973513766849492, "rougeL_recall_stderr": 0.0033149507650464807, "rougeLsum_fmeasure": 0.17909978174756772, "rougeLsum_fmeasure_stderr": 0.002921283919599673, "rougeLsum_precision": 0.17313600638730706, "rougeLsum_precision_stderr": 0.0035630889990072444, "rougeLsum_recall": 0.2218290481310322, "rougeLsum_recall_stderr": 0.0034301100563963126}}}} \ No newline at end of file +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4712440224858371, "bleu_stderr": 0.041795896662364315, "rouge1_fmeasure": 0.11420475482093911, "rouge1_fmeasure_stderr": 0.0022183305831393195, "rouge1_precision": 0.08494278433659254, "rouge1_precision_stderr": 0.002682014199464667, "rouge1_recall": 0.3010315100774798, "rouge1_recall_stderr": 0.005434382926718196, "rouge2_fmeasure": 0.051776101353314175, "rouge2_fmeasure_stderr": 0.0012810666933093713, "rouge2_precision": 0.03708415721222578, "rouge2_precision_stderr": 0.0015055606328312114, "rouge2_recall": 0.14224175280721357, "rouge2_recall_stderr": 0.0032367826210838984, "rougeL_fmeasure": 0.10723944522759159, "rougeL_fmeasure_stderr": 0.002017538602140417, "rougeL_precision": 0.07986488274116094, "rougeL_precision_stderr": 0.0025364066921602967, "rougeL_recall": 0.2851084703862567, "rougeL_recall_stderr": 0.005103727035204417, "rougeLsum_fmeasure": 0.10754705129811067, "rougeLsum_fmeasure_stderr": 0.002066923006506901, "rougeLsum_precision": 0.08031203138023377, "rougeLsum_precision_stderr": 0.0025700547990470703, "rougeLsum_recall": 0.2832049885841092, "rougeLsum_recall_stderr": 0.00501983173550152}}, "1": {"PALM_prompt": {"bleu": 0.6352422858353469, "bleu_stderr": 0.04386319262846487, "rouge1_fmeasure": 0.17595828606649466, "rouge1_fmeasure_stderr": 0.0037103128590180736, "rouge1_precision": 0.14729082149250516, "rouge1_precision_stderr": 0.004315116801617204, "rouge1_recall": 0.34501701041772753, "rouge1_recall_stderr": 0.004883560454809294, "rouge2_fmeasure": 0.08797235452625987, "rouge2_fmeasure_stderr": 0.002563800160615811, "rouge2_precision": 0.0749912603946425, "rouge2_precision_stderr": 0.0029697645981284103, "rouge2_recall": 0.17507177387031794, "rouge2_recall_stderr": 0.0035680879509635034, "rougeL_fmeasure": 0.1587241385605771, "rougeL_fmeasure_stderr": 0.0031813191221144485, "rougeL_precision": 0.13087012462145003, "rougeL_precision_stderr": 0.0037359089786958557, "rougeL_recall": 0.3214469196295103, "rougeL_recall_stderr": 0.004518231473535444, "rougeLsum_fmeasure": 0.16209364614512511, "rougeLsum_fmeasure_stderr": 0.003268839028418146, "rougeLsum_precision": 0.13443162008077075, "rougeLsum_precision_stderr": 0.0038480943977239573, "rougeLsum_recall": 0.32447309188867546, "rougeLsum_recall_stderr": 0.0044995663315275155}}, "2": {"PALM_prompt": {"bleu": 0.9195810919624675, "bleu_stderr": 0.04315216107683015, "rouge1_fmeasure": 0.21485950562026987, "rouge1_fmeasure_stderr": 0.004330200790716057, "rouge1_precision": 0.19281853446958125, "rouge1_precision_stderr": 0.005348179657635172, "rouge1_recall": 0.3839239693259562, "rouge1_recall_stderr": 0.004896794421772872, "rouge2_fmeasure": 0.11382579286837109, "rouge2_fmeasure_stderr": 0.0029731216918058084, "rouge2_precision": 0.1059225294754287, "rouge2_precision_stderr": 0.0036916446521448806, "rouge2_recall": 0.20457409251843786, "rouge2_recall_stderr": 0.0037775743582373947, "rougeL_fmeasure": 0.1905777152487244, "rougeL_fmeasure_stderr": 0.0036266046372882984, "rougeL_precision": 0.16888883147055286, "rougeL_precision_stderr": 0.00457575327708451, "rougeL_recall": 0.35302151670878984, "rougeL_recall_stderr": 0.004443535122018173, "rougeLsum_fmeasure": 0.1966781681343204, "rougeLsum_fmeasure_stderr": 0.0037869984327305512, "rougeLsum_precision": 0.17526191475368208, "rougeLsum_precision_stderr": 0.004769580382692894, "rougeLsum_recall": 0.35984651407202956, "rougeLsum_recall_stderr": 0.004507478564774203}}, "3": {"PALM_prompt": {"bleu": 1.0349566025343735, "bleu_stderr": 0.03887674612290353, "rouge1_fmeasure": 0.2237082219543607, "rouge1_fmeasure_stderr": 0.004572034500678942, "rouge1_precision": 0.20438835211537978, "rouge1_precision_stderr": 0.005665805208271208, "rouge1_recall": 0.39198992418866246, "rouge1_recall_stderr": 0.004947498335222262, "rouge2_fmeasure": 0.1207306381621409, "rouge2_fmeasure_stderr": 0.003262380279344123, "rouge2_precision": 0.11365499071190853, "rouge2_precision_stderr": 0.00397252439813096, "rouge2_recall": 0.2111302605559281, "rouge2_recall_stderr": 0.003926393767113415, "rougeL_fmeasure": 0.196789710556852, "rougeL_fmeasure_stderr": 0.0038281461566836554, "rougeL_precision": 0.17756713475764335, "rougeL_precision_stderr": 0.004833439612735641, "rougeL_recall": 0.3583014136638203, "rougeL_recall_stderr": 0.004476979554028604, "rougeLsum_fmeasure": 0.20416935580581227, "rougeLsum_fmeasure_stderr": 0.004021347054930177, "rougeLsum_precision": 0.18556820599751273, "rougeLsum_precision_stderr": 0.00508629808674153, "rougeLsum_recall": 0.3664900622762122, "rougeLsum_recall_stderr": 0.004561176586485258}}, "4": {"PALM_prompt": {"bleu": 1.1673998473534046, "bleu_stderr": 0.09133948510400369, "rouge1_fmeasure": 0.23196338111553216, "rouge1_fmeasure_stderr": 0.004558457306896048, "rouge1_precision": 0.2125195111528658, "rouge1_precision_stderr": 0.005730453983133926, "rouge1_recall": 0.40637639113635166, "rouge1_recall_stderr": 0.004933922377880338, "rouge2_fmeasure": 0.12559736589051002, "rouge2_fmeasure_stderr": 0.003188817274548042, "rouge2_precision": 0.11875092253811664, "rouge2_precision_stderr": 0.0039714331213516595, "rouge2_recall": 0.22248575515884728, "rouge2_recall_stderr": 0.004014145150661029, "rougeL_fmeasure": 0.2038615706419507, "rougeL_fmeasure_stderr": 0.003812860468542949, "rougeL_precision": 0.1843607825179914, "rougeL_precision_stderr": 0.0048800492055134925, "rougeL_recall": 0.370665372766866, "rougeL_recall_stderr": 0.004464677380992271, "rougeLsum_fmeasure": 0.2125920854603434, "rougeLsum_fmeasure_stderr": 0.004058013873763605, "rougeLsum_precision": 0.19370515184964268, "rougeLsum_precision_stderr": 0.005184788234116766, "rougeLsum_recall": 0.3806829975264072, "rougeLsum_recall_stderr": 0.004583640895651089}}, "5": {"PALM_prompt": {"bleu": 1.194760741144373, "bleu_stderr": 0.09038400286323779, "rouge1_fmeasure": 0.24067238350377898, "rouge1_fmeasure_stderr": 0.004627635578763758, "rouge1_precision": 0.22388107728965712, "rouge1_precision_stderr": 0.0058375878448932725, "rouge1_recall": 0.41258053843820386, "rouge1_recall_stderr": 0.004808891702805189, "rouge2_fmeasure": 0.12988590207293185, "rouge2_fmeasure_stderr": 0.003239297680717399, "rouge2_precision": 0.12470779957706617, "rouge2_precision_stderr": 0.004011608962377894, "rouge2_recall": 0.22396081182592556, "rouge2_recall_stderr": 0.003932938864761309, "rougeL_fmeasure": 0.20930811896578502, "rougeL_fmeasure_stderr": 0.0038259333936250185, "rougeL_precision": 0.19195297464964095, "rougeL_precision_stderr": 0.004914089701123312, "rougeL_recall": 0.37374160359881364, "rougeL_recall_stderr": 0.00433847098671097, "rougeLsum_fmeasure": 0.21923266732565286, "rougeLsum_fmeasure_stderr": 0.004087600756703223, "rougeLsum_precision": 0.20275170487892208, "rougeLsum_precision_stderr": 0.005253109590732199, "rougeLsum_recall": 0.3844755575959121, "rougeLsum_recall_stderr": 0.004448663215506157}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.170472303270534, "bleu_stderr": 0.09245843828343826, "rouge1_fmeasure": 0.10762579083182928, "rouge1_fmeasure_stderr": 0.0025835076322910672, "rouge1_precision": 0.11259234653795358, "rouge1_precision_stderr": 0.0032411862971363117, "rouge1_recall": 0.14340893843595093, "rouge1_recall_stderr": 0.0034694907773717048, "rouge2_fmeasure": 0.02778920003491775, "rouge2_fmeasure_stderr": 0.000982242456017776, "rouge2_precision": 0.02582999428983234, "rouge2_precision_stderr": 0.0010000072112516562, "rouge2_recall": 0.03784241933814092, "rouge2_recall_stderr": 0.001450453403192633, "rougeL_fmeasure": 0.08192620888873649, "rougeL_fmeasure_stderr": 0.0019339948879164794, "rougeL_precision": 0.08841484558766141, "rougeL_precision_stderr": 0.0027853934411114305, "rougeL_recall": 0.1114317644839379, "rougeL_recall_stderr": 0.0027453844256091257, "rougeLsum_fmeasure": 0.10120163014874665, "rougeLsum_fmeasure_stderr": 0.002435237757238233, "rougeLsum_precision": 0.1068082984161991, "rougeLsum_precision_stderr": 0.0031424461382987463, "rougeLsum_recall": 0.13490305217744453, "rougeLsum_recall_stderr": 0.0032803950524800847}}, "1": {"tldr_en": {"bleu": 3.020335174960559, "bleu_stderr": 0.06787715772935045, "rouge1_fmeasure": 0.18153788588353004, "rouge1_fmeasure_stderr": 0.002424482153914544, "rouge1_precision": 0.19504169763237164, "rouge1_precision_stderr": 0.003161655570483573, "rouge1_recall": 0.2268980166797919, "rouge1_recall_stderr": 0.00336143207911689, "rouge2_fmeasure": 0.044909001327117046, "rouge2_fmeasure_stderr": 0.0011943824375694858, "rouge2_precision": 0.049740220187040864, "rouge2_precision_stderr": 0.001561722025312558, "rouge2_recall": 0.05782059703115125, "rouge2_recall_stderr": 0.0016860297998920056, "rougeL_fmeasure": 0.1341843957584949, "rougeL_fmeasure_stderr": 0.0017857657209747223, "rougeL_precision": 0.14594590659077963, "rougeL_precision_stderr": 0.002476347943750222, "rougeL_recall": 0.1691189101402509, "rougeL_recall_stderr": 0.002588936970621776, "rougeLsum_fmeasure": 0.16951014920125493, "rougeLsum_fmeasure_stderr": 0.0022640636265336893, "rougeLsum_precision": 0.18281924229155466, "rougeLsum_precision_stderr": 0.0029977938731759806, "rougeLsum_recall": 0.21164415433278383, "rougeLsum_recall_stderr": 0.0031361900114708872}}, "2": {"tldr_en": {"bleu": 4.287770006656117, "bleu_stderr": 0.11835519853257852, "rouge1_fmeasure": 0.24048738248155943, "rouge1_fmeasure_stderr": 0.0022924559211728, "rouge1_precision": 0.26704682686051745, "rouge1_precision_stderr": 0.003346061664825914, "rouge1_recall": 0.28723900360273236, "rouge1_recall_stderr": 0.0030648378055716376, "rouge2_fmeasure": 0.0668489557832933, "rouge2_fmeasure_stderr": 0.0013397924519285302, "rouge2_precision": 0.07760883007326022, "rouge2_precision_stderr": 0.0018992689472675252, "rouge2_recall": 0.07930488429722847, "rouge2_recall_stderr": 0.0016958561414342646, "rougeL_fmeasure": 0.17873946616791977, "rougeL_fmeasure_stderr": 0.0017622551083101064, "rougeL_precision": 0.2005421334534283, "rougeL_precision_stderr": 0.00272610264193461, "rougeL_recall": 0.2147121574085488, "rougeL_recall_stderr": 0.0023921943267692113, "rougeLsum_fmeasure": 0.22548796825222664, "rougeLsum_fmeasure_stderr": 0.002166294099617983, "rougeLsum_precision": 0.25120585518065286, "rougeLsum_precision_stderr": 0.0032054522314792226, "rougeLsum_recall": 0.2689555721836676, "rougeLsum_recall_stderr": 0.002872035212266304}}, "3": {"tldr_en": {"bleu": 3.8253545690458486, "bleu_stderr": 0.11601169899524703, "rouge1_fmeasure": 0.21120410580427312, "rouge1_fmeasure_stderr": 0.002645585016214612, "rouge1_precision": 0.24693572848760356, "rouge1_precision_stderr": 0.0036808375259794837, "rouge1_recall": 0.24354376203183603, "rouge1_recall_stderr": 0.0034150456640924024, "rouge2_fmeasure": 0.05941725493999427, "rouge2_fmeasure_stderr": 0.0013446993130627589, "rouge2_precision": 0.07085834748957563, "rouge2_precision_stderr": 0.001874517454430376, "rouge2_recall": 0.0691245743627346, "rouge2_recall_stderr": 0.0017265463033494647, "rougeL_fmeasure": 0.1574847757182755, "rougeL_fmeasure_stderr": 0.0020235834993420653, "rougeL_precision": 0.18619149092636542, "rougeL_precision_stderr": 0.0029566758253007693, "rougeL_recall": 0.18275407900501345, "rougeL_recall_stderr": 0.002678372152846199, "rougeLsum_fmeasure": 0.19767422939876575, "rougeLsum_fmeasure_stderr": 0.00249381697530238, "rougeLsum_precision": 0.23179629616820094, "rougeLsum_precision_stderr": 0.003517652101595812, "rougeLsum_recall": 0.22762811767303537, "rougeLsum_recall_stderr": 0.0031986554975203343}}, "4": {"tldr_en": {"bleu": 0.2364078795997809, "bleu_stderr": 0.024391601699964672, "rouge1_fmeasure": 0.069564585739879, "rouge1_fmeasure_stderr": 0.002354877693129134, "rouge1_precision": 0.08491288146025691, "rouge1_precision_stderr": 0.003092982485525366, "rouge1_recall": 0.08011242516269225, "rouge1_recall_stderr": 0.0028916714775573983, "rouge2_fmeasure": 0.02020024245182335, "rouge2_fmeasure_stderr": 0.0009720348342506413, "rouge2_precision": 0.02565888234744232, "rouge2_precision_stderr": 0.0014806953806721635, "rouge2_recall": 0.023735577183813514, "rouge2_recall_stderr": 0.0012482652822662537, "rougeL_fmeasure": 0.0532771556440879, "rougeL_fmeasure_stderr": 0.001820427400127109, "rougeL_precision": 0.06624573735046896, "rougeL_precision_stderr": 0.0025139925050591855, "rougeL_recall": 0.06166301644336901, "rougeL_recall_stderr": 0.002275982977075628, "rougeLsum_fmeasure": 0.06493333871562736, "rougeLsum_fmeasure_stderr": 0.0022026091321757347, "rougeLsum_precision": 0.07976639037062659, "rougeLsum_precision_stderr": 0.0029385774245711256, "rougeLsum_recall": 0.07471788462506215, "rougeLsum_recall_stderr": 0.002705709953549525}}, "5": {"tldr_en": {"bleu": 2.099893037305623e-12, "bleu_stderr": 2.420929985001087e-11, "rouge1_fmeasure": 0.010622171790435694, "rouge1_fmeasure_stderr": 0.0010302186253132802, "rouge1_precision": 0.015915821872377674, "rouge1_precision_stderr": 0.0016702869678876702, "rouge1_recall": 0.011688601517501347, "rouge1_recall_stderr": 0.0011952240401007063, "rouge2_fmeasure": 0.0035013576004744333, "rouge2_fmeasure_stderr": 0.00047232485179348453, "rouge2_precision": 0.005627833349821597, "rouge2_precision_stderr": 0.0009009236122981561, "rouge2_recall": 0.0037129642663769444, "rouge2_recall_stderr": 0.0005184603335382291, "rougeL_fmeasure": 0.008452079824288578, "rougeL_fmeasure_stderr": 0.0008242289394208465, "rougeL_precision": 0.013277596011797644, "rougeL_precision_stderr": 0.0014699518837369332, "rougeL_recall": 0.009310479193227725, "rougeL_recall_stderr": 0.0009665047884074748, "rougeLsum_fmeasure": 0.009996482786372373, "rougeLsum_fmeasure_stderr": 0.0009674414655995565, "rougeLsum_precision": 0.015271784138576902, "rougeLsum_precision_stderr": 0.001626300881272708, "rougeLsum_recall": 0.010986042036294772, "rougeLsum_recall_stderr": 0.0011291877819487723}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 2.235702101845383, "bleu_stderr": 0.14717774293969751, "rouge1_fmeasure": 0.18121867050334917, "rouge1_fmeasure_stderr": 0.0018474922816842606, "rouge1_precision": 0.7622052797070503, "rouge1_precision_stderr": 0.006647669795578922, "rouge1_recall": 0.1532954300944369, "rouge1_recall_stderr": 0.0031267049368978316, "rouge2_fmeasure": 0.08752141617950068, "rouge2_fmeasure_stderr": 0.0010542837911429734, "rouge2_precision": 0.7122733552799622, "rouge2_precision_stderr": 0.007806658717987775, "rouge2_recall": 0.07189313722539607, "rouge2_recall_stderr": 0.0017250469433453682, "rougeL_fmeasure": 0.1698326390033901, "rougeL_fmeasure_stderr": 0.0014973431260616968, "rougeL_precision": 0.7500255707124592, "rougeL_precision_stderr": 0.006889071529543362, "rougeL_recall": 0.13882842661597, "rougeL_recall_stderr": 0.0025055451424711027, "rougeLsum_fmeasure": 0.17261995880897366, "rougeLsum_fmeasure_stderr": 0.0016987141071618745, "rougeLsum_precision": 0.752191135927755, "rougeLsum_precision_stderr": 0.00685154752320934, "rougeLsum_recall": 0.14346581040848494, "rougeLsum_recall_stderr": 0.002891637286051436}}, "1": {"generate_text_restaurant": {"bleu": 12.507959373818647, "bleu_stderr": 0.12774308491764494, "rouge1_fmeasure": 0.48125855004986695, "rouge1_fmeasure_stderr": 0.0023747652067436864, "rouge1_precision": 0.5901037503482837, "rouge1_precision_stderr": 0.0032159724501758416, "rouge1_recall": 0.4453954075758376, "rouge1_recall_stderr": 0.0030647232677422412, "rouge2_fmeasure": 0.23034576083422903, "rouge2_fmeasure_stderr": 0.0020975083120057516, "rouge2_precision": 0.2860806077756621, "rouge2_precision_stderr": 0.0027716337523015977, "rouge2_recall": 0.21297260057374331, "rouge2_recall_stderr": 0.002224122053216537, "rougeL_fmeasure": 0.34846752107389445, "rougeL_fmeasure_stderr": 0.0021243881165946155, "rougeL_precision": 0.4308357021587011, "rougeL_precision_stderr": 0.003009420690844223, "rougeL_recall": 0.3212543906087175, "rougeL_recall_stderr": 0.002480078078958591, "rougeLsum_fmeasure": 0.3931631455827191, "rougeLsum_fmeasure_stderr": 0.0023981333184838593, "rougeLsum_precision": 0.4830396285828985, "rougeLsum_precision_stderr": 0.003213107569523447, "rougeLsum_recall": 0.3634977467417529, "rougeLsum_recall_stderr": 0.002819214726631758}}, "2": {"generate_text_restaurant": {"bleu": 15.158653678134238, "bleu_stderr": 0.11217626725691181, "rouge1_fmeasure": 0.5131292320126455, "rouge1_fmeasure_stderr": 0.002253031761361406, "rouge1_precision": 0.6066145287034505, "rouge1_precision_stderr": 0.003147539042422012, "rouge1_recall": 0.48239587619310037, "rouge1_recall_stderr": 0.0029532891322940407, "rouge2_fmeasure": 0.2578530121604481, "rouge2_fmeasure_stderr": 0.0021628607938910917, "rouge2_precision": 0.3089787427206784, "rouge2_precision_stderr": 0.0028203707876968153, "rouge2_recall": 0.24188957715036025, "rouge2_recall_stderr": 0.002292974031208558, "rougeL_fmeasure": 0.37525966202827027, "rougeL_fmeasure_stderr": 0.0021725307399361672, "rougeL_precision": 0.44560511468454206, "rougeL_precision_stderr": 0.002995038040318342, "rougeL_recall": 0.35193582043955735, "rougeL_recall_stderr": 0.0025196283655795136, "rougeLsum_fmeasure": 0.4285131129511556, "rougeLsum_fmeasure_stderr": 0.0024045780604980463, "rougeLsum_precision": 0.5064590234844139, "rougeLsum_precision_stderr": 0.003190583549175626, "rougeLsum_recall": 0.4026845079975637, "rougeLsum_recall_stderr": 0.002836627735824717}}, "3": {"generate_text_restaurant": {"bleu": 15.759812537730546, "bleu_stderr": 0.16130432495289687, "rouge1_fmeasure": 0.5204784544081206, "rouge1_fmeasure_stderr": 0.002251275712244091, "rouge1_precision": 0.6077270325813979, "rouge1_precision_stderr": 0.0031150320109689183, "rouge1_recall": 0.49124527491366843, "rouge1_recall_stderr": 0.0029113142559981263, "rouge2_fmeasure": 0.26545692823299993, "rouge2_fmeasure_stderr": 0.0021835553197599713, "rouge2_precision": 0.31332017254120853, "rouge2_precision_stderr": 0.0027772905192189512, "rouge2_recall": 0.2502643652420692, "rouge2_recall_stderr": 0.0023315173133087275, "rougeL_fmeasure": 0.3810216920994399, "rougeL_fmeasure_stderr": 0.002194042986121944, "rougeL_precision": 0.4459205694292829, "rougeL_precision_stderr": 0.0029361567575171836, "rougeL_recall": 0.3592970906286205, "rougeL_recall_stderr": 0.0025375120589350396, "rougeLsum_fmeasure": 0.4357908055166842, "rougeLsum_fmeasure_stderr": 0.0024150704709467426, "rougeLsum_precision": 0.5082193147554575, "rougeLsum_precision_stderr": 0.003142597202698407, "rougeLsum_recall": 0.41160458845561226, "rougeLsum_recall_stderr": 0.002840955053130549}}, "4": {"generate_text_restaurant": {"bleu": 16.134799256710256, "bleu_stderr": 0.1070119918698532, "rouge1_fmeasure": 0.523909375920506, "rouge1_fmeasure_stderr": 0.0022880595790812155, "rouge1_precision": 0.6054119860771526, "rouge1_precision_stderr": 0.0031401965578103136, "rouge1_recall": 0.4946565816842518, "rouge1_recall_stderr": 0.0028364911596105765, "rouge2_fmeasure": 0.26757332273287654, "rouge2_fmeasure_stderr": 0.0022434333099258102, "rouge2_precision": 0.3124961122227526, "rouge2_precision_stderr": 0.002808954903697186, "rouge2_recall": 0.25220130152340275, "rouge2_recall_stderr": 0.0023493523906758557, "rougeL_fmeasure": 0.382175935620986, "rougeL_fmeasure_stderr": 0.002236237383536334, "rougeL_precision": 0.44233843950127705, "rougeL_precision_stderr": 0.0029306698549013154, "rougeL_recall": 0.3607369273273119, "rougeL_recall_stderr": 0.002523691933961996, "rougeLsum_fmeasure": 0.4381722454639816, "rougeLsum_fmeasure_stderr": 0.0024443106668905357, "rougeLsum_precision": 0.5057660051353207, "rougeLsum_precision_stderr": 0.003153645376968123, "rougeLsum_recall": 0.4139501510871044, "rougeLsum_recall_stderr": 0.002788136076678763}}, "5": {"generate_text_restaurant": {"bleu": 16.107165763126492, "bleu_stderr": 0.1725403495293288, "rouge1_fmeasure": 0.5240001141378704, "rouge1_fmeasure_stderr": 0.0022053636458810423, "rouge1_precision": 0.6037882438302521, "rouge1_precision_stderr": 0.00307437348704947, "rouge1_recall": 0.49496382545882733, "rouge1_recall_stderr": 0.0027753751046680057, "rouge2_fmeasure": 0.26817812765923393, "rouge2_fmeasure_stderr": 0.002197626024453061, "rouge2_precision": 0.31296117294303094, "rouge2_precision_stderr": 0.0027976539566726744, "rouge2_recall": 0.25262525126987617, "rouge2_recall_stderr": 0.0023072545439166217, "rougeL_fmeasure": 0.3843695820781079, "rougeL_fmeasure_stderr": 0.0022098458849370652, "rougeL_precision": 0.44434419215812615, "rougeL_precision_stderr": 0.002949180064465491, "rougeL_recall": 0.3625276939571968, "rougeL_recall_stderr": 0.0024933568400759844, "rougeLsum_fmeasure": 0.4393845856350276, "rougeLsum_fmeasure_stderr": 0.002395852175826684, "rougeLsum_precision": 0.5068534628215157, "rougeLsum_precision_stderr": 0.0031588710330076797, "rougeLsum_recall": 0.41448240798432473, "rougeLsum_recall_stderr": 0.0027203178472831524}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.1587870129118882, "bleu_stderr": 0.09029922216218629, "rouge1_fmeasure": 0.2131475561803473, "rouge1_fmeasure_stderr": 0.002809398976147748, "rouge1_precision": 0.1554932484923166, "rouge1_precision_stderr": 0.002221691062513405, "rouge1_recall": 0.35841398237522276, "rouge1_recall_stderr": 0.004880460975300627, "rouge2_fmeasure": 0.05062634534913608, "rouge2_fmeasure_stderr": 0.001690768330530915, "rouge2_precision": 0.036146559262545196, "rouge2_precision_stderr": 0.0012086761533876766, "rouge2_recall": 0.08868762877243597, "rouge2_recall_stderr": 0.0030835154047321074, "rougeL_fmeasure": 0.15595658881556435, "rougeL_fmeasure_stderr": 0.002127497880378741, "rougeL_precision": 0.11383026012975615, "rougeL_precision_stderr": 0.001748646718497343, "rougeL_recall": 0.2638855275835188, "rougeL_recall_stderr": 0.0038587410761858755, "rougeLsum_fmeasure": 0.16967238254142056, "rougeLsum_fmeasure_stderr": 0.00237624607133809, "rougeLsum_precision": 0.12373439304630478, "rougeLsum_precision_stderr": 0.0019078089158618033, "rougeLsum_recall": 0.2868552037024372, "rougeLsum_recall_stderr": 0.004272131408381669}}, "1": {"article_DOC_summary": {"bleu": 1.9813214649410344, "bleu_stderr": 0.06605541875330423, "rouge1_fmeasure": 0.20475174329785, "rouge1_fmeasure_stderr": 0.0030292857270426262, "rouge1_precision": 0.17602273459783957, "rouge1_precision_stderr": 0.0034912567740095035, "rouge1_recall": 0.2978672198691626, "rouge1_recall_stderr": 0.004325712043489534, "rouge2_fmeasure": 0.046694043648379924, "rouge2_fmeasure_stderr": 0.001891957305961457, "rouge2_precision": 0.04044850363752258, "rouge2_precision_stderr": 0.0019137570100629107, "rouge2_recall": 0.06884979773351274, "rouge2_recall_stderr": 0.002724070824860895, "rougeL_fmeasure": 0.1593397594005996, "rougeL_fmeasure_stderr": 0.0024118116760541847, "rougeL_precision": 0.13694272638916472, "rougeL_precision_stderr": 0.0028412360369673493, "rougeL_recall": 0.2334897227634632, "rougeL_recall_stderr": 0.003556590427451311, "rougeLsum_fmeasure": 0.15946764140405875, "rougeLsum_fmeasure_stderr": 0.0025141424269190023, "rougeLsum_precision": 0.137142234387202, "rougeLsum_precision_stderr": 0.0029073893715640745, "rougeLsum_recall": 0.23357550407307784, "rougeLsum_recall_stderr": 0.003728086289055363}}, "2": {"article_DOC_summary": {"bleu": 2.568315279137756, "bleu_stderr": 0.13888090668923836, "rouge1_fmeasure": 0.2303304665083427, "rouge1_fmeasure_stderr": 0.003468001334593542, "rouge1_precision": 0.2229374260378376, "rouge1_precision_stderr": 0.004280928411283304, "rouge1_recall": 0.28304069580526636, "rouge1_recall_stderr": 0.003972947611177992, "rouge2_fmeasure": 0.05750919277461529, "rouge2_fmeasure_stderr": 0.002337201303458201, "rouge2_precision": 0.057639208977718646, "rouge2_precision_stderr": 0.002614759341604805, "rouge2_recall": 0.06850665995022794, "rouge2_recall_stderr": 0.0026514610571335023, "rougeL_fmeasure": 0.17795352717964225, "rougeL_fmeasure_stderr": 0.0028890194692426686, "rougeL_precision": 0.17219483167439611, "rougeL_precision_stderr": 0.003538412807763811, "rougeL_recall": 0.21973513766849492, "rougeL_recall_stderr": 0.0033149507650464807, "rougeLsum_fmeasure": 0.17909978174756772, "rougeLsum_fmeasure_stderr": 0.002921283919599673, "rougeLsum_precision": 0.17313600638730706, "rougeLsum_precision_stderr": 0.0035630889990072444, "rougeLsum_recall": 0.2218290481310322, "rougeLsum_recall_stderr": 0.0034301100563963126}}, "3": {"article_DOC_summary": {"bleu": 2.911154273868146, "bleu_stderr": 0.15584859367239043, "rouge1_fmeasure": 0.22535248615052844, "rouge1_fmeasure_stderr": 0.003791735783514838, "rouge1_precision": 0.22724912362863622, "rouge1_precision_stderr": 0.004617605872422519, "rouge1_recall": 0.26380458901019693, "rouge1_recall_stderr": 0.004323907531182028, "rouge2_fmeasure": 0.05815173892712961, "rouge2_fmeasure_stderr": 0.002414756084543016, "rouge2_precision": 0.06038128094861811, "rouge2_precision_stderr": 0.002741435955597867, "rouge2_recall": 0.06597238885255258, "rouge2_recall_stderr": 0.0026568422218367894, "rougeL_fmeasure": 0.17298927032856967, "rougeL_fmeasure_stderr": 0.0031337802161657533, "rougeL_precision": 0.17388874536583043, "rougeL_precision_stderr": 0.0037665824275065446, "rougeL_recall": 0.2036906299119507, "rougeL_recall_stderr": 0.0035595683530041146, "rougeLsum_fmeasure": 0.17531050854597707, "rougeLsum_fmeasure_stderr": 0.003177216988533208, "rougeLsum_precision": 0.17573844022643165, "rougeLsum_precision_stderr": 0.003788816404981867, "rougeLsum_recall": 0.20748309723587363, "rougeLsum_recall_stderr": 0.0037272616939512094}}, "4": {"article_DOC_summary": {"bleu": 0.3728774440118906, "bleu_stderr": 0.09579003392790254, "rouge1_fmeasure": 0.058891655759634634, "rouge1_fmeasure_stderr": 0.0034788169061019582, "rouge1_precision": 0.06384997695789031, "rouge1_precision_stderr": 0.004081417327764998, "rouge1_recall": 0.06700890326086541, "rouge1_recall_stderr": 0.00398883760575528, "rouge2_fmeasure": 0.014176278266192046, "rouge2_fmeasure_stderr": 0.0013476830056293153, "rouge2_precision": 0.01497412391522327, "rouge2_precision_stderr": 0.0015535957922032293, "rouge2_recall": 0.016447549994388213, "rouge2_recall_stderr": 0.0015411299403550512, "rougeL_fmeasure": 0.045492505164083784, "rougeL_fmeasure_stderr": 0.00272525124648742, "rougeL_precision": 0.04936509536592748, "rougeL_precision_stderr": 0.003202638138772514, "rougeL_recall": 0.05218241102055928, "rougeL_recall_stderr": 0.003145249972269267, "rougeLsum_fmeasure": 0.04608216819829664, "rougeLsum_fmeasure_stderr": 0.0027590740017696767, "rougeLsum_precision": 0.049919934471052654, "rougeLsum_precision_stderr": 0.003226500601808736, "rougeLsum_recall": 0.05295420506561277, "rougeLsum_recall_stderr": 0.0032077224941783403}}, "5": {"article_DOC_summary": {"bleu": 4.5775755670070436e-24, "bleu_stderr": 1.165194982722847e-20, "rouge1_fmeasure": 0.003209685928721461, "rouge1_fmeasure_stderr": 0.0009321057176689659, "rouge1_precision": 0.003245026115016397, "rouge1_precision_stderr": 0.0010018752109838872, "rouge1_recall": 0.003816108316949039, "rouge1_recall_stderr": 0.0010591733379027508, "rouge2_fmeasure": 0.0005731642694331555, "rouge2_fmeasure_stderr": 0.00023579174692404015, "rouge2_precision": 0.0006149053716742533, "rouge2_precision_stderr": 0.0002603220643343989, "rouge2_recall": 0.0005865993976426828, "rouge2_recall_stderr": 0.00023171003012774805, "rougeL_fmeasure": 0.002272633054035741, "rougeL_fmeasure_stderr": 0.0006656958601438186, "rougeL_precision": 0.0022714483278982152, "rougeL_precision_stderr": 0.0007108947692843597, "rougeL_recall": 0.0028112298473500735, "rougeL_recall_stderr": 0.0008069199908163127, "rougeLsum_fmeasure": 0.0023210463979317724, "rougeLsum_fmeasure_stderr": 0.0006849794862388629, "rougeLsum_precision": 0.002332315692638469, "rougeLsum_precision_stderr": 0.000733094697158169, "rougeLsum_recall": 0.0027357899134195732, "rougeLsum_recall_stderr": 0.0007566942257832804}}}} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.json b/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0436d105ed31292e1525a67e952483a82ce8571b --- /dev/null +++ b/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.26704682686051745, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.003346061664825914 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.28723900360273236, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0030648378055716376 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.24048738248155943, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0022924559211728 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.07760883007326022, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0018992689472675252 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.07930488429722847, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016958561414342646 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0668489557832933, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0013397924519285302 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.2005421334534283, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00272610264193461 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.2147121574085488, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0023921943267692113 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.17873946616791977, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017622551083101064 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.25120585518065286, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0032054522314792226 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.2689555721836676, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002872035212266304 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.22548796825222664, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002166294099617983 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 4.287770006656117, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.11835519853257852 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.json b/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..acbed32b765204cf144a423241be00a3b0b67008 --- /dev/null +++ b/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.24693572848760356, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0036808375259794837 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.24354376203183603, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0034150456640924024 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.21120410580427312, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002645585016214612 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.07085834748957563, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001874517454430376 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0691245743627346, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0017265463033494647 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.05941725493999427, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0013446993130627589 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.18619149092636542, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0029566758253007693 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.18275407900501345, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002678372152846199 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.1574847757182755, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0020235834993420653 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.23179629616820094, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.003517652101595812 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.22762811767303537, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0031986554975203343 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.19767422939876575, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00249381697530238 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.8253545690458486, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.11601169899524703 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.json b/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a1ee2bc4c83726c5176715382176dde336d6478d --- /dev/null +++ b/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.08491288146025691, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.003092982485525366 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.08011242516269225, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0028916714775573983 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.069564585739879, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002354877693129134 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.02565888234744232, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0014806953806721635 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.023735577183813514, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0012482652822662537 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.02020024245182335, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009720348342506413 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.06624573735046896, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0025139925050591855 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.06166301644336901, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002275982977075628 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.0532771556440879, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001820427400127109 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.07976639037062659, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0029385774245711256 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.07471788462506215, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002705709953549525 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.06493333871562736, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0022026091321757347 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.2364078795997809, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.024391601699964672 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_5.json b/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..501ab08bdd6edb6a6a89985c95b871231ef63849 --- /dev/null +++ b/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.015915821872377674, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0016702869678876702 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.011688601517501347, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0011952240401007063 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.010622171790435694, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0010302186253132802 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.005627833349821597, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009009236122981561 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0037129642663769444, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0005184603335382291 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0035013576004744333, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00047232485179348453 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.013277596011797644, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014699518837369332 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.009310479193227725, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0009665047884074748 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.008452079824288578, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0008242289394208465 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.015271784138576902, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001626300881272708 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.010986042036294772, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0011291877819487723 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.009996482786372373, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0009674414655995565 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.099893037305623e-12, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 2.420929985001087e-11 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/slim.8b7178b44b_gem_xsum_article_DOC_summary_3.json b/8b7178b44b/evaluation/generation/slim.8b7178b44b_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d043f2182bc6e4deb8093b50e2bb052f8f474680 --- /dev/null +++ b/8b7178b44b/evaluation/generation/slim.8b7178b44b_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.22724912362863622, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.004617605872422519 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.26380458901019693, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004323907531182028 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.22535248615052844, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.003791735783514838 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.06038128094861811, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.002741435955597867 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.06597238885255258, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0026568422218367894 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.05815173892712961, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.002414756084543016 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.17388874536583043, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0037665824275065446 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.2036906299119507, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035595683530041146 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.17298927032856967, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0031337802161657533 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.17573844022643165, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.003788816404981867 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.20748309723587363, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0037272616939512094 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.17531050854597707, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.003177216988533208 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.911154273868146, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.15584859367239043 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/slim.8b7178b44b_gem_xsum_article_DOC_summary_4.json b/8b7178b44b/evaluation/generation/slim.8b7178b44b_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a921247d34aa883baad008847607631ad82e4257 --- /dev/null +++ b/8b7178b44b/evaluation/generation/slim.8b7178b44b_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.06384997695789031, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.004081417327764998 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.06700890326086541, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00398883760575528 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.058891655759634634, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0034788169061019582 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.01497412391522327, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0015535957922032293 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.016447549994388213, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0015411299403550512 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.014176278266192046, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0013476830056293153 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.04936509536592748, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.003202638138772514 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.05218241102055928, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003145249972269267 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.045492505164083784, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.00272525124648742 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.049919934471052654, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.003226500601808736 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.05295420506561277, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0032077224941783403 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.04608216819829664, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0027590740017696767 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.3728774440118906, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.09579003392790254 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/slim.8b7178b44b_gem_xsum_article_DOC_summary_5.json b/8b7178b44b/evaluation/generation/slim.8b7178b44b_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2efa759bf4b515d83572a4abf2266649da4b0045 --- /dev/null +++ b/8b7178b44b/evaluation/generation/slim.8b7178b44b_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.003245026115016397, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0010018752109838872 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.003816108316949039, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0010591733379027508 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.003209685928721461, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0009321057176689659 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0006149053716742533, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0002603220643343989 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0005865993976426828, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00023171003012774805 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0005731642694331555, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00023579174692404015 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0022714483278982152, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0007108947692843597 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.0028112298473500735, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0008069199908163127 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.002272633054035741, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0006656958601438186 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.002332315692638469, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.000733094697158169 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0027357899134195732, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0007566942257832804 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0023210463979317724, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0006849794862388629 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 4.5775755670070436e-24, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 1.165194982722847e-20 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_2.json b/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2fd23189bdfb01def8ec0f4595119c840e88891a --- /dev/null +++ b/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.31402819280145294, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00372043085747532}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.27476531134514864, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003004834706287235}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.2502603578204361, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002293574688170618}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0964374050617497, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0023942829869708893}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.07950609744170317, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017736713632069314}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.07312196156858046, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015175238657054205}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.23993957539542324, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031070932808128005}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.20656602583791495, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024047364648286966}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.18837114117652573, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018392223420105429}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.29611740779831397, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0035725589380756783}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2582729289090492, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002859502904870282}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.23530722744582147, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021867085010560715}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 4.611723071615721, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09604703447972755}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_3.json b/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..67c363da4195e1c56cbd5401ccd0c39e8e18c3d3 --- /dev/null +++ b/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.27636674426156177, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004101651110307418}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.22930947512843322, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003338024591740698}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.2137655552336782, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002714393421558324}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.08702087298573732, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0023613075212299754}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.06834799049896098, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017608046858656524}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.06389847418480928, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001481197293938296}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.2135610881070536, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0033828812360183476}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.1743188444814511, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002648347151651851}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.162300378297079, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002115846911993043}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.260193792903579, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003915057196799965}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2155361794060783, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003167125886789834}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.2007815371334278, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002564977939596769}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.6534705242416563, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0881203300496025}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_4.json b/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3c4c4edb11ce2b0a088f01bf1c866fe9073052a6 --- /dev/null +++ b/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.09318115237631248, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0034313497448162753}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.07510657766582833, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027823499468141023}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.06966991898451057, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024061665565282536}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0306990992199194, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001732779147193654}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.023135462034409776, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012681089305181597}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.021410981626472696, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010806600066785071}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.07441428298557279, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0028438647198895358}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.05853529439287516, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022113797700769144}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.05441308852467703, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019026572440828432}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.08804803757911164, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003292180554603894}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.06984747732679251, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025861205751654633}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.06515832990606861, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022584661554748706}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.14336768966011842, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.01664751835367322}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_5.json b/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..46fa5506549a3a6cd809bb929d3f1d070d1a0290 --- /dev/null +++ b/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.016751606857569483, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016597025910088947}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.012874957063334068, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001320598626607211}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.011789826148853048, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0011310450894670711}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.004789908933072763, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006572780023821424}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.003932891692847358, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005535394810295029}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0035301672488990635, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00046931341477467973}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.013288040221462351, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013727762159921608}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.00989469802053577, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0010241634280214566}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.009092320709271332, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.000875179657864614}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.015518838953812768, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001554235047112553}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.01190208294502919, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001221212576212141}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.010891546354591043, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001040987925967451}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.5519142183400993e-12, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 6.955628631620419e-11}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/agg.8b7178b88b_gem_xsum_article_DOC_summary_3.json b/8b7178b88b/evaluation/generation/agg.8b7178b88b_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..031fa3a3330c1097b206b057e429680713731ce1 --- /dev/null +++ b/8b7178b88b/evaluation/generation/agg.8b7178b88b_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.262476289887743, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004621204276158766}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.2404612136480006, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004125414843698027}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.24229133087381188, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004031858321395662}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.06842316364282378, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0027647383711749615}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.061651170011214104, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002429297519350672}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.06268337272766085, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0024567035789010046}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.20156202346447022, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003847300881262445}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.18433196409317407, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033927670396723853}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.18567208218878792, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0033333790452062923}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.20245862347934387, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003853570083238628}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.18518636196696317, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0034107354023354916}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.18654971088104422, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003345098888199193}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 3.406816013953246, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.24161861472249982}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/agg.8b7178b88b_gem_xsum_article_DOC_summary_4.json b/8b7178b88b/evaluation/generation/agg.8b7178b88b_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3224059f7a64011f6557b38d31c41120bcb20c34 --- /dev/null +++ b/8b7178b88b/evaluation/generation/agg.8b7178b88b_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.069527922152048, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004415548165696612}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.058255960572026616, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0036893676818933246}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.06037720875233413, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003756291069253472}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.01884212610032935, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001864365662358469}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.01541077730496209, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015005237179700724}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.016076749046209322, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00155018273945325}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.05366548982395334, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003589390411326478}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.04435979503971948, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0029246377632095007}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.04601022042911696, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002972538395939391}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.05393140916782405, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003600323595663311}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.04463374272617106, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0029386715942067774}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.046274903650287996, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0029855052852481925}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.07621103585426778, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02328218785956861}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/agg.8b7178b88b_gem_xsum_article_DOC_summary_5.json b/8b7178b88b/evaluation/generation/agg.8b7178b88b_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..afe01d2bc6b1be4586d706c6489067cf8aad628c --- /dev/null +++ b/8b7178b88b/evaluation/generation/agg.8b7178b88b_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.002519205393633051, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009010685681674164}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.002553406526725967, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0008674535432962121}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0024677589985080198, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008512467369069438}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0005748785133997738, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00032234854700264514}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.00047620248785620596, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0002558552981869991}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0005053609550867747, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00027556435520113126}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.001782337656185266, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006875286151113505}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0016948473649144099, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005810537809262554}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0016858136591016738, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006017432419253131}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0018196260445811194, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0006924871080774834}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0017377290115696417, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005888117009644554}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0017257035629670058, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.000608227728173233}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.2907915607447047e-48, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 8.181410959491275e-33}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_2.jsonl b/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_2.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..5e5a98d85ee83c070657333d4416f347778e110c 100644 --- a/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_2.jsonl +++ b/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6830aff08e57c83436ae685160a990367fab34910511deb7c41fa57ca22d540a +size 18606839 diff --git a/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_3.jsonl b/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..a2453a227634ecc1f9789e8d08b49f9ffefe1929 100644 --- a/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_3.jsonl +++ b/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76a913fbf2897f9422d2b5ba8010168b34cd7dbe549f731886c6d06a933ba77a +size 24070640 diff --git a/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_4.jsonl b/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..8f501cd597a5ffdfa7ba2963e136a5053f9dd11b 100644 --- a/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_4.jsonl +++ b/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b031c15ee293894908b190ad65b79e1e1e3ca463f7e7a98e055ef3b06246bc5 +size 29382271 diff --git a/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_5.jsonl b/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..2d3f0b6aec2d9bb1de84e59af7fd0a9a926aea37 100644 --- a/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_5.jsonl +++ b/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb4d531bbcce50895d65aa2091f4184100e6c362996fc78d9175b884d1ca9550 +size 34785918 diff --git a/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_3.jsonl b/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..cdcbddbdc20fddb0d9bbab6a44975cfae37e69a9 100644 --- a/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_3.jsonl +++ b/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f56e7afe4803298b4503b99413ffddde087e02c9bab626dcc9e1e06417115ec +size 9448692 diff --git a/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_4.jsonl b/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..626f370f683c92c303b41f0f35440550c93fd98a 100644 --- a/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_4.jsonl +++ b/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc07f7f8d2b5928ecc092c9d47a6e20742115ad7fbdb00c59b0f40fe3441f816 +size 11623408 diff --git a/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_5.jsonl b/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..410cc541db9352943132a64eab16f6befdd421d0 100644 --- a/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_5.jsonl +++ b/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3db29ec060198282a3b20170fbf5880a8cf0e0e60bff18e0d2df7ce31aaf72b2 +size 13897232 diff --git a/8b7178b88b/evaluation/generation/merged.csv b/8b7178b88b/evaluation/generation/merged.csv index 7f2fac6cd9121f082616c95d9cf060ed3fb3d42f..8df18a090dac05346fcd019c9a09b365a77cd0e8 100644 --- a/8b7178b88b/evaluation/generation/merged.csv +++ b/8b7178b88b/evaluation/generation/merged.csv @@ -18,7 +18,13 @@ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.05054236360888691 gem_xsum,1,median,rouge2_fmeasure,0.05054236360888691 gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.06264124531231369 gem_xsum,2,median,rouge2_fmeasure,0.06264124531231369 -gem_xsum,2,average,multiple,0.05053264327830809 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.06268337272766085 +gem_xsum,3,median,rouge2_fmeasure,0.06268337272766085 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.016076749046209322 +gem_xsum,4,median,rouge2_fmeasure,0.016076749046209322 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0005053609550867747 +gem_xsum,5,median,rouge2_fmeasure,0.0005053609550867747 +gem_xsum,5,average,multiple,0.03847723542731354 web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.049040742381146675 web_nlg_en,0,median,rouge2_fmeasure,0.049040742381146675 web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.08445836822356238 @@ -36,4 +42,12 @@ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03307271090107231 wiki_lingua_en,0,median,rouge2_fmeasure,0.03307271090107231 wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04664136277906473 wiki_lingua_en,1,median,rouge2_fmeasure,0.04664136277906473 -wiki_lingua_en,1,average,multiple,0.03985703684006852 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.07312196156858046 +wiki_lingua_en,2,median,rouge2_fmeasure,0.07312196156858046 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.06389847418480928 +wiki_lingua_en,3,median,rouge2_fmeasure,0.06389847418480928 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.021410981626472696 +wiki_lingua_en,4,median,rouge2_fmeasure,0.021410981626472696 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0035301672488990635 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0035301672488990635 +wiki_lingua_en,5,average,multiple,0.040279276384816425 diff --git a/8b7178b88b/evaluation/generation/merged.json b/8b7178b88b/evaluation/generation/merged.json index b680c0bc1329193f65aa2ced33782e6a73af1634..e7c47425b658aac739df861291733f033d63fe1b 100644 --- a/8b7178b88b/evaluation/generation/merged.json +++ b/8b7178b88b/evaluation/generation/merged.json @@ -1 +1 @@ -{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3018678413532531, "bleu_stderr": 0.02844023781425127, "rouge1_fmeasure": 0.10892691674779821, "rouge1_fmeasure_stderr": 0.0019071235551653418, "rouge1_precision": 0.07529613947856881, "rouge1_precision_stderr": 0.001982838396838402, "rouge1_recall": 0.3128650370000908, "rouge1_recall_stderr": 0.0050801382033981305, "rouge2_fmeasure": 0.049040742381146675, "rouge2_fmeasure_stderr": 0.0011519113898695909, "rouge2_precision": 0.03417694669369951, "rouge2_precision_stderr": 0.001313368572681446, "rouge2_recall": 0.14493621807358514, "rouge2_recall_stderr": 0.0032191726931486194, "rougeL_fmeasure": 0.10396046493580034, "rougeL_fmeasure_stderr": 0.001778756221046735, "rougeL_precision": 0.07180450626627975, "rougeL_precision_stderr": 0.001883124040127771, "rougeL_recall": 0.30107392723259574, "rougeL_recall_stderr": 0.004911398619361963, "rougeLsum_fmeasure": 0.10404327940071008, "rougeLsum_fmeasure_stderr": 0.001820581250965178, "rougeLsum_precision": 0.0720836832710309, "rougeLsum_precision_stderr": 0.001909971490636727, "rougeLsum_recall": 0.29709085493162957, "rougeLsum_recall_stderr": 0.004719450806012623}}, "1": {"PALM_prompt": {"bleu": 0.5232740840707778, "bleu_stderr": 0.020854898797383486, "rouge1_fmeasure": 0.1654988415912321, "rouge1_fmeasure_stderr": 0.003746858529251122, "rouge1_precision": 0.14535656258632043, "rouge1_precision_stderr": 0.00467037788995567, "rouge1_recall": 0.32467105717536826, "rouge1_recall_stderr": 0.004844586825107793, "rouge2_fmeasure": 0.08445836822356238, "rouge2_fmeasure_stderr": 0.0026512482133539865, "rouge2_precision": 0.07725516531902364, "rouge2_precision_stderr": 0.0033727094011323654, "rouge2_recall": 0.16618004223187002, "rouge2_recall_stderr": 0.003573100663317745, "rougeL_fmeasure": 0.1515457985267535, "rougeL_fmeasure_stderr": 0.003242282817029173, "rougeL_precision": 0.1318447100821655, "rougeL_precision_stderr": 0.004146806578291652, "rougeL_recall": 0.30561460626596343, "rougeL_recall_stderr": 0.004514632290906585, "rougeLsum_fmeasure": 0.15355200233921673, "rougeLsum_fmeasure_stderr": 0.0032933307640919023, "rougeLsum_precision": 0.1339783021316465, "rougeLsum_precision_stderr": 0.004209899048276762, "rougeLsum_recall": 0.3075713065076715, "rougeLsum_recall_stderr": 0.004515232151240997}}, "2": {"PALM_prompt": {"bleu": 0.8216652782232776, "bleu_stderr": 0.04062910442416713, "rouge1_fmeasure": 0.2027123169825771, "rouge1_fmeasure_stderr": 0.004395624756947157, "rouge1_precision": 0.18008691481284958, "rouge1_precision_stderr": 0.0052757038925836395, "rouge1_recall": 0.3711673998047281, "rouge1_recall_stderr": 0.00487879034297811, "rouge2_fmeasure": 0.10928693373297448, "rouge2_fmeasure_stderr": 0.003154986876758784, "rouge2_precision": 0.10008962637316307, "rouge2_precision_stderr": 0.0037529405438827225, "rouge2_recall": 0.20016469360773192, "rouge2_recall_stderr": 0.003813720451519567, "rougeL_fmeasure": 0.18244327655886838, "rougeL_fmeasure_stderr": 0.003749619328233065, "rougeL_precision": 0.15996735275766777, "rougeL_precision_stderr": 0.004577626944141576, "rougeL_recall": 0.3460684822761284, "rougeL_recall_stderr": 0.004492766954524856, "rougeLsum_fmeasure": 0.18698187602734404, "rougeLsum_fmeasure_stderr": 0.0038921774054845996, "rougeLsum_precision": 0.1650914247834489, "rougeLsum_precision_stderr": 0.00477544492053463, "rougeLsum_recall": 0.3505702667663343, "rougeLsum_recall_stderr": 0.004548048493270462}}, "3": {"PALM_prompt": {"bleu": 0.8830672069742221, "bleu_stderr": 0.028000457075678158, "rouge1_fmeasure": 0.21071007543003908, "rouge1_fmeasure_stderr": 0.004463218149424481, "rouge1_precision": 0.188815921549058, "rouge1_precision_stderr": 0.005405782039170827, "rouge1_recall": 0.38372152833073453, "rouge1_recall_stderr": 0.004859603610175178, "rouge2_fmeasure": 0.11292635699291721, "rouge2_fmeasure_stderr": 0.0031281095189860052, "rouge2_precision": 0.10432859230180822, "rouge2_precision_stderr": 0.003707124967324291, "rouge2_recall": 0.2043813472297843, "rouge2_recall_stderr": 0.003801503662629156, "rougeL_fmeasure": 0.18796910977796713, "rougeL_fmeasure_stderr": 0.003750496425693158, "rougeL_precision": 0.16609714125737446, "rougeL_precision_stderr": 0.004603337178258819, "rougeL_recall": 0.3553325064502732, "rougeL_recall_stderr": 0.0044309664783451575, "rougeLsum_fmeasure": 0.1924829285681012, "rougeLsum_fmeasure_stderr": 0.0038880601940687624, "rougeLsum_precision": 0.17111254219249275, "rougeLsum_precision_stderr": 0.004780700019520514, "rougeLsum_recall": 0.36026081429848233, "rougeLsum_recall_stderr": 0.004488184055350965}}, "4": {"PALM_prompt": {"bleu": 1.0358671081792514, "bleu_stderr": 0.04263566752630496, "rouge1_fmeasure": 0.22245843694786877, "rouge1_fmeasure_stderr": 0.0045583402573195686, "rouge1_precision": 0.19989615669545296, "rouge1_precision_stderr": 0.005599957130646531, "rouge1_recall": 0.40336166710123866, "rouge1_recall_stderr": 0.004818721124125534, "rouge2_fmeasure": 0.12100972003049645, "rouge2_fmeasure_stderr": 0.0032743404066677153, "rouge2_precision": 0.1124336236373582, "rouge2_precision_stderr": 0.003935741095779227, "rouge2_recall": 0.21819990615637125, "rouge2_recall_stderr": 0.00385618079142143, "rougeL_fmeasure": 0.19760963957512798, "rougeL_fmeasure_stderr": 0.00383542084139647, "rougeL_precision": 0.17492321519899293, "rougeL_precision_stderr": 0.004773378480561351, "rougeL_recall": 0.37224158952457587, "rougeL_recall_stderr": 0.004377948019839408, "rougeLsum_fmeasure": 0.20440757227679296, "rougeLsum_fmeasure_stderr": 0.004022054785395138, "rougeLsum_precision": 0.18235385466161755, "rougeLsum_precision_stderr": 0.005021694344891093, "rougeLsum_recall": 0.3793687467005697, "rougeLsum_recall_stderr": 0.004423507289060084}}, "5": {"PALM_prompt": {"bleu": 1.204171361933669, "bleu_stderr": 0.07613961296168276, "rouge1_fmeasure": 0.2366466068769033, "rouge1_fmeasure_stderr": 0.0048055385839361016, "rouge1_precision": 0.21975255516712816, "rouge1_precision_stderr": 0.0060000476488098085, "rouge1_recall": 0.4066635344523642, "rouge1_recall_stderr": 0.0048262902071662655, "rouge2_fmeasure": 0.13171103508163604, "rouge2_fmeasure_stderr": 0.0034923443640983974, "rouge2_precision": 0.1275911476917841, "rouge2_precision_stderr": 0.004309205748397978, "rouge2_recall": 0.2235792080539432, "rouge2_recall_stderr": 0.003950954664759423, "rougeL_fmeasure": 0.2088535130444153, "rougeL_fmeasure_stderr": 0.004052842030593682, "rougeL_precision": 0.19131328031391062, "rougeL_precision_stderr": 0.005134255771123987, "rougeL_recall": 0.37336352525066424, "rougeL_recall_stderr": 0.0044216260882892515, "rougeLsum_fmeasure": 0.21625717477809833, "rougeLsum_fmeasure_stderr": 0.004260506456488727, "rougeLsum_precision": 0.1997593788485136, "rougeLsum_precision_stderr": 0.005406525039653976, "rougeLsum_recall": 0.3803285154915348, "rougeLsum_recall_stderr": 0.004488995109880062}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.7397274944771954, "bleu_stderr": 0.10039261451919343, "rouge1_fmeasure": 0.1238366627407439, "rouge1_fmeasure_stderr": 0.0026637547390690967, "rouge1_precision": 0.12462462729719587, "rouge1_precision_stderr": 0.0031826046662497306, "rouge1_recall": 0.16449607884346923, "rouge1_recall_stderr": 0.003588981148112544, "rouge2_fmeasure": 0.03307271090107231, "rouge2_fmeasure_stderr": 0.0010639050828558138, "rouge2_precision": 0.030579818116461738, "rouge2_precision_stderr": 0.001114447996676589, "rouge2_recall": 0.045348988871228374, "rouge2_recall_stderr": 0.0015776302264520795, "rougeL_fmeasure": 0.0940193069900903, "rougeL_fmeasure_stderr": 0.001985212421750187, "rougeL_precision": 0.09689369540491309, "rougeL_precision_stderr": 0.0026833442642966994, "rougeL_recall": 0.1273558135465041, "rougeL_recall_stderr": 0.002838117953205465, "rougeLsum_fmeasure": 0.11606269945283079, "rougeLsum_fmeasure_stderr": 0.0025094061265404873, "rougeLsum_precision": 0.11762669226995941, "rougeLsum_precision_stderr": 0.0030721629727453585, "rougeLsum_recall": 0.15435308236518808, "rougeLsum_recall_stderr": 0.003394722386908597}}, "1": {"tldr_en": {"bleu": 2.980212171285624, "bleu_stderr": 0.08986977717018042, "rouge1_fmeasure": 0.18706368194466189, "rouge1_fmeasure_stderr": 0.0023760001514770367, "rouge1_precision": 0.2303358349811005, "rouge1_precision_stderr": 0.003690727542733754, "rouge1_recall": 0.2154464843014795, "rouge1_recall_stderr": 0.00312287409246033, "rouge2_fmeasure": 0.04664136277906473, "rouge2_fmeasure_stderr": 0.0012607234019787598, "rouge2_precision": 0.0623131824874716, "rouge2_precision_stderr": 0.0021997319954224622, "rouge2_recall": 0.05366477872164223, "rouge2_recall_stderr": 0.0015294014686067932, "rougeL_fmeasure": 0.14101873450061833, "rougeL_fmeasure_stderr": 0.0017902413086473704, "rougeL_precision": 0.1774352175816246, "rougeL_precision_stderr": 0.003056714448069494, "rougeL_recall": 0.16262682837459477, "rougeL_recall_stderr": 0.00239676054267987, "rougeLsum_fmeasure": 0.175474051697367, "rougeLsum_fmeasure_stderr": 0.0022207055448280263, "rougeLsum_precision": 0.21694264697187762, "rougeLsum_precision_stderr": 0.0035242626334641026, "rougeLsum_recall": 0.2020549195476134, "rougeLsum_recall_stderr": 0.0029196969560855153}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 7.805428848697077, "bleu_stderr": 0.12760294204588482, "rouge1_fmeasure": 0.3241138419996619, "rouge1_fmeasure_stderr": 0.0023488121086233404, "rouge1_precision": 0.27630659363671284, "rouge1_precision_stderr": 0.0026635424916828915, "rouge1_recall": 0.4517766787964781, "rouge1_recall_stderr": 0.003086695070399088, "rouge2_fmeasure": 0.14611347950675788, "rouge2_fmeasure_stderr": 0.001555781159556189, "rouge2_precision": 0.1264233869651961, "rouge2_precision_stderr": 0.0021472423489400424, "rouge2_recall": 0.21006531093284228, "rouge2_recall_stderr": 0.002273053133823739, "rougeL_fmeasure": 0.2782112735172708, "rougeL_fmeasure_stderr": 0.0017379173406301361, "rougeL_precision": 0.23647123285513574, "rougeL_precision_stderr": 0.0021694427334222896, "rougeL_recall": 0.39302733892651276, "rougeL_recall_stderr": 0.0026453773863021526, "rougeLsum_fmeasure": 0.2838669853732326, "rougeLsum_fmeasure_stderr": 0.0022710804975857187, "rougeLsum_precision": 0.24330576145803912, "rougeLsum_precision_stderr": 0.0025941508735257053, "rougeLsum_recall": 0.3957393841813685, "rougeLsum_recall_stderr": 0.0030500827793352625}}, "1": {"generate_text_restaurant": {"bleu": 12.182758345079787, "bleu_stderr": 0.16518841123830086, "rouge1_fmeasure": 0.4813086962286221, "rouge1_fmeasure_stderr": 0.0023626326089443398, "rouge1_precision": 0.6058724528221369, "rouge1_precision_stderr": 0.0033311930729353, "rouge1_recall": 0.4375685096869626, "rouge1_recall_stderr": 0.003004503828354488, "rouge2_fmeasure": 0.23028289344422687, "rouge2_fmeasure_stderr": 0.0021131218145040835, "rouge2_precision": 0.29572879977984745, "rouge2_precision_stderr": 0.0029383694383037506, "rouge2_recall": 0.20878267638207582, "rouge2_recall_stderr": 0.002207007421280634, "rougeL_fmeasure": 0.3505103154607651, "rougeL_fmeasure_stderr": 0.002138758436636428, "rougeL_precision": 0.4452525524920764, "rougeL_precision_stderr": 0.003155514665754343, "rougeL_recall": 0.31741659640932457, "rougeL_recall_stderr": 0.0024579048629975297, "rougeLsum_fmeasure": 0.39279988924924625, "rougeLsum_fmeasure_stderr": 0.002377886979824964, "rougeLsum_precision": 0.4958740637334028, "rougeLsum_precision_stderr": 0.0033266551400865404, "rougeLsum_recall": 0.356592637216114, "rougeLsum_recall_stderr": 0.0027567285724463706}}, "2": {"generate_text_restaurant": {"bleu": 14.77827774961412, "bleu_stderr": 0.17074445414549996, "rouge1_fmeasure": 0.5169724859470212, "rouge1_fmeasure_stderr": 0.0022847828063681107, "rouge1_precision": 0.6222132815532128, "rouge1_precision_stderr": 0.0032116680629870925, "rouge1_recall": 0.4773787870946972, "rouge1_recall_stderr": 0.002880300695039262, "rouge2_fmeasure": 0.2598408586362927, "rouge2_fmeasure_stderr": 0.002199049529931736, "rouge2_precision": 0.31734278580959246, "rouge2_precision_stderr": 0.002901203617815359, "rouge2_recall": 0.23934523412335545, "rouge2_recall_stderr": 0.0022881777132200292, "rougeL_fmeasure": 0.37830224304770604, "rougeL_fmeasure_stderr": 0.0021704191814246308, "rougeL_precision": 0.45801859712166865, "rougeL_precision_stderr": 0.0030651884383412443, "rougeL_recall": 0.3483543845938943, "rougeL_recall_stderr": 0.002450753947224449, "rougeLsum_fmeasure": 0.42874770680726115, "rougeLsum_fmeasure_stderr": 0.00239536120318971, "rougeLsum_precision": 0.5166167827562745, "rougeLsum_precision_stderr": 0.003243801734266107, "rougeLsum_recall": 0.3956919145577125, "rougeLsum_recall_stderr": 0.002759335166433648}}, "3": {"generate_text_restaurant": {"bleu": 15.660507663936528, "bleu_stderr": 0.14749957372060007, "rouge1_fmeasure": 0.5280713486662109, "rouge1_fmeasure_stderr": 0.0022869985346214445, "rouge1_precision": 0.6220372006914571, "rouge1_precision_stderr": 0.0031139706058727593, "rouge1_recall": 0.4921683601168692, "rouge1_recall_stderr": 0.0029083346931655855, "rouge2_fmeasure": 0.26903286770558915, "rouge2_fmeasure_stderr": 0.0022307748577894758, "rouge2_precision": 0.31964099722948836, "rouge2_precision_stderr": 0.002777708595069122, "rouge2_recall": 0.2509916858334456, "rouge2_recall_stderr": 0.0023901154240661446, "rougeL_fmeasure": 0.38565827502841116, "rougeL_fmeasure_stderr": 0.0022144421481799675, "rougeL_precision": 0.455321793928234, "rougeL_precision_stderr": 0.0029321326061168074, "rougeL_recall": 0.35903739623304465, "rougeL_recall_stderr": 0.0025252866229682975, "rougeLsum_fmeasure": 0.4394144928671385, "rougeLsum_fmeasure_stderr": 0.002446885490860059, "rougeLsum_precision": 0.5172893895354551, "rougeLsum_precision_stderr": 0.003153574847912617, "rougeLsum_recall": 0.40954269048002123, "rougeLsum_recall_stderr": 0.002825679167152443}}, "4": {"generate_text_restaurant": {"bleu": 16.046714521986758, "bleu_stderr": 0.20143548468737246, "rouge1_fmeasure": 0.5343300631658281, "rouge1_fmeasure_stderr": 0.002276059193634907, "rouge1_precision": 0.6240126615936337, "rouge1_precision_stderr": 0.00312720979038774, "rouge1_recall": 0.4989840309368499, "rouge1_recall_stderr": 0.0028177351641601373, "rouge2_fmeasure": 0.2724112279454815, "rouge2_fmeasure_stderr": 0.0022911561617215254, "rouge2_precision": 0.3206493529913017, "rouge2_precision_stderr": 0.0028268414398358497, "rouge2_recall": 0.2544472343665272, "rouge2_recall_stderr": 0.0024038250271984246, "rougeL_fmeasure": 0.38733097602553734, "rougeL_fmeasure_stderr": 0.0022465126119210273, "rougeL_precision": 0.4530934873979934, "rougeL_precision_stderr": 0.0029431217749683455, "rougeL_recall": 0.3614836535348617, "rougeL_recall_stderr": 0.0025050776663639207, "rougeLsum_fmeasure": 0.4465935641244627, "rougeLsum_fmeasure_stderr": 0.0024867305930155463, "rougeLsum_precision": 0.5210977780903223, "rougeLsum_precision_stderr": 0.003193746615616614, "rougeLsum_recall": 0.41707944621670306, "rougeLsum_recall_stderr": 0.002802635738849298}}, "5": {"generate_text_restaurant": {"bleu": 16.068878582309953, "bleu_stderr": 0.16081121483163321, "rouge1_fmeasure": 0.5355837269029773, "rouge1_fmeasure_stderr": 0.002256980681599985, "rouge1_precision": 0.6224102862193265, "rouge1_precision_stderr": 0.003110234568961701, "rouge1_recall": 0.500933748593845, "rouge1_recall_stderr": 0.0027924958992030795, "rouge2_fmeasure": 0.27475970807249195, "rouge2_fmeasure_stderr": 0.0022615182075459324, "rouge2_precision": 0.32190257045006593, "rouge2_precision_stderr": 0.0027959933340407817, "rouge2_recall": 0.2569886621162686, "rouge2_recall_stderr": 0.002381426382114005, "rougeL_fmeasure": 0.3914922502865105, "rougeL_fmeasure_stderr": 0.0022467536238069325, "rougeL_precision": 0.45606984360921154, "rougeL_precision_stderr": 0.002956375747033492, "rougeL_recall": 0.3657446339488385, "rougeL_recall_stderr": 0.002509353106433301, "rougeLsum_fmeasure": 0.4482824729485371, "rougeLsum_fmeasure_stderr": 0.002444818682804833, "rougeLsum_precision": 0.5210361752122326, "rougeLsum_precision_stderr": 0.003175466781222956, "rougeLsum_recall": 0.4190489396414274, "rougeLsum_recall_stderr": 0.0027531052100554392}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.9818055644823127, "bleu_stderr": 0.13978823814879407, "rouge1_fmeasure": 0.16636180556058044, "rouge1_fmeasure_stderr": 0.003516551179676636, "rouge1_precision": 0.12872236323811218, "rouge1_precision_stderr": 0.0032462823876021106, "rouge1_recall": 0.2715393984942335, "rouge1_recall_stderr": 0.005900736678094209, "rouge2_fmeasure": 0.03841432091372367, "rouge2_fmeasure_stderr": 0.001586981058162795, "rouge2_precision": 0.028024322491153688, "rouge2_precision_stderr": 0.0011655819703985627, "rouge2_recall": 0.06517448572813263, "rouge2_recall_stderr": 0.002766560068276156, "rougeL_fmeasure": 0.1211817583352299, "rougeL_fmeasure_stderr": 0.0025835665514872593, "rougeL_precision": 0.09534495061706572, "rougeL_precision_stderr": 0.0027658531338571454, "rougeL_recall": 0.19816999370668548, "rougeL_recall_stderr": 0.004414878706261276, "rougeLsum_fmeasure": 0.13466615178498392, "rougeLsum_fmeasure_stderr": 0.002869955857600535, "rougeLsum_precision": 0.10514740754117864, "rougeLsum_precision_stderr": 0.0028961374555318475, "rougeLsum_recall": 0.22067225131355028, "rougeLsum_recall_stderr": 0.004901108311721729}}, "1": {"article_DOC_summary": {"bleu": 2.718621562424974, "bleu_stderr": 0.25634902183378955, "rouge1_fmeasure": 0.2252201544517192, "rouge1_fmeasure_stderr": 0.003572521220076497, "rouge1_precision": 0.23797749292717868, "rouge1_precision_stderr": 0.0044248829684158586, "rouge1_recall": 0.23607563998039394, "rouge1_recall_stderr": 0.0037207865629804763, "rouge2_fmeasure": 0.05054236360888691, "rouge2_fmeasure_stderr": 0.0023023749748183903, "rouge2_precision": 0.05608458865614393, "rouge2_precision_stderr": 0.0026939326656790527, "rouge2_recall": 0.0508692034245319, "rouge2_recall_stderr": 0.0022931658802725287, "rougeL_fmeasure": 0.1713099421580951, "rougeL_fmeasure_stderr": 0.0029320038478532935, "rougeL_precision": 0.18150902719333478, "rougeL_precision_stderr": 0.0036305277559064066, "rougeL_recall": 0.179561582857957, "rougeL_recall_stderr": 0.003045132003822578, "rougeLsum_fmeasure": 0.17382401249824367, "rougeLsum_fmeasure_stderr": 0.0029535439613312367, "rougeLsum_precision": 0.18366739499452403, "rougeLsum_precision_stderr": 0.003629882854081677, "rougeLsum_recall": 0.18297857106589824, "rougeLsum_recall_stderr": 0.003137997544172217}}, "2": {"article_DOC_summary": {"bleu": 3.6613673832300515, "bleu_stderr": 0.29305939325429936, "rouge1_fmeasure": 0.2516525371976862, "rouge1_fmeasure_stderr": 0.0037511199505683374, "rouge1_precision": 0.26927175770553385, "rouge1_precision_stderr": 0.004455761662742256, "rouge1_recall": 0.25342716562216744, "rouge1_recall_stderr": 0.0038549474849294127, "rouge2_fmeasure": 0.06264124531231369, "rouge2_fmeasure_stderr": 0.0025022071058794777, "rouge2_precision": 0.06831364960977458, "rouge2_precision_stderr": 0.0028257306475568847, "rouge2_recall": 0.06217985063286193, "rouge2_recall_stderr": 0.0025187198385362495, "rougeL_fmeasure": 0.19077644710608424, "rougeL_fmeasure_stderr": 0.003126557023491442, "rougeL_precision": 0.20401963131847098, "rougeL_precision_stderr": 0.0036937944194900436, "rougeL_recall": 0.19255452445504442, "rougeL_recall_stderr": 0.0032146416769602863, "rougeLsum_fmeasure": 0.19254279514804024, "rougeLsum_fmeasure_stderr": 0.00313403184014492, "rougeLsum_precision": 0.20564280351844771, "rougeLsum_precision_stderr": 0.003691582561213757, "rougeLsum_recall": 0.1946786446186705, "rougeLsum_recall_stderr": 0.0032424612854966943}}}} \ No newline at end of file +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3018678413532531, "bleu_stderr": 0.02844023781425127, "rouge1_fmeasure": 0.10892691674779821, "rouge1_fmeasure_stderr": 0.0019071235551653418, "rouge1_precision": 0.07529613947856881, "rouge1_precision_stderr": 0.001982838396838402, "rouge1_recall": 0.3128650370000908, "rouge1_recall_stderr": 0.0050801382033981305, "rouge2_fmeasure": 0.049040742381146675, "rouge2_fmeasure_stderr": 0.0011519113898695909, "rouge2_precision": 0.03417694669369951, "rouge2_precision_stderr": 0.001313368572681446, "rouge2_recall": 0.14493621807358514, "rouge2_recall_stderr": 0.0032191726931486194, "rougeL_fmeasure": 0.10396046493580034, "rougeL_fmeasure_stderr": 0.001778756221046735, "rougeL_precision": 0.07180450626627975, "rougeL_precision_stderr": 0.001883124040127771, "rougeL_recall": 0.30107392723259574, "rougeL_recall_stderr": 0.004911398619361963, "rougeLsum_fmeasure": 0.10404327940071008, "rougeLsum_fmeasure_stderr": 0.001820581250965178, "rougeLsum_precision": 0.0720836832710309, "rougeLsum_precision_stderr": 0.001909971490636727, "rougeLsum_recall": 0.29709085493162957, "rougeLsum_recall_stderr": 0.004719450806012623}}, "1": {"PALM_prompt": {"bleu": 0.5232740840707778, "bleu_stderr": 0.020854898797383486, "rouge1_fmeasure": 0.1654988415912321, "rouge1_fmeasure_stderr": 0.003746858529251122, "rouge1_precision": 0.14535656258632043, "rouge1_precision_stderr": 0.00467037788995567, "rouge1_recall": 0.32467105717536826, "rouge1_recall_stderr": 0.004844586825107793, "rouge2_fmeasure": 0.08445836822356238, "rouge2_fmeasure_stderr": 0.0026512482133539865, "rouge2_precision": 0.07725516531902364, "rouge2_precision_stderr": 0.0033727094011323654, "rouge2_recall": 0.16618004223187002, "rouge2_recall_stderr": 0.003573100663317745, "rougeL_fmeasure": 0.1515457985267535, "rougeL_fmeasure_stderr": 0.003242282817029173, "rougeL_precision": 0.1318447100821655, "rougeL_precision_stderr": 0.004146806578291652, "rougeL_recall": 0.30561460626596343, "rougeL_recall_stderr": 0.004514632290906585, "rougeLsum_fmeasure": 0.15355200233921673, "rougeLsum_fmeasure_stderr": 0.0032933307640919023, "rougeLsum_precision": 0.1339783021316465, "rougeLsum_precision_stderr": 0.004209899048276762, "rougeLsum_recall": 0.3075713065076715, "rougeLsum_recall_stderr": 0.004515232151240997}}, "2": {"PALM_prompt": {"bleu": 0.8216652782232776, "bleu_stderr": 0.04062910442416713, "rouge1_fmeasure": 0.2027123169825771, "rouge1_fmeasure_stderr": 0.004395624756947157, "rouge1_precision": 0.18008691481284958, "rouge1_precision_stderr": 0.0052757038925836395, "rouge1_recall": 0.3711673998047281, "rouge1_recall_stderr": 0.00487879034297811, "rouge2_fmeasure": 0.10928693373297448, "rouge2_fmeasure_stderr": 0.003154986876758784, "rouge2_precision": 0.10008962637316307, "rouge2_precision_stderr": 0.0037529405438827225, "rouge2_recall": 0.20016469360773192, "rouge2_recall_stderr": 0.003813720451519567, "rougeL_fmeasure": 0.18244327655886838, "rougeL_fmeasure_stderr": 0.003749619328233065, "rougeL_precision": 0.15996735275766777, "rougeL_precision_stderr": 0.004577626944141576, "rougeL_recall": 0.3460684822761284, "rougeL_recall_stderr": 0.004492766954524856, "rougeLsum_fmeasure": 0.18698187602734404, "rougeLsum_fmeasure_stderr": 0.0038921774054845996, "rougeLsum_precision": 0.1650914247834489, "rougeLsum_precision_stderr": 0.00477544492053463, "rougeLsum_recall": 0.3505702667663343, "rougeLsum_recall_stderr": 0.004548048493270462}}, "3": {"PALM_prompt": {"bleu": 0.8830672069742221, "bleu_stderr": 0.028000457075678158, "rouge1_fmeasure": 0.21071007543003908, "rouge1_fmeasure_stderr": 0.004463218149424481, "rouge1_precision": 0.188815921549058, "rouge1_precision_stderr": 0.005405782039170827, "rouge1_recall": 0.38372152833073453, "rouge1_recall_stderr": 0.004859603610175178, "rouge2_fmeasure": 0.11292635699291721, "rouge2_fmeasure_stderr": 0.0031281095189860052, "rouge2_precision": 0.10432859230180822, "rouge2_precision_stderr": 0.003707124967324291, "rouge2_recall": 0.2043813472297843, "rouge2_recall_stderr": 0.003801503662629156, "rougeL_fmeasure": 0.18796910977796713, "rougeL_fmeasure_stderr": 0.003750496425693158, "rougeL_precision": 0.16609714125737446, "rougeL_precision_stderr": 0.004603337178258819, "rougeL_recall": 0.3553325064502732, "rougeL_recall_stderr": 0.0044309664783451575, "rougeLsum_fmeasure": 0.1924829285681012, "rougeLsum_fmeasure_stderr": 0.0038880601940687624, "rougeLsum_precision": 0.17111254219249275, "rougeLsum_precision_stderr": 0.004780700019520514, "rougeLsum_recall": 0.36026081429848233, "rougeLsum_recall_stderr": 0.004488184055350965}}, "4": {"PALM_prompt": {"bleu": 1.0358671081792514, "bleu_stderr": 0.04263566752630496, "rouge1_fmeasure": 0.22245843694786877, "rouge1_fmeasure_stderr": 0.0045583402573195686, "rouge1_precision": 0.19989615669545296, "rouge1_precision_stderr": 0.005599957130646531, "rouge1_recall": 0.40336166710123866, "rouge1_recall_stderr": 0.004818721124125534, "rouge2_fmeasure": 0.12100972003049645, "rouge2_fmeasure_stderr": 0.0032743404066677153, "rouge2_precision": 0.1124336236373582, "rouge2_precision_stderr": 0.003935741095779227, "rouge2_recall": 0.21819990615637125, "rouge2_recall_stderr": 0.00385618079142143, "rougeL_fmeasure": 0.19760963957512798, "rougeL_fmeasure_stderr": 0.00383542084139647, "rougeL_precision": 0.17492321519899293, "rougeL_precision_stderr": 0.004773378480561351, "rougeL_recall": 0.37224158952457587, "rougeL_recall_stderr": 0.004377948019839408, "rougeLsum_fmeasure": 0.20440757227679296, "rougeLsum_fmeasure_stderr": 0.004022054785395138, "rougeLsum_precision": 0.18235385466161755, "rougeLsum_precision_stderr": 0.005021694344891093, "rougeLsum_recall": 0.3793687467005697, "rougeLsum_recall_stderr": 0.004423507289060084}}, "5": {"PALM_prompt": {"bleu": 1.204171361933669, "bleu_stderr": 0.07613961296168276, "rouge1_fmeasure": 0.2366466068769033, "rouge1_fmeasure_stderr": 0.0048055385839361016, "rouge1_precision": 0.21975255516712816, "rouge1_precision_stderr": 0.0060000476488098085, "rouge1_recall": 0.4066635344523642, "rouge1_recall_stderr": 0.0048262902071662655, "rouge2_fmeasure": 0.13171103508163604, "rouge2_fmeasure_stderr": 0.0034923443640983974, "rouge2_precision": 0.1275911476917841, "rouge2_precision_stderr": 0.004309205748397978, "rouge2_recall": 0.2235792080539432, "rouge2_recall_stderr": 0.003950954664759423, "rougeL_fmeasure": 0.2088535130444153, "rougeL_fmeasure_stderr": 0.004052842030593682, "rougeL_precision": 0.19131328031391062, "rougeL_precision_stderr": 0.005134255771123987, "rougeL_recall": 0.37336352525066424, "rougeL_recall_stderr": 0.0044216260882892515, "rougeLsum_fmeasure": 0.21625717477809833, "rougeLsum_fmeasure_stderr": 0.004260506456488727, "rougeLsum_precision": 0.1997593788485136, "rougeLsum_precision_stderr": 0.005406525039653976, "rougeLsum_recall": 0.3803285154915348, "rougeLsum_recall_stderr": 0.004488995109880062}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.7397274944771954, "bleu_stderr": 0.10039261451919343, "rouge1_fmeasure": 0.1238366627407439, "rouge1_fmeasure_stderr": 0.0026637547390690967, "rouge1_precision": 0.12462462729719587, "rouge1_precision_stderr": 0.0031826046662497306, "rouge1_recall": 0.16449607884346923, "rouge1_recall_stderr": 0.003588981148112544, "rouge2_fmeasure": 0.03307271090107231, "rouge2_fmeasure_stderr": 0.0010639050828558138, "rouge2_precision": 0.030579818116461738, "rouge2_precision_stderr": 0.001114447996676589, "rouge2_recall": 0.045348988871228374, "rouge2_recall_stderr": 0.0015776302264520795, "rougeL_fmeasure": 0.0940193069900903, "rougeL_fmeasure_stderr": 0.001985212421750187, "rougeL_precision": 0.09689369540491309, "rougeL_precision_stderr": 0.0026833442642966994, "rougeL_recall": 0.1273558135465041, "rougeL_recall_stderr": 0.002838117953205465, "rougeLsum_fmeasure": 0.11606269945283079, "rougeLsum_fmeasure_stderr": 0.0025094061265404873, "rougeLsum_precision": 0.11762669226995941, "rougeLsum_precision_stderr": 0.0030721629727453585, "rougeLsum_recall": 0.15435308236518808, "rougeLsum_recall_stderr": 0.003394722386908597}}, "1": {"tldr_en": {"bleu": 2.980212171285624, "bleu_stderr": 0.08986977717018042, "rouge1_fmeasure": 0.18706368194466189, "rouge1_fmeasure_stderr": 0.0023760001514770367, "rouge1_precision": 0.2303358349811005, "rouge1_precision_stderr": 0.003690727542733754, "rouge1_recall": 0.2154464843014795, "rouge1_recall_stderr": 0.00312287409246033, "rouge2_fmeasure": 0.04664136277906473, "rouge2_fmeasure_stderr": 0.0012607234019787598, "rouge2_precision": 0.0623131824874716, "rouge2_precision_stderr": 0.0021997319954224622, "rouge2_recall": 0.05366477872164223, "rouge2_recall_stderr": 0.0015294014686067932, "rougeL_fmeasure": 0.14101873450061833, "rougeL_fmeasure_stderr": 0.0017902413086473704, "rougeL_precision": 0.1774352175816246, "rougeL_precision_stderr": 0.003056714448069494, "rougeL_recall": 0.16262682837459477, "rougeL_recall_stderr": 0.00239676054267987, "rougeLsum_fmeasure": 0.175474051697367, "rougeLsum_fmeasure_stderr": 0.0022207055448280263, "rougeLsum_precision": 0.21694264697187762, "rougeLsum_precision_stderr": 0.0035242626334641026, "rougeLsum_recall": 0.2020549195476134, "rougeLsum_recall_stderr": 0.0029196969560855153}}, "2": {"tldr_en": {"bleu": 4.611723071615721, "bleu_stderr": 0.09604703447972755, "rouge1_fmeasure": 0.2502603578204361, "rouge1_fmeasure_stderr": 0.002293574688170618, "rouge1_precision": 0.31402819280145294, "rouge1_precision_stderr": 0.00372043085747532, "rouge1_recall": 0.27476531134514864, "rouge1_recall_stderr": 0.003004834706287235, "rouge2_fmeasure": 0.07312196156858046, "rouge2_fmeasure_stderr": 0.0015175238657054205, "rouge2_precision": 0.0964374050617497, "rouge2_precision_stderr": 0.0023942829869708893, "rouge2_recall": 0.07950609744170317, "rouge2_recall_stderr": 0.0017736713632069314, "rougeL_fmeasure": 0.18837114117652573, "rougeL_fmeasure_stderr": 0.0018392223420105429, "rougeL_precision": 0.23993957539542324, "rougeL_precision_stderr": 0.0031070932808128005, "rougeL_recall": 0.20656602583791495, "rougeL_recall_stderr": 0.0024047364648286966, "rougeLsum_fmeasure": 0.23530722744582147, "rougeLsum_fmeasure_stderr": 0.0021867085010560715, "rougeLsum_precision": 0.29611740779831397, "rougeLsum_precision_stderr": 0.0035725589380756783, "rougeLsum_recall": 0.2582729289090492, "rougeLsum_recall_stderr": 0.002859502904870282}}, "3": {"tldr_en": {"bleu": 3.6534705242416563, "bleu_stderr": 0.0881203300496025, "rouge1_fmeasure": 0.2137655552336782, "rouge1_fmeasure_stderr": 0.002714393421558324, "rouge1_precision": 0.27636674426156177, "rouge1_precision_stderr": 0.004101651110307418, "rouge1_recall": 0.22930947512843322, "rouge1_recall_stderr": 0.003338024591740698, "rouge2_fmeasure": 0.06389847418480928, "rouge2_fmeasure_stderr": 0.001481197293938296, "rouge2_precision": 0.08702087298573732, "rouge2_precision_stderr": 0.0023613075212299754, "rouge2_recall": 0.06834799049896098, "rouge2_recall_stderr": 0.0017608046858656524, "rougeL_fmeasure": 0.162300378297079, "rougeL_fmeasure_stderr": 0.002115846911993043, "rougeL_precision": 0.2135610881070536, "rougeL_precision_stderr": 0.0033828812360183476, "rougeL_recall": 0.1743188444814511, "rougeL_recall_stderr": 0.002648347151651851, "rougeLsum_fmeasure": 0.2007815371334278, "rougeLsum_fmeasure_stderr": 0.002564977939596769, "rougeLsum_precision": 0.260193792903579, "rougeLsum_precision_stderr": 0.003915057196799965, "rougeLsum_recall": 0.2155361794060783, "rougeLsum_recall_stderr": 0.003167125886789834}}, "4": {"tldr_en": {"bleu": 0.14336768966011842, "bleu_stderr": 0.01664751835367322, "rouge1_fmeasure": 0.06966991898451057, "rouge1_fmeasure_stderr": 0.0024061665565282536, "rouge1_precision": 0.09318115237631248, "rouge1_precision_stderr": 0.0034313497448162753, "rouge1_recall": 0.07510657766582833, "rouge1_recall_stderr": 0.0027823499468141023, "rouge2_fmeasure": 0.021410981626472696, "rouge2_fmeasure_stderr": 0.0010806600066785071, "rouge2_precision": 0.0306990992199194, "rouge2_precision_stderr": 0.001732779147193654, "rouge2_recall": 0.023135462034409776, "rouge2_recall_stderr": 0.0012681089305181597, "rougeL_fmeasure": 0.05441308852467703, "rougeL_fmeasure_stderr": 0.0019026572440828432, "rougeL_precision": 0.07441428298557279, "rougeL_precision_stderr": 0.0028438647198895358, "rougeL_recall": 0.05853529439287516, "rougeL_recall_stderr": 0.0022113797700769144, "rougeLsum_fmeasure": 0.06515832990606861, "rougeLsum_fmeasure_stderr": 0.0022584661554748706, "rougeLsum_precision": 0.08804803757911164, "rougeLsum_precision_stderr": 0.003292180554603894, "rougeLsum_recall": 0.06984747732679251, "rougeLsum_recall_stderr": 0.0025861205751654633}}, "5": {"tldr_en": {"bleu": 1.5519142183400993e-12, "bleu_stderr": 6.955628631620419e-11, "rouge1_fmeasure": 0.011789826148853048, "rouge1_fmeasure_stderr": 0.0011310450894670711, "rouge1_precision": 0.016751606857569483, "rouge1_precision_stderr": 0.0016597025910088947, "rouge1_recall": 0.012874957063334068, "rouge1_recall_stderr": 0.001320598626607211, "rouge2_fmeasure": 0.0035301672488990635, "rouge2_fmeasure_stderr": 0.00046931341477467973, "rouge2_precision": 0.004789908933072763, "rouge2_precision_stderr": 0.0006572780023821424, "rouge2_recall": 0.003932891692847358, "rouge2_recall_stderr": 0.0005535394810295029, "rougeL_fmeasure": 0.009092320709271332, "rougeL_fmeasure_stderr": 0.000875179657864614, "rougeL_precision": 0.013288040221462351, "rougeL_precision_stderr": 0.0013727762159921608, "rougeL_recall": 0.00989469802053577, "rougeL_recall_stderr": 0.0010241634280214566, "rougeLsum_fmeasure": 0.010891546354591043, "rougeLsum_fmeasure_stderr": 0.001040987925967451, "rougeLsum_precision": 0.015518838953812768, "rougeLsum_precision_stderr": 0.001554235047112553, "rougeLsum_recall": 0.01190208294502919, "rougeLsum_recall_stderr": 0.001221212576212141}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 7.805428848697077, "bleu_stderr": 0.12760294204588482, "rouge1_fmeasure": 0.3241138419996619, "rouge1_fmeasure_stderr": 0.0023488121086233404, "rouge1_precision": 0.27630659363671284, "rouge1_precision_stderr": 0.0026635424916828915, "rouge1_recall": 0.4517766787964781, "rouge1_recall_stderr": 0.003086695070399088, "rouge2_fmeasure": 0.14611347950675788, "rouge2_fmeasure_stderr": 0.001555781159556189, "rouge2_precision": 0.1264233869651961, "rouge2_precision_stderr": 0.0021472423489400424, "rouge2_recall": 0.21006531093284228, "rouge2_recall_stderr": 0.002273053133823739, "rougeL_fmeasure": 0.2782112735172708, "rougeL_fmeasure_stderr": 0.0017379173406301361, "rougeL_precision": 0.23647123285513574, "rougeL_precision_stderr": 0.0021694427334222896, "rougeL_recall": 0.39302733892651276, "rougeL_recall_stderr": 0.0026453773863021526, "rougeLsum_fmeasure": 0.2838669853732326, "rougeLsum_fmeasure_stderr": 0.0022710804975857187, "rougeLsum_precision": 0.24330576145803912, "rougeLsum_precision_stderr": 0.0025941508735257053, "rougeLsum_recall": 0.3957393841813685, "rougeLsum_recall_stderr": 0.0030500827793352625}}, "1": {"generate_text_restaurant": {"bleu": 12.182758345079787, "bleu_stderr": 0.16518841123830086, "rouge1_fmeasure": 0.4813086962286221, "rouge1_fmeasure_stderr": 0.0023626326089443398, "rouge1_precision": 0.6058724528221369, "rouge1_precision_stderr": 0.0033311930729353, "rouge1_recall": 0.4375685096869626, "rouge1_recall_stderr": 0.003004503828354488, "rouge2_fmeasure": 0.23028289344422687, "rouge2_fmeasure_stderr": 0.0021131218145040835, "rouge2_precision": 0.29572879977984745, "rouge2_precision_stderr": 0.0029383694383037506, "rouge2_recall": 0.20878267638207582, "rouge2_recall_stderr": 0.002207007421280634, "rougeL_fmeasure": 0.3505103154607651, "rougeL_fmeasure_stderr": 0.002138758436636428, "rougeL_precision": 0.4452525524920764, "rougeL_precision_stderr": 0.003155514665754343, "rougeL_recall": 0.31741659640932457, "rougeL_recall_stderr": 0.0024579048629975297, "rougeLsum_fmeasure": 0.39279988924924625, "rougeLsum_fmeasure_stderr": 0.002377886979824964, "rougeLsum_precision": 0.4958740637334028, "rougeLsum_precision_stderr": 0.0033266551400865404, "rougeLsum_recall": 0.356592637216114, "rougeLsum_recall_stderr": 0.0027567285724463706}}, "2": {"generate_text_restaurant": {"bleu": 14.77827774961412, "bleu_stderr": 0.17074445414549996, "rouge1_fmeasure": 0.5169724859470212, "rouge1_fmeasure_stderr": 0.0022847828063681107, "rouge1_precision": 0.6222132815532128, "rouge1_precision_stderr": 0.0032116680629870925, "rouge1_recall": 0.4773787870946972, "rouge1_recall_stderr": 0.002880300695039262, "rouge2_fmeasure": 0.2598408586362927, "rouge2_fmeasure_stderr": 0.002199049529931736, "rouge2_precision": 0.31734278580959246, "rouge2_precision_stderr": 0.002901203617815359, "rouge2_recall": 0.23934523412335545, "rouge2_recall_stderr": 0.0022881777132200292, "rougeL_fmeasure": 0.37830224304770604, "rougeL_fmeasure_stderr": 0.0021704191814246308, "rougeL_precision": 0.45801859712166865, "rougeL_precision_stderr": 0.0030651884383412443, "rougeL_recall": 0.3483543845938943, "rougeL_recall_stderr": 0.002450753947224449, "rougeLsum_fmeasure": 0.42874770680726115, "rougeLsum_fmeasure_stderr": 0.00239536120318971, "rougeLsum_precision": 0.5166167827562745, "rougeLsum_precision_stderr": 0.003243801734266107, "rougeLsum_recall": 0.3956919145577125, "rougeLsum_recall_stderr": 0.002759335166433648}}, "3": {"generate_text_restaurant": {"bleu": 15.660507663936528, "bleu_stderr": 0.14749957372060007, "rouge1_fmeasure": 0.5280713486662109, "rouge1_fmeasure_stderr": 0.0022869985346214445, "rouge1_precision": 0.6220372006914571, "rouge1_precision_stderr": 0.0031139706058727593, "rouge1_recall": 0.4921683601168692, "rouge1_recall_stderr": 0.0029083346931655855, "rouge2_fmeasure": 0.26903286770558915, "rouge2_fmeasure_stderr": 0.0022307748577894758, "rouge2_precision": 0.31964099722948836, "rouge2_precision_stderr": 0.002777708595069122, "rouge2_recall": 0.2509916858334456, "rouge2_recall_stderr": 0.0023901154240661446, "rougeL_fmeasure": 0.38565827502841116, "rougeL_fmeasure_stderr": 0.0022144421481799675, "rougeL_precision": 0.455321793928234, "rougeL_precision_stderr": 0.0029321326061168074, "rougeL_recall": 0.35903739623304465, "rougeL_recall_stderr": 0.0025252866229682975, "rougeLsum_fmeasure": 0.4394144928671385, "rougeLsum_fmeasure_stderr": 0.002446885490860059, "rougeLsum_precision": 0.5172893895354551, "rougeLsum_precision_stderr": 0.003153574847912617, "rougeLsum_recall": 0.40954269048002123, "rougeLsum_recall_stderr": 0.002825679167152443}}, "4": {"generate_text_restaurant": {"bleu": 16.046714521986758, "bleu_stderr": 0.20143548468737246, "rouge1_fmeasure": 0.5343300631658281, "rouge1_fmeasure_stderr": 0.002276059193634907, "rouge1_precision": 0.6240126615936337, "rouge1_precision_stderr": 0.00312720979038774, "rouge1_recall": 0.4989840309368499, "rouge1_recall_stderr": 0.0028177351641601373, "rouge2_fmeasure": 0.2724112279454815, "rouge2_fmeasure_stderr": 0.0022911561617215254, "rouge2_precision": 0.3206493529913017, "rouge2_precision_stderr": 0.0028268414398358497, "rouge2_recall": 0.2544472343665272, "rouge2_recall_stderr": 0.0024038250271984246, "rougeL_fmeasure": 0.38733097602553734, "rougeL_fmeasure_stderr": 0.0022465126119210273, "rougeL_precision": 0.4530934873979934, "rougeL_precision_stderr": 0.0029431217749683455, "rougeL_recall": 0.3614836535348617, "rougeL_recall_stderr": 0.0025050776663639207, "rougeLsum_fmeasure": 0.4465935641244627, "rougeLsum_fmeasure_stderr": 0.0024867305930155463, "rougeLsum_precision": 0.5210977780903223, "rougeLsum_precision_stderr": 0.003193746615616614, "rougeLsum_recall": 0.41707944621670306, "rougeLsum_recall_stderr": 0.002802635738849298}}, "5": {"generate_text_restaurant": {"bleu": 16.068878582309953, "bleu_stderr": 0.16081121483163321, "rouge1_fmeasure": 0.5355837269029773, "rouge1_fmeasure_stderr": 0.002256980681599985, "rouge1_precision": 0.6224102862193265, "rouge1_precision_stderr": 0.003110234568961701, "rouge1_recall": 0.500933748593845, "rouge1_recall_stderr": 0.0027924958992030795, "rouge2_fmeasure": 0.27475970807249195, "rouge2_fmeasure_stderr": 0.0022615182075459324, "rouge2_precision": 0.32190257045006593, "rouge2_precision_stderr": 0.0027959933340407817, "rouge2_recall": 0.2569886621162686, "rouge2_recall_stderr": 0.002381426382114005, "rougeL_fmeasure": 0.3914922502865105, "rougeL_fmeasure_stderr": 0.0022467536238069325, "rougeL_precision": 0.45606984360921154, "rougeL_precision_stderr": 0.002956375747033492, "rougeL_recall": 0.3657446339488385, "rougeL_recall_stderr": 0.002509353106433301, "rougeLsum_fmeasure": 0.4482824729485371, "rougeLsum_fmeasure_stderr": 0.002444818682804833, "rougeLsum_precision": 0.5210361752122326, "rougeLsum_precision_stderr": 0.003175466781222956, "rougeLsum_recall": 0.4190489396414274, "rougeLsum_recall_stderr": 0.0027531052100554392}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.9818055644823127, "bleu_stderr": 0.13978823814879407, "rouge1_fmeasure": 0.16636180556058044, "rouge1_fmeasure_stderr": 0.003516551179676636, "rouge1_precision": 0.12872236323811218, "rouge1_precision_stderr": 0.0032462823876021106, "rouge1_recall": 0.2715393984942335, "rouge1_recall_stderr": 0.005900736678094209, "rouge2_fmeasure": 0.03841432091372367, "rouge2_fmeasure_stderr": 0.001586981058162795, "rouge2_precision": 0.028024322491153688, "rouge2_precision_stderr": 0.0011655819703985627, "rouge2_recall": 0.06517448572813263, "rouge2_recall_stderr": 0.002766560068276156, "rougeL_fmeasure": 0.1211817583352299, "rougeL_fmeasure_stderr": 0.0025835665514872593, "rougeL_precision": 0.09534495061706572, "rougeL_precision_stderr": 0.0027658531338571454, "rougeL_recall": 0.19816999370668548, "rougeL_recall_stderr": 0.004414878706261276, "rougeLsum_fmeasure": 0.13466615178498392, "rougeLsum_fmeasure_stderr": 0.002869955857600535, "rougeLsum_precision": 0.10514740754117864, "rougeLsum_precision_stderr": 0.0028961374555318475, "rougeLsum_recall": 0.22067225131355028, "rougeLsum_recall_stderr": 0.004901108311721729}}, "1": {"article_DOC_summary": {"bleu": 2.718621562424974, "bleu_stderr": 0.25634902183378955, "rouge1_fmeasure": 0.2252201544517192, "rouge1_fmeasure_stderr": 0.003572521220076497, "rouge1_precision": 0.23797749292717868, "rouge1_precision_stderr": 0.0044248829684158586, "rouge1_recall": 0.23607563998039394, "rouge1_recall_stderr": 0.0037207865629804763, "rouge2_fmeasure": 0.05054236360888691, "rouge2_fmeasure_stderr": 0.0023023749748183903, "rouge2_precision": 0.05608458865614393, "rouge2_precision_stderr": 0.0026939326656790527, "rouge2_recall": 0.0508692034245319, "rouge2_recall_stderr": 0.0022931658802725287, "rougeL_fmeasure": 0.1713099421580951, "rougeL_fmeasure_stderr": 0.0029320038478532935, "rougeL_precision": 0.18150902719333478, "rougeL_precision_stderr": 0.0036305277559064066, "rougeL_recall": 0.179561582857957, "rougeL_recall_stderr": 0.003045132003822578, "rougeLsum_fmeasure": 0.17382401249824367, "rougeLsum_fmeasure_stderr": 0.0029535439613312367, "rougeLsum_precision": 0.18366739499452403, "rougeLsum_precision_stderr": 0.003629882854081677, "rougeLsum_recall": 0.18297857106589824, "rougeLsum_recall_stderr": 0.003137997544172217}}, "2": {"article_DOC_summary": {"bleu": 3.6613673832300515, "bleu_stderr": 0.29305939325429936, "rouge1_fmeasure": 0.2516525371976862, "rouge1_fmeasure_stderr": 0.0037511199505683374, "rouge1_precision": 0.26927175770553385, "rouge1_precision_stderr": 0.004455761662742256, "rouge1_recall": 0.25342716562216744, "rouge1_recall_stderr": 0.0038549474849294127, "rouge2_fmeasure": 0.06264124531231369, "rouge2_fmeasure_stderr": 0.0025022071058794777, "rouge2_precision": 0.06831364960977458, "rouge2_precision_stderr": 0.0028257306475568847, "rouge2_recall": 0.06217985063286193, "rouge2_recall_stderr": 0.0025187198385362495, "rougeL_fmeasure": 0.19077644710608424, "rougeL_fmeasure_stderr": 0.003126557023491442, "rougeL_precision": 0.20401963131847098, "rougeL_precision_stderr": 0.0036937944194900436, "rougeL_recall": 0.19255452445504442, "rougeL_recall_stderr": 0.0032146416769602863, "rougeLsum_fmeasure": 0.19254279514804024, "rougeLsum_fmeasure_stderr": 0.00313403184014492, "rougeLsum_precision": 0.20564280351844771, "rougeLsum_precision_stderr": 0.003691582561213757, "rougeLsum_recall": 0.1946786446186705, "rougeLsum_recall_stderr": 0.0032424612854966943}}, "3": {"article_DOC_summary": {"bleu": 3.406816013953246, "bleu_stderr": 0.24161861472249982, "rouge1_fmeasure": 0.24229133087381188, "rouge1_fmeasure_stderr": 0.004031858321395662, "rouge1_precision": 0.262476289887743, "rouge1_precision_stderr": 0.004621204276158766, "rouge1_recall": 0.2404612136480006, "rouge1_recall_stderr": 0.004125414843698027, "rouge2_fmeasure": 0.06268337272766085, "rouge2_fmeasure_stderr": 0.0024567035789010046, "rouge2_precision": 0.06842316364282378, "rouge2_precision_stderr": 0.0027647383711749615, "rouge2_recall": 0.061651170011214104, "rouge2_recall_stderr": 0.002429297519350672, "rougeL_fmeasure": 0.18567208218878792, "rougeL_fmeasure_stderr": 0.0033333790452062923, "rougeL_precision": 0.20156202346447022, "rougeL_precision_stderr": 0.003847300881262445, "rougeL_recall": 0.18433196409317407, "rougeL_recall_stderr": 0.0033927670396723853, "rougeLsum_fmeasure": 0.18654971088104422, "rougeLsum_fmeasure_stderr": 0.003345098888199193, "rougeLsum_precision": 0.20245862347934387, "rougeLsum_precision_stderr": 0.003853570083238628, "rougeLsum_recall": 0.18518636196696317, "rougeLsum_recall_stderr": 0.0034107354023354916}}, "4": {"article_DOC_summary": {"bleu": 0.07621103585426778, "bleu_stderr": 0.02328218785956861, "rouge1_fmeasure": 0.06037720875233413, "rouge1_fmeasure_stderr": 0.003756291069253472, "rouge1_precision": 0.069527922152048, "rouge1_precision_stderr": 0.004415548165696612, "rouge1_recall": 0.058255960572026616, "rouge1_recall_stderr": 0.0036893676818933246, "rouge2_fmeasure": 0.016076749046209322, "rouge2_fmeasure_stderr": 0.00155018273945325, "rouge2_precision": 0.01884212610032935, "rouge2_precision_stderr": 0.001864365662358469, "rouge2_recall": 0.01541077730496209, "rouge2_recall_stderr": 0.0015005237179700724, "rougeL_fmeasure": 0.04601022042911696, "rougeL_fmeasure_stderr": 0.002972538395939391, "rougeL_precision": 0.05366548982395334, "rougeL_precision_stderr": 0.003589390411326478, "rougeL_recall": 0.04435979503971948, "rougeL_recall_stderr": 0.0029246377632095007, "rougeLsum_fmeasure": 0.046274903650287996, "rougeLsum_fmeasure_stderr": 0.0029855052852481925, "rougeLsum_precision": 0.05393140916782405, "rougeLsum_precision_stderr": 0.003600323595663311, "rougeLsum_recall": 0.04463374272617106, "rougeLsum_recall_stderr": 0.0029386715942067774}}, "5": {"article_DOC_summary": {"bleu": 1.2907915607447047e-48, "bleu_stderr": 8.181410959491275e-33, "rouge1_fmeasure": 0.0024677589985080198, "rouge1_fmeasure_stderr": 0.0008512467369069438, "rouge1_precision": 0.002519205393633051, "rouge1_precision_stderr": 0.0009010685681674164, "rouge1_recall": 0.002553406526725967, "rouge1_recall_stderr": 0.0008674535432962121, "rouge2_fmeasure": 0.0005053609550867747, "rouge2_fmeasure_stderr": 0.00027556435520113126, "rouge2_precision": 0.0005748785133997738, "rouge2_precision_stderr": 0.00032234854700264514, "rouge2_recall": 0.00047620248785620596, "rouge2_recall_stderr": 0.0002558552981869991, "rougeL_fmeasure": 0.0016858136591016738, "rougeL_fmeasure_stderr": 0.0006017432419253131, "rougeL_precision": 0.001782337656185266, "rougeL_precision_stderr": 0.0006875286151113505, "rougeL_recall": 0.0016948473649144099, "rougeL_recall_stderr": 0.0005810537809262554, "rougeLsum_fmeasure": 0.0017257035629670058, "rougeLsum_fmeasure_stderr": 0.000608227728173233, "rougeLsum_precision": 0.0018196260445811194, "rougeLsum_precision_stderr": 0.0006924871080774834, "rougeLsum_recall": 0.0017377290115696417, "rougeLsum_recall_stderr": 0.0005888117009644554}}}} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_2.json b/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2f0c71a2f5aad937cf81bc76ee1515299028fda4 --- /dev/null +++ b/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.31402819280145294, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00372043085747532 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.27476531134514864, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003004834706287235 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.2502603578204361, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002293574688170618 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0964374050617497, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0023942829869708893 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.07950609744170317, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0017736713632069314 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.07312196156858046, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0015175238657054205 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.23993957539542324, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0031070932808128005 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.20656602583791495, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0024047364648286966 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.18837114117652573, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018392223420105429 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.29611740779831397, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0035725589380756783 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.2582729289090492, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002859502904870282 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.23530722744582147, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0021867085010560715 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 4.611723071615721, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.09604703447972755 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_3.json b/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3db68d5cbb938554633c1b0c9d182f952d84a07c --- /dev/null +++ b/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.27636674426156177, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.004101651110307418 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.22930947512843322, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003338024591740698 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.2137655552336782, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002714393421558324 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.08702087298573732, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0023613075212299754 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.06834799049896098, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0017608046858656524 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.06389847418480928, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001481197293938296 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.2135610881070536, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0033828812360183476 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.1743188444814511, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002648347151651851 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.162300378297079, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.002115846911993043 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.260193792903579, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.003915057196799965 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.2155361794060783, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003167125886789834 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.2007815371334278, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002564977939596769 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.6534705242416563, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.0881203300496025 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_4.json b/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..268c1a2f024cd5b3b912fc88919e6c47a90b4390 --- /dev/null +++ b/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.09318115237631248, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0034313497448162753 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.07510657766582833, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0027823499468141023 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.06966991898451057, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0024061665565282536 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0306990992199194, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001732779147193654 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.023135462034409776, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0012681089305181597 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.021410981626472696, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010806600066785071 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.07441428298557279, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0028438647198895358 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.05853529439287516, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022113797700769144 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.05441308852467703, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0019026572440828432 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.08804803757911164, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.003292180554603894 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.06984747732679251, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0025861205751654633 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.06515832990606861, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0022584661554748706 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.14336768966011842, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.01664751835367322 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_5.json b/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..80179e69b10fcbe0e29af067c8e83c5cfb59d592 --- /dev/null +++ b/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.016751606857569483, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0016597025910088947 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.012874957063334068, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.001320598626607211 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.011789826148853048, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0011310450894670711 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.004789908933072763, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006572780023821424 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.003932891692847358, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0005535394810295029 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0035301672488990635, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00046931341477467973 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.013288040221462351, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013727762159921608 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.00989469802053577, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0010241634280214566 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.009092320709271332, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.000875179657864614 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.015518838953812768, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001554235047112553 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.01190208294502919, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.001221212576212141 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.010891546354591043, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001040987925967451 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.5519142183400993e-12, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 6.955628631620419e-11 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/slim.8b7178b88b_gem_xsum_article_DOC_summary_3.json b/8b7178b88b/evaluation/generation/slim.8b7178b88b_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..59d072f18290f8ea5f28e281bc3bbaf719a62d57 --- /dev/null +++ b/8b7178b88b/evaluation/generation/slim.8b7178b88b_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.262476289887743, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.004621204276158766 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.2404612136480006, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004125414843698027 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.24229133087381188, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.004031858321395662 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.06842316364282378, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0027647383711749615 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.061651170011214104, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002429297519350672 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.06268337272766085, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0024567035789010046 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.20156202346447022, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.003847300881262445 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.18433196409317407, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033927670396723853 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.18567208218878792, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0033333790452062923 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.20245862347934387, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.003853570083238628 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.18518636196696317, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0034107354023354916 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.18654971088104422, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.003345098888199193 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 3.406816013953246, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.24161861472249982 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/slim.8b7178b88b_gem_xsum_article_DOC_summary_4.json b/8b7178b88b/evaluation/generation/slim.8b7178b88b_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d3ede473b2526abec45a8af282574dfe892ef415 --- /dev/null +++ b/8b7178b88b/evaluation/generation/slim.8b7178b88b_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.069527922152048, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.004415548165696612 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.058255960572026616, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0036893676818933246 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.06037720875233413, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.003756291069253472 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.01884212610032935, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.001864365662358469 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.01541077730496209, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0015005237179700724 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.016076749046209322, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00155018273945325 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.05366548982395334, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.003589390411326478 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.04435979503971948, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0029246377632095007 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.04601022042911696, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002972538395939391 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.05393140916782405, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.003600323595663311 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.04463374272617106, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0029386715942067774 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.046274903650287996, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0029855052852481925 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.07621103585426778, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.02328218785956861 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/slim.8b7178b88b_gem_xsum_article_DOC_summary_5.json b/8b7178b88b/evaluation/generation/slim.8b7178b88b_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8979050afd395bcda963bd9362752e5ab282d6ce --- /dev/null +++ b/8b7178b88b/evaluation/generation/slim.8b7178b88b_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.002519205393633051, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0009010685681674164 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.002553406526725967, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0008674535432962121 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.0024677589985080198, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0008512467369069438 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0005748785133997738, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00032234854700264514 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.00047620248785620596, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0002558552981869991 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0005053609550867747, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00027556435520113126 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.001782337656185266, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0006875286151113505 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.0016948473649144099, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0005810537809262554 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.0016858136591016738, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0006017432419253131 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0018196260445811194, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0006924871080774834 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0017377290115696417, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0005888117009644554 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0017257035629670058, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.000608227728173233 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.2907915607447047e-48, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 8.181410959491275e-33 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file