diff --git a/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.json b/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..adf67fd249a4ff0ccc89f9ff92bcf2e4cd2ce470 --- /dev/null +++ b/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.20824164042833873, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002234214827494051}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.3342680558497238, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002719351845027267}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.235556648636045, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018848294908913575}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0553447765818299, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001097279257804817}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.09158152498363732, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00182638815550646}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.06251395339429003, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011032340747877004}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.14804254896591035, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001582109014998804}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.24506636928994607, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002277815665553641}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.16869690772741758, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013497024922833358}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.19708639575292217, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021089064966209187}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.31720897725992786, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002596567797319611}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.22311024372097865, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017776265096001729}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.5155467010727994, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06141556601214586}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.json b/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..19de62d0cf1ea598edc39e3f2819ebb471bbebfc --- /dev/null +++ b/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.17811439171989488, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002560275363708373}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.27886090799993984, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0034448901496326586}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.1960859652047859, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023366060464774423}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.04742286059353845, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011604513875678288}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.07742946215061461, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001866825934930981}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0523326312861241, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011285281285534974}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.12852552196126066, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018964172540042934}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.20545470726815437, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0027326532184944424}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.14162844117143206, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016899728317895791}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.16868014464807685, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002438611490893621}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.26411127102133913, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003281374640568146}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.18556326066261042, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002215627597176747}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.504980488837104, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07033302280883086}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.json b/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d655f44f730e210bb8f4f004d0b7f94349d689b8 --- /dev/null +++ b/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.057686885490487505, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002077970469715233}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.09385574649347103, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0031890083282469467}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.06282501006696624, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020888213951751034}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.015071313264611244, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007774016714959856}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.026883790080977472, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014075436212441558}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.016837586892487218, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000790811753626707}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.042638568215269866, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015504702364272222}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.07032908173779572, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024572632014890276}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.0461316780279477, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015235672554804573}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.05433748981297747, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019639155165544035}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.0885417619284403, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030172562108610624}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.05917832063435797, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001970707904332247}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.9086939324156383, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05855823137144976}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.json b/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1b55d4b3765faeccc32a3b76e880517d1c5d027b --- /dev/null +++ b/8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.009722491969586005, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009564982386894138}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.015357271032115101, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001448960021557378}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.010180728590080238, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0009430571774107825}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0024973101781513925, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000343733403974376}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0045816927074040755, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006153391357680226}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.00281382806546607, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00035214119256937883}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.0072997926636164424, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0007272617797612332}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.011767619522932812, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0011356853451163903}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.00759156476319381, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007014066070016197}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.009129679261559958, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009077175108677888}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.014481417293915742, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0013773388496390326}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.00955145315759681, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0008912980968449334}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 6.161453193982374e-06, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.2644510555241896e-05}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_3.json b/8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..641efd76ba3e8dcc2078744fed9d21fd8b4a1745 --- /dev/null +++ b/8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.12661685012408697, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021093732895884356}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.30042742106922904, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004793262475513874}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.1741691220448831, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027529808698691072}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.027457383236278155, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011005894460101174}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.06872339351563556, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0028264005600779733}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.03848207108872355, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015349514843449124}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.10054376826357136, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016355832838556733}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.24044972226758587, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0038558430038978684}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.13848485640839917, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002136410452186076}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.10022509495467177, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001733799927568896}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.2391221551244155, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0040668979594426224}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.13794850154717367, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002276629118568194}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.6273581251014975, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08297152976943713}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_4.json b/8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..55844e587aca959eaa97fffbebf12bc5f0499ac7 --- /dev/null +++ b/8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.03984641222755382, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023583006672399276}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.07781373363270898, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0044678999283725125}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.049025842019891894, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002761462686946734}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.008839770813015032, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009132501416122141}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.018595501088417683, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018046371518142589}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.011276208078698832, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010882501335885415}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.03193279708670165, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0019219705874772618}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.06201212154363477, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0036181165069829016}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.03899309107571578, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022095258634566655}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.03172097901935682, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019113003105201993}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.06162603968016436, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003613145314591428}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.03871502132726552, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021963924186294845}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.9297944407512662, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10452568299951195}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_5.json b/8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0c507aa9e6fff48a9ffddc1a43b51beceff4d535 --- /dev/null +++ b/8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.0018747157977051885, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0005217271715436668}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.003549283577549333, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0010767599221403455}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.002273937140206459, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006514011124893405}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.00047917769370046107, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00017934012188651556}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0007855487711203583, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00030506211146057416}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0005538280315731646, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00021055797770595916}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.001591076043425948, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00044966429790604164}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.002863749112737054, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0008323073183414411}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0019019709421522583, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005434789167320575}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.00156298958828556, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00043239685994550666}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0027767015907513405, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0007833271709551212}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0018351130157390045, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005055637602691865}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.3519576766092788e-16, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.7148754157361587e-13}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.jsonl b/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..06078db84afe94fa007b5c06606e10948c453165 100644 --- a/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.jsonl +++ b/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf52e7e89cb05892fcb65745e1779ab79a7bf338c0fd951fce559bbf967d309c +size 18902094 diff --git a/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.jsonl b/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..9533b38ed59d4b7d6d1eeda3359fa896d8d3d459 100644 --- a/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.jsonl +++ b/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d48b05d6c928cef63c3cbdedcfb8bca843c623bb49ed5e05c5ba8bafd912330a +size 24327036 diff --git a/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.jsonl b/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..c85acdd821f52bc4b006d2250de9a14e0e95fdbd 100644 --- a/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.jsonl +++ b/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f03b9fc6ca034ddb19e6558c89401d6667967bb919e5e43cbb227c773e165f4 +size 29475398 diff --git a/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.jsonl b/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..6eea236d2f021da177d346f3d4a2ebd5ba8bb6db 100644 --- a/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.jsonl +++ b/8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0696f3da8c2c1666d68d54ba9c6a0adf0015721431da594f6fd3d5ffd0de34b +size 34801699 diff --git a/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_3.jsonl b/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..d7f73f7f7b9889d8f6f3dba3f676e28e0b246115 100644 --- a/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_3.jsonl +++ b/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66dd94ebc21d8fb6bc777be380e845ad59128f54b1275c0ea4139ffc162849d5 +size 9647268 diff --git a/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_4.jsonl b/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..a6e696d26820027279a6aa3b3c4364f08177ac48 100644 --- a/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_4.jsonl +++ b/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f81a6460b234aa2e50301e03345295e21bd5c5a28290727bc037d1865867fef +size 11673456 diff --git a/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_5.jsonl b/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..bc9d3c2c4a663d99999179087b60cf8a54f3f68c 100644 --- a/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_5.jsonl +++ b/8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a8cc00d33bfd2864acc67c803f643c08279364e5b8639b6a77f56c7e52fe3cc +size 13899565 diff --git a/8b7178b25b/evaluation/generation/merged.csv b/8b7178b25b/evaluation/generation/merged.csv index c13f36099434047a109db579cbb770fe9e786c34..10c54e4ed18573c2a63acb58ff3e34e0064595f7 100644 --- a/8b7178b25b/evaluation/generation/merged.csv +++ b/8b7178b25b/evaluation/generation/merged.csv @@ -18,7 +18,13 @@ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03598895109620969 gem_xsum,1,median,rouge2_fmeasure,0.03598895109620969 gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.03728951183116744 gem_xsum,2,median,rouge2_fmeasure,0.03728951183116744 -gem_xsum,2,average,multiple,0.04324025638606534 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03848207108872355 +gem_xsum,3,median,rouge2_fmeasure,0.03848207108872355 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.011276208078698832 +gem_xsum,4,median,rouge2_fmeasure,0.011276208078698832 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0005538280315731646 +gem_xsum,5,median,rouge2_fmeasure,0.0005538280315731646 +gem_xsum,5,average,multiple,0.03000547939286526 web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04746695901782121 web_nlg_en,0,median,rouge2_fmeasure,0.04746695901782121 web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.052665704185835084 @@ -36,4 +42,12 @@ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.04047796652620078 wiki_lingua_en,0,median,rouge2_fmeasure,0.04047796652620078 wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.0622366147541512 wiki_lingua_en,1,median,rouge2_fmeasure,0.0622366147541512 -wiki_lingua_en,1,average,multiple,0.051357290640175995 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.06251395339429003 +wiki_lingua_en,2,median,rouge2_fmeasure,0.06251395339429003 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.0523326312861241 +wiki_lingua_en,3,median,rouge2_fmeasure,0.0523326312861241 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.016837586892487218 +wiki_lingua_en,4,median,rouge2_fmeasure,0.016837586892487218 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.00281382806546607 +wiki_lingua_en,5,median,rouge2_fmeasure,0.00281382806546607 +wiki_lingua_en,5,average,multiple,0.0395354301531199 diff --git a/8b7178b25b/evaluation/generation/merged.json b/8b7178b25b/evaluation/generation/merged.json index b8972f9cf3e6c82a86054b68fb33e5df1895f670..661197c633bb6cd8d59d4c4bb7600bf3a40c4ad9 100644 --- a/8b7178b25b/evaluation/generation/merged.json +++ b/8b7178b25b/evaluation/generation/merged.json @@ -1 +1 @@ -{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.31606716251794437, "bleu_stderr": 0.02563944351471059, "rouge1_fmeasure": 0.10246108718545202, "rouge1_fmeasure_stderr": 0.0017992092481067788, "rouge1_precision": 0.06664761750887295, "rouge1_precision_stderr": 0.0013732959148702218, "rouge1_recall": 0.2940457093239289, "rouge1_recall_stderr": 0.004555499438691762, "rouge2_fmeasure": 0.04746695901782121, "rouge2_fmeasure_stderr": 0.0011129470648980045, "rouge2_precision": 0.030425492420252335, "rouge2_precision_stderr": 0.0007598510657945697, "rouge2_recall": 0.1409722690208349, "rouge2_recall_stderr": 0.0031027610869315144, "rougeL_fmeasure": 0.09860897831373452, "rougeL_fmeasure_stderr": 0.00170202692564, "rougeL_precision": 0.06400114097684483, "rougeL_precision_stderr": 0.0012983914696295615, "rougeL_recall": 0.2849573873935468, "rougeL_recall_stderr": 0.004452046846530438, "rougeLsum_fmeasure": 0.09784615421581211, "rougeLsum_fmeasure_stderr": 0.001694890791053129, "rougeLsum_precision": 0.06366499205115346, "rougeLsum_precision_stderr": 0.0013067510671992333, "rougeLsum_recall": 0.281020892101209, "rougeLsum_recall_stderr": 0.004305691903165883}}, "1": {"PALM_prompt": {"bleu": 0.4594356029919803, "bleu_stderr": 0.025125558903473632, "rouge1_fmeasure": 0.10986827089930207, "rouge1_fmeasure_stderr": 0.0017427056384300328, "rouge1_precision": 0.07027188536501679, "rouge1_precision_stderr": 0.0012558026173472138, "rouge1_recall": 0.3450765288308058, "rouge1_recall_stderr": 0.005055990718374951, "rouge2_fmeasure": 0.052665704185835084, "rouge2_fmeasure_stderr": 0.001116219363919427, "rouge2_precision": 0.03347891417115085, "rouge2_precision_stderr": 0.0007753123397897254, "rouge2_recall": 0.1749435024767411, "rouge2_recall_stderr": 0.0036147215578007344, "rougeL_fmeasure": 0.10429693361721804, "rougeL_fmeasure_stderr": 0.0016079944655777501, "rougeL_precision": 0.06659999496172114, "rougeL_precision_stderr": 0.001145948530216064, "rougeL_recall": 0.32753689197885233, "rougeL_recall_stderr": 0.004724892115185279, "rougeLsum_fmeasure": 0.10453836123127251, "rougeLsum_fmeasure_stderr": 0.0016341468252465234, "rougeLsum_precision": 0.06687788385741289, "rougeLsum_precision_stderr": 0.0011787909083336645, "rougeLsum_recall": 0.32738351484636163, "rougeLsum_recall_stderr": 0.004663465991453483}}, "2": {"PALM_prompt": {"bleu": 0.4626917691641033, "bleu_stderr": 0.026701626014007735, "rouge1_fmeasure": 0.11037120367033523, "rouge1_fmeasure_stderr": 0.0016020141226316521, "rouge1_precision": 0.07008037252994849, "rouge1_precision_stderr": 0.0012035993637679488, "rouge1_recall": 0.36892015349821927, "rouge1_recall_stderr": 0.0052030339175267275, "rouge2_fmeasure": 0.05157944416358925, "rouge2_fmeasure_stderr": 0.0010362585811456641, "rouge2_precision": 0.032540593106790174, "rouge2_precision_stderr": 0.000731515411120624, "rouge2_recall": 0.18396134575469794, "rouge2_recall_stderr": 0.0036503252463729982, "rougeL_fmeasure": 0.10359385842660361, "rougeL_fmeasure_stderr": 0.001472076271258242, "rougeL_precision": 0.06586167470031352, "rougeL_precision_stderr": 0.0011239315244069698, "rougeL_recall": 0.3440427226978943, "rougeL_recall_stderr": 0.0046759733900977925, "rougeLsum_fmeasure": 0.1053829136834338, "rougeLsum_fmeasure_stderr": 0.0015312528661206545, "rougeLsum_precision": 0.06702892136385862, "rougeLsum_precision_stderr": 0.0011636056187413796, "rougeLsum_recall": 0.3498407885544722, "rougeLsum_recall_stderr": 0.004802606835414834}}, "3": {"PALM_prompt": {"bleu": 0.5229218783075383, "bleu_stderr": 0.03287750689960854, "rouge1_fmeasure": 0.11129898486093194, "rouge1_fmeasure_stderr": 0.001603697213150543, "rouge1_precision": 0.07078968601889724, "rouge1_precision_stderr": 0.0012530189540338403, "rouge1_recall": 0.370854174943212, "rouge1_recall_stderr": 0.005131257813808184, "rouge2_fmeasure": 0.052270761889885206, "rouge2_fmeasure_stderr": 0.00103207142359725, "rouge2_precision": 0.03298499960390668, "rouge2_precision_stderr": 0.0007381370373430346, "rouge2_recall": 0.18675978429729426, "rouge2_recall_stderr": 0.0037180617775293043, "rougeL_fmeasure": 0.10352172121642274, "rougeL_fmeasure_stderr": 0.001462770227679166, "rougeL_precision": 0.06597257056782777, "rougeL_precision_stderr": 0.0011757840654232027, "rougeL_recall": 0.3435597898095353, "rougeL_recall_stderr": 0.004593814760542546, "rougeLsum_fmeasure": 0.10603634362382153, "rougeLsum_fmeasure_stderr": 0.0015192749230585549, "rougeLsum_precision": 0.06757005923050743, "rougeLsum_precision_stderr": 0.001208590145084183, "rougeLsum_recall": 0.35225955402286013, "rougeLsum_recall_stderr": 0.004772649571059099}}, "4": {"PALM_prompt": {"bleu": 0.6153677621128861, "bleu_stderr": 0.06053705735401149, "rouge1_fmeasure": 0.11264103250558938, "rouge1_fmeasure_stderr": 0.00163118942788917, "rouge1_precision": 0.07071127507433782, "rouge1_precision_stderr": 0.0011520192275434789, "rouge1_recall": 0.38147578757395145, "rouge1_recall_stderr": 0.005244875143584364, "rouge2_fmeasure": 0.05295652860215851, "rouge2_fmeasure_stderr": 0.0010479736263979194, "rouge2_precision": 0.032992739334882344, "rouge2_precision_stderr": 0.0007156556105984286, "rouge2_recall": 0.1932194114953707, "rouge2_recall_stderr": 0.00374859289181986, "rougeL_fmeasure": 0.10299927861932978, "rougeL_fmeasure_stderr": 0.0014284397671676585, "rougeL_precision": 0.06471147959433785, "rougeL_precision_stderr": 0.0010212506145839731, "rougeL_recall": 0.3483975351615344, "rougeL_recall_stderr": 0.004606426897749366, "rougeLsum_fmeasure": 0.10684797875052807, "rougeLsum_fmeasure_stderr": 0.0015286400891968755, "rougeLsum_precision": 0.06714267179992058, "rougeLsum_precision_stderr": 0.0010872321667830566, "rougeLsum_recall": 0.3609556696570317, "rougeLsum_recall_stderr": 0.004821987801086811}}, "5": {"PALM_prompt": {"bleu": 0.6522310718826546, "bleu_stderr": 0.0342543434913143, "rouge1_fmeasure": 0.11496831895443499, "rouge1_fmeasure_stderr": 0.0016131648938140647, "rouge1_precision": 0.07224798328936255, "rouge1_precision_stderr": 0.0011956217365912398, "rouge1_recall": 0.3930127103047457, "rouge1_recall_stderr": 0.005342634817712522, "rouge2_fmeasure": 0.054266401974880724, "rouge2_fmeasure_stderr": 0.0010279738093731765, "rouge2_precision": 0.03383327897010777, "rouge2_precision_stderr": 0.0007155324088112528, "rouge2_recall": 0.20017152367590102, "rouge2_recall_stderr": 0.0038194708260410088, "rougeL_fmeasure": 0.10413378847184503, "rougeL_fmeasure_stderr": 0.001404569572763983, "rougeL_precision": 0.06558923187444043, "rougeL_precision_stderr": 0.0010757591438449354, "rougeL_recall": 0.35557332046269224, "rougeL_recall_stderr": 0.0046352766809419025, "rougeLsum_fmeasure": 0.10879985852497424, "rougeLsum_fmeasure_stderr": 0.001514236323280586, "rougeLsum_precision": 0.0684743244702778, "rougeLsum_precision_stderr": 0.0011401249172099674, "rougeLsum_recall": 0.3715362630204789, "rougeLsum_recall_stderr": 0.004921937595664833}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.8695101649478336, "bleu_stderr": 0.0774798723948649, "rouge1_fmeasure": 0.18492809283993106, "rouge1_fmeasure_stderr": 0.0019138467632652185, "rouge1_precision": 0.16049598964045364, "rouge1_precision_stderr": 0.0019462983151563367, "rouge1_recall": 0.2676713790563651, "rouge1_recall_stderr": 0.0028735302834564675, "rouge2_fmeasure": 0.04047796652620078, "rouge2_fmeasure_stderr": 0.0009161392431729087, "rouge2_precision": 0.034697662184252506, "rouge2_precision_stderr": 0.0008205861453711819, "rouge2_recall": 0.06110103740366906, "rouge2_recall_stderr": 0.0015502965185361641, "rougeL_fmeasure": 0.13965276114673036, "rougeL_fmeasure_stderr": 0.0013470236487103265, "rougeL_precision": 0.11976282577213969, "rougeL_precision_stderr": 0.0013342259804847978, "rougeL_recall": 0.20766916878715935, "rougeL_recall_stderr": 0.0023245522996809573, "rougeLsum_fmeasure": 0.17014737831445742, "rougeLsum_fmeasure_stderr": 0.0017471450796099739, "rougeLsum_precision": 0.14751551166205068, "rougeLsum_precision_stderr": 0.001780031941907223, "rougeLsum_recall": 0.24735686937216053, "rougeLsum_recall_stderr": 0.0026833728446082504}}, "1": {"tldr_en": {"bleu": 3.2332562389930515, "bleu_stderr": 0.0799951186296727, "rouge1_fmeasure": 0.23690998702342858, "rouge1_fmeasure_stderr": 0.0019890231968532827, "rouge1_precision": 0.20729964457048256, "rouge1_precision_stderr": 0.002274245844031677, "rouge1_recall": 0.33969139617897154, "rouge1_recall_stderr": 0.002833556197863864, "rouge2_fmeasure": 0.0622366147541512, "rouge2_fmeasure_stderr": 0.0011025942027406593, "rouge2_precision": 0.05460146452130771, "rouge2_precision_stderr": 0.001093799845706477, "rouge2_recall": 0.09225300993047651, "rouge2_recall_stderr": 0.0018248020314785354, "rougeL_fmeasure": 0.16727654774531026, "rougeL_fmeasure_stderr": 0.0013621791164943034, "rougeL_precision": 0.14525073667888475, "rougeL_precision_stderr": 0.0015687845277167008, "rougeL_recall": 0.24628845510672517, "rougeL_recall_stderr": 0.0023082894542053194, "rougeLsum_fmeasure": 0.2230496791129934, "rougeLsum_fmeasure_stderr": 0.0018757377324494942, "rougeLsum_precision": 0.19509659175047353, "rougeLsum_precision_stderr": 0.0021449940715420894, "rougeLsum_recall": 0.3204235648372041, "rougeLsum_recall_stderr": 0.0027128359683571725}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.03454795333510589, "bleu_stderr": 0.010722245919714395, "rouge1_fmeasure": 0.009983083620835689, "rouge1_fmeasure_stderr": 0.0007216517112036717, "rouge1_precision": 0.008330877542768076, "rouge1_precision_stderr": 0.0006789467135202171, "rouge1_recall": 0.01512213229502354, "rouge1_recall_stderr": 0.0010598056749213033, "rouge2_fmeasure": 0.0015194719382464828, "rouge2_fmeasure_stderr": 0.0002186426847142056, "rouge2_precision": 0.0012462152328688286, "rouge2_precision_stderr": 0.0001850740246851996, "rouge2_recall": 0.0021941504736507373, "rouge2_recall_stderr": 0.00031789022057788433, "rougeL_fmeasure": 0.009451610525850281, "rougeL_fmeasure_stderr": 0.0006635438560036569, "rougeL_precision": 0.0076678486076520065, "rougeL_precision_stderr": 0.0005780206165870198, "rougeL_recall": 0.014540667195886358, "rougeL_recall_stderr": 0.0010048165449253396, "rougeLsum_fmeasure": 0.008102826704141256, "rougeLsum_fmeasure_stderr": 0.0005948187054074003, "rougeLsum_precision": 0.006854342833172471, "rougeLsum_precision_stderr": 0.0005808821610184307, "rougeLsum_recall": 0.01222352423290077, "rougeLsum_recall_stderr": 0.0008679529695598302}}, "1": {"generate_text_restaurant": {"bleu": 10.110683183008721, "bleu_stderr": 0.10856580026912326, "rouge1_fmeasure": 0.4261931768500333, "rouge1_fmeasure_stderr": 0.0020338904219177937, "rouge1_precision": 0.4273885051454009, "rouge1_precision_stderr": 0.0023386626945434775, "rouge1_recall": 0.4619168895673238, "rouge1_recall_stderr": 0.002910080537073359, "rouge2_fmeasure": 0.18637115332137155, "rouge2_fmeasure_stderr": 0.0017490252095085136, "rouge2_precision": 0.18621605242973624, "rouge2_precision_stderr": 0.0018420288481743644, "rouge2_recall": 0.20425913452615088, "rouge2_recall_stderr": 0.002171191668790452, "rougeL_fmeasure": 0.30007012360581886, "rougeL_fmeasure_stderr": 0.0017073194851728616, "rougeL_precision": 0.30141579223639636, "rougeL_precision_stderr": 0.0019324093600811055, "rougeL_recall": 0.3256528804455511, "rougeL_recall_stderr": 0.002364609519861813, "rougeLsum_fmeasure": 0.35343380568090416, "rougeLsum_fmeasure_stderr": 0.0020183396806264077, "rougeLsum_precision": 0.3549239657030783, "rougeLsum_precision_stderr": 0.0022624041723470494, "rougeLsum_recall": 0.38280848019973457, "rougeLsum_recall_stderr": 0.002716917275700221}}, "2": {"generate_text_restaurant": {"bleu": 11.76941570816289, "bleu_stderr": 0.15573900245748828, "rouge1_fmeasure": 0.45040818552274897, "rouge1_fmeasure_stderr": 0.001961202454945367, "rouge1_precision": 0.44751016860851195, "rouge1_precision_stderr": 0.002306632595634685, "rouge1_recall": 0.4895112200039268, "rouge1_recall_stderr": 0.0028440252957840422, "rouge2_fmeasure": 0.21309279835498804, "rouge2_fmeasure_stderr": 0.0018084420773651576, "rouge2_precision": 0.21124466241603482, "rouge2_precision_stderr": 0.001920141813437847, "rouge2_recall": 0.23408285111523244, "rouge2_recall_stderr": 0.0022741821628275683, "rougeL_fmeasure": 0.3267435445462492, "rougeL_fmeasure_stderr": 0.0017435336297606837, "rougeL_precision": 0.3250865088115516, "rougeL_precision_stderr": 0.0020090753832284394, "rougeL_recall": 0.35555451083591993, "rougeL_recall_stderr": 0.0024076335160479975, "rougeLsum_fmeasure": 0.37491707249596695, "rougeLsum_fmeasure_stderr": 0.0020146819194002373, "rougeLsum_precision": 0.3727655581276283, "rougeLsum_precision_stderr": 0.002277906383748808, "rougeLsum_recall": 0.40742138628815283, "rougeLsum_recall_stderr": 0.0027125462981769503}}, "3": {"generate_text_restaurant": {"bleu": 12.357057866617781, "bleu_stderr": 0.11622210780282331, "rouge1_fmeasure": 0.4562426842321961, "rouge1_fmeasure_stderr": 0.0019386232886629857, "rouge1_precision": 0.45217337104133665, "rouge1_precision_stderr": 0.002294505554504784, "rouge1_recall": 0.4942792865855869, "rouge1_recall_stderr": 0.002743597811824126, "rouge2_fmeasure": 0.22032333042194527, "rouge2_fmeasure_stderr": 0.001852412446316337, "rouge2_precision": 0.2175811124968712, "rouge2_precision_stderr": 0.0019112105040248436, "rouge2_recall": 0.2409573657644072, "rouge2_recall_stderr": 0.0023044463187969245, "rougeL_fmeasure": 0.3334482257131471, "rougeL_fmeasure_stderr": 0.0017866016624809113, "rougeL_precision": 0.33057426692006453, "rougeL_precision_stderr": 0.0020126164762619228, "rougeL_recall": 0.3619557938558922, "rougeL_recall_stderr": 0.0024177454342242813, "rougeLsum_fmeasure": 0.3816770816558835, "rougeLsum_fmeasure_stderr": 0.002007876237485467, "rougeLsum_precision": 0.37808218718729336, "rougeLsum_precision_stderr": 0.0022431062970265465, "rougeLsum_recall": 0.41390580636859037, "rougeLsum_recall_stderr": 0.0026729564370056775}}, "4": {"generate_text_restaurant": {"bleu": 12.468466502215751, "bleu_stderr": 0.09990648109577639, "rouge1_fmeasure": 0.45529352918386296, "rouge1_fmeasure_stderr": 0.0019729158566691536, "rouge1_precision": 0.45220424528442504, "rouge1_precision_stderr": 0.002337808233031794, "rouge1_recall": 0.4921231264159867, "rouge1_recall_stderr": 0.0027602306430746792, "rouge2_fmeasure": 0.21993594978939215, "rouge2_fmeasure_stderr": 0.001850258705420329, "rouge2_precision": 0.21806635417357498, "rouge2_precision_stderr": 0.0019446195531272607, "rouge2_recall": 0.2401062314536212, "rouge2_recall_stderr": 0.00230192039014965, "rougeL_fmeasure": 0.3320801238250487, "rougeL_fmeasure_stderr": 0.001796774794658876, "rougeL_precision": 0.32979623685018056, "rougeL_precision_stderr": 0.002025955430723721, "rougeL_recall": 0.35968401480469203, "rougeL_recall_stderr": 0.0024104842159546787, "rougeLsum_fmeasure": 0.38073707749695784, "rougeLsum_fmeasure_stderr": 0.0020603619333531575, "rougeLsum_precision": 0.37807324998983866, "rougeLsum_precision_stderr": 0.002305718057682757, "rougeLsum_recall": 0.4118614121463698, "rougeLsum_recall_stderr": 0.0027125157827382973}}, "5": {"generate_text_restaurant": {"bleu": 12.11108163274697, "bleu_stderr": 0.1511086038189987, "rouge1_fmeasure": 0.45448318084710065, "rouge1_fmeasure_stderr": 0.0019502573764627392, "rouge1_precision": 0.4494689335142834, "rouge1_precision_stderr": 0.0023139900407069896, "rouge1_recall": 0.49189724241328564, "rouge1_recall_stderr": 0.002720672072662264, "rouge2_fmeasure": 0.21793438022656353, "rouge2_fmeasure_stderr": 0.0018382910372242867, "rouge2_precision": 0.21502011952369768, "rouge2_precision_stderr": 0.0018993108425412735, "rouge2_recall": 0.23790406650497267, "rouge2_recall_stderr": 0.002257304748049696, "rougeL_fmeasure": 0.33100721144592343, "rougeL_fmeasure_stderr": 0.0017930595424372638, "rougeL_precision": 0.3270263161597272, "rougeL_precision_stderr": 0.001992100365911566, "rougeL_recall": 0.3590215756959229, "rougeL_recall_stderr": 0.0023843408170312035, "rougeLsum_fmeasure": 0.3800458168267657, "rougeLsum_fmeasure_stderr": 0.002019377882054415, "rougeLsum_precision": 0.3757504695199971, "rougeLsum_precision_stderr": 0.00226312608847076, "rougeLsum_recall": 0.4115209951658525, "rougeLsum_recall_stderr": 0.002643403932683506}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.4185935712214466, "bleu_stderr": 0.11565432283117048, "rouge1_fmeasure": 0.22302727328338223, "rouge1_fmeasure_stderr": 0.002543649203320007, "rouge1_precision": 0.16079100538036975, "rouge1_precision_stderr": 0.0019317155011328527, "rouge1_recall": 0.38463169025745175, "rouge1_recall_stderr": 0.004526539026922928, "rouge2_fmeasure": 0.056442306230818876, "rouge2_fmeasure_stderr": 0.0016804520319396248, "rouge2_precision": 0.03999269013558242, "rouge2_precision_stderr": 0.0011971308201383902, "rouge2_recall": 0.10113339981669754, "rouge2_recall_stderr": 0.0031343748451553286, "rougeL_fmeasure": 0.16717619407627404, "rougeL_fmeasure_stderr": 0.0019481211634922503, "rougeL_precision": 0.12033234686017578, "rougeL_precision_stderr": 0.0014585203532609563, "rougeL_recall": 0.2903778922407286, "rougeL_recall_stderr": 0.003692559366146235, "rougeLsum_fmeasure": 0.17608763758164225, "rougeLsum_fmeasure_stderr": 0.002145834312930374, "rougeLsum_precision": 0.12659083153302422, "rougeLsum_precision_stderr": 0.0015911326356590116, "rougeLsum_recall": 0.30622228336959245, "rougeLsum_recall_stderr": 0.004035233836387343}}, "1": {"article_DOC_summary": {"bleu": 1.4753760157360585, "bleu_stderr": 0.053942288193643405, "rouge1_fmeasure": 0.17409533783255507, "rouge1_fmeasure_stderr": 0.002615183917058679, "rouge1_precision": 0.123975541885696, "rouge1_precision_stderr": 0.001937069913810257, "rouge1_recall": 0.3041634033606052, "rouge1_recall_stderr": 0.004462494037919621, "rouge2_fmeasure": 0.03598895109620969, "rouge2_fmeasure_stderr": 0.001496555270960858, "rouge2_precision": 0.02530985857085611, "rouge2_precision_stderr": 0.0010516965631042789, "rouge2_recall": 0.06497657707857857, "rouge2_recall_stderr": 0.0028062229327394097, "rougeL_fmeasure": 0.13719393806212205, "rougeL_fmeasure_stderr": 0.001988792672158033, "rougeL_precision": 0.09739516038141627, "rougeL_precision_stderr": 0.0014498365795769053, "rougeL_recall": 0.24191669859950216, "rougeL_recall_stderr": 0.0036028516933205405, "rougeLsum_fmeasure": 0.1371396960626393, "rougeLsum_fmeasure_stderr": 0.0021254210571197236, "rougeLsum_precision": 0.0973784808310999, "rougeLsum_precision_stderr": 0.0015541782441477088, "rougeLsum_recall": 0.24166767773481146, "rougeLsum_recall_stderr": 0.0037775246270440404}}, "2": {"article_DOC_summary": {"bleu": 1.4658077495541642, "bleu_stderr": 0.05656663077206122, "rouge1_fmeasure": 0.17850901621898715, "rouge1_fmeasure_stderr": 0.002631185065208956, "rouge1_precision": 0.12701574570405383, "rouge1_precision_stderr": 0.0019497483407488042, "rouge1_recall": 0.31209308297568034, "rouge1_recall_stderr": 0.0044823221140013695, "rouge2_fmeasure": 0.03728951183116744, "rouge2_fmeasure_stderr": 0.0014839130164513042, "rouge2_precision": 0.02628489874277305, "rouge2_precision_stderr": 0.0010496767573945315, "rouge2_recall": 0.06678550185926611, "rouge2_recall_stderr": 0.002741050501727463, "rougeL_fmeasure": 0.14040861303541324, "rougeL_fmeasure_stderr": 0.001957888420474033, "rougeL_precision": 0.09965436966997658, "rougeL_precision_stderr": 0.0014375571522234208, "rougeL_recall": 0.24737259975268608, "rougeL_recall_stderr": 0.003487423921250019, "rougeLsum_fmeasure": 0.1402274218021141, "rougeLsum_fmeasure_stderr": 0.00214102737994103, "rougeLsum_precision": 0.0995909554171278, "rougeLsum_precision_stderr": 0.001569684395558083, "rougeLsum_recall": 0.2465747406900749, "rougeLsum_recall_stderr": 0.0037567150545319778}}}} \ No newline at end of file +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.31606716251794437, "bleu_stderr": 0.02563944351471059, "rouge1_fmeasure": 0.10246108718545202, "rouge1_fmeasure_stderr": 0.0017992092481067788, "rouge1_precision": 0.06664761750887295, "rouge1_precision_stderr": 0.0013732959148702218, "rouge1_recall": 0.2940457093239289, "rouge1_recall_stderr": 0.004555499438691762, "rouge2_fmeasure": 0.04746695901782121, "rouge2_fmeasure_stderr": 0.0011129470648980045, "rouge2_precision": 0.030425492420252335, "rouge2_precision_stderr": 0.0007598510657945697, "rouge2_recall": 0.1409722690208349, "rouge2_recall_stderr": 0.0031027610869315144, "rougeL_fmeasure": 0.09860897831373452, "rougeL_fmeasure_stderr": 0.00170202692564, "rougeL_precision": 0.06400114097684483, "rougeL_precision_stderr": 0.0012983914696295615, "rougeL_recall": 0.2849573873935468, "rougeL_recall_stderr": 0.004452046846530438, "rougeLsum_fmeasure": 0.09784615421581211, "rougeLsum_fmeasure_stderr": 0.001694890791053129, "rougeLsum_precision": 0.06366499205115346, "rougeLsum_precision_stderr": 0.0013067510671992333, "rougeLsum_recall": 0.281020892101209, "rougeLsum_recall_stderr": 0.004305691903165883}}, "1": {"PALM_prompt": {"bleu": 0.4594356029919803, "bleu_stderr": 0.025125558903473632, "rouge1_fmeasure": 0.10986827089930207, "rouge1_fmeasure_stderr": 0.0017427056384300328, "rouge1_precision": 0.07027188536501679, "rouge1_precision_stderr": 0.0012558026173472138, "rouge1_recall": 0.3450765288308058, "rouge1_recall_stderr": 0.005055990718374951, "rouge2_fmeasure": 0.052665704185835084, "rouge2_fmeasure_stderr": 0.001116219363919427, "rouge2_precision": 0.03347891417115085, "rouge2_precision_stderr": 0.0007753123397897254, "rouge2_recall": 0.1749435024767411, "rouge2_recall_stderr": 0.0036147215578007344, "rougeL_fmeasure": 0.10429693361721804, "rougeL_fmeasure_stderr": 0.0016079944655777501, "rougeL_precision": 0.06659999496172114, "rougeL_precision_stderr": 0.001145948530216064, "rougeL_recall": 0.32753689197885233, "rougeL_recall_stderr": 0.004724892115185279, "rougeLsum_fmeasure": 0.10453836123127251, "rougeLsum_fmeasure_stderr": 0.0016341468252465234, "rougeLsum_precision": 0.06687788385741289, "rougeLsum_precision_stderr": 0.0011787909083336645, "rougeLsum_recall": 0.32738351484636163, "rougeLsum_recall_stderr": 0.004663465991453483}}, "2": {"PALM_prompt": {"bleu": 0.4626917691641033, "bleu_stderr": 0.026701626014007735, "rouge1_fmeasure": 0.11037120367033523, "rouge1_fmeasure_stderr": 0.0016020141226316521, "rouge1_precision": 0.07008037252994849, "rouge1_precision_stderr": 0.0012035993637679488, "rouge1_recall": 0.36892015349821927, "rouge1_recall_stderr": 0.0052030339175267275, "rouge2_fmeasure": 0.05157944416358925, "rouge2_fmeasure_stderr": 0.0010362585811456641, "rouge2_precision": 0.032540593106790174, "rouge2_precision_stderr": 0.000731515411120624, "rouge2_recall": 0.18396134575469794, "rouge2_recall_stderr": 0.0036503252463729982, "rougeL_fmeasure": 0.10359385842660361, "rougeL_fmeasure_stderr": 0.001472076271258242, "rougeL_precision": 0.06586167470031352, "rougeL_precision_stderr": 0.0011239315244069698, "rougeL_recall": 0.3440427226978943, "rougeL_recall_stderr": 0.0046759733900977925, "rougeLsum_fmeasure": 0.1053829136834338, "rougeLsum_fmeasure_stderr": 0.0015312528661206545, "rougeLsum_precision": 0.06702892136385862, "rougeLsum_precision_stderr": 0.0011636056187413796, "rougeLsum_recall": 0.3498407885544722, "rougeLsum_recall_stderr": 0.004802606835414834}}, "3": {"PALM_prompt": {"bleu": 0.5229218783075383, "bleu_stderr": 0.03287750689960854, "rouge1_fmeasure": 0.11129898486093194, "rouge1_fmeasure_stderr": 0.001603697213150543, "rouge1_precision": 0.07078968601889724, "rouge1_precision_stderr": 0.0012530189540338403, "rouge1_recall": 0.370854174943212, "rouge1_recall_stderr": 0.005131257813808184, "rouge2_fmeasure": 0.052270761889885206, "rouge2_fmeasure_stderr": 0.00103207142359725, "rouge2_precision": 0.03298499960390668, "rouge2_precision_stderr": 0.0007381370373430346, "rouge2_recall": 0.18675978429729426, "rouge2_recall_stderr": 0.0037180617775293043, "rougeL_fmeasure": 0.10352172121642274, "rougeL_fmeasure_stderr": 0.001462770227679166, "rougeL_precision": 0.06597257056782777, "rougeL_precision_stderr": 0.0011757840654232027, "rougeL_recall": 0.3435597898095353, "rougeL_recall_stderr": 0.004593814760542546, "rougeLsum_fmeasure": 0.10603634362382153, "rougeLsum_fmeasure_stderr": 0.0015192749230585549, "rougeLsum_precision": 0.06757005923050743, "rougeLsum_precision_stderr": 0.001208590145084183, "rougeLsum_recall": 0.35225955402286013, "rougeLsum_recall_stderr": 0.004772649571059099}}, "4": {"PALM_prompt": {"bleu": 0.6153677621128861, "bleu_stderr": 0.06053705735401149, "rouge1_fmeasure": 0.11264103250558938, "rouge1_fmeasure_stderr": 0.00163118942788917, "rouge1_precision": 0.07071127507433782, "rouge1_precision_stderr": 0.0011520192275434789, "rouge1_recall": 0.38147578757395145, "rouge1_recall_stderr": 0.005244875143584364, "rouge2_fmeasure": 0.05295652860215851, "rouge2_fmeasure_stderr": 0.0010479736263979194, "rouge2_precision": 0.032992739334882344, "rouge2_precision_stderr": 0.0007156556105984286, "rouge2_recall": 0.1932194114953707, "rouge2_recall_stderr": 0.00374859289181986, "rougeL_fmeasure": 0.10299927861932978, "rougeL_fmeasure_stderr": 0.0014284397671676585, "rougeL_precision": 0.06471147959433785, "rougeL_precision_stderr": 0.0010212506145839731, "rougeL_recall": 0.3483975351615344, "rougeL_recall_stderr": 0.004606426897749366, "rougeLsum_fmeasure": 0.10684797875052807, "rougeLsum_fmeasure_stderr": 0.0015286400891968755, "rougeLsum_precision": 0.06714267179992058, "rougeLsum_precision_stderr": 0.0010872321667830566, "rougeLsum_recall": 0.3609556696570317, "rougeLsum_recall_stderr": 0.004821987801086811}}, "5": {"PALM_prompt": {"bleu": 0.6522310718826546, "bleu_stderr": 0.0342543434913143, "rouge1_fmeasure": 0.11496831895443499, "rouge1_fmeasure_stderr": 0.0016131648938140647, "rouge1_precision": 0.07224798328936255, "rouge1_precision_stderr": 0.0011956217365912398, "rouge1_recall": 0.3930127103047457, "rouge1_recall_stderr": 0.005342634817712522, "rouge2_fmeasure": 0.054266401974880724, "rouge2_fmeasure_stderr": 0.0010279738093731765, "rouge2_precision": 0.03383327897010777, "rouge2_precision_stderr": 0.0007155324088112528, "rouge2_recall": 0.20017152367590102, "rouge2_recall_stderr": 0.0038194708260410088, "rougeL_fmeasure": 0.10413378847184503, "rougeL_fmeasure_stderr": 0.001404569572763983, "rougeL_precision": 0.06558923187444043, "rougeL_precision_stderr": 0.0010757591438449354, "rougeL_recall": 0.35557332046269224, "rougeL_recall_stderr": 0.0046352766809419025, "rougeLsum_fmeasure": 0.10879985852497424, "rougeLsum_fmeasure_stderr": 0.001514236323280586, "rougeLsum_precision": 0.0684743244702778, "rougeLsum_precision_stderr": 0.0011401249172099674, "rougeLsum_recall": 0.3715362630204789, "rougeLsum_recall_stderr": 0.004921937595664833}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.8695101649478336, "bleu_stderr": 0.0774798723948649, "rouge1_fmeasure": 0.18492809283993106, "rouge1_fmeasure_stderr": 0.0019138467632652185, "rouge1_precision": 0.16049598964045364, "rouge1_precision_stderr": 0.0019462983151563367, "rouge1_recall": 0.2676713790563651, "rouge1_recall_stderr": 0.0028735302834564675, "rouge2_fmeasure": 0.04047796652620078, "rouge2_fmeasure_stderr": 0.0009161392431729087, "rouge2_precision": 0.034697662184252506, "rouge2_precision_stderr": 0.0008205861453711819, "rouge2_recall": 0.06110103740366906, "rouge2_recall_stderr": 0.0015502965185361641, "rougeL_fmeasure": 0.13965276114673036, "rougeL_fmeasure_stderr": 0.0013470236487103265, "rougeL_precision": 0.11976282577213969, "rougeL_precision_stderr": 0.0013342259804847978, "rougeL_recall": 0.20766916878715935, "rougeL_recall_stderr": 0.0023245522996809573, "rougeLsum_fmeasure": 0.17014737831445742, "rougeLsum_fmeasure_stderr": 0.0017471450796099739, "rougeLsum_precision": 0.14751551166205068, "rougeLsum_precision_stderr": 0.001780031941907223, "rougeLsum_recall": 0.24735686937216053, "rougeLsum_recall_stderr": 0.0026833728446082504}}, "1": {"tldr_en": {"bleu": 3.2332562389930515, "bleu_stderr": 0.0799951186296727, "rouge1_fmeasure": 0.23690998702342858, "rouge1_fmeasure_stderr": 0.0019890231968532827, "rouge1_precision": 0.20729964457048256, "rouge1_precision_stderr": 0.002274245844031677, "rouge1_recall": 0.33969139617897154, "rouge1_recall_stderr": 0.002833556197863864, "rouge2_fmeasure": 0.0622366147541512, "rouge2_fmeasure_stderr": 0.0011025942027406593, "rouge2_precision": 0.05460146452130771, "rouge2_precision_stderr": 0.001093799845706477, "rouge2_recall": 0.09225300993047651, "rouge2_recall_stderr": 0.0018248020314785354, "rougeL_fmeasure": 0.16727654774531026, "rougeL_fmeasure_stderr": 0.0013621791164943034, "rougeL_precision": 0.14525073667888475, "rougeL_precision_stderr": 0.0015687845277167008, "rougeL_recall": 0.24628845510672517, "rougeL_recall_stderr": 0.0023082894542053194, "rougeLsum_fmeasure": 0.2230496791129934, "rougeLsum_fmeasure_stderr": 0.0018757377324494942, "rougeLsum_precision": 0.19509659175047353, "rougeLsum_precision_stderr": 0.0021449940715420894, "rougeLsum_recall": 0.3204235648372041, "rougeLsum_recall_stderr": 0.0027128359683571725}}, "2": {"tldr_en": {"bleu": 3.5155467010727994, "bleu_stderr": 0.06141556601214586, "rouge1_fmeasure": 0.235556648636045, "rouge1_fmeasure_stderr": 0.0018848294908913575, "rouge1_precision": 0.20824164042833873, "rouge1_precision_stderr": 0.002234214827494051, "rouge1_recall": 0.3342680558497238, "rouge1_recall_stderr": 0.002719351845027267, "rouge2_fmeasure": 0.06251395339429003, "rouge2_fmeasure_stderr": 0.0011032340747877004, "rouge2_precision": 0.0553447765818299, "rouge2_precision_stderr": 0.001097279257804817, "rouge2_recall": 0.09158152498363732, "rouge2_recall_stderr": 0.00182638815550646, "rougeL_fmeasure": 0.16869690772741758, "rougeL_fmeasure_stderr": 0.0013497024922833358, "rougeL_precision": 0.14804254896591035, "rougeL_precision_stderr": 0.001582109014998804, "rougeL_recall": 0.24506636928994607, "rougeL_recall_stderr": 0.002277815665553641, "rougeLsum_fmeasure": 0.22311024372097865, "rougeLsum_fmeasure_stderr": 0.0017776265096001729, "rougeLsum_precision": 0.19708639575292217, "rougeLsum_precision_stderr": 0.0021089064966209187, "rougeLsum_recall": 0.31720897725992786, "rougeLsum_recall_stderr": 0.002596567797319611}}, "3": {"tldr_en": {"bleu": 3.504980488837104, "bleu_stderr": 0.07033302280883086, "rouge1_fmeasure": 0.1960859652047859, "rouge1_fmeasure_stderr": 0.0023366060464774423, "rouge1_precision": 0.17811439171989488, "rouge1_precision_stderr": 0.002560275363708373, "rouge1_recall": 0.27886090799993984, "rouge1_recall_stderr": 0.0034448901496326586, "rouge2_fmeasure": 0.0523326312861241, "rouge2_fmeasure_stderr": 0.0011285281285534974, "rouge2_precision": 0.04742286059353845, "rouge2_precision_stderr": 0.0011604513875678288, "rouge2_recall": 0.07742946215061461, "rouge2_recall_stderr": 0.001866825934930981, "rougeL_fmeasure": 0.14162844117143206, "rougeL_fmeasure_stderr": 0.0016899728317895791, "rougeL_precision": 0.12852552196126066, "rougeL_precision_stderr": 0.0018964172540042934, "rougeL_recall": 0.20545470726815437, "rougeL_recall_stderr": 0.0027326532184944424, "rougeLsum_fmeasure": 0.18556326066261042, "rougeLsum_fmeasure_stderr": 0.002215627597176747, "rougeLsum_precision": 0.16868014464807685, "rougeLsum_precision_stderr": 0.002438611490893621, "rougeLsum_recall": 0.26411127102133913, "rougeLsum_recall_stderr": 0.003281374640568146}}, "4": {"tldr_en": {"bleu": 0.9086939324156383, "bleu_stderr": 0.05855823137144976, "rouge1_fmeasure": 0.06282501006696624, "rouge1_fmeasure_stderr": 0.0020888213951751034, "rouge1_precision": 0.057686885490487505, "rouge1_precision_stderr": 0.002077970469715233, "rouge1_recall": 0.09385574649347103, "rouge1_recall_stderr": 0.0031890083282469467, "rouge2_fmeasure": 0.016837586892487218, "rouge2_fmeasure_stderr": 0.000790811753626707, "rouge2_precision": 0.015071313264611244, "rouge2_precision_stderr": 0.0007774016714959856, "rouge2_recall": 0.026883790080977472, "rouge2_recall_stderr": 0.0014075436212441558, "rougeL_fmeasure": 0.0461316780279477, "rougeL_fmeasure_stderr": 0.0015235672554804573, "rougeL_precision": 0.042638568215269866, "rougeL_precision_stderr": 0.0015504702364272222, "rougeL_recall": 0.07032908173779572, "rougeL_recall_stderr": 0.0024572632014890276, "rougeLsum_fmeasure": 0.05917832063435797, "rougeLsum_fmeasure_stderr": 0.001970707904332247, "rougeLsum_precision": 0.05433748981297747, "rougeLsum_precision_stderr": 0.0019639155165544035, "rougeLsum_recall": 0.0885417619284403, "rougeLsum_recall_stderr": 0.0030172562108610624}}, "5": {"tldr_en": {"bleu": 6.161453193982374e-06, "bleu_stderr": 1.2644510555241896e-05, "rouge1_fmeasure": 0.010180728590080238, "rouge1_fmeasure_stderr": 0.0009430571774107825, "rouge1_precision": 0.009722491969586005, "rouge1_precision_stderr": 0.0009564982386894138, "rouge1_recall": 0.015357271032115101, "rouge1_recall_stderr": 0.001448960021557378, "rouge2_fmeasure": 0.00281382806546607, "rouge2_fmeasure_stderr": 0.00035214119256937883, "rouge2_precision": 0.0024973101781513925, "rouge2_precision_stderr": 0.000343733403974376, "rouge2_recall": 0.0045816927074040755, "rouge2_recall_stderr": 0.0006153391357680226, "rougeL_fmeasure": 0.00759156476319381, "rougeL_fmeasure_stderr": 0.0007014066070016197, "rougeL_precision": 0.0072997926636164424, "rougeL_precision_stderr": 0.0007272617797612332, "rougeL_recall": 0.011767619522932812, "rougeL_recall_stderr": 0.0011356853451163903, "rougeLsum_fmeasure": 0.00955145315759681, "rougeLsum_fmeasure_stderr": 0.0008912980968449334, "rougeLsum_precision": 0.009129679261559958, "rougeLsum_precision_stderr": 0.0009077175108677888, "rougeLsum_recall": 0.014481417293915742, "rougeLsum_recall_stderr": 0.0013773388496390326}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.03454795333510589, "bleu_stderr": 0.010722245919714395, "rouge1_fmeasure": 0.009983083620835689, "rouge1_fmeasure_stderr": 0.0007216517112036717, "rouge1_precision": 0.008330877542768076, "rouge1_precision_stderr": 0.0006789467135202171, "rouge1_recall": 0.01512213229502354, "rouge1_recall_stderr": 0.0010598056749213033, "rouge2_fmeasure": 0.0015194719382464828, "rouge2_fmeasure_stderr": 0.0002186426847142056, "rouge2_precision": 0.0012462152328688286, "rouge2_precision_stderr": 0.0001850740246851996, "rouge2_recall": 0.0021941504736507373, "rouge2_recall_stderr": 0.00031789022057788433, "rougeL_fmeasure": 0.009451610525850281, "rougeL_fmeasure_stderr": 0.0006635438560036569, "rougeL_precision": 0.0076678486076520065, "rougeL_precision_stderr": 0.0005780206165870198, "rougeL_recall": 0.014540667195886358, "rougeL_recall_stderr": 0.0010048165449253396, "rougeLsum_fmeasure": 0.008102826704141256, "rougeLsum_fmeasure_stderr": 0.0005948187054074003, "rougeLsum_precision": 0.006854342833172471, "rougeLsum_precision_stderr": 0.0005808821610184307, "rougeLsum_recall": 0.01222352423290077, "rougeLsum_recall_stderr": 0.0008679529695598302}}, "1": {"generate_text_restaurant": {"bleu": 10.110683183008721, "bleu_stderr": 0.10856580026912326, "rouge1_fmeasure": 0.4261931768500333, "rouge1_fmeasure_stderr": 0.0020338904219177937, "rouge1_precision": 0.4273885051454009, "rouge1_precision_stderr": 0.0023386626945434775, "rouge1_recall": 0.4619168895673238, "rouge1_recall_stderr": 0.002910080537073359, "rouge2_fmeasure": 0.18637115332137155, "rouge2_fmeasure_stderr": 0.0017490252095085136, "rouge2_precision": 0.18621605242973624, "rouge2_precision_stderr": 0.0018420288481743644, "rouge2_recall": 0.20425913452615088, "rouge2_recall_stderr": 0.002171191668790452, "rougeL_fmeasure": 0.30007012360581886, "rougeL_fmeasure_stderr": 0.0017073194851728616, "rougeL_precision": 0.30141579223639636, "rougeL_precision_stderr": 0.0019324093600811055, "rougeL_recall": 0.3256528804455511, "rougeL_recall_stderr": 0.002364609519861813, "rougeLsum_fmeasure": 0.35343380568090416, "rougeLsum_fmeasure_stderr": 0.0020183396806264077, "rougeLsum_precision": 0.3549239657030783, "rougeLsum_precision_stderr": 0.0022624041723470494, "rougeLsum_recall": 0.38280848019973457, "rougeLsum_recall_stderr": 0.002716917275700221}}, "2": {"generate_text_restaurant": {"bleu": 11.76941570816289, "bleu_stderr": 0.15573900245748828, "rouge1_fmeasure": 0.45040818552274897, "rouge1_fmeasure_stderr": 0.001961202454945367, "rouge1_precision": 0.44751016860851195, "rouge1_precision_stderr": 0.002306632595634685, "rouge1_recall": 0.4895112200039268, "rouge1_recall_stderr": 0.0028440252957840422, "rouge2_fmeasure": 0.21309279835498804, "rouge2_fmeasure_stderr": 0.0018084420773651576, "rouge2_precision": 0.21124466241603482, "rouge2_precision_stderr": 0.001920141813437847, "rouge2_recall": 0.23408285111523244, "rouge2_recall_stderr": 0.0022741821628275683, "rougeL_fmeasure": 0.3267435445462492, "rougeL_fmeasure_stderr": 0.0017435336297606837, "rougeL_precision": 0.3250865088115516, "rougeL_precision_stderr": 0.0020090753832284394, "rougeL_recall": 0.35555451083591993, "rougeL_recall_stderr": 0.0024076335160479975, "rougeLsum_fmeasure": 0.37491707249596695, "rougeLsum_fmeasure_stderr": 0.0020146819194002373, "rougeLsum_precision": 0.3727655581276283, "rougeLsum_precision_stderr": 0.002277906383748808, "rougeLsum_recall": 0.40742138628815283, "rougeLsum_recall_stderr": 0.0027125462981769503}}, "3": {"generate_text_restaurant": {"bleu": 12.357057866617781, "bleu_stderr": 0.11622210780282331, "rouge1_fmeasure": 0.4562426842321961, "rouge1_fmeasure_stderr": 0.0019386232886629857, "rouge1_precision": 0.45217337104133665, "rouge1_precision_stderr": 0.002294505554504784, "rouge1_recall": 0.4942792865855869, "rouge1_recall_stderr": 0.002743597811824126, "rouge2_fmeasure": 0.22032333042194527, "rouge2_fmeasure_stderr": 0.001852412446316337, "rouge2_precision": 0.2175811124968712, "rouge2_precision_stderr": 0.0019112105040248436, "rouge2_recall": 0.2409573657644072, "rouge2_recall_stderr": 0.0023044463187969245, "rougeL_fmeasure": 0.3334482257131471, "rougeL_fmeasure_stderr": 0.0017866016624809113, "rougeL_precision": 0.33057426692006453, "rougeL_precision_stderr": 0.0020126164762619228, "rougeL_recall": 0.3619557938558922, "rougeL_recall_stderr": 0.0024177454342242813, "rougeLsum_fmeasure": 0.3816770816558835, "rougeLsum_fmeasure_stderr": 0.002007876237485467, "rougeLsum_precision": 0.37808218718729336, "rougeLsum_precision_stderr": 0.0022431062970265465, "rougeLsum_recall": 0.41390580636859037, "rougeLsum_recall_stderr": 0.0026729564370056775}}, "4": {"generate_text_restaurant": {"bleu": 12.468466502215751, "bleu_stderr": 0.09990648109577639, "rouge1_fmeasure": 0.45529352918386296, "rouge1_fmeasure_stderr": 0.0019729158566691536, "rouge1_precision": 0.45220424528442504, "rouge1_precision_stderr": 0.002337808233031794, "rouge1_recall": 0.4921231264159867, "rouge1_recall_stderr": 0.0027602306430746792, "rouge2_fmeasure": 0.21993594978939215, "rouge2_fmeasure_stderr": 0.001850258705420329, "rouge2_precision": 0.21806635417357498, "rouge2_precision_stderr": 0.0019446195531272607, "rouge2_recall": 0.2401062314536212, "rouge2_recall_stderr": 0.00230192039014965, "rougeL_fmeasure": 0.3320801238250487, "rougeL_fmeasure_stderr": 0.001796774794658876, "rougeL_precision": 0.32979623685018056, "rougeL_precision_stderr": 0.002025955430723721, "rougeL_recall": 0.35968401480469203, "rougeL_recall_stderr": 0.0024104842159546787, "rougeLsum_fmeasure": 0.38073707749695784, "rougeLsum_fmeasure_stderr": 0.0020603619333531575, "rougeLsum_precision": 0.37807324998983866, "rougeLsum_precision_stderr": 0.002305718057682757, "rougeLsum_recall": 0.4118614121463698, "rougeLsum_recall_stderr": 0.0027125157827382973}}, "5": {"generate_text_restaurant": {"bleu": 12.11108163274697, "bleu_stderr": 0.1511086038189987, "rouge1_fmeasure": 0.45448318084710065, "rouge1_fmeasure_stderr": 0.0019502573764627392, "rouge1_precision": 0.4494689335142834, "rouge1_precision_stderr": 0.0023139900407069896, "rouge1_recall": 0.49189724241328564, "rouge1_recall_stderr": 0.002720672072662264, "rouge2_fmeasure": 0.21793438022656353, "rouge2_fmeasure_stderr": 0.0018382910372242867, "rouge2_precision": 0.21502011952369768, "rouge2_precision_stderr": 0.0018993108425412735, "rouge2_recall": 0.23790406650497267, "rouge2_recall_stderr": 0.002257304748049696, "rougeL_fmeasure": 0.33100721144592343, "rougeL_fmeasure_stderr": 0.0017930595424372638, "rougeL_precision": 0.3270263161597272, "rougeL_precision_stderr": 0.001992100365911566, "rougeL_recall": 0.3590215756959229, "rougeL_recall_stderr": 0.0023843408170312035, "rougeLsum_fmeasure": 0.3800458168267657, "rougeLsum_fmeasure_stderr": 0.002019377882054415, "rougeLsum_precision": 0.3757504695199971, "rougeLsum_precision_stderr": 0.00226312608847076, "rougeLsum_recall": 0.4115209951658525, "rougeLsum_recall_stderr": 0.002643403932683506}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.4185935712214466, "bleu_stderr": 0.11565432283117048, "rouge1_fmeasure": 0.22302727328338223, "rouge1_fmeasure_stderr": 0.002543649203320007, "rouge1_precision": 0.16079100538036975, "rouge1_precision_stderr": 0.0019317155011328527, "rouge1_recall": 0.38463169025745175, "rouge1_recall_stderr": 0.004526539026922928, "rouge2_fmeasure": 0.056442306230818876, "rouge2_fmeasure_stderr": 0.0016804520319396248, "rouge2_precision": 0.03999269013558242, "rouge2_precision_stderr": 0.0011971308201383902, "rouge2_recall": 0.10113339981669754, "rouge2_recall_stderr": 0.0031343748451553286, "rougeL_fmeasure": 0.16717619407627404, "rougeL_fmeasure_stderr": 0.0019481211634922503, "rougeL_precision": 0.12033234686017578, "rougeL_precision_stderr": 0.0014585203532609563, "rougeL_recall": 0.2903778922407286, "rougeL_recall_stderr": 0.003692559366146235, "rougeLsum_fmeasure": 0.17608763758164225, "rougeLsum_fmeasure_stderr": 0.002145834312930374, "rougeLsum_precision": 0.12659083153302422, "rougeLsum_precision_stderr": 0.0015911326356590116, "rougeLsum_recall": 0.30622228336959245, "rougeLsum_recall_stderr": 0.004035233836387343}}, "1": {"article_DOC_summary": {"bleu": 1.4753760157360585, "bleu_stderr": 0.053942288193643405, "rouge1_fmeasure": 0.17409533783255507, "rouge1_fmeasure_stderr": 0.002615183917058679, "rouge1_precision": 0.123975541885696, "rouge1_precision_stderr": 0.001937069913810257, "rouge1_recall": 0.3041634033606052, "rouge1_recall_stderr": 0.004462494037919621, "rouge2_fmeasure": 0.03598895109620969, "rouge2_fmeasure_stderr": 0.001496555270960858, "rouge2_precision": 0.02530985857085611, "rouge2_precision_stderr": 0.0010516965631042789, "rouge2_recall": 0.06497657707857857, "rouge2_recall_stderr": 0.0028062229327394097, "rougeL_fmeasure": 0.13719393806212205, "rougeL_fmeasure_stderr": 0.001988792672158033, "rougeL_precision": 0.09739516038141627, "rougeL_precision_stderr": 0.0014498365795769053, "rougeL_recall": 0.24191669859950216, "rougeL_recall_stderr": 0.0036028516933205405, "rougeLsum_fmeasure": 0.1371396960626393, "rougeLsum_fmeasure_stderr": 0.0021254210571197236, "rougeLsum_precision": 0.0973784808310999, "rougeLsum_precision_stderr": 0.0015541782441477088, "rougeLsum_recall": 0.24166767773481146, "rougeLsum_recall_stderr": 0.0037775246270440404}}, "2": {"article_DOC_summary": {"bleu": 1.4658077495541642, "bleu_stderr": 0.05656663077206122, "rouge1_fmeasure": 0.17850901621898715, "rouge1_fmeasure_stderr": 0.002631185065208956, "rouge1_precision": 0.12701574570405383, "rouge1_precision_stderr": 0.0019497483407488042, "rouge1_recall": 0.31209308297568034, "rouge1_recall_stderr": 0.0044823221140013695, "rouge2_fmeasure": 0.03728951183116744, "rouge2_fmeasure_stderr": 0.0014839130164513042, "rouge2_precision": 0.02628489874277305, "rouge2_precision_stderr": 0.0010496767573945315, "rouge2_recall": 0.06678550185926611, "rouge2_recall_stderr": 0.002741050501727463, "rougeL_fmeasure": 0.14040861303541324, "rougeL_fmeasure_stderr": 0.001957888420474033, "rougeL_precision": 0.09965436966997658, "rougeL_precision_stderr": 0.0014375571522234208, "rougeL_recall": 0.24737259975268608, "rougeL_recall_stderr": 0.003487423921250019, "rougeLsum_fmeasure": 0.1402274218021141, "rougeLsum_fmeasure_stderr": 0.00214102737994103, "rougeLsum_precision": 0.0995909554171278, "rougeLsum_precision_stderr": 0.001569684395558083, "rougeLsum_recall": 0.2465747406900749, "rougeLsum_recall_stderr": 0.0037567150545319778}}, "3": {"article_DOC_summary": {"bleu": 1.6273581251014975, "bleu_stderr": 0.08297152976943713, "rouge1_fmeasure": 0.1741691220448831, "rouge1_fmeasure_stderr": 0.0027529808698691072, "rouge1_precision": 0.12661685012408697, "rouge1_precision_stderr": 0.0021093732895884356, "rouge1_recall": 0.30042742106922904, "rouge1_recall_stderr": 0.004793262475513874, "rouge2_fmeasure": 0.03848207108872355, "rouge2_fmeasure_stderr": 0.0015349514843449124, "rouge2_precision": 0.027457383236278155, "rouge2_precision_stderr": 0.0011005894460101174, "rouge2_recall": 0.06872339351563556, "rouge2_recall_stderr": 0.0028264005600779733, "rougeL_fmeasure": 0.13848485640839917, "rougeL_fmeasure_stderr": 0.002136410452186076, "rougeL_precision": 0.10054376826357136, "rougeL_precision_stderr": 0.0016355832838556733, "rougeL_recall": 0.24044972226758587, "rougeL_recall_stderr": 0.0038558430038978684, "rougeLsum_fmeasure": 0.13794850154717367, "rougeLsum_fmeasure_stderr": 0.002276629118568194, "rougeLsum_precision": 0.10022509495467177, "rougeLsum_precision_stderr": 0.001733799927568896, "rougeLsum_recall": 0.2391221551244155, "rougeLsum_recall_stderr": 0.0040668979594426224}}, "4": {"article_DOC_summary": {"bleu": 0.9297944407512662, "bleu_stderr": 0.10452568299951195, "rouge1_fmeasure": 0.049025842019891894, "rouge1_fmeasure_stderr": 0.002761462686946734, "rouge1_precision": 0.03984641222755382, "rouge1_precision_stderr": 0.0023583006672399276, "rouge1_recall": 0.07781373363270898, "rouge1_recall_stderr": 0.0044678999283725125, "rouge2_fmeasure": 0.011276208078698832, "rouge2_fmeasure_stderr": 0.0010882501335885415, "rouge2_precision": 0.008839770813015032, "rouge2_precision_stderr": 0.0009132501416122141, "rouge2_recall": 0.018595501088417683, "rouge2_recall_stderr": 0.0018046371518142589, "rougeL_fmeasure": 0.03899309107571578, "rougeL_fmeasure_stderr": 0.0022095258634566655, "rougeL_precision": 0.03193279708670165, "rougeL_precision_stderr": 0.0019219705874772618, "rougeL_recall": 0.06201212154363477, "rougeL_recall_stderr": 0.0036181165069829016, "rougeLsum_fmeasure": 0.03871502132726552, "rougeLsum_fmeasure_stderr": 0.0021963924186294845, "rougeLsum_precision": 0.03172097901935682, "rougeLsum_precision_stderr": 0.0019113003105201993, "rougeLsum_recall": 0.06162603968016436, "rougeLsum_recall_stderr": 0.003613145314591428}}, "5": {"article_DOC_summary": {"bleu": 1.3519576766092788e-16, "bleu_stderr": 1.7148754157361587e-13, "rouge1_fmeasure": 0.002273937140206459, "rouge1_fmeasure_stderr": 0.0006514011124893405, "rouge1_precision": 0.0018747157977051885, "rouge1_precision_stderr": 0.0005217271715436668, "rouge1_recall": 0.003549283577549333, "rouge1_recall_stderr": 0.0010767599221403455, "rouge2_fmeasure": 0.0005538280315731646, "rouge2_fmeasure_stderr": 0.00021055797770595916, "rouge2_precision": 0.00047917769370046107, "rouge2_precision_stderr": 0.00017934012188651556, "rouge2_recall": 0.0007855487711203583, "rouge2_recall_stderr": 0.00030506211146057416, "rougeL_fmeasure": 0.0019019709421522583, "rougeL_fmeasure_stderr": 0.0005434789167320575, "rougeL_precision": 0.001591076043425948, "rougeL_precision_stderr": 0.00044966429790604164, "rougeL_recall": 0.002863749112737054, "rougeL_recall_stderr": 0.0008323073183414411, "rougeLsum_fmeasure": 0.0018351130157390045, "rougeLsum_fmeasure_stderr": 0.0005055637602691865, "rougeLsum_precision": 0.00156298958828556, "rougeLsum_precision_stderr": 0.00043239685994550666, "rougeLsum_recall": 0.0027767015907513405, "rougeLsum_recall_stderr": 0.0007833271709551212}}}} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.json b/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..19edb35db07b8ef0c855e7f07156f67a64c0789b --- /dev/null +++ b/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.20824164042833873, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002234214827494051 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.3342680558497238, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002719351845027267 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.235556648636045, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018848294908913575 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0553447765818299, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001097279257804817 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.09158152498363732, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00182638815550646 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.06251395339429003, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011032340747877004 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.14804254896591035, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001582109014998804 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.24506636928994607, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002277815665553641 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.16869690772741758, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013497024922833358 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.19708639575292217, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0021089064966209187 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.31720897725992786, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002596567797319611 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.22311024372097865, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017776265096001729 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.5155467010727994, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06141556601214586 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.json b/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..73d9d6f50e150e3fe309a86c1d0656e4eb86b780 --- /dev/null +++ b/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.17811439171989488, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002560275363708373 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.27886090799993984, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0034448901496326586 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.1960859652047859, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0023366060464774423 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.04742286059353845, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0011604513875678288 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.07742946215061461, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001866825934930981 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0523326312861241, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011285281285534974 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.12852552196126066, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0018964172540042934 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.20545470726815437, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0027326532184944424 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.14162844117143206, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016899728317895791 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.16868014464807685, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002438611490893621 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.26411127102133913, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003281374640568146 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.18556326066261042, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002215627597176747 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.504980488837104, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07033302280883086 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.json b/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4989d1950bfaeb36840b3fa5876da3b004664426 --- /dev/null +++ b/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.057686885490487505, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002077970469715233 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.09385574649347103, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0031890083282469467 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.06282501006696624, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020888213951751034 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.015071313264611244, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007774016714959856 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.026883790080977472, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0014075436212441558 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.016837586892487218, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000790811753626707 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.042638568215269866, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015504702364272222 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.07032908173779572, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0024572632014890276 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.0461316780279477, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015235672554804573 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.05433748981297747, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0019639155165544035 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.0885417619284403, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0030172562108610624 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.05917832063435797, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001970707904332247 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.9086939324156383, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05855823137144976 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.json b/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2f0f17baf4a6eaefd60d012e9f6875d3465a451c --- /dev/null +++ b/8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.009722491969586005, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0009564982386894138 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.015357271032115101, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.001448960021557378 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.010180728590080238, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0009430571774107825 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0024973101781513925, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000343733403974376 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0045816927074040755, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0006153391357680226 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.00281382806546607, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00035214119256937883 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.0072997926636164424, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0007272617797612332 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.011767619522932812, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0011356853451163903 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.00759156476319381, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0007014066070016197 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.009129679261559958, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0009077175108677888 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.014481417293915742, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0013773388496390326 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.00955145315759681, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0008912980968449334 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 6.161453193982374e-06, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 1.2644510555241896e-05 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_3.json b/8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..83b0ba41c774c425e99748653f28a452b58d349b --- /dev/null +++ b/8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.12661685012408697, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0021093732895884356 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.30042742106922904, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004793262475513874 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.1741691220448831, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0027529808698691072 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.027457383236278155, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011005894460101174 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.06872339351563556, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0028264005600779733 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.03848207108872355, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015349514843449124 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.10054376826357136, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0016355832838556733 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.24044972226758587, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0038558430038978684 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.13848485640839917, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002136410452186076 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.10022509495467177, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001733799927568896 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2391221551244155, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0040668979594426224 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.13794850154717367, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002276629118568194 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.6273581251014975, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.08297152976943713 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_4.json b/8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0a70853228438d58b057d7a135798c4b9184222c --- /dev/null +++ b/8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.03984641222755382, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0023583006672399276 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.07781373363270898, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0044678999283725125 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.049025842019891894, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002761462686946734 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.008839770813015032, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0009132501416122141 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.018595501088417683, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0018046371518142589 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.011276208078698832, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0010882501335885415 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.03193279708670165, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0019219705874772618 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.06201212154363477, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0036181165069829016 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.03899309107571578, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0022095258634566655 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.03172097901935682, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0019113003105201993 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.06162603968016436, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003613145314591428 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.03871502132726552, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021963924186294845 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.9297944407512662, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.10452568299951195 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_5.json b/8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c27c2401753e0fd6c2e87d11514d9f47829d81dd --- /dev/null +++ b/8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.0018747157977051885, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0005217271715436668 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.003549283577549333, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0010767599221403455 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.002273937140206459, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0006514011124893405 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.00047917769370046107, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00017934012188651556 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0007855487711203583, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00030506211146057416 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0005538280315731646, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00021055797770595916 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.001591076043425948, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.00044966429790604164 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.002863749112737054, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0008323073183414411 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.0019019709421522583, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0005434789167320575 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.00156298958828556, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.00043239685994550666 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0027767015907513405, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0007833271709551212 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0018351130157390045, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0005055637602691865 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.3519576766092788e-16, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 1.7148754157361587e-13 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.json b/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d4c92ad25fb5f576500a9c23b085c1edf8d4603b --- /dev/null +++ b/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.22135115607796563, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002466762662359004}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.3499365920126553, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029103670069339765}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.24678912656203086, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002037101161511086}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.06270201642870808, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001276661389687198}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.10236690363201062, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0019960722912629107}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.06998330324965386, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012429781942009475}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.15820160856971907, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001803134257764531}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.25667905638991584, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002404233442497606}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.17719154028942327, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014768440178427862}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.20961741686683294, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0023343675964983485}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.332399007963623, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027852830969350697}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.2338283238156954, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019162970817694011}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.9658166802535484, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07925927931568665}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.json b/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c4b2201f86132a32849f618998c83cba82cfe069 --- /dev/null +++ b/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.1877914020690942, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002780603639421943}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.28469073541532774, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003552554685317243}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.2013934895923683, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024101386411320806}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.05231468260770844, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00130393473012376}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.08172649028680513, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018880261158435462}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05612430703445463, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011944949124968374}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.1363160410891958, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021057536768160825}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.21040942089474482, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002825298497100027}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.14593849406958256, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017576382763014175}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.17789698988425304, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0026448838544979162}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2699836129539428, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003393162716447727}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.19069625305891352, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002278271153942546}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.971856308559798, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09289145672499902}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.json b/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..aa7c82700416de8337fe3691bfdecb78f5773d55 --- /dev/null +++ b/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.05986373469352374, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022413247620520016}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.09299451528533728, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003182662484214728}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.06294177276794305, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021234318631902995}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.01607251699061772, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008627575848833899}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.026217246289428207, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013573347317069598}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.01701967942257221, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008072426334728445}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.04416852187112644, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016929957101032731}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.06977139563193656, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002453474360518891}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.04627660806628133, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001562406165753214}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.056539524010580496, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002122275707889583}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.08802275906234883, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00301921948656467}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.05949644164272974, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002009657736950104}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.8216828826403728, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06116206229347704}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.json b/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6ab5a308eeaabba788fdcab40473a9eb91e2ae9a --- /dev/null +++ b/8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.010026792511549231, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009722843833023145}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.014997054249515373, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0014200291615747175}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.010348132353778797, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0009643798928416376}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.002614111292262381, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0003582186570242107}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0043272321523977674, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.000556694180882867}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.002947850654970828, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00038101201967536574}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.007696209279533862, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0007541223407519663}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.01149712833393674, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001106103576275103}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.007846213305722558, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007330941188437008}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.009502293292635411, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009264781441363666}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.01418906709799213, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0013478881437725126}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.009790499222718256, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009156513115428641}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.939501013520158e-06, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 6.1416042954126946e-06}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_3.json b/8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..941f6473fe22065d144213985d2b6273d3ab5025 --- /dev/null +++ b/8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.1451633530537778, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022577436785926417}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.3454037481783916, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005024002677948197}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.2005996992165322, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002922307185273832}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.03399383589660795, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011970297160356436}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.08515533208064306, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00305750154560904}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.047805324803931785, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001665552432783648}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.1082275772002201, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016303512277000601}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.26018618860799503, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0038502171494847245}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.15011348459355398, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021656916632942095}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.1163681459054013, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018244541208628394}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.27969335970329556, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004315552235909818}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.1614133142157355, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00243279237750366}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.0101663995130106, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13035938633140529}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_4.json b/8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..bc810924074a1b9d9700d4b8abbfc2bda0c1f65d --- /dev/null +++ b/8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.04604504358511682, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0026703224008911334}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.08906761715367195, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0050274498862542885}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05612514432374489, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0030731828239370837}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.011276377923656494, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012571358140648242}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.023708158418892494, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002002425619950796}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.013937278041326827, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011465942789645456}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.03500836518053345, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021377284451202998}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.06645887444884206, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0037898342177074525}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.04184105680154411, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002285547204915818}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0378191473406664, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002272186759930814}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.07247078079866072, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004137833255697601}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.04557476167159472, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002503478149457618}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.1210416443624807, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.18886560236043304}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_5.json b/8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..cb9c274f331fe4dd0808d0c8a9a35e1b9be68e5a --- /dev/null +++ b/8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.0022472741378328047, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0006307356242527256}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.004133903246439151, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0011668769662793363}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0026984759176508304, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.000738763092128848}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.00044268370843308837, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00017799352097082119}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0008499613285961788, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003602171813535447}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.000551357929598578, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00022851833366345053}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0016336554354190846, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0004503176195113186}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.003027477992470196, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0008730946685903953}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.001991063584250406, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005515138571257219}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0017629252737840803, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0005014954502254297}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0031744555131817086, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0008937483641872015}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.002097477776387118, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005739873846094995}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.7628232823336373e-17, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 2.8521803760603906e-14}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.jsonl b/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..e82a862e016faad30f8bf35ca59ff924daee9a3c 100644 --- a/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.jsonl +++ b/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:633b67c477b50749ee0df318352f9c26b8246632303e42a04130096db86134b3 +size 18901229 diff --git a/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.jsonl b/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..7949aa3cfdbd9d1efb4a4b64a3158753c0d536e0 100644 --- a/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.jsonl +++ b/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb0375c0b10c4c1729c1811d0079a86d3b6f67ab5947b77f4d83c597606dc8a1 +size 24315240 diff --git a/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.jsonl b/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..48584829a9ffd9eb07200355b91237fb3e6837ae 100644 --- a/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.jsonl +++ b/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd549626f85f15b551ff61f922588c672e9dea8045df9f459c0b25c747f712b4 +size 29469414 diff --git a/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.jsonl b/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..6d83b922644b250b273750c3552368c361da7ef0 100644 --- a/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.jsonl +++ b/8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83f21fa77bb6a64f6ca3dc23d6cb94b8078265eb91430652eedcf44cb11c30b0 +size 34800828 diff --git a/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_3.jsonl b/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..19aeba82ea07c740dc8469fb72e722079b465694 100644 --- a/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_3.jsonl +++ b/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c98fd50bf29300b85ea2c8c42cc1dac19e47a5deddd7ab319507ef9553809abe +size 9647374 diff --git a/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_4.jsonl b/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..fd6d2f9a6471e4ba452fe8e1aff34504e4656efe 100644 --- a/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_4.jsonl +++ b/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8140c5d43dadd582c3d79c1c06d9ee99ca459810a3b1e56368fb4f12a2b4cb63 +size 11673693 diff --git a/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_5.jsonl b/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..faf7f702b409426519b79cf0d9bad89f2d2e1b84 100644 --- a/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_5.jsonl +++ b/8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04979a4e0509f41ac7b144862e4b7aded18e0e4e2e69fdcbff4d50eb8aed3dc2 +size 13899458 diff --git a/8b7178b35b/evaluation/generation/merged.csv b/8b7178b35b/evaluation/generation/merged.csv index a283cd0aac210b52330af608f5a8fec5c07d89a2..55c02ebcf1988ac012c8bad456207d7a6b2943d2 100644 --- a/8b7178b35b/evaluation/generation/merged.csv +++ b/8b7178b35b/evaluation/generation/merged.csv @@ -18,7 +18,13 @@ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04451173090845151 gem_xsum,1,median,rouge2_fmeasure,0.04451173090845151 gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.05083267238349886 gem_xsum,2,median,rouge2_fmeasure,0.05083267238349886 -gem_xsum,2,average,multiple,0.04955506903105885 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.047805324803931785 +gem_xsum,3,median,rouge2_fmeasure,0.047805324803931785 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.013937278041326827 +gem_xsum,4,median,rouge2_fmeasure,0.013937278041326827 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.000551357929598578 +gem_xsum,5,median,rouge2_fmeasure,0.000551357929598578 +gem_xsum,5,average,multiple,0.035159861311338955 web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05212321093989505 web_nlg_en,0,median,rouge2_fmeasure,0.05212321093989505 web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.058822699744876736 @@ -36,4 +42,12 @@ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03900064299554168 wiki_lingua_en,0,median,rouge2_fmeasure,0.03900064299554168 wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.06561109570973458 wiki_lingua_en,1,median,rouge2_fmeasure,0.06561109570973458 -wiki_lingua_en,1,average,multiple,0.05230586935263813 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.06998330324965386 +wiki_lingua_en,2,median,rouge2_fmeasure,0.06998330324965386 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.05612430703445463 +wiki_lingua_en,3,median,rouge2_fmeasure,0.05612430703445463 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01701967942257221 +wiki_lingua_en,4,median,rouge2_fmeasure,0.01701967942257221 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.002947850654970828 +wiki_lingua_en,5,median,rouge2_fmeasure,0.002947850654970828 +wiki_lingua_en,5,average,multiple,0.04178114651115463 diff --git a/8b7178b35b/evaluation/generation/merged.json b/8b7178b35b/evaluation/generation/merged.json index 967747d9cf1c2ec22e0e6b99a07c41b10ab6eb34..c3082fb73a27ac54b74213e465632f554a0fe56d 100644 --- a/8b7178b35b/evaluation/generation/merged.json +++ b/8b7178b35b/evaluation/generation/merged.json @@ -1 +1 @@ -{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.32917980943922837, "bleu_stderr": 0.028436979522399264, "rouge1_fmeasure": 0.11092359878172349, "rouge1_fmeasure_stderr": 0.0021083743244835647, "rouge1_precision": 0.07454038083418474, "rouge1_precision_stderr": 0.001766454284632946, "rouge1_recall": 0.30290353802392916, "rouge1_recall_stderr": 0.004835002819067963, "rouge2_fmeasure": 0.05212321093989505, "rouge2_fmeasure_stderr": 0.0013412640798549406, "rouge2_precision": 0.03405534335780326, "rouge2_precision_stderr": 0.0009795255375204467, "rouge2_recall": 0.14725829338052365, "rouge2_recall_stderr": 0.0033496237842817397, "rougeL_fmeasure": 0.1059469350929842, "rougeL_fmeasure_stderr": 0.0019397934888169463, "rougeL_precision": 0.07086594439969038, "rougeL_precision_stderr": 0.0016082892660627373, "rougeL_recall": 0.29226896530687846, "rougeL_recall_stderr": 0.00466575095416219, "rougeLsum_fmeasure": 0.1060475373688485, "rougeLsum_fmeasure_stderr": 0.0019809059744097615, "rougeLsum_precision": 0.0711343154862909, "rougeLsum_precision_stderr": 0.001649638611842115, "rougeLsum_recall": 0.2903231316909427, "rougeLsum_recall_stderr": 0.004582555178091478}}, "1": {"PALM_prompt": {"bleu": 0.5285748263104777, "bleu_stderr": 0.05154539253732287, "rouge1_fmeasure": 0.1218665522358165, "rouge1_fmeasure_stderr": 0.0019158434392141466, "rouge1_precision": 0.07826126347996014, "rouge1_precision_stderr": 0.001412323883175065, "rouge1_recall": 0.38231102690350377, "rouge1_recall_stderr": 0.0053881864820272286, "rouge2_fmeasure": 0.058822699744876736, "rouge2_fmeasure_stderr": 0.0012405607498159584, "rouge2_precision": 0.037391127717872924, "rouge2_precision_stderr": 0.0008664479676328054, "rouge2_recall": 0.19839458948033734, "rouge2_recall_stderr": 0.004010973785657583, "rougeL_fmeasure": 0.11559328292690177, "rougeL_fmeasure_stderr": 0.001764131335387481, "rougeL_precision": 0.07407975935897594, "rougeL_precision_stderr": 0.0012854765812861943, "rougeL_recall": 0.3629315958992141, "rougeL_recall_stderr": 0.005069344323262684, "rougeLsum_fmeasure": 0.11544948693652962, "rougeLsum_fmeasure_stderr": 0.0017842924340211324, "rougeLsum_precision": 0.07415121638849131, "rougeLsum_precision_stderr": 0.001318142684082723, "rougeLsum_recall": 0.3614370946625198, "rougeLsum_recall_stderr": 0.004964006794864235}}, "2": {"PALM_prompt": {"bleu": 0.6066453441760302, "bleu_stderr": 0.02113523644793951, "rouge1_fmeasure": 0.12529215862638043, "rouge1_fmeasure_stderr": 0.0018259072915703836, "rouge1_precision": 0.08017864305448325, "rouge1_precision_stderr": 0.0014244751923494794, "rouge1_recall": 0.40480204267227765, "rouge1_recall_stderr": 0.005138963278066339, "rouge2_fmeasure": 0.059482069465634994, "rouge2_fmeasure_stderr": 0.0011595782202414246, "rouge2_precision": 0.03746116833011539, "rouge2_precision_stderr": 0.0008054382500683709, "rouge2_recall": 0.2078327469176558, "rouge2_recall_stderr": 0.003905498610846629, "rougeL_fmeasure": 0.11765794068577944, "rougeL_fmeasure_stderr": 0.0016600536639441099, "rougeL_precision": 0.0752258255540516, "rougeL_precision_stderr": 0.0013017426600869019, "rougeL_recall": 0.37921672738638584, "rougeL_recall_stderr": 0.004724125031038082, "rougeLsum_fmeasure": 0.11886578443137756, "rougeLsum_fmeasure_stderr": 0.001710265056607144, "rougeLsum_precision": 0.07612402487838264, "rougeLsum_precision_stderr": 0.0013499817758363393, "rougeLsum_recall": 0.3833520840989382, "rougeLsum_recall_stderr": 0.004787520387833171}}, "3": {"PALM_prompt": {"bleu": 0.6647933502215259, "bleu_stderr": 0.04923433986433192, "rouge1_fmeasure": 0.12320395767637426, "rouge1_fmeasure_stderr": 0.0017848979409527468, "rouge1_precision": 0.0781723421483203, "rouge1_precision_stderr": 0.0013023299811420648, "rouge1_recall": 0.4046567231900049, "rouge1_recall_stderr": 0.005137890673994109, "rouge2_fmeasure": 0.05931767519712951, "rouge2_fmeasure_stderr": 0.0011649351056668448, "rouge2_precision": 0.037339890555931926, "rouge2_precision_stderr": 0.0008144074272058679, "rouge2_recall": 0.2106193724526676, "rouge2_recall_stderr": 0.003927640735394401, "rougeL_fmeasure": 0.1150473458151772, "rougeL_fmeasure_stderr": 0.001619592187307252, "rougeL_precision": 0.07298298160971306, "rougeL_precision_stderr": 0.0011729650830812643, "rougeL_recall": 0.37541756571956564, "rougeL_recall_stderr": 0.004598099062061018, "rougeLsum_fmeasure": 0.1167920444605632, "rougeLsum_fmeasure_stderr": 0.0016685972133379994, "rougeLsum_precision": 0.0741403065968848, "rougeLsum_precision_stderr": 0.0012209548996141538, "rougeLsum_recall": 0.38296199684709586, "rougeLsum_recall_stderr": 0.004753402389332228}}, "4": {"PALM_prompt": {"bleu": 0.6566656848041873, "bleu_stderr": 0.03433459283803238, "rouge1_fmeasure": 0.12360753221007545, "rouge1_fmeasure_stderr": 0.001737583890202225, "rouge1_precision": 0.07771155841369212, "rouge1_precision_stderr": 0.0012374008770267796, "rouge1_recall": 0.4160226119034251, "rouge1_recall_stderr": 0.005178203274221526, "rouge2_fmeasure": 0.05886517252410502, "rouge2_fmeasure_stderr": 0.0011242425541824303, "rouge2_precision": 0.036708952345678336, "rouge2_precision_stderr": 0.0007671406759682118, "rouge2_recall": 0.2142530872032143, "rouge2_recall_stderr": 0.003938344188890309, "rougeL_fmeasure": 0.11421873690833906, "rougeL_fmeasure_stderr": 0.0015701261033135242, "rougeL_precision": 0.07189004426671647, "rougeL_precision_stderr": 0.001119612418604389, "rougeL_recall": 0.381893221081078, "rougeL_recall_stderr": 0.004577229345577291, "rougeLsum_fmeasure": 0.11777877062717519, "rougeLsum_fmeasure_stderr": 0.0016474868402205588, "rougeLsum_precision": 0.0741079421040528, "rougeLsum_precision_stderr": 0.0011766632070640108, "rougeLsum_recall": 0.39536320623753024, "rougeLsum_recall_stderr": 0.004834163190323313}}, "5": {"PALM_prompt": {"bleu": 0.7608515369903156, "bleu_stderr": 0.05544338902785279, "rouge1_fmeasure": 0.12542945154484628, "rouge1_fmeasure_stderr": 0.0017246642532628591, "rouge1_precision": 0.07874251282939974, "rouge1_precision_stderr": 0.0012390781355337446, "rouge1_recall": 0.4274915903037139, "rouge1_recall_stderr": 0.0052178214889688, "rouge2_fmeasure": 0.059919950381812144, "rouge2_fmeasure_stderr": 0.0011224239943820847, "rouge2_precision": 0.03725892588478996, "rouge2_precision_stderr": 0.0007691554935322187, "rouge2_recall": 0.22193078536394883, "rouge2_recall_stderr": 0.003999018222823969, "rougeL_fmeasure": 0.11503193912763204, "rougeL_fmeasure_stderr": 0.0015332329154476773, "rougeL_precision": 0.07225670171843512, "rougeL_precision_stderr": 0.001101933354024317, "rougeL_recall": 0.390252551254915, "rougeL_recall_stderr": 0.004591599013709285, "rougeLsum_fmeasure": 0.11871364855558929, "rougeLsum_fmeasure_stderr": 0.0016202511632686142, "rougeLsum_precision": 0.07457349456330893, "rougeLsum_precision_stderr": 0.0011662890645907052, "rougeLsum_recall": 0.40390693081462914, "rougeLsum_recall_stderr": 0.004850186326089204}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.7311283501769414, "bleu_stderr": 0.07292848467922458, "rouge1_fmeasure": 0.18112864366064585, "rouge1_fmeasure_stderr": 0.0019261005266957636, "rouge1_precision": 0.15412231045043578, "rouge1_precision_stderr": 0.0019133423603737852, "rouge1_recall": 0.26596610436897833, "rouge1_recall_stderr": 0.0029078617781084343, "rouge2_fmeasure": 0.03900064299554168, "rouge2_fmeasure_stderr": 0.0009076451372655918, "rouge2_precision": 0.032745788295643126, "rouge2_precision_stderr": 0.000790295587407252, "rouge2_recall": 0.06015567293207339, "rouge2_recall_stderr": 0.0015850260882465787, "rougeL_fmeasure": 0.13940156755327857, "rougeL_fmeasure_stderr": 0.0013668263941440067, "rougeL_precision": 0.1171936085278983, "rougeL_precision_stderr": 0.00132039683220814, "rougeL_recall": 0.21005905052835785, "rougeL_recall_stderr": 0.002357155817310683, "rougeLsum_fmeasure": 0.16698321861271978, "rougeLsum_fmeasure_stderr": 0.0017578260329471065, "rougeLsum_precision": 0.14187748986714835, "rougeLsum_precision_stderr": 0.0017420221740131032, "rougeLsum_recall": 0.2462694168340833, "rougeLsum_recall_stderr": 0.002709379098155295}}, "1": {"tldr_en": {"bleu": 3.5922679202027847, "bleu_stderr": 0.05984463327930867, "rouge1_fmeasure": 0.24327622952771533, "rouge1_fmeasure_stderr": 0.002037288100861591, "rouge1_precision": 0.2123430992404537, "rouge1_precision_stderr": 0.002316190617323844, "rouge1_recall": 0.349869999562048, "rouge1_recall_stderr": 0.0028776097446615145, "rouge2_fmeasure": 0.06561109570973458, "rouge2_fmeasure_stderr": 0.0011754883309790891, "rouge2_precision": 0.05726169522995292, "rouge2_precision_stderr": 0.0011305457211571272, "rouge2_recall": 0.09761319314398223, "rouge2_recall_stderr": 0.001931346687472591, "rougeL_fmeasure": 0.17044218669835878, "rougeL_fmeasure_stderr": 0.001393233918351284, "rougeL_precision": 0.147593790999516, "rougeL_precision_stderr": 0.0015843013179494662, "rougeL_recall": 0.25163325238017475, "rougeL_recall_stderr": 0.0023185700212327545, "rougeLsum_fmeasure": 0.22970291581408894, "rougeLsum_fmeasure_stderr": 0.0019241182984497916, "rougeLsum_precision": 0.20040929059968782, "rougeLsum_precision_stderr": 0.0021906873980096493, "rougeLsum_recall": 0.33105684407128727, "rougeLsum_recall_stderr": 0.0027539492668926546}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.17854969577688523, "bleu_stderr": 0.02350515901837014, "rouge1_fmeasure": 0.14982661873128114, "rouge1_fmeasure_stderr": 0.0010486948402501617, "rouge1_precision": 0.34481003507654684, "rouge1_precision_stderr": 0.0024960642876163215, "rouge1_recall": 0.11140479511426793, "rouge1_recall_stderr": 0.0012326825192403048, "rouge2_fmeasure": 0.010042498177274786, "rouge2_fmeasure_stderr": 0.0004526496809829142, "rouge2_precision": 0.02652033223020393, "rouge2_precision_stderr": 0.0012104924394046484, "rouge2_recall": 0.007738642998342415, "rouge2_recall_stderr": 0.00040331722748865453, "rougeL_fmeasure": 0.13556698518148783, "rougeL_fmeasure_stderr": 0.0010029610186637107, "rougeL_precision": 0.31148977629184244, "rougeL_precision_stderr": 0.0024243327495898086, "rougeL_recall": 0.10191142585459571, "rougeL_recall_stderr": 0.0012109507603402815, "rougeLsum_fmeasure": 0.1364054799839495, "rougeLsum_fmeasure_stderr": 0.0010086900095262318, "rougeLsum_precision": 0.3173006007013496, "rougeLsum_precision_stderr": 0.002513826319580817, "rougeLsum_recall": 0.10030998557950792, "rougeLsum_recall_stderr": 0.0010758913304541808}}, "1": {"generate_text_restaurant": {"bleu": 8.550714951198435, "bleu_stderr": 0.06794955954281734, "rouge1_fmeasure": 0.3940891272584714, "rouge1_fmeasure_stderr": 0.00233253448293986, "rouge1_precision": 0.37441894592411723, "rouge1_precision_stderr": 0.00285043399066679, "rouge1_recall": 0.4681809355112701, "rouge1_recall_stderr": 0.0029387631347472464, "rouge2_fmeasure": 0.17310173953900088, "rouge2_fmeasure_stderr": 0.0017673218319637287, "rouge2_precision": 0.16514863563264978, "rouge2_precision_stderr": 0.0019360690941877992, "rouge2_recall": 0.20649495583013436, "rouge2_recall_stderr": 0.0021624061718912763, "rougeL_fmeasure": 0.2901588169021053, "rougeL_fmeasure_stderr": 0.0017017464729554115, "rougeL_precision": 0.2743117208263101, "rougeL_precision_stderr": 0.0021002662608094564, "rougeL_recall": 0.34924701312209255, "rougeL_recall_stderr": 0.0023705538319379695, "rougeLsum_fmeasure": 0.327671306182913, "rougeLsum_fmeasure_stderr": 0.0022171207606254796, "rougeLsum_precision": 0.31201556645452433, "rougeLsum_precision_stderr": 0.002622342685847327, "rougeLsum_recall": 0.3885350242195471, "rougeLsum_recall_stderr": 0.0027432106106179788}}, "2": {"generate_text_restaurant": {"bleu": 11.867888027230435, "bleu_stderr": 0.15989930331829488, "rouge1_fmeasure": 0.4442817836593718, "rouge1_fmeasure_stderr": 0.0019988950330102873, "rouge1_precision": 0.44246406218448636, "rouge1_precision_stderr": 0.002320846131050864, "rouge1_recall": 0.4822237492014395, "rouge1_recall_stderr": 0.0028480697292934715, "rouge2_fmeasure": 0.20571413885055867, "rouge2_fmeasure_stderr": 0.001815833869403785, "rouge2_precision": 0.20447987966953268, "rouge2_precision_stderr": 0.0019087131351150518, "rouge2_recall": 0.22575098541036898, "rouge2_recall_stderr": 0.0022679440038537117, "rougeL_fmeasure": 0.3232871465388061, "rougeL_fmeasure_stderr": 0.0017540023019821938, "rougeL_precision": 0.3220661789895318, "rougeL_precision_stderr": 0.0019807481977887516, "rougeL_recall": 0.35167106294020406, "rougeL_recall_stderr": 0.002412719701186727, "rougeLsum_fmeasure": 0.37187691948404467, "rougeLsum_fmeasure_stderr": 0.0020334507090512886, "rougeLsum_precision": 0.37040704761920407, "rougeLsum_precision_stderr": 0.002271535999762247, "rougeLsum_recall": 0.4036635693660681, "rougeLsum_recall_stderr": 0.0027100283713887775}}, "3": {"generate_text_restaurant": {"bleu": 12.176808245577714, "bleu_stderr": 0.1522141950985559, "rouge1_fmeasure": 0.4492578440402226, "rouge1_fmeasure_stderr": 0.0019667066974463896, "rouge1_precision": 0.44578562677217626, "rouge1_precision_stderr": 0.0022755220565382653, "rouge1_recall": 0.4889266631673112, "rouge1_recall_stderr": 0.0028806835048615135, "rouge2_fmeasure": 0.21198726153120898, "rouge2_fmeasure_stderr": 0.0018195540133290194, "rouge2_precision": 0.20934965847705986, "rouge2_precision_stderr": 0.0018748523719357, "rouge2_recall": 0.23374791215018093, "rouge2_recall_stderr": 0.0023263783868214196, "rougeL_fmeasure": 0.3276087887468011, "rougeL_fmeasure_stderr": 0.0017546007227396715, "rougeL_precision": 0.32511799875091574, "rougeL_precision_stderr": 0.001967046506806585, "rougeL_recall": 0.35739711489377646, "rougeL_recall_stderr": 0.002465697341506471, "rougeLsum_fmeasure": 0.3771411500357944, "rougeLsum_fmeasure_stderr": 0.0020137589520510426, "rougeLsum_precision": 0.3741382823179302, "rougeLsum_precision_stderr": 0.002234766274837387, "rougeLsum_recall": 0.4108171966535684, "rougeLsum_recall_stderr": 0.0027621282248548955}}, "4": {"generate_text_restaurant": {"bleu": 12.402384292539924, "bleu_stderr": 0.135024423037405, "rouge1_fmeasure": 0.4552455893734124, "rouge1_fmeasure_stderr": 0.0019583919126399917, "rouge1_precision": 0.4500297698600483, "rouge1_precision_stderr": 0.002279184782036985, "rouge1_recall": 0.49411520793728253, "rouge1_recall_stderr": 0.0028065754284846934, "rouge2_fmeasure": 0.2155975402534293, "rouge2_fmeasure_stderr": 0.001873826925854458, "rouge2_precision": 0.2124489885242478, "rouge2_precision_stderr": 0.001924872387744921, "rouge2_recall": 0.23653217550233827, "rouge2_recall_stderr": 0.0023355573505469554, "rougeL_fmeasure": 0.33116918396735845, "rougeL_fmeasure_stderr": 0.0017792810189211944, "rougeL_precision": 0.32709295569865143, "rougeL_precision_stderr": 0.001956608790763274, "rougeL_recall": 0.3602236836443227, "rougeL_recall_stderr": 0.002427496751180958, "rougeLsum_fmeasure": 0.3808847920405725, "rougeLsum_fmeasure_stderr": 0.002065156905180787, "rougeLsum_precision": 0.37596847671524797, "rougeLsum_precision_stderr": 0.002251960910942368, "rougeLsum_recall": 0.41399734380099557, "rougeLsum_recall_stderr": 0.0027661212857413163}}, "5": {"generate_text_restaurant": {"bleu": 12.262420161138401, "bleu_stderr": 0.18903914838075944, "rouge1_fmeasure": 0.45590768378496416, "rouge1_fmeasure_stderr": 0.001962173998693794, "rouge1_precision": 0.4494732628845236, "rouge1_precision_stderr": 0.0023023795088449608, "rouge1_recall": 0.49537684561519607, "rouge1_recall_stderr": 0.002767197249479356, "rouge2_fmeasure": 0.21627498227149344, "rouge2_fmeasure_stderr": 0.0018499821709783027, "rouge2_precision": 0.21297442064638186, "rouge2_precision_stderr": 0.001926070075686657, "rouge2_recall": 0.23703970881860875, "rouge2_recall_stderr": 0.0022735756287929677, "rougeL_fmeasure": 0.3331238951307013, "rougeL_fmeasure_stderr": 0.0017799650141595914, "rougeL_precision": 0.3281425210661801, "rougeL_precision_stderr": 0.001972901049188069, "rougeL_recall": 0.3628730729029585, "rougeL_recall_stderr": 0.0024176721948621425, "rougeLsum_fmeasure": 0.38246870938641964, "rougeLsum_fmeasure_stderr": 0.0020417804934292076, "rougeLsum_precision": 0.3768630011135112, "rougeLsum_precision_stderr": 0.0022664490914563948, "rougeLsum_recall": 0.41605530731114854, "rougeLsum_recall_stderr": 0.0027097026120309876}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.24608084382355, "bleu_stderr": 0.11799890497604383, "rouge1_fmeasure": 0.22178670977262957, "rouge1_fmeasure_stderr": 0.0026603762671440385, "rouge1_precision": 0.17400888678487844, "rouge1_precision_stderr": 0.002464330237468343, "rouge1_recall": 0.3502023417511915, "rouge1_recall_stderr": 0.004626152702942569, "rouge2_fmeasure": 0.05332080380122619, "rouge2_fmeasure_stderr": 0.0017188362970072153, "rouge2_precision": 0.040704743736010573, "rouge2_precision_stderr": 0.001393829442555013, "rouge2_recall": 0.08799878519146338, "rouge2_recall_stderr": 0.0029196986234636744, "rougeL_fmeasure": 0.16609949129334142, "rougeL_fmeasure_stderr": 0.0020543947028805716, "rougeL_precision": 0.12992137120722907, "rougeL_precision_stderr": 0.0018781656373678345, "rougeL_recall": 0.2641999836224796, "rougeL_recall_stderr": 0.0036812097901456225, "rougeLsum_fmeasure": 0.17331928576093303, "rougeLsum_fmeasure_stderr": 0.0022434557390865763, "rougeLsum_precision": 0.13526811734802355, "rougeLsum_precision_stderr": 0.001975414967835506, "rougeLsum_recall": 0.27591607890974656, "rougeLsum_recall_stderr": 0.004020504841332198}}, "1": {"article_DOC_summary": {"bleu": 1.8189118554354626, "bleu_stderr": 0.10029587869243096, "rouge1_fmeasure": 0.19671334605636295, "rouge1_fmeasure_stderr": 0.0027188446769093686, "rouge1_precision": 0.14019002697714425, "rouge1_precision_stderr": 0.002033439876939429, "rouge1_recall": 0.3437328928644267, "rouge1_recall_stderr": 0.004622184360550014, "rouge2_fmeasure": 0.04451173090845151, "rouge2_fmeasure_stderr": 0.0016237255917685457, "rouge2_precision": 0.031342583459769556, "rouge2_precision_stderr": 0.0011487670391079187, "rouge2_recall": 0.0803191311481453, "rouge2_recall_stderr": 0.003010334414958687, "rougeL_fmeasure": 0.14825510655652663, "rougeL_fmeasure_stderr": 0.0020028300424767575, "rougeL_precision": 0.1053445488770993, "rougeL_precision_stderr": 0.001476955053927121, "rougeL_recall": 0.2613986979531857, "rougeL_recall_stderr": 0.0035961912243557873, "rougeLsum_fmeasure": 0.1592072439127538, "rougeLsum_fmeasure_stderr": 0.002270863529627419, "rougeLsum_precision": 0.11322117707364136, "rougeLsum_precision_stderr": 0.001678631461460253, "rougeLsum_recall": 0.27993778037490374, "rougeLsum_recall_stderr": 0.003988822898958396}}, "2": {"article_DOC_summary": {"bleu": 2.0454443359589503, "bleu_stderr": 0.08368970692051826, "rouge1_fmeasure": 0.20889368966335237, "rouge1_fmeasure_stderr": 0.0026887281040736378, "rouge1_precision": 0.1488455450923004, "rouge1_precision_stderr": 0.002009481565468108, "rouge1_recall": 0.3642751817540943, "rouge1_recall_stderr": 0.004565498697439836, "rouge2_fmeasure": 0.05083267238349886, "rouge2_fmeasure_stderr": 0.001670794310427006, "rouge2_precision": 0.03580310577134537, "rouge2_precision_stderr": 0.0011818056791973873, "rouge2_recall": 0.09140711571916593, "rouge2_recall_stderr": 0.003096432205185466, "rougeL_fmeasure": 0.15708491403271296, "rougeL_fmeasure_stderr": 0.0019982552541995204, "rougeL_precision": 0.11164126156649677, "rougeL_precision_stderr": 0.001473760236619727, "rougeL_recall": 0.27611637516436727, "rougeL_recall_stderr": 0.003575935893484929, "rougeLsum_fmeasure": 0.1672065849237029, "rougeLsum_fmeasure_stderr": 0.002247088064282535, "rougeLsum_precision": 0.1188385887857819, "rougeLsum_precision_stderr": 0.0016510150185409913, "rougeLsum_recall": 0.2937542835590118, "rougeLsum_recall_stderr": 0.003999046140359821}}}} \ No newline at end of file +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.32917980943922837, "bleu_stderr": 0.028436979522399264, "rouge1_fmeasure": 0.11092359878172349, "rouge1_fmeasure_stderr": 0.0021083743244835647, "rouge1_precision": 0.07454038083418474, "rouge1_precision_stderr": 0.001766454284632946, "rouge1_recall": 0.30290353802392916, "rouge1_recall_stderr": 0.004835002819067963, "rouge2_fmeasure": 0.05212321093989505, "rouge2_fmeasure_stderr": 0.0013412640798549406, "rouge2_precision": 0.03405534335780326, "rouge2_precision_stderr": 0.0009795255375204467, "rouge2_recall": 0.14725829338052365, "rouge2_recall_stderr": 0.0033496237842817397, "rougeL_fmeasure": 0.1059469350929842, "rougeL_fmeasure_stderr": 0.0019397934888169463, "rougeL_precision": 0.07086594439969038, "rougeL_precision_stderr": 0.0016082892660627373, "rougeL_recall": 0.29226896530687846, "rougeL_recall_stderr": 0.00466575095416219, "rougeLsum_fmeasure": 0.1060475373688485, "rougeLsum_fmeasure_stderr": 0.0019809059744097615, "rougeLsum_precision": 0.0711343154862909, "rougeLsum_precision_stderr": 0.001649638611842115, "rougeLsum_recall": 0.2903231316909427, "rougeLsum_recall_stderr": 0.004582555178091478}}, "1": {"PALM_prompt": {"bleu": 0.5285748263104777, "bleu_stderr": 0.05154539253732287, "rouge1_fmeasure": 0.1218665522358165, "rouge1_fmeasure_stderr": 0.0019158434392141466, "rouge1_precision": 0.07826126347996014, "rouge1_precision_stderr": 0.001412323883175065, "rouge1_recall": 0.38231102690350377, "rouge1_recall_stderr": 0.0053881864820272286, "rouge2_fmeasure": 0.058822699744876736, "rouge2_fmeasure_stderr": 0.0012405607498159584, "rouge2_precision": 0.037391127717872924, "rouge2_precision_stderr": 0.0008664479676328054, "rouge2_recall": 0.19839458948033734, "rouge2_recall_stderr": 0.004010973785657583, "rougeL_fmeasure": 0.11559328292690177, "rougeL_fmeasure_stderr": 0.001764131335387481, "rougeL_precision": 0.07407975935897594, "rougeL_precision_stderr": 0.0012854765812861943, "rougeL_recall": 0.3629315958992141, "rougeL_recall_stderr": 0.005069344323262684, "rougeLsum_fmeasure": 0.11544948693652962, "rougeLsum_fmeasure_stderr": 0.0017842924340211324, "rougeLsum_precision": 0.07415121638849131, "rougeLsum_precision_stderr": 0.001318142684082723, "rougeLsum_recall": 0.3614370946625198, "rougeLsum_recall_stderr": 0.004964006794864235}}, "2": {"PALM_prompt": {"bleu": 0.6066453441760302, "bleu_stderr": 0.02113523644793951, "rouge1_fmeasure": 0.12529215862638043, "rouge1_fmeasure_stderr": 0.0018259072915703836, "rouge1_precision": 0.08017864305448325, "rouge1_precision_stderr": 0.0014244751923494794, "rouge1_recall": 0.40480204267227765, "rouge1_recall_stderr": 0.005138963278066339, "rouge2_fmeasure": 0.059482069465634994, "rouge2_fmeasure_stderr": 0.0011595782202414246, "rouge2_precision": 0.03746116833011539, "rouge2_precision_stderr": 0.0008054382500683709, "rouge2_recall": 0.2078327469176558, "rouge2_recall_stderr": 0.003905498610846629, "rougeL_fmeasure": 0.11765794068577944, "rougeL_fmeasure_stderr": 0.0016600536639441099, "rougeL_precision": 0.0752258255540516, "rougeL_precision_stderr": 0.0013017426600869019, "rougeL_recall": 0.37921672738638584, "rougeL_recall_stderr": 0.004724125031038082, "rougeLsum_fmeasure": 0.11886578443137756, "rougeLsum_fmeasure_stderr": 0.001710265056607144, "rougeLsum_precision": 0.07612402487838264, "rougeLsum_precision_stderr": 0.0013499817758363393, "rougeLsum_recall": 0.3833520840989382, "rougeLsum_recall_stderr": 0.004787520387833171}}, "3": {"PALM_prompt": {"bleu": 0.6647933502215259, "bleu_stderr": 0.04923433986433192, "rouge1_fmeasure": 0.12320395767637426, "rouge1_fmeasure_stderr": 0.0017848979409527468, "rouge1_precision": 0.0781723421483203, "rouge1_precision_stderr": 0.0013023299811420648, "rouge1_recall": 0.4046567231900049, "rouge1_recall_stderr": 0.005137890673994109, "rouge2_fmeasure": 0.05931767519712951, "rouge2_fmeasure_stderr": 0.0011649351056668448, "rouge2_precision": 0.037339890555931926, "rouge2_precision_stderr": 0.0008144074272058679, "rouge2_recall": 0.2106193724526676, "rouge2_recall_stderr": 0.003927640735394401, "rougeL_fmeasure": 0.1150473458151772, "rougeL_fmeasure_stderr": 0.001619592187307252, "rougeL_precision": 0.07298298160971306, "rougeL_precision_stderr": 0.0011729650830812643, "rougeL_recall": 0.37541756571956564, "rougeL_recall_stderr": 0.004598099062061018, "rougeLsum_fmeasure": 0.1167920444605632, "rougeLsum_fmeasure_stderr": 0.0016685972133379994, "rougeLsum_precision": 0.0741403065968848, "rougeLsum_precision_stderr": 0.0012209548996141538, "rougeLsum_recall": 0.38296199684709586, "rougeLsum_recall_stderr": 0.004753402389332228}}, "4": {"PALM_prompt": {"bleu": 0.6566656848041873, "bleu_stderr": 0.03433459283803238, "rouge1_fmeasure": 0.12360753221007545, "rouge1_fmeasure_stderr": 0.001737583890202225, "rouge1_precision": 0.07771155841369212, "rouge1_precision_stderr": 0.0012374008770267796, "rouge1_recall": 0.4160226119034251, "rouge1_recall_stderr": 0.005178203274221526, "rouge2_fmeasure": 0.05886517252410502, "rouge2_fmeasure_stderr": 0.0011242425541824303, "rouge2_precision": 0.036708952345678336, "rouge2_precision_stderr": 0.0007671406759682118, "rouge2_recall": 0.2142530872032143, "rouge2_recall_stderr": 0.003938344188890309, "rougeL_fmeasure": 0.11421873690833906, "rougeL_fmeasure_stderr": 0.0015701261033135242, "rougeL_precision": 0.07189004426671647, "rougeL_precision_stderr": 0.001119612418604389, "rougeL_recall": 0.381893221081078, "rougeL_recall_stderr": 0.004577229345577291, "rougeLsum_fmeasure": 0.11777877062717519, "rougeLsum_fmeasure_stderr": 0.0016474868402205588, "rougeLsum_precision": 0.0741079421040528, "rougeLsum_precision_stderr": 0.0011766632070640108, "rougeLsum_recall": 0.39536320623753024, "rougeLsum_recall_stderr": 0.004834163190323313}}, "5": {"PALM_prompt": {"bleu": 0.7608515369903156, "bleu_stderr": 0.05544338902785279, "rouge1_fmeasure": 0.12542945154484628, "rouge1_fmeasure_stderr": 0.0017246642532628591, "rouge1_precision": 0.07874251282939974, "rouge1_precision_stderr": 0.0012390781355337446, "rouge1_recall": 0.4274915903037139, "rouge1_recall_stderr": 0.0052178214889688, "rouge2_fmeasure": 0.059919950381812144, "rouge2_fmeasure_stderr": 0.0011224239943820847, "rouge2_precision": 0.03725892588478996, "rouge2_precision_stderr": 0.0007691554935322187, "rouge2_recall": 0.22193078536394883, "rouge2_recall_stderr": 0.003999018222823969, "rougeL_fmeasure": 0.11503193912763204, "rougeL_fmeasure_stderr": 0.0015332329154476773, "rougeL_precision": 0.07225670171843512, "rougeL_precision_stderr": 0.001101933354024317, "rougeL_recall": 0.390252551254915, "rougeL_recall_stderr": 0.004591599013709285, "rougeLsum_fmeasure": 0.11871364855558929, "rougeLsum_fmeasure_stderr": 0.0016202511632686142, "rougeLsum_precision": 0.07457349456330893, "rougeLsum_precision_stderr": 0.0011662890645907052, "rougeLsum_recall": 0.40390693081462914, "rougeLsum_recall_stderr": 0.004850186326089204}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.7311283501769414, "bleu_stderr": 0.07292848467922458, "rouge1_fmeasure": 0.18112864366064585, "rouge1_fmeasure_stderr": 0.0019261005266957636, "rouge1_precision": 0.15412231045043578, "rouge1_precision_stderr": 0.0019133423603737852, "rouge1_recall": 0.26596610436897833, "rouge1_recall_stderr": 0.0029078617781084343, "rouge2_fmeasure": 0.03900064299554168, "rouge2_fmeasure_stderr": 0.0009076451372655918, "rouge2_precision": 0.032745788295643126, "rouge2_precision_stderr": 0.000790295587407252, "rouge2_recall": 0.06015567293207339, "rouge2_recall_stderr": 0.0015850260882465787, "rougeL_fmeasure": 0.13940156755327857, "rougeL_fmeasure_stderr": 0.0013668263941440067, "rougeL_precision": 0.1171936085278983, "rougeL_precision_stderr": 0.00132039683220814, "rougeL_recall": 0.21005905052835785, "rougeL_recall_stderr": 0.002357155817310683, "rougeLsum_fmeasure": 0.16698321861271978, "rougeLsum_fmeasure_stderr": 0.0017578260329471065, "rougeLsum_precision": 0.14187748986714835, "rougeLsum_precision_stderr": 0.0017420221740131032, "rougeLsum_recall": 0.2462694168340833, "rougeLsum_recall_stderr": 0.002709379098155295}}, "1": {"tldr_en": {"bleu": 3.5922679202027847, "bleu_stderr": 0.05984463327930867, "rouge1_fmeasure": 0.24327622952771533, "rouge1_fmeasure_stderr": 0.002037288100861591, "rouge1_precision": 0.2123430992404537, "rouge1_precision_stderr": 0.002316190617323844, "rouge1_recall": 0.349869999562048, "rouge1_recall_stderr": 0.0028776097446615145, "rouge2_fmeasure": 0.06561109570973458, "rouge2_fmeasure_stderr": 0.0011754883309790891, "rouge2_precision": 0.05726169522995292, "rouge2_precision_stderr": 0.0011305457211571272, "rouge2_recall": 0.09761319314398223, "rouge2_recall_stderr": 0.001931346687472591, "rougeL_fmeasure": 0.17044218669835878, "rougeL_fmeasure_stderr": 0.001393233918351284, "rougeL_precision": 0.147593790999516, "rougeL_precision_stderr": 0.0015843013179494662, "rougeL_recall": 0.25163325238017475, "rougeL_recall_stderr": 0.0023185700212327545, "rougeLsum_fmeasure": 0.22970291581408894, "rougeLsum_fmeasure_stderr": 0.0019241182984497916, "rougeLsum_precision": 0.20040929059968782, "rougeLsum_precision_stderr": 0.0021906873980096493, "rougeLsum_recall": 0.33105684407128727, "rougeLsum_recall_stderr": 0.0027539492668926546}}, "2": {"tldr_en": {"bleu": 3.9658166802535484, "bleu_stderr": 0.07925927931568665, "rouge1_fmeasure": 0.24678912656203086, "rouge1_fmeasure_stderr": 0.002037101161511086, "rouge1_precision": 0.22135115607796563, "rouge1_precision_stderr": 0.002466762662359004, "rouge1_recall": 0.3499365920126553, "rouge1_recall_stderr": 0.0029103670069339765, "rouge2_fmeasure": 0.06998330324965386, "rouge2_fmeasure_stderr": 0.0012429781942009475, "rouge2_precision": 0.06270201642870808, "rouge2_precision_stderr": 0.001276661389687198, "rouge2_recall": 0.10236690363201062, "rouge2_recall_stderr": 0.0019960722912629107, "rougeL_fmeasure": 0.17719154028942327, "rougeL_fmeasure_stderr": 0.0014768440178427862, "rougeL_precision": 0.15820160856971907, "rougeL_precision_stderr": 0.001803134257764531, "rougeL_recall": 0.25667905638991584, "rougeL_recall_stderr": 0.002404233442497606, "rougeLsum_fmeasure": 0.2338283238156954, "rougeLsum_fmeasure_stderr": 0.0019162970817694011, "rougeLsum_precision": 0.20961741686683294, "rougeLsum_precision_stderr": 0.0023343675964983485, "rougeLsum_recall": 0.332399007963623, "rougeLsum_recall_stderr": 0.0027852830969350697}}, "3": {"tldr_en": {"bleu": 3.971856308559798, "bleu_stderr": 0.09289145672499902, "rouge1_fmeasure": 0.2013934895923683, "rouge1_fmeasure_stderr": 0.0024101386411320806, "rouge1_precision": 0.1877914020690942, "rouge1_precision_stderr": 0.002780603639421943, "rouge1_recall": 0.28469073541532774, "rouge1_recall_stderr": 0.003552554685317243, "rouge2_fmeasure": 0.05612430703445463, "rouge2_fmeasure_stderr": 0.0011944949124968374, "rouge2_precision": 0.05231468260770844, "rouge2_precision_stderr": 0.00130393473012376, "rouge2_recall": 0.08172649028680513, "rouge2_recall_stderr": 0.0018880261158435462, "rougeL_fmeasure": 0.14593849406958256, "rougeL_fmeasure_stderr": 0.0017576382763014175, "rougeL_precision": 0.1363160410891958, "rougeL_precision_stderr": 0.0021057536768160825, "rougeL_recall": 0.21040942089474482, "rougeL_recall_stderr": 0.002825298497100027, "rougeLsum_fmeasure": 0.19069625305891352, "rougeLsum_fmeasure_stderr": 0.002278271153942546, "rougeLsum_precision": 0.17789698988425304, "rougeLsum_precision_stderr": 0.0026448838544979162, "rougeLsum_recall": 0.2699836129539428, "rougeLsum_recall_stderr": 0.003393162716447727}}, "4": {"tldr_en": {"bleu": 0.8216828826403728, "bleu_stderr": 0.06116206229347704, "rouge1_fmeasure": 0.06294177276794305, "rouge1_fmeasure_stderr": 0.0021234318631902995, "rouge1_precision": 0.05986373469352374, "rouge1_precision_stderr": 0.0022413247620520016, "rouge1_recall": 0.09299451528533728, "rouge1_recall_stderr": 0.003182662484214728, "rouge2_fmeasure": 0.01701967942257221, "rouge2_fmeasure_stderr": 0.0008072426334728445, "rouge2_precision": 0.01607251699061772, "rouge2_precision_stderr": 0.0008627575848833899, "rouge2_recall": 0.026217246289428207, "rouge2_recall_stderr": 0.0013573347317069598, "rougeL_fmeasure": 0.04627660806628133, "rougeL_fmeasure_stderr": 0.001562406165753214, "rougeL_precision": 0.04416852187112644, "rougeL_precision_stderr": 0.0016929957101032731, "rougeL_recall": 0.06977139563193656, "rougeL_recall_stderr": 0.002453474360518891, "rougeLsum_fmeasure": 0.05949644164272974, "rougeLsum_fmeasure_stderr": 0.002009657736950104, "rougeLsum_precision": 0.056539524010580496, "rougeLsum_precision_stderr": 0.002122275707889583, "rougeLsum_recall": 0.08802275906234883, "rougeLsum_recall_stderr": 0.00301921948656467}}, "5": {"tldr_en": {"bleu": 2.939501013520158e-06, "bleu_stderr": 6.1416042954126946e-06, "rouge1_fmeasure": 0.010348132353778797, "rouge1_fmeasure_stderr": 0.0009643798928416376, "rouge1_precision": 0.010026792511549231, "rouge1_precision_stderr": 0.0009722843833023145, "rouge1_recall": 0.014997054249515373, "rouge1_recall_stderr": 0.0014200291615747175, "rouge2_fmeasure": 0.002947850654970828, "rouge2_fmeasure_stderr": 0.00038101201967536574, "rouge2_precision": 0.002614111292262381, "rouge2_precision_stderr": 0.0003582186570242107, "rouge2_recall": 0.0043272321523977674, "rouge2_recall_stderr": 0.000556694180882867, "rougeL_fmeasure": 0.007846213305722558, "rougeL_fmeasure_stderr": 0.0007330941188437008, "rougeL_precision": 0.007696209279533862, "rougeL_precision_stderr": 0.0007541223407519663, "rougeL_recall": 0.01149712833393674, "rougeL_recall_stderr": 0.001106103576275103, "rougeLsum_fmeasure": 0.009790499222718256, "rougeLsum_fmeasure_stderr": 0.0009156513115428641, "rougeLsum_precision": 0.009502293292635411, "rougeLsum_precision_stderr": 0.0009264781441363666, "rougeLsum_recall": 0.01418906709799213, "rougeLsum_recall_stderr": 0.0013478881437725126}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.17854969577688523, "bleu_stderr": 0.02350515901837014, "rouge1_fmeasure": 0.14982661873128114, "rouge1_fmeasure_stderr": 0.0010486948402501617, "rouge1_precision": 0.34481003507654684, "rouge1_precision_stderr": 0.0024960642876163215, "rouge1_recall": 0.11140479511426793, "rouge1_recall_stderr": 0.0012326825192403048, "rouge2_fmeasure": 0.010042498177274786, "rouge2_fmeasure_stderr": 0.0004526496809829142, "rouge2_precision": 0.02652033223020393, "rouge2_precision_stderr": 0.0012104924394046484, "rouge2_recall": 0.007738642998342415, "rouge2_recall_stderr": 0.00040331722748865453, "rougeL_fmeasure": 0.13556698518148783, "rougeL_fmeasure_stderr": 0.0010029610186637107, "rougeL_precision": 0.31148977629184244, "rougeL_precision_stderr": 0.0024243327495898086, "rougeL_recall": 0.10191142585459571, "rougeL_recall_stderr": 0.0012109507603402815, "rougeLsum_fmeasure": 0.1364054799839495, "rougeLsum_fmeasure_stderr": 0.0010086900095262318, "rougeLsum_precision": 0.3173006007013496, "rougeLsum_precision_stderr": 0.002513826319580817, "rougeLsum_recall": 0.10030998557950792, "rougeLsum_recall_stderr": 0.0010758913304541808}}, "1": {"generate_text_restaurant": {"bleu": 8.550714951198435, "bleu_stderr": 0.06794955954281734, "rouge1_fmeasure": 0.3940891272584714, "rouge1_fmeasure_stderr": 0.00233253448293986, "rouge1_precision": 0.37441894592411723, "rouge1_precision_stderr": 0.00285043399066679, "rouge1_recall": 0.4681809355112701, "rouge1_recall_stderr": 0.0029387631347472464, "rouge2_fmeasure": 0.17310173953900088, "rouge2_fmeasure_stderr": 0.0017673218319637287, "rouge2_precision": 0.16514863563264978, "rouge2_precision_stderr": 0.0019360690941877992, "rouge2_recall": 0.20649495583013436, "rouge2_recall_stderr": 0.0021624061718912763, "rougeL_fmeasure": 0.2901588169021053, "rougeL_fmeasure_stderr": 0.0017017464729554115, "rougeL_precision": 0.2743117208263101, "rougeL_precision_stderr": 0.0021002662608094564, "rougeL_recall": 0.34924701312209255, "rougeL_recall_stderr": 0.0023705538319379695, "rougeLsum_fmeasure": 0.327671306182913, "rougeLsum_fmeasure_stderr": 0.0022171207606254796, "rougeLsum_precision": 0.31201556645452433, "rougeLsum_precision_stderr": 0.002622342685847327, "rougeLsum_recall": 0.3885350242195471, "rougeLsum_recall_stderr": 0.0027432106106179788}}, "2": {"generate_text_restaurant": {"bleu": 11.867888027230435, "bleu_stderr": 0.15989930331829488, "rouge1_fmeasure": 0.4442817836593718, "rouge1_fmeasure_stderr": 0.0019988950330102873, "rouge1_precision": 0.44246406218448636, "rouge1_precision_stderr": 0.002320846131050864, "rouge1_recall": 0.4822237492014395, "rouge1_recall_stderr": 0.0028480697292934715, "rouge2_fmeasure": 0.20571413885055867, "rouge2_fmeasure_stderr": 0.001815833869403785, "rouge2_precision": 0.20447987966953268, "rouge2_precision_stderr": 0.0019087131351150518, "rouge2_recall": 0.22575098541036898, "rouge2_recall_stderr": 0.0022679440038537117, "rougeL_fmeasure": 0.3232871465388061, "rougeL_fmeasure_stderr": 0.0017540023019821938, "rougeL_precision": 0.3220661789895318, "rougeL_precision_stderr": 0.0019807481977887516, "rougeL_recall": 0.35167106294020406, "rougeL_recall_stderr": 0.002412719701186727, "rougeLsum_fmeasure": 0.37187691948404467, "rougeLsum_fmeasure_stderr": 0.0020334507090512886, "rougeLsum_precision": 0.37040704761920407, "rougeLsum_precision_stderr": 0.002271535999762247, "rougeLsum_recall": 0.4036635693660681, "rougeLsum_recall_stderr": 0.0027100283713887775}}, "3": {"generate_text_restaurant": {"bleu": 12.176808245577714, "bleu_stderr": 0.1522141950985559, "rouge1_fmeasure": 0.4492578440402226, "rouge1_fmeasure_stderr": 0.0019667066974463896, "rouge1_precision": 0.44578562677217626, "rouge1_precision_stderr": 0.0022755220565382653, "rouge1_recall": 0.4889266631673112, "rouge1_recall_stderr": 0.0028806835048615135, "rouge2_fmeasure": 0.21198726153120898, "rouge2_fmeasure_stderr": 0.0018195540133290194, "rouge2_precision": 0.20934965847705986, "rouge2_precision_stderr": 0.0018748523719357, "rouge2_recall": 0.23374791215018093, "rouge2_recall_stderr": 0.0023263783868214196, "rougeL_fmeasure": 0.3276087887468011, "rougeL_fmeasure_stderr": 0.0017546007227396715, "rougeL_precision": 0.32511799875091574, "rougeL_precision_stderr": 0.001967046506806585, "rougeL_recall": 0.35739711489377646, "rougeL_recall_stderr": 0.002465697341506471, "rougeLsum_fmeasure": 0.3771411500357944, "rougeLsum_fmeasure_stderr": 0.0020137589520510426, "rougeLsum_precision": 0.3741382823179302, "rougeLsum_precision_stderr": 0.002234766274837387, "rougeLsum_recall": 0.4108171966535684, "rougeLsum_recall_stderr": 0.0027621282248548955}}, "4": {"generate_text_restaurant": {"bleu": 12.402384292539924, "bleu_stderr": 0.135024423037405, "rouge1_fmeasure": 0.4552455893734124, "rouge1_fmeasure_stderr": 0.0019583919126399917, "rouge1_precision": 0.4500297698600483, "rouge1_precision_stderr": 0.002279184782036985, "rouge1_recall": 0.49411520793728253, "rouge1_recall_stderr": 0.0028065754284846934, "rouge2_fmeasure": 0.2155975402534293, "rouge2_fmeasure_stderr": 0.001873826925854458, "rouge2_precision": 0.2124489885242478, "rouge2_precision_stderr": 0.001924872387744921, "rouge2_recall": 0.23653217550233827, "rouge2_recall_stderr": 0.0023355573505469554, "rougeL_fmeasure": 0.33116918396735845, "rougeL_fmeasure_stderr": 0.0017792810189211944, "rougeL_precision": 0.32709295569865143, "rougeL_precision_stderr": 0.001956608790763274, "rougeL_recall": 0.3602236836443227, "rougeL_recall_stderr": 0.002427496751180958, "rougeLsum_fmeasure": 0.3808847920405725, "rougeLsum_fmeasure_stderr": 0.002065156905180787, "rougeLsum_precision": 0.37596847671524797, "rougeLsum_precision_stderr": 0.002251960910942368, "rougeLsum_recall": 0.41399734380099557, "rougeLsum_recall_stderr": 0.0027661212857413163}}, "5": {"generate_text_restaurant": {"bleu": 12.262420161138401, "bleu_stderr": 0.18903914838075944, "rouge1_fmeasure": 0.45590768378496416, "rouge1_fmeasure_stderr": 0.001962173998693794, "rouge1_precision": 0.4494732628845236, "rouge1_precision_stderr": 0.0023023795088449608, "rouge1_recall": 0.49537684561519607, "rouge1_recall_stderr": 0.002767197249479356, "rouge2_fmeasure": 0.21627498227149344, "rouge2_fmeasure_stderr": 0.0018499821709783027, "rouge2_precision": 0.21297442064638186, "rouge2_precision_stderr": 0.001926070075686657, "rouge2_recall": 0.23703970881860875, "rouge2_recall_stderr": 0.0022735756287929677, "rougeL_fmeasure": 0.3331238951307013, "rougeL_fmeasure_stderr": 0.0017799650141595914, "rougeL_precision": 0.3281425210661801, "rougeL_precision_stderr": 0.001972901049188069, "rougeL_recall": 0.3628730729029585, "rougeL_recall_stderr": 0.0024176721948621425, "rougeLsum_fmeasure": 0.38246870938641964, "rougeLsum_fmeasure_stderr": 0.0020417804934292076, "rougeLsum_precision": 0.3768630011135112, "rougeLsum_precision_stderr": 0.0022664490914563948, "rougeLsum_recall": 0.41605530731114854, "rougeLsum_recall_stderr": 0.0027097026120309876}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.24608084382355, "bleu_stderr": 0.11799890497604383, "rouge1_fmeasure": 0.22178670977262957, "rouge1_fmeasure_stderr": 0.0026603762671440385, "rouge1_precision": 0.17400888678487844, "rouge1_precision_stderr": 0.002464330237468343, "rouge1_recall": 0.3502023417511915, "rouge1_recall_stderr": 0.004626152702942569, "rouge2_fmeasure": 0.05332080380122619, "rouge2_fmeasure_stderr": 0.0017188362970072153, "rouge2_precision": 0.040704743736010573, "rouge2_precision_stderr": 0.001393829442555013, "rouge2_recall": 0.08799878519146338, "rouge2_recall_stderr": 0.0029196986234636744, "rougeL_fmeasure": 0.16609949129334142, "rougeL_fmeasure_stderr": 0.0020543947028805716, "rougeL_precision": 0.12992137120722907, "rougeL_precision_stderr": 0.0018781656373678345, "rougeL_recall": 0.2641999836224796, "rougeL_recall_stderr": 0.0036812097901456225, "rougeLsum_fmeasure": 0.17331928576093303, "rougeLsum_fmeasure_stderr": 0.0022434557390865763, "rougeLsum_precision": 0.13526811734802355, "rougeLsum_precision_stderr": 0.001975414967835506, "rougeLsum_recall": 0.27591607890974656, "rougeLsum_recall_stderr": 0.004020504841332198}}, "1": {"article_DOC_summary": {"bleu": 1.8189118554354626, "bleu_stderr": 0.10029587869243096, "rouge1_fmeasure": 0.19671334605636295, "rouge1_fmeasure_stderr": 0.0027188446769093686, "rouge1_precision": 0.14019002697714425, "rouge1_precision_stderr": 0.002033439876939429, "rouge1_recall": 0.3437328928644267, "rouge1_recall_stderr": 0.004622184360550014, "rouge2_fmeasure": 0.04451173090845151, "rouge2_fmeasure_stderr": 0.0016237255917685457, "rouge2_precision": 0.031342583459769556, "rouge2_precision_stderr": 0.0011487670391079187, "rouge2_recall": 0.0803191311481453, "rouge2_recall_stderr": 0.003010334414958687, "rougeL_fmeasure": 0.14825510655652663, "rougeL_fmeasure_stderr": 0.0020028300424767575, "rougeL_precision": 0.1053445488770993, "rougeL_precision_stderr": 0.001476955053927121, "rougeL_recall": 0.2613986979531857, "rougeL_recall_stderr": 0.0035961912243557873, "rougeLsum_fmeasure": 0.1592072439127538, "rougeLsum_fmeasure_stderr": 0.002270863529627419, "rougeLsum_precision": 0.11322117707364136, "rougeLsum_precision_stderr": 0.001678631461460253, "rougeLsum_recall": 0.27993778037490374, "rougeLsum_recall_stderr": 0.003988822898958396}}, "2": {"article_DOC_summary": {"bleu": 2.0454443359589503, "bleu_stderr": 0.08368970692051826, "rouge1_fmeasure": 0.20889368966335237, "rouge1_fmeasure_stderr": 0.0026887281040736378, "rouge1_precision": 0.1488455450923004, "rouge1_precision_stderr": 0.002009481565468108, "rouge1_recall": 0.3642751817540943, "rouge1_recall_stderr": 0.004565498697439836, "rouge2_fmeasure": 0.05083267238349886, "rouge2_fmeasure_stderr": 0.001670794310427006, "rouge2_precision": 0.03580310577134537, "rouge2_precision_stderr": 0.0011818056791973873, "rouge2_recall": 0.09140711571916593, "rouge2_recall_stderr": 0.003096432205185466, "rougeL_fmeasure": 0.15708491403271296, "rougeL_fmeasure_stderr": 0.0019982552541995204, "rougeL_precision": 0.11164126156649677, "rougeL_precision_stderr": 0.001473760236619727, "rougeL_recall": 0.27611637516436727, "rougeL_recall_stderr": 0.003575935893484929, "rougeLsum_fmeasure": 0.1672065849237029, "rougeLsum_fmeasure_stderr": 0.002247088064282535, "rougeLsum_precision": 0.1188385887857819, "rougeLsum_precision_stderr": 0.0016510150185409913, "rougeLsum_recall": 0.2937542835590118, "rougeLsum_recall_stderr": 0.003999046140359821}}, "3": {"article_DOC_summary": {"bleu": 2.0101663995130106, "bleu_stderr": 0.13035938633140529, "rouge1_fmeasure": 0.2005996992165322, "rouge1_fmeasure_stderr": 0.002922307185273832, "rouge1_precision": 0.1451633530537778, "rouge1_precision_stderr": 0.0022577436785926417, "rouge1_recall": 0.3454037481783916, "rouge1_recall_stderr": 0.005024002677948197, "rouge2_fmeasure": 0.047805324803931785, "rouge2_fmeasure_stderr": 0.001665552432783648, "rouge2_precision": 0.03399383589660795, "rouge2_precision_stderr": 0.0011970297160356436, "rouge2_recall": 0.08515533208064306, "rouge2_recall_stderr": 0.00305750154560904, "rougeL_fmeasure": 0.15011348459355398, "rougeL_fmeasure_stderr": 0.0021656916632942095, "rougeL_precision": 0.1082275772002201, "rougeL_precision_stderr": 0.0016303512277000601, "rougeL_recall": 0.26018618860799503, "rougeL_recall_stderr": 0.0038502171494847245, "rougeLsum_fmeasure": 0.1614133142157355, "rougeLsum_fmeasure_stderr": 0.00243279237750366, "rougeLsum_precision": 0.1163681459054013, "rougeLsum_precision_stderr": 0.0018244541208628394, "rougeLsum_recall": 0.27969335970329556, "rougeLsum_recall_stderr": 0.004315552235909818}}, "4": {"article_DOC_summary": {"bleu": 1.1210416443624807, "bleu_stderr": 0.18886560236043304, "rouge1_fmeasure": 0.05612514432374489, "rouge1_fmeasure_stderr": 0.0030731828239370837, "rouge1_precision": 0.04604504358511682, "rouge1_precision_stderr": 0.0026703224008911334, "rouge1_recall": 0.08906761715367195, "rouge1_recall_stderr": 0.0050274498862542885, "rouge2_fmeasure": 0.013937278041326827, "rouge2_fmeasure_stderr": 0.0011465942789645456, "rouge2_precision": 0.011276377923656494, "rouge2_precision_stderr": 0.0012571358140648242, "rouge2_recall": 0.023708158418892494, "rouge2_recall_stderr": 0.002002425619950796, "rougeL_fmeasure": 0.04184105680154411, "rougeL_fmeasure_stderr": 0.002285547204915818, "rougeL_precision": 0.03500836518053345, "rougeL_precision_stderr": 0.0021377284451202998, "rougeL_recall": 0.06645887444884206, "rougeL_recall_stderr": 0.0037898342177074525, "rougeLsum_fmeasure": 0.04557476167159472, "rougeLsum_fmeasure_stderr": 0.002503478149457618, "rougeLsum_precision": 0.0378191473406664, "rougeLsum_precision_stderr": 0.002272186759930814, "rougeLsum_recall": 0.07247078079866072, "rougeLsum_recall_stderr": 0.004137833255697601}}, "5": {"article_DOC_summary": {"bleu": 1.7628232823336373e-17, "bleu_stderr": 2.8521803760603906e-14, "rouge1_fmeasure": 0.0026984759176508304, "rouge1_fmeasure_stderr": 0.000738763092128848, "rouge1_precision": 0.0022472741378328047, "rouge1_precision_stderr": 0.0006307356242527256, "rouge1_recall": 0.004133903246439151, "rouge1_recall_stderr": 0.0011668769662793363, "rouge2_fmeasure": 0.000551357929598578, "rouge2_fmeasure_stderr": 0.00022851833366345053, "rouge2_precision": 0.00044268370843308837, "rouge2_precision_stderr": 0.00017799352097082119, "rouge2_recall": 0.0008499613285961788, "rouge2_recall_stderr": 0.0003602171813535447, "rougeL_fmeasure": 0.001991063584250406, "rougeL_fmeasure_stderr": 0.0005515138571257219, "rougeL_precision": 0.0016336554354190846, "rougeL_precision_stderr": 0.0004503176195113186, "rougeL_recall": 0.003027477992470196, "rougeL_recall_stderr": 0.0008730946685903953, "rougeLsum_fmeasure": 0.002097477776387118, "rougeLsum_fmeasure_stderr": 0.0005739873846094995, "rougeLsum_precision": 0.0017629252737840803, "rougeLsum_precision_stderr": 0.0005014954502254297, "rougeLsum_recall": 0.0031744555131817086, "rougeLsum_recall_stderr": 0.0008937483641872015}}}} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.json b/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f3195793a3288dcc98140930f178a56657911f2b --- /dev/null +++ b/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.22135115607796563, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002466762662359004 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.3499365920126553, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0029103670069339765 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.24678912656203086, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002037101161511086 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.06270201642870808, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001276661389687198 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.10236690363201062, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0019960722912629107 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.06998330324965386, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012429781942009475 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.15820160856971907, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001803134257764531 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.25667905638991584, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002404233442497606 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.17719154028942327, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014768440178427862 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.20961741686683294, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0023343675964983485 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.332399007963623, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0027852830969350697 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.2338283238156954, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019162970817694011 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.9658166802535484, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07925927931568665 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.json b/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..be295181671db1c37b6520d844e3f815ad9769c2 --- /dev/null +++ b/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.1877914020690942, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002780603639421943 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.28469073541532774, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003552554685317243 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.2013934895923683, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0024101386411320806 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.05231468260770844, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00130393473012376 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.08172649028680513, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0018880261158435462 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.05612430703445463, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011944949124968374 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.1363160410891958, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0021057536768160825 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.21040942089474482, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002825298497100027 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.14593849406958256, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017576382763014175 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.17789698988425304, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0026448838544979162 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.2699836129539428, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003393162716447727 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.19069625305891352, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002278271153942546 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.971856308559798, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.09289145672499902 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.json b/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a99c442c8f10d83dc88243cd1f30dfd3f75c3ff8 --- /dev/null +++ b/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.05986373469352374, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0022413247620520016 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.09299451528533728, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003182662484214728 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.06294177276794305, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0021234318631902995 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.01607251699061772, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008627575848833899 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.026217246289428207, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0013573347317069598 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.01701967942257221, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008072426334728445 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.04416852187112644, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0016929957101032731 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.06977139563193656, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002453474360518891 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.04627660806628133, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001562406165753214 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.056539524010580496, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002122275707889583 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.08802275906234883, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00301921948656467 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.05949644164272974, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002009657736950104 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.8216828826403728, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06116206229347704 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.json b/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1168b5608d100c2b3a8291c2b67eee66dfe03469 --- /dev/null +++ b/8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.010026792511549231, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0009722843833023145 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.014997054249515373, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0014200291615747175 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.010348132353778797, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0009643798928416376 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.002614111292262381, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0003582186570242107 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0043272321523977674, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.000556694180882867 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.002947850654970828, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00038101201967536574 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.007696209279533862, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0007541223407519663 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.01149712833393674, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.001106103576275103 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.007846213305722558, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0007330941188437008 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.009502293292635411, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0009264781441363666 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.01418906709799213, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0013478881437725126 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.009790499222718256, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0009156513115428641 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.939501013520158e-06, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 6.1416042954126946e-06 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_3.json b/8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1695d0b7a1509cdc347f92444861157b44b46843 --- /dev/null +++ b/8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.1451633530537778, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0022577436785926417 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3454037481783916, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.005024002677948197 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.2005996992165322, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002922307185273832 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03399383589660795, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011970297160356436 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08515533208064306, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00305750154560904 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.047805324803931785, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001665552432783648 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.1082275772002201, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0016303512277000601 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.26018618860799503, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0038502171494847245 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.15011348459355398, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0021656916632942095 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.1163681459054013, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0018244541208628394 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.27969335970329556, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004315552235909818 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.1614133142157355, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.00243279237750366 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.0101663995130106, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.13035938633140529 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_4.json b/8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0f8d6a1bd073f1a1e245380607ff7df85d5a7ff2 --- /dev/null +++ b/8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.04604504358511682, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0026703224008911334 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.08906761715367195, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0050274498862542885 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.05612514432374489, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0030731828239370837 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.011276377923656494, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0012571358140648242 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.023708158418892494, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002002425619950796 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.013937278041326827, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0011465942789645456 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.03500836518053345, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0021377284451202998 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.06645887444884206, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0037898342177074525 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.04184105680154411, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002285547204915818 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0378191473406664, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.002272186759930814 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.07247078079866072, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004137833255697601 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.04557476167159472, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002503478149457618 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.1210416443624807, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.18886560236043304 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_5.json b/8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2e6f5d3594f46551a1f1c4bd6cd66a2029f34625 --- /dev/null +++ b/8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.0022472741378328047, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0006307356242527256 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.004133903246439151, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0011668769662793363 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.0026984759176508304, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.000738763092128848 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.00044268370843308837, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00017799352097082119 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0008499613285961788, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0003602171813535447 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.000551357929598578, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00022851833366345053 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0016336554354190846, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0004503176195113186 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.003027477992470196, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0008730946685903953 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.001991063584250406, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0005515138571257219 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0017629252737840803, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0005014954502254297 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0031744555131817086, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0008937483641872015 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.002097477776387118, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0005739873846094995 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.7628232823336373e-17, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 2.8521803760603906e-14 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.json b/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..cd37248ad7e4d0fb70701a752cbb524b23ddc434 --- /dev/null +++ b/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.24807700870805302, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032686331426239274}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.2993366704587394, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028897035935155396}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.23240946839912788, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020535331913193833}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.07542612135833042, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019732499800712644}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.08771578704686069, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001852341009446473}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.06722787106866046, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001306015352926357}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.19005839255632967, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002778132688091869}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.22871074992869012, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002406566174307835}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.17544324351165277, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001595587702286189}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.23648500820691407, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031596714430369476}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.285430067349801, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027751440475442097}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.2213635199796785, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019597819111732734}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.6209568989964964, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05693014182524011}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.json b/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..eb16729ccc96dad8ecf487ff4f5f89dd8a0bf40d --- /dev/null +++ b/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.23084151108139417, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003999536576630713}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.22564401884098545, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00322268288462845}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.18788478661055097, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002472024769674421}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.07199814947709587, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002257359657010455}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0658546131120379, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016996600071196948}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0547277065977853, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001316330403077771}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.1812422041327713, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0033598941031452102}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.17549664105843907, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00262690640098719}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1450771150583464, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019574386379697857}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.21961824449300582, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003836248551627411}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.21459589482318636, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003079841928060846}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.17866009428807192, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023640874867847488}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.2600101837197197, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04841341881147457}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.json b/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1cacc3c8c33082aa8b241ffae2befe5784648e34 --- /dev/null +++ b/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.07762612304263315, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032117324014510297}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.07075067661625373, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00266141134016641}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.05902884784226779, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021304994672767174}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.024715265794116245, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015375396891910912}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.02194236604474347, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001230091261779581}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.01811109085502527, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009535025790089693}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.06261195931905689, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002684073457127978}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.0564250650958304, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021744446112801413}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.04687343959092794, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00171937320425508}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.07398653730135701, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003081234652717845}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.06720084909348248, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002527336091137797}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.056127739240012624, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002031189471014326}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.6197416585249048, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.050726126214537225}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_5.json b/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d94d6424799fc04d434dfaff8cd2fc210ba945a5 --- /dev/null +++ b/8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.012749197720849256, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013678224737624083}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.011454983373776283, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001152103487332925}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.01025088836822709, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010237064667092625}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.004106263613998532, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006621811206101381}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.003568369407358366, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00047406732675066786}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.003230399473756007, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004373491168596821}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.01021272837958567, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001137838670735105}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.009264383578933708, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0009514107080117658}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.008151883997164445, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008219787823794978}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.012094227953101243, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013067091712138548}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.010971397408480117, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0011066936947192276}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.00977974159773322, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009796733551025018}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 4.394475685873892e-07, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.020668525155571e-06}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/agg.8b7178b44b_gem_xsum_article_DOC_summary_3.json b/8b7178b44b/evaluation/generation/agg.8b7178b44b_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f73eb582def3f8b6b46f61b85208612ab4be3f6b --- /dev/null +++ b/8b7178b44b/evaluation/generation/agg.8b7178b44b_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.15862259061034525, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022705733916664897}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.37475206189355903, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004959725343264155}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.21823856021867855, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00287509423652189}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.040777801871425406, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012722722586737197}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.1013174193884724, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0032107881395706537}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.05707472809987463, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017670032180951126}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.11900917936528578, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017595863454497236}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.2823031773424627, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00395533580561216}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.16363168956728244, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002209845051514402}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.1267437227686215, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019625874757798435}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.3009970402211979, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00444855277167097}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.1744236573082813, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0025092608797292787}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.5396858265074442, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08867651581985392}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/agg.8b7178b44b_gem_xsum_article_DOC_summary_4.json b/8b7178b44b/evaluation/generation/agg.8b7178b44b_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6b235fbf447cc78db95f9a8fd701038e7c195082 --- /dev/null +++ b/8b7178b44b/evaluation/generation/agg.8b7178b44b_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.047304211697680565, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0026448858289027655}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.09474278252615058, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005308877220848683}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05899878787999313, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0031984402791436907}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.011996087771023494, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001099214076178898}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0263417241337705, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002220861663283421}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0155308783397458, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001299471111442663}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.03579263906355918, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00203840434793067}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.07128613276076894, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0040405231404290805}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.044337528627898845, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002409584552086278}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.03891697941827118, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022203695026481044}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.07811802978970729, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004450967311039029}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.04848597226747422, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026607333825415256}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.2606242149604632, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14038396719045979}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/agg.8b7178b44b_gem_xsum_article_DOC_summary_5.json b/8b7178b44b/evaluation/generation/agg.8b7178b44b_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e461b08c8b9a668bf188628826168dae0b8a7ee4 --- /dev/null +++ b/8b7178b44b/evaluation/generation/agg.8b7178b44b_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.002575838186588936, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0007400165754621803}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.004319762469932286, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0011903191633291926}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0029056409403733605, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007789369963578187}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0006086503070847952, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0002950028606554242}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.000972008751741797, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003567899989116429}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0006473343958914281, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0002465721330155313}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0017056464200092727, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0004727453480810513}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.002922861912775516, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0008030319091531282}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.001934058917849306, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005094916809588088}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.001992523091403346, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.000590292247129764}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0032955304150390267, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0009334494775770981}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.002209255524001538, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006016088430140866}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 5.904200875747917e-17, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 6.845384228352431e-14}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.jsonl b/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..3ac5d3bfefd9865dfdb8b0bdc1b7f0a3bd60e3b8 100644 --- a/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.jsonl +++ b/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1eb702335e183a82ead97f84299be0c619eb8dcec4a0388a17ec1ac434bec75 +size 18785667 diff --git a/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.jsonl b/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..35c405a27854c272b6e8dfb162db4f7a506f7730 100644 --- a/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.jsonl +++ b/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d5b9d71bb0f55514f71803ba161aabcbea7dc68d8860b6f3eedcef14527394c +size 24184575 diff --git a/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.jsonl b/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..2d52a6326d4cf031ea9b8fe336f50451dca1c64b 100644 --- a/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.jsonl +++ b/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:561658b2ae505abc09d12593bd99986ef1097e923415230bda613a2becc73126 +size 29415598 diff --git a/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_5.jsonl b/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..a2047f23c65af3007e21d72d571c9b7caa3c9b09 100644 --- a/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_5.jsonl +++ b/8b7178b44b/evaluation/generation/examples.8b7178b44b_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3af194c69d4eca87f36ac18894f687f53fe30c5ec4ba92939d9cf531746089b +size 34791765 diff --git a/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_3.jsonl b/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..4f9405bd7c8d99641f5abf2c906f15abc9bb14bc 100644 --- a/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_3.jsonl +++ b/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d080da64289343c8fe11ba84d3a0fd17811c0d228af95450c46d238db8c06357 +size 9646747 diff --git a/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_4.jsonl b/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..b6f8f6b58ddce0db5361d2b87be0a81e522be013 100644 --- a/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_4.jsonl +++ b/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02d1a0216cd6cf118808c15ada39317c4b51a4f4599441748b7ad9fd8b0d88bb +size 11674442 diff --git a/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_5.jsonl b/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..a16c651978598f70a4495e05d8d97ee20b915575 100644 --- a/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_5.jsonl +++ b/8b7178b44b/evaluation/generation/examples.8b7178b44b_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f54dfe14bffe045d75ee7a1bb4956ac7f0aaa4f56b5dae5ed0df245d917266ee +size 13899538 diff --git a/8b7178b44b/evaluation/generation/merged.csv b/8b7178b44b/evaluation/generation/merged.csv index ddeea4abfe4d2b04416a0ab0fe779ed79f303a62..c272697ce2231a15a000bc8f8f1d2f7f0cadf40b 100644 --- a/8b7178b44b/evaluation/generation/merged.csv +++ b/8b7178b44b/evaluation/generation/merged.csv @@ -18,7 +18,13 @@ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04598183981683866 gem_xsum,1,median,rouge2_fmeasure,0.04598183981683866 gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.05795467773425469 gem_xsum,2,median,rouge2_fmeasure,0.05795467773425469 -gem_xsum,2,average,multiple,0.05254585450465313 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.05707472809987463 +gem_xsum,3,median,rouge2_fmeasure,0.05707472809987463 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.0155308783397458 +gem_xsum,4,median,rouge2_fmeasure,0.0155308783397458 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0006473343958914281 +gem_xsum,5,median,rouge2_fmeasure,0.0006473343958914281 +gem_xsum,5,average,multiple,0.038481750724911876 web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04887042562262386 web_nlg_en,0,median,rouge2_fmeasure,0.04887042562262386 web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05575099241882265 @@ -36,4 +42,12 @@ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.04216588190254503 wiki_lingua_en,0,median,rouge2_fmeasure,0.04216588190254503 wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.06178715930121975 wiki_lingua_en,1,median,rouge2_fmeasure,0.06178715930121975 -wiki_lingua_en,1,average,multiple,0.05197652060188239 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.06722787106866046 +wiki_lingua_en,2,median,rouge2_fmeasure,0.06722787106866046 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.0547277065977853 +wiki_lingua_en,3,median,rouge2_fmeasure,0.0547277065977853 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01811109085502527 +wiki_lingua_en,4,median,rouge2_fmeasure,0.01811109085502527 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.003230399473756007 +wiki_lingua_en,5,median,rouge2_fmeasure,0.003230399473756007 +wiki_lingua_en,5,average,multiple,0.0412083515331653 diff --git a/8b7178b44b/evaluation/generation/merged.json b/8b7178b44b/evaluation/generation/merged.json index 679c8d07a26c617341535d1c532167b976738d68..f3b25f64bbf36f179caf8d9c21fb64e4c2577b7c 100644 --- a/8b7178b44b/evaluation/generation/merged.json +++ b/8b7178b44b/evaluation/generation/merged.json @@ -1 +1 @@ -{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.2975989706476116, "bleu_stderr": 0.04003039263082336, "rouge1_fmeasure": 0.10454253577265941, "rouge1_fmeasure_stderr": 0.0020033273254233754, "rouge1_precision": 0.07036951453310922, "rouge1_precision_stderr": 0.001708454486390292, "rouge1_recall": 0.28391423378552716, "rouge1_recall_stderr": 0.004406605071545494, "rouge2_fmeasure": 0.04887042562262386, "rouge2_fmeasure_stderr": 0.0012367653733248375, "rouge2_precision": 0.03205700906292, "rouge2_precision_stderr": 0.0008904068747417852, "rouge2_recall": 0.1344860970658841, "rouge2_recall_stderr": 0.0029699527724654008, "rougeL_fmeasure": 0.09984712791182629, "rougeL_fmeasure_stderr": 0.001864385700385995, "rougeL_precision": 0.06700444340435573, "rougeL_precision_stderr": 0.001577635401560107, "rougeL_recall": 0.2723165283491304, "rougeL_recall_stderr": 0.00420334764816495, "rougeLsum_fmeasure": 0.09986723049046536, "rougeLsum_fmeasure_stderr": 0.0018921794146755414, "rougeLsum_precision": 0.06714085166592883, "rougeLsum_precision_stderr": 0.0016058922247288731, "rougeLsum_recall": 0.2710598887897389, "rougeLsum_recall_stderr": 0.004143182286682885}}, "1": {"PALM_prompt": {"bleu": 0.5399102183044338, "bleu_stderr": 0.017964459975753112, "rouge1_fmeasure": 0.1179994472699294, "rouge1_fmeasure_stderr": 0.001898574018978891, "rouge1_precision": 0.07582800253640251, "rouge1_precision_stderr": 0.0014775645988873703, "rouge1_recall": 0.3887660352664638, "rouge1_recall_stderr": 0.005510916338178581, "rouge2_fmeasure": 0.05575099241882265, "rouge2_fmeasure_stderr": 0.0011943826599908319, "rouge2_precision": 0.03547782475927522, "rouge2_precision_stderr": 0.000840851630406, "rouge2_recall": 0.19434244564750444, "rouge2_recall_stderr": 0.0039041652957777923, "rougeL_fmeasure": 0.10987367600038657, "rougeL_fmeasure_stderr": 0.0017019944711869266, "rougeL_precision": 0.07062905559753234, "rougeL_precision_stderr": 0.0013389008252074547, "rougeL_recall": 0.3594853678419995, "rougeL_recall_stderr": 0.004899885542632218, "rougeLsum_fmeasure": 0.11246764238131458, "rougeLsum_fmeasure_stderr": 0.001788765887772429, "rougeLsum_precision": 0.07235075590060619, "rougeLsum_precision_stderr": 0.0014035297849105085, "rougeLsum_recall": 0.36849778247126685, "rougeLsum_recall_stderr": 0.005075094016447408}}, "2": {"PALM_prompt": {"bleu": 0.6132285241911561, "bleu_stderr": 0.021744492816193794, "rouge1_fmeasure": 0.11750948362933708, "rouge1_fmeasure_stderr": 0.0017443663495669947, "rouge1_precision": 0.07454923688966983, "rouge1_precision_stderr": 0.0013402782053761812, "rouge1_recall": 0.4060696737056673, "rouge1_recall_stderr": 0.005408951548341353, "rouge2_fmeasure": 0.055072070164922673, "rouge2_fmeasure_stderr": 0.0011249794411084804, "rouge2_precision": 0.034783751809391424, "rouge2_precision_stderr": 0.0008624538099408842, "rouge2_recall": 0.20529500851088617, "rouge2_recall_stderr": 0.004029599291312284, "rougeL_fmeasure": 0.1081453556034095, "rougeL_fmeasure_stderr": 0.0015836947383010406, "rougeL_precision": 0.06870503701115688, "rougeL_precision_stderr": 0.001214108724498691, "rougeL_recall": 0.36958240896722416, "rougeL_recall_stderr": 0.004690608909122334, "rougeLsum_fmeasure": 0.11165202694972098, "rougeLsum_fmeasure_stderr": 0.0016570784342064698, "rougeLsum_precision": 0.07091032443010714, "rougeLsum_precision_stderr": 0.0012648199327180544, "rougeLsum_recall": 0.383427293902694, "rougeLsum_recall_stderr": 0.004986353289044804}}, "3": {"PALM_prompt": {"bleu": 0.6689911149446237, "bleu_stderr": 0.03744306343722813, "rouge1_fmeasure": 0.1180307283191865, "rouge1_fmeasure_stderr": 0.0017866703950125558, "rouge1_precision": 0.07481052377527687, "rouge1_precision_stderr": 0.0013081535798312271, "rouge1_recall": 0.4066681705698085, "rouge1_recall_stderr": 0.0054019768500427, "rouge2_fmeasure": 0.05572250969924929, "rouge2_fmeasure_stderr": 0.0011431197746402597, "rouge2_precision": 0.03499189343685807, "rouge2_precision_stderr": 0.0007904982370484803, "rouge2_recall": 0.20736527087710988, "rouge2_recall_stderr": 0.0040148188536021905, "rougeL_fmeasure": 0.10739194121096401, "rougeL_fmeasure_stderr": 0.0015945105292757362, "rougeL_precision": 0.06809194203501431, "rougeL_precision_stderr": 0.0011605273645199663, "rougeL_recall": 0.36740563590690617, "rougeL_recall_stderr": 0.004685196144542859, "rougeLsum_fmeasure": 0.11166569262354312, "rougeLsum_fmeasure_stderr": 0.001692220078605611, "rougeLsum_precision": 0.07085369656177957, "rougeLsum_precision_stderr": 0.0012407868939625576, "rougeLsum_recall": 0.38310128345463584, "rougeLsum_recall_stderr": 0.004987172857446567}}, "4": {"PALM_prompt": {"bleu": 0.7331295013237835, "bleu_stderr": 0.0376308794527232, "rouge1_fmeasure": 0.12533762675953772, "rouge1_fmeasure_stderr": 0.0018878485389189842, "rouge1_precision": 0.07979502355783509, "rouge1_precision_stderr": 0.0014030109393831616, "rouge1_recall": 0.4235816766130294, "rouge1_recall_stderr": 0.005421570401845218, "rouge2_fmeasure": 0.05965148393261353, "rouge2_fmeasure_stderr": 0.0011811051795035776, "rouge2_precision": 0.037526756160207154, "rouge2_precision_stderr": 0.0008206560892571376, "rouge2_recall": 0.21978995547688931, "rouge2_recall_stderr": 0.004158496007987482, "rougeL_fmeasure": 0.1127973185683369, "rougeL_fmeasure_stderr": 0.0016137599087464715, "rougeL_precision": 0.07173000592003134, "rougeL_precision_stderr": 0.001189659942487538, "rougeL_recall": 0.3800311917113359, "rougeL_recall_stderr": 0.004634022951095827, "rougeLsum_fmeasure": 0.1188731408826614, "rougeLsum_fmeasure_stderr": 0.001769238872468489, "rougeLsum_precision": 0.07571045481706781, "rougeLsum_precision_stderr": 0.0013130071112336225, "rougeLsum_recall": 0.3998599380714254, "rougeLsum_recall_stderr": 0.004996013261269132}}, "5": {"PALM_prompt": {"bleu": 0.7199803964010689, "bleu_stderr": 0.0335341295220522, "rouge1_fmeasure": 0.12957731808842543, "rouge1_fmeasure_stderr": 0.001950385050411022, "rouge1_precision": 0.0831998254284886, "rouge1_precision_stderr": 0.0015598094416264543, "rouge1_recall": 0.43497959954822596, "rouge1_recall_stderr": 0.005519903741092491, "rouge2_fmeasure": 0.061471018231031564, "rouge2_fmeasure_stderr": 0.0011954291401081725, "rouge2_precision": 0.038915148561977614, "rouge2_precision_stderr": 0.0008746684971485049, "rouge2_recall": 0.22682184765709848, "rouge2_recall_stderr": 0.004248572295542693, "rougeL_fmeasure": 0.11444599253148456, "rougeL_fmeasure_stderr": 0.0016146327759809166, "rougeL_precision": 0.07313294781649872, "rougeL_precision_stderr": 0.0012487980616292505, "rougeL_recall": 0.3869620252624607, "rougeL_recall_stderr": 0.00472547888254755, "rougeLsum_fmeasure": 0.12182978494965954, "rougeLsum_fmeasure_stderr": 0.0017943723258623825, "rougeLsum_precision": 0.07806446348136394, "rougeLsum_precision_stderr": 0.0013916456734416912, "rougeLsum_recall": 0.4091496557022353, "rougeLsum_recall_stderr": 0.005082415980961682}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.953242244453973, "bleu_stderr": 0.05731680211281534, "rouge1_fmeasure": 0.19037826199272892, "rouge1_fmeasure_stderr": 0.0019330255590741413, "rouge1_precision": 0.16219138852188794, "rouge1_precision_stderr": 0.001952170268256085, "rouge1_recall": 0.27855214789998106, "rouge1_recall_stderr": 0.0028661354548397203, "rouge2_fmeasure": 0.04216588190254503, "rouge2_fmeasure_stderr": 0.0009553783013135788, "rouge2_precision": 0.0354263621636146, "rouge2_precision_stderr": 0.0008399737677670994, "rouge2_recall": 0.06484436785581019, "rouge2_recall_stderr": 0.001614050776584332, "rougeL_fmeasure": 0.1448489434443541, "rougeL_fmeasure_stderr": 0.001361425790826035, "rougeL_precision": 0.12175563950858005, "rougeL_precision_stderr": 0.0013221666940557008, "rougeL_recall": 0.21752194752302395, "rougeL_recall_stderr": 0.0023088753932079594, "rougeLsum_fmeasure": 0.1758237056132223, "rougeLsum_fmeasure_stderr": 0.0017714351825610913, "rougeLsum_precision": 0.14956126999475367, "rougeLsum_precision_stderr": 0.0017821025582316077, "rougeLsum_recall": 0.2582733333610139, "rougeLsum_recall_stderr": 0.0026810100132315094}}, "1": {"tldr_en": {"bleu": 3.1938176079049834, "bleu_stderr": 0.06758802228266159, "rouge1_fmeasure": 0.23210095477457815, "rouge1_fmeasure_stderr": 0.0019831122653105916, "rouge1_precision": 0.21449004123956839, "rouge1_precision_stderr": 0.002565225956613692, "rouge1_recall": 0.3240170155957018, "rouge1_recall_stderr": 0.002854256640412515, "rouge2_fmeasure": 0.06178715930121975, "rouge2_fmeasure_stderr": 0.0011277206664131291, "rouge2_precision": 0.05838461705053636, "rouge2_precision_stderr": 0.0014235851964682594, "rouge2_recall": 0.08878615660224992, "rouge2_recall_stderr": 0.0017942884481548583, "rougeL_fmeasure": 0.16714787573615286, "rougeL_fmeasure_stderr": 0.001398161530856219, "rougeL_precision": 0.15474204497973576, "rougeL_precision_stderr": 0.001972497669394072, "rougeL_recall": 0.23823572295131598, "rougeL_recall_stderr": 0.0022819725517357744, "rougeLsum_fmeasure": 0.2194764899179448, "rougeLsum_fmeasure_stderr": 0.0018724396931342306, "rougeLsum_precision": 0.20262040584770197, "rougeLsum_precision_stderr": 0.0024273351077249345, "rougeLsum_recall": 0.3070782317130203, "rougeLsum_recall_stderr": 0.0027244549656214464}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 5.041894180834452, "bleu_stderr": 0.13033007436218397, "rouge1_fmeasure": 0.2602520157352752, "rouge1_fmeasure_stderr": 0.0019814546233520676, "rouge1_precision": 0.2562147385675977, "rouge1_precision_stderr": 0.0023058315204108905, "rouge1_recall": 0.30315886913634216, "rouge1_recall_stderr": 0.0026139295974695565, "rouge2_fmeasure": 0.08140776012327813, "rouge2_fmeasure_stderr": 0.001311313047394141, "rouge2_precision": 0.07721561798611194, "rouge2_precision_stderr": 0.0012074751318368925, "rouge2_recall": 0.09865589414861335, "rouge2_recall_stderr": 0.0018266381810205243, "rougeL_fmeasure": 0.2348805493930538, "rougeL_fmeasure_stderr": 0.0016535155281064092, "rougeL_precision": 0.23076758584429627, "rougeL_precision_stderr": 0.0019576805274118237, "rougeL_recall": 0.27459675434984065, "rougeL_recall_stderr": 0.002244077056891058, "rougeLsum_fmeasure": 0.2231769159790997, "rougeLsum_fmeasure_stderr": 0.0018450572724071957, "rougeLsum_precision": 0.2179856280727458, "rougeLsum_precision_stderr": 0.0019996113997260653, "rougeLsum_recall": 0.26255348104591614, "rougeLsum_recall_stderr": 0.002535270034294405}}, "1": {"generate_text_restaurant": {"bleu": 11.374090854418796, "bleu_stderr": 0.16642693339608366, "rouge1_fmeasure": 0.44441293942301124, "rouge1_fmeasure_stderr": 0.0019990822428693255, "rouge1_precision": 0.4562660071544534, "rouge1_precision_stderr": 0.0023247072186540372, "rouge1_recall": 0.47023523973208886, "rouge1_recall_stderr": 0.0029204957902967815, "rouge2_fmeasure": 0.19963881437447786, "rouge2_fmeasure_stderr": 0.001763576201025895, "rouge2_precision": 0.20466887595780073, "rouge2_precision_stderr": 0.0018785112726430648, "rouge2_recall": 0.21306332737215863, "rouge2_recall_stderr": 0.0021659970991227857, "rougeL_fmeasure": 0.31715680141114483, "rougeL_fmeasure_stderr": 0.0017221275024008205, "rougeL_precision": 0.32653768131528194, "rougeL_precision_stderr": 0.001990750084311469, "rougeL_recall": 0.3355433404446685, "rougeL_recall_stderr": 0.0023788838500677285, "rougeLsum_fmeasure": 0.37193579102313756, "rougeLsum_fmeasure_stderr": 0.0020241065034827393, "rougeLsum_precision": 0.3824555027115406, "rougeLsum_precision_stderr": 0.0023079179899071546, "rougeLsum_recall": 0.39314046484640197, "rougeLsum_recall_stderr": 0.0027333955271206228}}, "2": {"generate_text_restaurant": {"bleu": 12.560644285364408, "bleu_stderr": 0.16815632193227215, "rouge1_fmeasure": 0.471505067248124, "rouge1_fmeasure_stderr": 0.0019357830388836816, "rouge1_precision": 0.4693114735453705, "rouge1_precision_stderr": 0.002281004588256248, "rouge1_recall": 0.5083374783023907, "rouge1_recall_stderr": 0.0027945657189866707, "rouge2_fmeasure": 0.22343026823022358, "rouge2_fmeasure_stderr": 0.0018374594088211456, "rouge2_precision": 0.2221316806859365, "rouge2_precision_stderr": 0.0019435645476744553, "rouge2_recall": 0.24313554807879642, "rouge2_recall_stderr": 0.002262346807934513, "rougeL_fmeasure": 0.34007419588321747, "rougeL_fmeasure_stderr": 0.0017661692599286885, "rougeL_precision": 0.33868884919458336, "rougeL_precision_stderr": 0.0020005248626700075, "rougeL_recall": 0.36715355671289307, "rougeL_recall_stderr": 0.002393826810405225, "rougeLsum_fmeasure": 0.3992367697974597, "rougeLsum_fmeasure_stderr": 0.002047758775999973, "rougeLsum_precision": 0.39764801002400024, "rougeLsum_precision_stderr": 0.0023126735358181896, "rougeLsum_recall": 0.4301763419894379, "rougeLsum_recall_stderr": 0.002704714026602679}}, "3": {"generate_text_restaurant": {"bleu": 12.935887027476703, "bleu_stderr": 0.1413974704174758, "rouge1_fmeasure": 0.47748617274088134, "rouge1_fmeasure_stderr": 0.0019432464320825518, "rouge1_precision": 0.46877980581573947, "rouge1_precision_stderr": 0.002306244616020727, "rouge1_recall": 0.5201076534915446, "rouge1_recall_stderr": 0.0027589098635390622, "rouge2_fmeasure": 0.2299932736088538, "rouge2_fmeasure_stderr": 0.0018542116692296976, "rouge2_precision": 0.22504131267180502, "rouge2_precision_stderr": 0.0019096188553057005, "rouge2_recall": 0.25334102505668565, "rouge2_recall_stderr": 0.0023303517782663806, "rougeL_fmeasure": 0.3453328850410215, "rougeL_fmeasure_stderr": 0.0017788674272907377, "rougeL_precision": 0.33848754169654177, "rougeL_precision_stderr": 0.0019586798135973788, "rougeL_recall": 0.3774849420782741, "rougeL_recall_stderr": 0.002448077823046622, "rougeLsum_fmeasure": 0.4054179095581331, "rougeLsum_fmeasure_stderr": 0.002057211964943938, "rougeLsum_precision": 0.398001467577397, "rougeLsum_precision_stderr": 0.002302044463314153, "rougeLsum_recall": 0.44176973604863884, "rougeLsum_recall_stderr": 0.002731272765488879}}, "4": {"generate_text_restaurant": {"bleu": 13.062203352639571, "bleu_stderr": 0.1443635676836731, "rouge1_fmeasure": 0.480197984035446, "rouge1_fmeasure_stderr": 0.0019374104232555646, "rouge1_precision": 0.4681188396235994, "rouge1_precision_stderr": 0.0022953424581571586, "rouge1_recall": 0.5257893103591965, "rouge1_recall_stderr": 0.0027113760261248936, "rouge2_fmeasure": 0.231134956488486, "rouge2_fmeasure_stderr": 0.0018720751391330711, "rouge2_precision": 0.2248618704612403, "rouge2_precision_stderr": 0.001945938539895268, "rouge2_recall": 0.2557332161082852, "rouge2_recall_stderr": 0.0023207877819305836, "rougeL_fmeasure": 0.34575847531397613, "rougeL_fmeasure_stderr": 0.0017883470089669242, "rougeL_precision": 0.33632345011254977, "rougeL_precision_stderr": 0.0019454027156019396, "rougeL_recall": 0.38013195807046607, "rougeL_recall_stderr": 0.00244327641775733, "rougeLsum_fmeasure": 0.4073959730873966, "rougeLsum_fmeasure_stderr": 0.002067697322138944, "rougeLsum_precision": 0.39692021210935596, "rougeLsum_precision_stderr": 0.0022989702493060767, "rougeLsum_recall": 0.44638101481881887, "rougeLsum_recall_stderr": 0.002709781160431362}}, "5": {"generate_text_restaurant": {"bleu": 13.007507775940796, "bleu_stderr": 0.15953149423790455, "rouge1_fmeasure": 0.4808919059561077, "rouge1_fmeasure_stderr": 0.001952787403665255, "rouge1_precision": 0.46770150316298764, "rouge1_precision_stderr": 0.0022797897877814698, "rouge1_recall": 0.5268279632857007, "rouge1_recall_stderr": 0.0027460682371807316, "rouge2_fmeasure": 0.23283624336459657, "rouge2_fmeasure_stderr": 0.0018731934526011642, "rouge2_precision": 0.22559527103944196, "rouge2_precision_stderr": 0.0019147802508556583, "rouge2_recall": 0.2579313360551791, "rouge2_recall_stderr": 0.00233240570767885, "rougeL_fmeasure": 0.3483900918258692, "rougeL_fmeasure_stderr": 0.0018126238683006549, "rougeL_precision": 0.33809291109163425, "rougeL_precision_stderr": 0.0019479891358002816, "rougeL_recall": 0.3830487611318609, "rougeL_recall_stderr": 0.0024664916023592687, "rougeLsum_fmeasure": 0.40962759438618196, "rougeLsum_fmeasure_stderr": 0.002070580879609581, "rougeLsum_precision": 0.3985001878422045, "rougeLsum_precision_stderr": 0.002294921272754354, "rougeLsum_recall": 0.4486089837026324, "rougeLsum_recall_stderr": 0.002709717530773792}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.202159785681064, "bleu_stderr": 0.1019905127598651, "rouge1_fmeasure": 0.21747292414186656, "rouge1_fmeasure_stderr": 0.0025362641546465615, "rouge1_precision": 0.15620442016026878, "rouge1_precision_stderr": 0.0019393097407504807, "rouge1_recall": 0.37637904496982905, "rouge1_recall_stderr": 0.004428769631922407, "rouge2_fmeasure": 0.05370104596286604, "rouge2_fmeasure_stderr": 0.001644696030213316, "rouge2_precision": 0.03798752030386553, "rouge2_precision_stderr": 0.001174899351649945, "rouge2_recall": 0.09645314548431894, "rouge2_recall_stderr": 0.0030269539528850957, "rougeL_fmeasure": 0.16392696498450032, "rougeL_fmeasure_stderr": 0.0019387441418856352, "rougeL_precision": 0.11750264664821597, "rougeL_precision_stderr": 0.001455192782942557, "rougeL_recall": 0.28553565213661397, "rougeL_recall_stderr": 0.003571545700265264, "rougeLsum_fmeasure": 0.1729737886084137, "rougeLsum_fmeasure_stderr": 0.0021734952347434375, "rougeLsum_precision": 0.12392072701085288, "rougeLsum_precision_stderr": 0.0016211314235083207, "rougeLsum_recall": 0.30146747797091933, "rougeLsum_recall_stderr": 0.003959575791577483}}, "1": {"article_DOC_summary": {"bleu": 1.9249412744961178, "bleu_stderr": 0.12178209771806298, "rouge1_fmeasure": 0.20067154816406912, "rouge1_fmeasure_stderr": 0.0026898597422456614, "rouge1_precision": 0.14293573080985697, "rouge1_precision_stderr": 0.0020020715732026848, "rouge1_recall": 0.3507450257481026, "rouge1_recall_stderr": 0.004639905013718798, "rouge2_fmeasure": 0.04598183981683866, "rouge2_fmeasure_stderr": 0.001649351483193518, "rouge2_precision": 0.0324308871516594, "rouge2_precision_stderr": 0.0011682005693517872, "rouge2_recall": 0.08264816662579652, "rouge2_recall_stderr": 0.003080246882800617, "rougeL_fmeasure": 0.15118274474995874, "rougeL_fmeasure_stderr": 0.001994655397896864, "rougeL_precision": 0.10746501641700831, "rougeL_precision_stderr": 0.0014694032015050793, "rougeL_recall": 0.2659219914279442, "rougeL_recall_stderr": 0.0035904663684219085, "rougeLsum_fmeasure": 0.160999424649348, "rougeLsum_fmeasure_stderr": 0.002240087025674008, "rougeLsum_precision": 0.11442553643678831, "rougeLsum_precision_stderr": 0.0016427072257106192, "rougeLsum_recall": 0.28320972201688277, "rougeLsum_recall_stderr": 0.00404579631504026}}, "2": {"article_DOC_summary": {"bleu": 2.354378541611583, "bleu_stderr": 0.11647407253425891, "rouge1_fmeasure": 0.22162196546785315, "rouge1_fmeasure_stderr": 0.0026821013380755065, "rouge1_precision": 0.15803061677869468, "rouge1_precision_stderr": 0.0020134830696427186, "rouge1_recall": 0.38606737885839715, "rouge1_recall_stderr": 0.0045727909552794035, "rouge2_fmeasure": 0.05795467773425469, "rouge2_fmeasure_stderr": 0.001726378828967632, "rouge2_precision": 0.04089910513966374, "rouge2_precision_stderr": 0.0012276135124666848, "rouge2_recall": 0.10377696105882783, "rouge2_recall_stderr": 0.0031669949816395633, "rougeL_fmeasure": 0.16710610962820924, "rougeL_fmeasure_stderr": 0.0020561292357138584, "rougeL_precision": 0.11895190107092266, "rougeL_precision_stderr": 0.001522240640457265, "rougeL_recall": 0.29275028181844964, "rougeL_recall_stderr": 0.0036724085051555028, "rougeLsum_fmeasure": 0.17719218883183147, "rougeLsum_fmeasure_stderr": 0.002316006750830736, "rougeLsum_precision": 0.12607152034389388, "rougeLsum_precision_stderr": 0.0017074031321102738, "rougeLsum_recall": 0.31057163498399204, "rougeLsum_recall_stderr": 0.0041042123336552}}}} \ No newline at end of file +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.2975989706476116, "bleu_stderr": 0.04003039263082336, "rouge1_fmeasure": 0.10454253577265941, "rouge1_fmeasure_stderr": 0.0020033273254233754, "rouge1_precision": 0.07036951453310922, "rouge1_precision_stderr": 0.001708454486390292, "rouge1_recall": 0.28391423378552716, "rouge1_recall_stderr": 0.004406605071545494, "rouge2_fmeasure": 0.04887042562262386, "rouge2_fmeasure_stderr": 0.0012367653733248375, "rouge2_precision": 0.03205700906292, "rouge2_precision_stderr": 0.0008904068747417852, "rouge2_recall": 0.1344860970658841, "rouge2_recall_stderr": 0.0029699527724654008, "rougeL_fmeasure": 0.09984712791182629, "rougeL_fmeasure_stderr": 0.001864385700385995, "rougeL_precision": 0.06700444340435573, "rougeL_precision_stderr": 0.001577635401560107, "rougeL_recall": 0.2723165283491304, "rougeL_recall_stderr": 0.00420334764816495, "rougeLsum_fmeasure": 0.09986723049046536, "rougeLsum_fmeasure_stderr": 0.0018921794146755414, "rougeLsum_precision": 0.06714085166592883, "rougeLsum_precision_stderr": 0.0016058922247288731, "rougeLsum_recall": 0.2710598887897389, "rougeLsum_recall_stderr": 0.004143182286682885}}, "1": {"PALM_prompt": {"bleu": 0.5399102183044338, "bleu_stderr": 0.017964459975753112, "rouge1_fmeasure": 0.1179994472699294, "rouge1_fmeasure_stderr": 0.001898574018978891, "rouge1_precision": 0.07582800253640251, "rouge1_precision_stderr": 0.0014775645988873703, "rouge1_recall": 0.3887660352664638, "rouge1_recall_stderr": 0.005510916338178581, "rouge2_fmeasure": 0.05575099241882265, "rouge2_fmeasure_stderr": 0.0011943826599908319, "rouge2_precision": 0.03547782475927522, "rouge2_precision_stderr": 0.000840851630406, "rouge2_recall": 0.19434244564750444, "rouge2_recall_stderr": 0.0039041652957777923, "rougeL_fmeasure": 0.10987367600038657, "rougeL_fmeasure_stderr": 0.0017019944711869266, "rougeL_precision": 0.07062905559753234, "rougeL_precision_stderr": 0.0013389008252074547, "rougeL_recall": 0.3594853678419995, "rougeL_recall_stderr": 0.004899885542632218, "rougeLsum_fmeasure": 0.11246764238131458, "rougeLsum_fmeasure_stderr": 0.001788765887772429, "rougeLsum_precision": 0.07235075590060619, "rougeLsum_precision_stderr": 0.0014035297849105085, "rougeLsum_recall": 0.36849778247126685, "rougeLsum_recall_stderr": 0.005075094016447408}}, "2": {"PALM_prompt": {"bleu": 0.6132285241911561, "bleu_stderr": 0.021744492816193794, "rouge1_fmeasure": 0.11750948362933708, "rouge1_fmeasure_stderr": 0.0017443663495669947, "rouge1_precision": 0.07454923688966983, "rouge1_precision_stderr": 0.0013402782053761812, "rouge1_recall": 0.4060696737056673, "rouge1_recall_stderr": 0.005408951548341353, "rouge2_fmeasure": 0.055072070164922673, "rouge2_fmeasure_stderr": 0.0011249794411084804, "rouge2_precision": 0.034783751809391424, "rouge2_precision_stderr": 0.0008624538099408842, "rouge2_recall": 0.20529500851088617, "rouge2_recall_stderr": 0.004029599291312284, "rougeL_fmeasure": 0.1081453556034095, "rougeL_fmeasure_stderr": 0.0015836947383010406, "rougeL_precision": 0.06870503701115688, "rougeL_precision_stderr": 0.001214108724498691, "rougeL_recall": 0.36958240896722416, "rougeL_recall_stderr": 0.004690608909122334, "rougeLsum_fmeasure": 0.11165202694972098, "rougeLsum_fmeasure_stderr": 0.0016570784342064698, "rougeLsum_precision": 0.07091032443010714, "rougeLsum_precision_stderr": 0.0012648199327180544, "rougeLsum_recall": 0.383427293902694, "rougeLsum_recall_stderr": 0.004986353289044804}}, "3": {"PALM_prompt": {"bleu": 0.6689911149446237, "bleu_stderr": 0.03744306343722813, "rouge1_fmeasure": 0.1180307283191865, "rouge1_fmeasure_stderr": 0.0017866703950125558, "rouge1_precision": 0.07481052377527687, "rouge1_precision_stderr": 0.0013081535798312271, "rouge1_recall": 0.4066681705698085, "rouge1_recall_stderr": 0.0054019768500427, "rouge2_fmeasure": 0.05572250969924929, "rouge2_fmeasure_stderr": 0.0011431197746402597, "rouge2_precision": 0.03499189343685807, "rouge2_precision_stderr": 0.0007904982370484803, "rouge2_recall": 0.20736527087710988, "rouge2_recall_stderr": 0.0040148188536021905, "rougeL_fmeasure": 0.10739194121096401, "rougeL_fmeasure_stderr": 0.0015945105292757362, "rougeL_precision": 0.06809194203501431, "rougeL_precision_stderr": 0.0011605273645199663, "rougeL_recall": 0.36740563590690617, "rougeL_recall_stderr": 0.004685196144542859, "rougeLsum_fmeasure": 0.11166569262354312, "rougeLsum_fmeasure_stderr": 0.001692220078605611, "rougeLsum_precision": 0.07085369656177957, "rougeLsum_precision_stderr": 0.0012407868939625576, "rougeLsum_recall": 0.38310128345463584, "rougeLsum_recall_stderr": 0.004987172857446567}}, "4": {"PALM_prompt": {"bleu": 0.7331295013237835, "bleu_stderr": 0.0376308794527232, "rouge1_fmeasure": 0.12533762675953772, "rouge1_fmeasure_stderr": 0.0018878485389189842, "rouge1_precision": 0.07979502355783509, "rouge1_precision_stderr": 0.0014030109393831616, "rouge1_recall": 0.4235816766130294, "rouge1_recall_stderr": 0.005421570401845218, "rouge2_fmeasure": 0.05965148393261353, "rouge2_fmeasure_stderr": 0.0011811051795035776, "rouge2_precision": 0.037526756160207154, "rouge2_precision_stderr": 0.0008206560892571376, "rouge2_recall": 0.21978995547688931, "rouge2_recall_stderr": 0.004158496007987482, "rougeL_fmeasure": 0.1127973185683369, "rougeL_fmeasure_stderr": 0.0016137599087464715, "rougeL_precision": 0.07173000592003134, "rougeL_precision_stderr": 0.001189659942487538, "rougeL_recall": 0.3800311917113359, "rougeL_recall_stderr": 0.004634022951095827, "rougeLsum_fmeasure": 0.1188731408826614, "rougeLsum_fmeasure_stderr": 0.001769238872468489, "rougeLsum_precision": 0.07571045481706781, "rougeLsum_precision_stderr": 0.0013130071112336225, "rougeLsum_recall": 0.3998599380714254, "rougeLsum_recall_stderr": 0.004996013261269132}}, "5": {"PALM_prompt": {"bleu": 0.7199803964010689, "bleu_stderr": 0.0335341295220522, "rouge1_fmeasure": 0.12957731808842543, "rouge1_fmeasure_stderr": 0.001950385050411022, "rouge1_precision": 0.0831998254284886, "rouge1_precision_stderr": 0.0015598094416264543, "rouge1_recall": 0.43497959954822596, "rouge1_recall_stderr": 0.005519903741092491, "rouge2_fmeasure": 0.061471018231031564, "rouge2_fmeasure_stderr": 0.0011954291401081725, "rouge2_precision": 0.038915148561977614, "rouge2_precision_stderr": 0.0008746684971485049, "rouge2_recall": 0.22682184765709848, "rouge2_recall_stderr": 0.004248572295542693, "rougeL_fmeasure": 0.11444599253148456, "rougeL_fmeasure_stderr": 0.0016146327759809166, "rougeL_precision": 0.07313294781649872, "rougeL_precision_stderr": 0.0012487980616292505, "rougeL_recall": 0.3869620252624607, "rougeL_recall_stderr": 0.00472547888254755, "rougeLsum_fmeasure": 0.12182978494965954, "rougeLsum_fmeasure_stderr": 0.0017943723258623825, "rougeLsum_precision": 0.07806446348136394, "rougeLsum_precision_stderr": 0.0013916456734416912, "rougeLsum_recall": 0.4091496557022353, "rougeLsum_recall_stderr": 0.005082415980961682}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.953242244453973, "bleu_stderr": 0.05731680211281534, "rouge1_fmeasure": 0.19037826199272892, "rouge1_fmeasure_stderr": 0.0019330255590741413, "rouge1_precision": 0.16219138852188794, "rouge1_precision_stderr": 0.001952170268256085, "rouge1_recall": 0.27855214789998106, "rouge1_recall_stderr": 0.0028661354548397203, "rouge2_fmeasure": 0.04216588190254503, "rouge2_fmeasure_stderr": 0.0009553783013135788, "rouge2_precision": 0.0354263621636146, "rouge2_precision_stderr": 0.0008399737677670994, "rouge2_recall": 0.06484436785581019, "rouge2_recall_stderr": 0.001614050776584332, "rougeL_fmeasure": 0.1448489434443541, "rougeL_fmeasure_stderr": 0.001361425790826035, "rougeL_precision": 0.12175563950858005, "rougeL_precision_stderr": 0.0013221666940557008, "rougeL_recall": 0.21752194752302395, "rougeL_recall_stderr": 0.0023088753932079594, "rougeLsum_fmeasure": 0.1758237056132223, "rougeLsum_fmeasure_stderr": 0.0017714351825610913, "rougeLsum_precision": 0.14956126999475367, "rougeLsum_precision_stderr": 0.0017821025582316077, "rougeLsum_recall": 0.2582733333610139, "rougeLsum_recall_stderr": 0.0026810100132315094}}, "1": {"tldr_en": {"bleu": 3.1938176079049834, "bleu_stderr": 0.06758802228266159, "rouge1_fmeasure": 0.23210095477457815, "rouge1_fmeasure_stderr": 0.0019831122653105916, "rouge1_precision": 0.21449004123956839, "rouge1_precision_stderr": 0.002565225956613692, "rouge1_recall": 0.3240170155957018, "rouge1_recall_stderr": 0.002854256640412515, "rouge2_fmeasure": 0.06178715930121975, "rouge2_fmeasure_stderr": 0.0011277206664131291, "rouge2_precision": 0.05838461705053636, "rouge2_precision_stderr": 0.0014235851964682594, "rouge2_recall": 0.08878615660224992, "rouge2_recall_stderr": 0.0017942884481548583, "rougeL_fmeasure": 0.16714787573615286, "rougeL_fmeasure_stderr": 0.001398161530856219, "rougeL_precision": 0.15474204497973576, "rougeL_precision_stderr": 0.001972497669394072, "rougeL_recall": 0.23823572295131598, "rougeL_recall_stderr": 0.0022819725517357744, "rougeLsum_fmeasure": 0.2194764899179448, "rougeLsum_fmeasure_stderr": 0.0018724396931342306, "rougeLsum_precision": 0.20262040584770197, "rougeLsum_precision_stderr": 0.0024273351077249345, "rougeLsum_recall": 0.3070782317130203, "rougeLsum_recall_stderr": 0.0027244549656214464}}, "2": {"tldr_en": {"bleu": 3.6209568989964964, "bleu_stderr": 0.05693014182524011, "rouge1_fmeasure": 0.23240946839912788, "rouge1_fmeasure_stderr": 0.0020535331913193833, "rouge1_precision": 0.24807700870805302, "rouge1_precision_stderr": 0.0032686331426239274, "rouge1_recall": 0.2993366704587394, "rouge1_recall_stderr": 0.0028897035935155396, "rouge2_fmeasure": 0.06722787106866046, "rouge2_fmeasure_stderr": 0.001306015352926357, "rouge2_precision": 0.07542612135833042, "rouge2_precision_stderr": 0.0019732499800712644, "rouge2_recall": 0.08771578704686069, "rouge2_recall_stderr": 0.001852341009446473, "rougeL_fmeasure": 0.17544324351165277, "rougeL_fmeasure_stderr": 0.001595587702286189, "rougeL_precision": 0.19005839255632967, "rougeL_precision_stderr": 0.002778132688091869, "rougeL_recall": 0.22871074992869012, "rougeL_recall_stderr": 0.002406566174307835, "rougeLsum_fmeasure": 0.2213635199796785, "rougeLsum_fmeasure_stderr": 0.0019597819111732734, "rougeLsum_precision": 0.23648500820691407, "rougeLsum_precision_stderr": 0.0031596714430369476, "rougeLsum_recall": 0.285430067349801, "rougeLsum_recall_stderr": 0.0027751440475442097}}, "3": {"tldr_en": {"bleu": 3.2600101837197197, "bleu_stderr": 0.04841341881147457, "rouge1_fmeasure": 0.18788478661055097, "rouge1_fmeasure_stderr": 0.002472024769674421, "rouge1_precision": 0.23084151108139417, "rouge1_precision_stderr": 0.003999536576630713, "rouge1_recall": 0.22564401884098545, "rouge1_recall_stderr": 0.00322268288462845, "rouge2_fmeasure": 0.0547277065977853, "rouge2_fmeasure_stderr": 0.001316330403077771, "rouge2_precision": 0.07199814947709587, "rouge2_precision_stderr": 0.002257359657010455, "rouge2_recall": 0.0658546131120379, "rouge2_recall_stderr": 0.0016996600071196948, "rougeL_fmeasure": 0.1450771150583464, "rougeL_fmeasure_stderr": 0.0019574386379697857, "rougeL_precision": 0.1812422041327713, "rougeL_precision_stderr": 0.0033598941031452102, "rougeL_recall": 0.17549664105843907, "rougeL_recall_stderr": 0.00262690640098719, "rougeLsum_fmeasure": 0.17866009428807192, "rougeLsum_fmeasure_stderr": 0.0023640874867847488, "rougeLsum_precision": 0.21961824449300582, "rougeLsum_precision_stderr": 0.003836248551627411, "rougeLsum_recall": 0.21459589482318636, "rougeLsum_recall_stderr": 0.003079841928060846}}, "4": {"tldr_en": {"bleu": 0.6197416585249048, "bleu_stderr": 0.050726126214537225, "rouge1_fmeasure": 0.05902884784226779, "rouge1_fmeasure_stderr": 0.0021304994672767174, "rouge1_precision": 0.07762612304263315, "rouge1_precision_stderr": 0.0032117324014510297, "rouge1_recall": 0.07075067661625373, "rouge1_recall_stderr": 0.00266141134016641, "rouge2_fmeasure": 0.01811109085502527, "rouge2_fmeasure_stderr": 0.0009535025790089693, "rouge2_precision": 0.024715265794116245, "rouge2_precision_stderr": 0.0015375396891910912, "rouge2_recall": 0.02194236604474347, "rouge2_recall_stderr": 0.001230091261779581, "rougeL_fmeasure": 0.04687343959092794, "rougeL_fmeasure_stderr": 0.00171937320425508, "rougeL_precision": 0.06261195931905689, "rougeL_precision_stderr": 0.002684073457127978, "rougeL_recall": 0.0564250650958304, "rougeL_recall_stderr": 0.0021744446112801413, "rougeLsum_fmeasure": 0.056127739240012624, "rougeLsum_fmeasure_stderr": 0.002031189471014326, "rougeLsum_precision": 0.07398653730135701, "rougeLsum_precision_stderr": 0.003081234652717845, "rougeLsum_recall": 0.06720084909348248, "rougeLsum_recall_stderr": 0.002527336091137797}}, "5": {"tldr_en": {"bleu": 4.394475685873892e-07, "bleu_stderr": 1.020668525155571e-06, "rouge1_fmeasure": 0.01025088836822709, "rouge1_fmeasure_stderr": 0.0010237064667092625, "rouge1_precision": 0.012749197720849256, "rouge1_precision_stderr": 0.0013678224737624083, "rouge1_recall": 0.011454983373776283, "rouge1_recall_stderr": 0.001152103487332925, "rouge2_fmeasure": 0.003230399473756007, "rouge2_fmeasure_stderr": 0.0004373491168596821, "rouge2_precision": 0.004106263613998532, "rouge2_precision_stderr": 0.0006621811206101381, "rouge2_recall": 0.003568369407358366, "rouge2_recall_stderr": 0.00047406732675066786, "rougeL_fmeasure": 0.008151883997164445, "rougeL_fmeasure_stderr": 0.0008219787823794978, "rougeL_precision": 0.01021272837958567, "rougeL_precision_stderr": 0.001137838670735105, "rougeL_recall": 0.009264383578933708, "rougeL_recall_stderr": 0.0009514107080117658, "rougeLsum_fmeasure": 0.00977974159773322, "rougeLsum_fmeasure_stderr": 0.0009796733551025018, "rougeLsum_precision": 0.012094227953101243, "rougeLsum_precision_stderr": 0.0013067091712138548, "rougeLsum_recall": 0.010971397408480117, "rougeLsum_recall_stderr": 0.0011066936947192276}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 5.041894180834452, "bleu_stderr": 0.13033007436218397, "rouge1_fmeasure": 0.2602520157352752, "rouge1_fmeasure_stderr": 0.0019814546233520676, "rouge1_precision": 0.2562147385675977, "rouge1_precision_stderr": 0.0023058315204108905, "rouge1_recall": 0.30315886913634216, "rouge1_recall_stderr": 0.0026139295974695565, "rouge2_fmeasure": 0.08140776012327813, "rouge2_fmeasure_stderr": 0.001311313047394141, "rouge2_precision": 0.07721561798611194, "rouge2_precision_stderr": 0.0012074751318368925, "rouge2_recall": 0.09865589414861335, "rouge2_recall_stderr": 0.0018266381810205243, "rougeL_fmeasure": 0.2348805493930538, "rougeL_fmeasure_stderr": 0.0016535155281064092, "rougeL_precision": 0.23076758584429627, "rougeL_precision_stderr": 0.0019576805274118237, "rougeL_recall": 0.27459675434984065, "rougeL_recall_stderr": 0.002244077056891058, "rougeLsum_fmeasure": 0.2231769159790997, "rougeLsum_fmeasure_stderr": 0.0018450572724071957, "rougeLsum_precision": 0.2179856280727458, "rougeLsum_precision_stderr": 0.0019996113997260653, "rougeLsum_recall": 0.26255348104591614, "rougeLsum_recall_stderr": 0.002535270034294405}}, "1": {"generate_text_restaurant": {"bleu": 11.374090854418796, "bleu_stderr": 0.16642693339608366, "rouge1_fmeasure": 0.44441293942301124, "rouge1_fmeasure_stderr": 0.0019990822428693255, "rouge1_precision": 0.4562660071544534, "rouge1_precision_stderr": 0.0023247072186540372, "rouge1_recall": 0.47023523973208886, "rouge1_recall_stderr": 0.0029204957902967815, "rouge2_fmeasure": 0.19963881437447786, "rouge2_fmeasure_stderr": 0.001763576201025895, "rouge2_precision": 0.20466887595780073, "rouge2_precision_stderr": 0.0018785112726430648, "rouge2_recall": 0.21306332737215863, "rouge2_recall_stderr": 0.0021659970991227857, "rougeL_fmeasure": 0.31715680141114483, "rougeL_fmeasure_stderr": 0.0017221275024008205, "rougeL_precision": 0.32653768131528194, "rougeL_precision_stderr": 0.001990750084311469, "rougeL_recall": 0.3355433404446685, "rougeL_recall_stderr": 0.0023788838500677285, "rougeLsum_fmeasure": 0.37193579102313756, "rougeLsum_fmeasure_stderr": 0.0020241065034827393, "rougeLsum_precision": 0.3824555027115406, "rougeLsum_precision_stderr": 0.0023079179899071546, "rougeLsum_recall": 0.39314046484640197, "rougeLsum_recall_stderr": 0.0027333955271206228}}, "2": {"generate_text_restaurant": {"bleu": 12.560644285364408, "bleu_stderr": 0.16815632193227215, "rouge1_fmeasure": 0.471505067248124, "rouge1_fmeasure_stderr": 0.0019357830388836816, "rouge1_precision": 0.4693114735453705, "rouge1_precision_stderr": 0.002281004588256248, "rouge1_recall": 0.5083374783023907, "rouge1_recall_stderr": 0.0027945657189866707, "rouge2_fmeasure": 0.22343026823022358, "rouge2_fmeasure_stderr": 0.0018374594088211456, "rouge2_precision": 0.2221316806859365, "rouge2_precision_stderr": 0.0019435645476744553, "rouge2_recall": 0.24313554807879642, "rouge2_recall_stderr": 0.002262346807934513, "rougeL_fmeasure": 0.34007419588321747, "rougeL_fmeasure_stderr": 0.0017661692599286885, "rougeL_precision": 0.33868884919458336, "rougeL_precision_stderr": 0.0020005248626700075, "rougeL_recall": 0.36715355671289307, "rougeL_recall_stderr": 0.002393826810405225, "rougeLsum_fmeasure": 0.3992367697974597, "rougeLsum_fmeasure_stderr": 0.002047758775999973, "rougeLsum_precision": 0.39764801002400024, "rougeLsum_precision_stderr": 0.0023126735358181896, "rougeLsum_recall": 0.4301763419894379, "rougeLsum_recall_stderr": 0.002704714026602679}}, "3": {"generate_text_restaurant": {"bleu": 12.935887027476703, "bleu_stderr": 0.1413974704174758, "rouge1_fmeasure": 0.47748617274088134, "rouge1_fmeasure_stderr": 0.0019432464320825518, "rouge1_precision": 0.46877980581573947, "rouge1_precision_stderr": 0.002306244616020727, "rouge1_recall": 0.5201076534915446, "rouge1_recall_stderr": 0.0027589098635390622, "rouge2_fmeasure": 0.2299932736088538, "rouge2_fmeasure_stderr": 0.0018542116692296976, "rouge2_precision": 0.22504131267180502, "rouge2_precision_stderr": 0.0019096188553057005, "rouge2_recall": 0.25334102505668565, "rouge2_recall_stderr": 0.0023303517782663806, "rougeL_fmeasure": 0.3453328850410215, "rougeL_fmeasure_stderr": 0.0017788674272907377, "rougeL_precision": 0.33848754169654177, "rougeL_precision_stderr": 0.0019586798135973788, "rougeL_recall": 0.3774849420782741, "rougeL_recall_stderr": 0.002448077823046622, "rougeLsum_fmeasure": 0.4054179095581331, "rougeLsum_fmeasure_stderr": 0.002057211964943938, "rougeLsum_precision": 0.398001467577397, "rougeLsum_precision_stderr": 0.002302044463314153, "rougeLsum_recall": 0.44176973604863884, "rougeLsum_recall_stderr": 0.002731272765488879}}, "4": {"generate_text_restaurant": {"bleu": 13.062203352639571, "bleu_stderr": 0.1443635676836731, "rouge1_fmeasure": 0.480197984035446, "rouge1_fmeasure_stderr": 0.0019374104232555646, "rouge1_precision": 0.4681188396235994, "rouge1_precision_stderr": 0.0022953424581571586, "rouge1_recall": 0.5257893103591965, "rouge1_recall_stderr": 0.0027113760261248936, "rouge2_fmeasure": 0.231134956488486, "rouge2_fmeasure_stderr": 0.0018720751391330711, "rouge2_precision": 0.2248618704612403, "rouge2_precision_stderr": 0.001945938539895268, "rouge2_recall": 0.2557332161082852, "rouge2_recall_stderr": 0.0023207877819305836, "rougeL_fmeasure": 0.34575847531397613, "rougeL_fmeasure_stderr": 0.0017883470089669242, "rougeL_precision": 0.33632345011254977, "rougeL_precision_stderr": 0.0019454027156019396, "rougeL_recall": 0.38013195807046607, "rougeL_recall_stderr": 0.00244327641775733, "rougeLsum_fmeasure": 0.4073959730873966, "rougeLsum_fmeasure_stderr": 0.002067697322138944, "rougeLsum_precision": 0.39692021210935596, "rougeLsum_precision_stderr": 0.0022989702493060767, "rougeLsum_recall": 0.44638101481881887, "rougeLsum_recall_stderr": 0.002709781160431362}}, "5": {"generate_text_restaurant": {"bleu": 13.007507775940796, "bleu_stderr": 0.15953149423790455, "rouge1_fmeasure": 0.4808919059561077, "rouge1_fmeasure_stderr": 0.001952787403665255, "rouge1_precision": 0.46770150316298764, "rouge1_precision_stderr": 0.0022797897877814698, "rouge1_recall": 0.5268279632857007, "rouge1_recall_stderr": 0.0027460682371807316, "rouge2_fmeasure": 0.23283624336459657, "rouge2_fmeasure_stderr": 0.0018731934526011642, "rouge2_precision": 0.22559527103944196, "rouge2_precision_stderr": 0.0019147802508556583, "rouge2_recall": 0.2579313360551791, "rouge2_recall_stderr": 0.00233240570767885, "rougeL_fmeasure": 0.3483900918258692, "rougeL_fmeasure_stderr": 0.0018126238683006549, "rougeL_precision": 0.33809291109163425, "rougeL_precision_stderr": 0.0019479891358002816, "rougeL_recall": 0.3830487611318609, "rougeL_recall_stderr": 0.0024664916023592687, "rougeLsum_fmeasure": 0.40962759438618196, "rougeLsum_fmeasure_stderr": 0.002070580879609581, "rougeLsum_precision": 0.3985001878422045, "rougeLsum_precision_stderr": 0.002294921272754354, "rougeLsum_recall": 0.4486089837026324, "rougeLsum_recall_stderr": 0.002709717530773792}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.202159785681064, "bleu_stderr": 0.1019905127598651, "rouge1_fmeasure": 0.21747292414186656, "rouge1_fmeasure_stderr": 0.0025362641546465615, "rouge1_precision": 0.15620442016026878, "rouge1_precision_stderr": 0.0019393097407504807, "rouge1_recall": 0.37637904496982905, "rouge1_recall_stderr": 0.004428769631922407, "rouge2_fmeasure": 0.05370104596286604, "rouge2_fmeasure_stderr": 0.001644696030213316, "rouge2_precision": 0.03798752030386553, "rouge2_precision_stderr": 0.001174899351649945, "rouge2_recall": 0.09645314548431894, "rouge2_recall_stderr": 0.0030269539528850957, "rougeL_fmeasure": 0.16392696498450032, "rougeL_fmeasure_stderr": 0.0019387441418856352, "rougeL_precision": 0.11750264664821597, "rougeL_precision_stderr": 0.001455192782942557, "rougeL_recall": 0.28553565213661397, "rougeL_recall_stderr": 0.003571545700265264, "rougeLsum_fmeasure": 0.1729737886084137, "rougeLsum_fmeasure_stderr": 0.0021734952347434375, "rougeLsum_precision": 0.12392072701085288, "rougeLsum_precision_stderr": 0.0016211314235083207, "rougeLsum_recall": 0.30146747797091933, "rougeLsum_recall_stderr": 0.003959575791577483}}, "1": {"article_DOC_summary": {"bleu": 1.9249412744961178, "bleu_stderr": 0.12178209771806298, "rouge1_fmeasure": 0.20067154816406912, "rouge1_fmeasure_stderr": 0.0026898597422456614, "rouge1_precision": 0.14293573080985697, "rouge1_precision_stderr": 0.0020020715732026848, "rouge1_recall": 0.3507450257481026, "rouge1_recall_stderr": 0.004639905013718798, "rouge2_fmeasure": 0.04598183981683866, "rouge2_fmeasure_stderr": 0.001649351483193518, "rouge2_precision": 0.0324308871516594, "rouge2_precision_stderr": 0.0011682005693517872, "rouge2_recall": 0.08264816662579652, "rouge2_recall_stderr": 0.003080246882800617, "rougeL_fmeasure": 0.15118274474995874, "rougeL_fmeasure_stderr": 0.001994655397896864, "rougeL_precision": 0.10746501641700831, "rougeL_precision_stderr": 0.0014694032015050793, "rougeL_recall": 0.2659219914279442, "rougeL_recall_stderr": 0.0035904663684219085, "rougeLsum_fmeasure": 0.160999424649348, "rougeLsum_fmeasure_stderr": 0.002240087025674008, "rougeLsum_precision": 0.11442553643678831, "rougeLsum_precision_stderr": 0.0016427072257106192, "rougeLsum_recall": 0.28320972201688277, "rougeLsum_recall_stderr": 0.00404579631504026}}, "2": {"article_DOC_summary": {"bleu": 2.354378541611583, "bleu_stderr": 0.11647407253425891, "rouge1_fmeasure": 0.22162196546785315, "rouge1_fmeasure_stderr": 0.0026821013380755065, "rouge1_precision": 0.15803061677869468, "rouge1_precision_stderr": 0.0020134830696427186, "rouge1_recall": 0.38606737885839715, "rouge1_recall_stderr": 0.0045727909552794035, "rouge2_fmeasure": 0.05795467773425469, "rouge2_fmeasure_stderr": 0.001726378828967632, "rouge2_precision": 0.04089910513966374, "rouge2_precision_stderr": 0.0012276135124666848, "rouge2_recall": 0.10377696105882783, "rouge2_recall_stderr": 0.0031669949816395633, "rougeL_fmeasure": 0.16710610962820924, "rougeL_fmeasure_stderr": 0.0020561292357138584, "rougeL_precision": 0.11895190107092266, "rougeL_precision_stderr": 0.001522240640457265, "rougeL_recall": 0.29275028181844964, "rougeL_recall_stderr": 0.0036724085051555028, "rougeLsum_fmeasure": 0.17719218883183147, "rougeLsum_fmeasure_stderr": 0.002316006750830736, "rougeLsum_precision": 0.12607152034389388, "rougeLsum_precision_stderr": 0.0017074031321102738, "rougeLsum_recall": 0.31057163498399204, "rougeLsum_recall_stderr": 0.0041042123336552}}, "3": {"article_DOC_summary": {"bleu": 2.5396858265074442, "bleu_stderr": 0.08867651581985392, "rouge1_fmeasure": 0.21823856021867855, "rouge1_fmeasure_stderr": 0.00287509423652189, "rouge1_precision": 0.15862259061034525, "rouge1_precision_stderr": 0.0022705733916664897, "rouge1_recall": 0.37475206189355903, "rouge1_recall_stderr": 0.004959725343264155, "rouge2_fmeasure": 0.05707472809987463, "rouge2_fmeasure_stderr": 0.0017670032180951126, "rouge2_precision": 0.040777801871425406, "rouge2_precision_stderr": 0.0012722722586737197, "rouge2_recall": 0.1013174193884724, "rouge2_recall_stderr": 0.0032107881395706537, "rougeL_fmeasure": 0.16363168956728244, "rougeL_fmeasure_stderr": 0.002209845051514402, "rougeL_precision": 0.11900917936528578, "rougeL_precision_stderr": 0.0017595863454497236, "rougeL_recall": 0.2823031773424627, "rougeL_recall_stderr": 0.00395533580561216, "rougeLsum_fmeasure": 0.1744236573082813, "rougeLsum_fmeasure_stderr": 0.0025092608797292787, "rougeLsum_precision": 0.1267437227686215, "rougeLsum_precision_stderr": 0.0019625874757798435, "rougeLsum_recall": 0.3009970402211979, "rougeLsum_recall_stderr": 0.00444855277167097}}, "4": {"article_DOC_summary": {"bleu": 1.2606242149604632, "bleu_stderr": 0.14038396719045979, "rouge1_fmeasure": 0.05899878787999313, "rouge1_fmeasure_stderr": 0.0031984402791436907, "rouge1_precision": 0.047304211697680565, "rouge1_precision_stderr": 0.0026448858289027655, "rouge1_recall": 0.09474278252615058, "rouge1_recall_stderr": 0.005308877220848683, "rouge2_fmeasure": 0.0155308783397458, "rouge2_fmeasure_stderr": 0.001299471111442663, "rouge2_precision": 0.011996087771023494, "rouge2_precision_stderr": 0.001099214076178898, "rouge2_recall": 0.0263417241337705, "rouge2_recall_stderr": 0.002220861663283421, "rougeL_fmeasure": 0.044337528627898845, "rougeL_fmeasure_stderr": 0.002409584552086278, "rougeL_precision": 0.03579263906355918, "rougeL_precision_stderr": 0.00203840434793067, "rougeL_recall": 0.07128613276076894, "rougeL_recall_stderr": 0.0040405231404290805, "rougeLsum_fmeasure": 0.04848597226747422, "rougeLsum_fmeasure_stderr": 0.0026607333825415256, "rougeLsum_precision": 0.03891697941827118, "rougeLsum_precision_stderr": 0.0022203695026481044, "rougeLsum_recall": 0.07811802978970729, "rougeLsum_recall_stderr": 0.004450967311039029}}, "5": {"article_DOC_summary": {"bleu": 5.904200875747917e-17, "bleu_stderr": 6.845384228352431e-14, "rouge1_fmeasure": 0.0029056409403733605, "rouge1_fmeasure_stderr": 0.0007789369963578187, "rouge1_precision": 0.002575838186588936, "rouge1_precision_stderr": 0.0007400165754621803, "rouge1_recall": 0.004319762469932286, "rouge1_recall_stderr": 0.0011903191633291926, "rouge2_fmeasure": 0.0006473343958914281, "rouge2_fmeasure_stderr": 0.0002465721330155313, "rouge2_precision": 0.0006086503070847952, "rouge2_precision_stderr": 0.0002950028606554242, "rouge2_recall": 0.000972008751741797, "rouge2_recall_stderr": 0.0003567899989116429, "rougeL_fmeasure": 0.001934058917849306, "rougeL_fmeasure_stderr": 0.0005094916809588088, "rougeL_precision": 0.0017056464200092727, "rougeL_precision_stderr": 0.0004727453480810513, "rougeL_recall": 0.002922861912775516, "rougeL_recall_stderr": 0.0008030319091531282, "rougeLsum_fmeasure": 0.002209255524001538, "rougeLsum_fmeasure_stderr": 0.0006016088430140866, "rougeLsum_precision": 0.001992523091403346, "rougeLsum_precision_stderr": 0.000590292247129764, "rougeLsum_recall": 0.0032955304150390267, "rougeLsum_recall_stderr": 0.0009334494775770981}}}} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.json b/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..80e4c639dbe8ccd03bec3f00f19c6797b1a30f28 --- /dev/null +++ b/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.24807700870805302, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0032686331426239274 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.2993366704587394, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0028897035935155396 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.23240946839912788, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020535331913193833 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.07542612135833042, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0019732499800712644 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.08771578704686069, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001852341009446473 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.06722787106866046, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001306015352926357 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.19005839255632967, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002778132688091869 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.22871074992869012, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002406566174307835 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.17544324351165277, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001595587702286189 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.23648500820691407, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0031596714430369476 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.285430067349801, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0027751440475442097 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.2213635199796785, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019597819111732734 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.6209568989964964, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05693014182524011 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.json b/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..068ee5e9cc746656b54030b64dec5cacccbd2a7e --- /dev/null +++ b/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.23084151108139417, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.003999536576630713 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.22564401884098545, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00322268288462845 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.18788478661055097, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002472024769674421 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.07199814947709587, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.002257359657010455 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0658546131120379, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016996600071196948 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0547277065977853, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001316330403077771 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.1812422041327713, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0033598941031452102 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.17549664105843907, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.00262690640098719 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.1450771150583464, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0019574386379697857 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.21961824449300582, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.003836248551627411 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.21459589482318636, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003079841928060846 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.17866009428807192, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0023640874867847488 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.2600101837197197, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04841341881147457 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.json b/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4ffa80f86ed4d534ac4f8da1f027234d40112bf6 --- /dev/null +++ b/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.07762612304263315, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0032117324014510297 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.07075067661625373, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00266141134016641 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.05902884784226779, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0021304994672767174 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.024715265794116245, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0015375396891910912 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.02194236604474347, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001230091261779581 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.01811109085502527, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009535025790089693 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.06261195931905689, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002684073457127978 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.0564250650958304, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0021744446112801413 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.04687343959092794, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.00171937320425508 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.07398653730135701, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.003081234652717845 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.06720084909348248, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002527336091137797 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.056127739240012624, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002031189471014326 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.6197416585249048, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.050726126214537225 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_5.json b/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..431d206dfec5a8f9534567e1a69a95fd30166391 --- /dev/null +++ b/8b7178b44b/evaluation/generation/slim.8b7178b44b_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.012749197720849256, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0013678224737624083 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.011454983373776283, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.001152103487332925 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.01025088836822709, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0010237064667092625 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.004106263613998532, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006621811206101381 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.003568369407358366, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00047406732675066786 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.003230399473756007, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0004373491168596821 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.01021272837958567, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001137838670735105 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.009264383578933708, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0009514107080117658 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.008151883997164445, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0008219787823794978 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.012094227953101243, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0013067091712138548 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.010971397408480117, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0011066936947192276 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.00977974159773322, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0009796733551025018 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 4.394475685873892e-07, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 1.020668525155571e-06 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/slim.8b7178b44b_gem_xsum_article_DOC_summary_3.json b/8b7178b44b/evaluation/generation/slim.8b7178b44b_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..42b7d4842b8349885a95748dc0a04e8191982772 --- /dev/null +++ b/8b7178b44b/evaluation/generation/slim.8b7178b44b_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.15862259061034525, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0022705733916664897 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.37475206189355903, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004959725343264155 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.21823856021867855, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.00287509423652189 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.040777801871425406, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0012722722586737197 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.1013174193884724, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0032107881395706537 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.05707472809987463, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0017670032180951126 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.11900917936528578, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0017595863454497236 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.2823031773424627, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.00395533580561216 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.16363168956728244, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002209845051514402 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.1267437227686215, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0019625874757798435 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.3009970402211979, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.00444855277167097 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.1744236573082813, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0025092608797292787 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.5396858265074442, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.08867651581985392 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/slim.8b7178b44b_gem_xsum_article_DOC_summary_4.json b/8b7178b44b/evaluation/generation/slim.8b7178b44b_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..cf2f2dff8012dba801ded0e68951a61876138460 --- /dev/null +++ b/8b7178b44b/evaluation/generation/slim.8b7178b44b_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.047304211697680565, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0026448858289027655 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.09474278252615058, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.005308877220848683 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.05899878787999313, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0031984402791436907 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.011996087771023494, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.001099214076178898 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0263417241337705, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002220861663283421 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0155308783397458, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001299471111442663 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.03579263906355918, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.00203840434793067 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.07128613276076894, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0040405231404290805 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.044337528627898845, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002409584552086278 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.03891697941827118, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0022203695026481044 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.07811802978970729, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004450967311039029 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.04848597226747422, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0026607333825415256 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.2606242149604632, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.14038396719045979 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b44b/evaluation/generation/slim.8b7178b44b_gem_xsum_article_DOC_summary_5.json b/8b7178b44b/evaluation/generation/slim.8b7178b44b_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..cfad6dba558470d3413e9ecb8c6fcd1b651af977 --- /dev/null +++ b/8b7178b44b/evaluation/generation/slim.8b7178b44b_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.002575838186588936, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0007400165754621803 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.004319762469932286, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0011903191633291926 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.0029056409403733605, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0007789369963578187 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0006086503070847952, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0002950028606554242 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.000972008751741797, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0003567899989116429 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0006473343958914281, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0002465721330155313 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0017056464200092727, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0004727453480810513 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.002922861912775516, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0008030319091531282 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.001934058917849306, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0005094916809588088 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.001992523091403346, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.000590292247129764 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0032955304150390267, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0009334494775770981 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.002209255524001538, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0006016088430140866 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 5.904200875747917e-17, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 6.845384228352431e-14 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b58b/evaluation/generation/agg.8b7178b58b_GEM-wiki_lingua_en_tldr_en_2.json b/8b7178b58b/evaluation/generation/agg.8b7178b58b_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3422c3b7d021b6051d659aa4451a33391bc3eea8 --- /dev/null +++ b/8b7178b58b/evaluation/generation/agg.8b7178b58b_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.23457091267008237, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0027679657131044456}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.34134267442562116, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029201119619592353}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.24748860335539324, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020650327174219227}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.06809717111480851, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015408937269110604}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.09908181073861143, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001996906692665077}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.07016193751579543, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012709364243235631}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.16899767873052451, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021331406852508664}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.24978190911695647, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024029557797763655}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.17762855895363824, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015104607154289617}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.22219258646455792, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0026321839454086595}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.32435218510070457, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002817151691787598}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.23449572227657609, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001955519333700258}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 4.105745383260244, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08432146375641045}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b58b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b58b/evaluation/generation/agg.8b7178b58b_GEM-wiki_lingua_en_tldr_en_3.json b/8b7178b58b/evaluation/generation/agg.8b7178b58b_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..3df5914762ca28f315c2bba28f446fa1f34ec1d0 --- /dev/null +++ b/8b7178b58b/evaluation/generation/agg.8b7178b58b_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.2018448766253913, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003073161400459354}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.28172071543272187, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003594746473409513}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.20579755698645813, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024814106035235444}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.05693855985307953, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014354920153072253}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.08217428533466181, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001967375414041398}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.057766738614790786, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001223860321221621}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.14712383510443716, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002367934256298009}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.2074931184166513, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002842631573610709}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.14896417876723742, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001816503510484197}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.19153153433363726, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002934568467456808}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2671010109221112, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003425037370895828}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.19498864236687288, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023499776362643178}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 4.150193084650233, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11031172513046578}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b58b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b58b/evaluation/generation/agg.8b7178b58b_GEM-wiki_lingua_en_tldr_en_4.json b/8b7178b58b/evaluation/generation/agg.8b7178b58b_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ccbcbe8f19fadbafd0ea08a349505e740eaa9f1b --- /dev/null +++ b/8b7178b58b/evaluation/generation/agg.8b7178b58b_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.06276227014065545, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023884276241260254}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.09152089229465758, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003170681109109529}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.0638872592550199, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021736088152739283}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.016993816746714407, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009445450958091424}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.026516974130858528, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013633238440930392}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.017492971654085164, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008455647267737554}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.04661295129618777, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018307969366172505}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.06895783497739912, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002463509315480347}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.04716945018429479, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016102955217448768}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.059550490378451754, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022812820815524093}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.08666480644593474, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003010285669141728}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.06053208461846568, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002064190262965029}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.7482733498786457, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04289432927446872}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b58b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b58b/evaluation/generation/agg.8b7178b58b_GEM-wiki_lingua_en_tldr_en_5.json b/8b7178b58b/evaluation/generation/agg.8b7178b58b_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b2cae4a80ca99dbd41bd990eda042613316205a9 --- /dev/null +++ b/8b7178b58b/evaluation/generation/agg.8b7178b58b_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.01062569815388256, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001044931714423556}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.015829449605584002, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0014988937828031954}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.010963700281898832, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001016356199855156}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.002965306348837173, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00039542449556901336}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.004883623633485144, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006415779749258939}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0031213037008769227, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00038874767150106574}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.00810568450656589, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0008158965824147807}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.012260844933610076, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0012027923697108917}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.00827136001527878, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007741743267056739}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.010057343739525522, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009929081454170272}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.0149839670075284, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0014220508785650803}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.010362958827854624, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009617992097751725}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.2541436019570651e-06, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 3.589754879521194e-06}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b58b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b58b/evaluation/generation/agg.8b7178b58b_gem_xsum_article_DOC_summary_3.json b/8b7178b58b/evaluation/generation/agg.8b7178b58b_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..531e204fb4f46b498e820a3a1f50202a3c583b17 --- /dev/null +++ b/8b7178b58b/evaluation/generation/agg.8b7178b58b_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.14794374341275107, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022457907635443767}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.34719811012821766, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0049938759471077005}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.203288145377478, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002902549912894558}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.038373258945208606, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012224763268285}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.09516942967025471, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0031451684639885744}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.05387889032218581, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017118408282623175}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.11596978873700024, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017382600731524656}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.2738718049202109, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00398280173794519}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.15948452331329185, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022439870906278744}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.11685891599330396, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001878971804961673}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.2761128747693524, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004338710723283792}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.16072503506629401, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002448676603417113}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.2460570125021815, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09639536279310482}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b58b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b58b/evaluation/generation/agg.8b7178b58b_gem_xsum_article_DOC_summary_4.json b/8b7178b58b/evaluation/generation/agg.8b7178b58b_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..30d04d9086f7f67ec940464662e652f9acb73cec --- /dev/null +++ b/8b7178b58b/evaluation/generation/agg.8b7178b58b_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.045641403513942344, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0026877928553966255}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.08792590860532339, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004951100510662503}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05559972270834411, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0030493540636307315}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.011710935455623647, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012861915474572897}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.02368119469360542, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001960136241976673}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.01431853871406581, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011857334383597357}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.03727739770750944, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0022705131920867042}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.07104571554164463, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004021066125324381}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.04491432061444308, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0024680842086334605}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.03735291149683435, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002314584651645963}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.07107549567098383, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004070397888407412}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.044904907347443464, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002501946586732125}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.217900205097583, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.16547330167199492}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b58b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b58b/evaluation/generation/agg.8b7178b58b_gem_xsum_article_DOC_summary_5.json b/8b7178b58b/evaluation/generation/agg.8b7178b58b_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..bfe4fe7f499add82342a93dbfb8b7b7b8fe3fcdb --- /dev/null +++ b/8b7178b58b/evaluation/generation/agg.8b7178b58b_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.0025215542744389413, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0007044520139374636}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.004537012002175192, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0012541613619486166}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.002984428901393074, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008077475320402855}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0003456350518315212, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00015654732477210276}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0006329432975265091, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00028793862001474474}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.000431120989446868, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00019695106711284816}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0017146833815532188, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0004901362650343392}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.003097704549940136, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.000885003553914414}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.002060124493547919, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005846305244422877}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0019763029300951048, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.000583997721778465}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.003481088798128124, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0009743709994872758}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0023450606462443726, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006698668342557274}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 3.69574011869292e-17, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 2.684261446109072e-14}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b58b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b58b/evaluation/generation/examples.8b7178b58b_GEM-wiki_lingua_en_tldr_en_2.jsonl b/8b7178b58b/evaluation/generation/examples.8b7178b58b_GEM-wiki_lingua_en_tldr_en_2.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..1f7b0d4b9d0267cf9a74acd6e30935f6a1a97f09 100644 --- a/8b7178b58b/evaluation/generation/examples.8b7178b58b_GEM-wiki_lingua_en_tldr_en_2.jsonl +++ b/8b7178b58b/evaluation/generation/examples.8b7178b58b_GEM-wiki_lingua_en_tldr_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3b93d7234b279751e82895a590c61fce703c628430d647c28e624472b0ae4af +size 18871565 diff --git a/8b7178b58b/evaluation/generation/examples.8b7178b58b_GEM-wiki_lingua_en_tldr_en_3.jsonl b/8b7178b58b/evaluation/generation/examples.8b7178b58b_GEM-wiki_lingua_en_tldr_en_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..ecd4fd8cda18b00dc7ef7e1bbfba357bc7984ab9 100644 --- a/8b7178b58b/evaluation/generation/examples.8b7178b58b_GEM-wiki_lingua_en_tldr_en_3.jsonl +++ b/8b7178b58b/evaluation/generation/examples.8b7178b58b_GEM-wiki_lingua_en_tldr_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d96615a9e52a4becc791b2771f5ec1d38774222d1f40c406c0cc95413f941534 +size 24286739 diff --git a/8b7178b58b/evaluation/generation/examples.8b7178b58b_GEM-wiki_lingua_en_tldr_en_4.jsonl b/8b7178b58b/evaluation/generation/examples.8b7178b58b_GEM-wiki_lingua_en_tldr_en_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..62a2c179fed7df88c73691e6a1b2b4241bad08ab 100644 --- a/8b7178b58b/evaluation/generation/examples.8b7178b58b_GEM-wiki_lingua_en_tldr_en_4.jsonl +++ b/8b7178b58b/evaluation/generation/examples.8b7178b58b_GEM-wiki_lingua_en_tldr_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e09e01c465d3d9627f639f3b908990284d7c1ddce1394813f368b8cbae2c7935 +size 29460613 diff --git a/8b7178b58b/evaluation/generation/examples.8b7178b58b_GEM-wiki_lingua_en_tldr_en_5.jsonl b/8b7178b58b/evaluation/generation/examples.8b7178b58b_GEM-wiki_lingua_en_tldr_en_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..fc2d7f9bbecb1471a379da85453ac487b21a7126 100644 --- a/8b7178b58b/evaluation/generation/examples.8b7178b58b_GEM-wiki_lingua_en_tldr_en_5.jsonl +++ b/8b7178b58b/evaluation/generation/examples.8b7178b58b_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:961291f7ba7b41dfbc6667f90076da553d7153d6618fc5f5cf1b60501a14f5e6 +size 34799376 diff --git a/8b7178b58b/evaluation/generation/examples.8b7178b58b_gem_xsum_article_DOC_summary_3.jsonl b/8b7178b58b/evaluation/generation/examples.8b7178b58b_gem_xsum_article_DOC_summary_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..fdc95eee78107cb5446c460ee0b34d84fac78f3b 100644 --- a/8b7178b58b/evaluation/generation/examples.8b7178b58b_gem_xsum_article_DOC_summary_3.jsonl +++ b/8b7178b58b/evaluation/generation/examples.8b7178b58b_gem_xsum_article_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a43ca45b3784094af2302d0b4b1397d3ac8c987f6cbc6e5d0bc85cd73d227fa +size 9649189 diff --git a/8b7178b58b/evaluation/generation/examples.8b7178b58b_gem_xsum_article_DOC_summary_4.jsonl b/8b7178b58b/evaluation/generation/examples.8b7178b58b_gem_xsum_article_DOC_summary_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..eac04d83c709c37613bd9379d780a79d9d1ac7ab 100644 --- a/8b7178b58b/evaluation/generation/examples.8b7178b58b_gem_xsum_article_DOC_summary_4.jsonl +++ b/8b7178b58b/evaluation/generation/examples.8b7178b58b_gem_xsum_article_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4f5c727460ece710ef13ad7cb33b1a934c0b963926cf6f20861b2fd45df4e5e +size 11674423 diff --git a/8b7178b58b/evaluation/generation/examples.8b7178b58b_gem_xsum_article_DOC_summary_5.jsonl b/8b7178b58b/evaluation/generation/examples.8b7178b58b_gem_xsum_article_DOC_summary_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..9da0fe3421f3b300b68720e241d932a386f2f718 100644 --- a/8b7178b58b/evaluation/generation/examples.8b7178b58b_gem_xsum_article_DOC_summary_5.jsonl +++ b/8b7178b58b/evaluation/generation/examples.8b7178b58b_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8362d238885412228d06d2611f95fe7b502c3edc2dac214ef4800d5f61ca4e47 +size 13899514 diff --git a/8b7178b58b/evaluation/generation/merged.csv b/8b7178b58b/evaluation/generation/merged.csv index 3dcbfcf12575fa42996f29b8da3ac240be53d8a9..4e43563943806657d0f4b03b32f8b9464608539a 100644 --- a/8b7178b58b/evaluation/generation/merged.csv +++ b/8b7178b58b/evaluation/generation/merged.csv @@ -18,7 +18,13 @@ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04842756093378267 gem_xsum,1,median,rouge2_fmeasure,0.04842756093378267 gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.05372147678733591 gem_xsum,2,median,rouge2_fmeasure,0.05372147678733591 -gem_xsum,2,average,multiple,0.05265405051550691 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.05387889032218581 +gem_xsum,3,median,rouge2_fmeasure,0.05387889032218581 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.01431853871406581 +gem_xsum,4,median,rouge2_fmeasure,0.01431853871406581 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.000431120989446868 +gem_xsum,5,median,rouge2_fmeasure,0.000431120989446868 +gem_xsum,5,average,multiple,0.0377651169287032 web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05525608727734654 web_nlg_en,0,median,rouge2_fmeasure,0.05525608727734654 web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.059610216940881165 @@ -36,4 +42,12 @@ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.037350220833466265 wiki_lingua_en,0,median,rouge2_fmeasure,0.037350220833466265 wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.06579562483959114 wiki_lingua_en,1,median,rouge2_fmeasure,0.06579562483959114 -wiki_lingua_en,1,average,multiple,0.0515729228365287 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.07016193751579543 +wiki_lingua_en,2,median,rouge2_fmeasure,0.07016193751579543 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.057766738614790786 +wiki_lingua_en,3,median,rouge2_fmeasure,0.057766738614790786 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.017492971654085164 +wiki_lingua_en,4,median,rouge2_fmeasure,0.017492971654085164 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0031213037008769227 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0031213037008769227 +wiki_lingua_en,5,average,multiple,0.04194813285976762 diff --git a/8b7178b58b/evaluation/generation/merged.json b/8b7178b58b/evaluation/generation/merged.json index bbc918bf29e9fd462b5be36ffb8464f02a6bfce4..e4e2cfe5e1f7fac1e2551a6f46009b9e1d9e5609 100644 --- a/8b7178b58b/evaluation/generation/merged.json +++ b/8b7178b58b/evaluation/generation/merged.json @@ -1 +1 @@ -{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3677810394757023, "bleu_stderr": 0.03281723526619896, "rouge1_fmeasure": 0.11606407601489055, "rouge1_fmeasure_stderr": 0.001969146758246114, "rouge1_precision": 0.07551756953809975, "rouge1_precision_stderr": 0.0014428401024288566, "rouge1_recall": 0.32667793832637043, "rouge1_recall_stderr": 0.0045432759692835755, "rouge2_fmeasure": 0.05525608727734654, "rouge2_fmeasure_stderr": 0.0012506172623425777, "rouge2_precision": 0.0358576440999003, "rouge2_precision_stderr": 0.0009026300665133467, "rouge2_recall": 0.16122516666992529, "rouge2_recall_stderr": 0.003267212716096976, "rougeL_fmeasure": 0.11192452656660684, "rougeL_fmeasure_stderr": 0.00184821865839915, "rougeL_precision": 0.07265841861456275, "rougeL_precision_stderr": 0.0013478808767448548, "rougeL_recall": 0.3173688843344159, "rougeL_recall_stderr": 0.004415574211500117, "rougeLsum_fmeasure": 0.11130422275494882, "rougeLsum_fmeasure_stderr": 0.0018686028478286372, "rougeLsum_precision": 0.07243892580671336, "rougeLsum_precision_stderr": 0.0013760467900819921, "rougeLsum_recall": 0.3134569235593474, "rougeLsum_recall_stderr": 0.004287283373230945}}, "1": {"PALM_prompt": {"bleu": 0.5497938903834476, "bleu_stderr": 0.030329648907717416, "rouge1_fmeasure": 0.12377098272865582, "rouge1_fmeasure_stderr": 0.0018005918585052136, "rouge1_precision": 0.07921341676188258, "rouge1_precision_stderr": 0.0013409822377619385, "rouge1_recall": 0.39870747987203026, "rouge1_recall_stderr": 0.005183434021587761, "rouge2_fmeasure": 0.059610216940881165, "rouge2_fmeasure_stderr": 0.0011877157461269233, "rouge2_precision": 0.037977631085495546, "rouge2_precision_stderr": 0.0008537095990628626, "rouge2_recall": 0.20482620328537757, "rouge2_recall_stderr": 0.003925783970259362, "rougeL_fmeasure": 0.11717642785021219, "rougeL_fmeasure_stderr": 0.001674344616420258, "rougeL_precision": 0.07494409148525415, "rougeL_precision_stderr": 0.0012339020353945417, "rougeL_recall": 0.3749296897875364, "rougeL_recall_stderr": 0.004746737866261382, "rougeLsum_fmeasure": 0.1180104667449292, "rougeLsum_fmeasure_stderr": 0.0017037357470238706, "rougeLsum_precision": 0.07561615649375214, "rougeLsum_precision_stderr": 0.001274630336916664, "rougeLsum_recall": 0.3781401543982665, "rougeLsum_recall_stderr": 0.004772450320283416}}, "2": {"PALM_prompt": {"bleu": 0.6345753558723145, "bleu_stderr": 0.022376664491002615, "rouge1_fmeasure": 0.12360575844131859, "rouge1_fmeasure_stderr": 0.0016749434032436248, "rouge1_precision": 0.07795669324486854, "rouge1_precision_stderr": 0.00119684640928671, "rouge1_recall": 0.41803228663523345, "rouge1_recall_stderr": 0.0053864905798921125, "rouge2_fmeasure": 0.059094553406324295, "rouge2_fmeasure_stderr": 0.0010986707292730262, "rouge2_precision": 0.03697168943889819, "rouge2_precision_stderr": 0.0007524504443837138, "rouge2_recall": 0.21717607459640267, "rouge2_recall_stderr": 0.004101036268432045, "rougeL_fmeasure": 0.1151080347627734, "rougeL_fmeasure_stderr": 0.0015232281204089519, "rougeL_precision": 0.07270569406411607, "rougeL_precision_stderr": 0.0010983482762338668, "rougeL_recall": 0.38609502456284567, "rougeL_recall_stderr": 0.004797386037833962, "rougeLsum_fmeasure": 0.11783674396217436, "rougeLsum_fmeasure_stderr": 0.0015879949279193565, "rougeLsum_precision": 0.07442415306126304, "rougeLsum_precision_stderr": 0.0011448850348819214, "rougeLsum_recall": 0.396883636105679, "rougeLsum_recall_stderr": 0.0049884922683259055}}, "3": {"PALM_prompt": {"bleu": 0.6611202603466564, "bleu_stderr": 0.026292760196011896, "rouge1_fmeasure": 0.12270682587650009, "rouge1_fmeasure_stderr": 0.001689827532535868, "rouge1_precision": 0.07720993388700956, "rouge1_precision_stderr": 0.0012101497257156794, "rouge1_recall": 0.4133639192496906, "rouge1_recall_stderr": 0.005411628648726416, "rouge2_fmeasure": 0.05861591086080908, "rouge2_fmeasure_stderr": 0.0010907460831452867, "rouge2_precision": 0.03655832324841317, "rouge2_precision_stderr": 0.0007506453443122757, "rouge2_recall": 0.21585685095591742, "rouge2_recall_stderr": 0.004093725567157412, "rougeL_fmeasure": 0.11361332097070079, "rougeL_fmeasure_stderr": 0.0015022331310385422, "rougeL_precision": 0.07151482183987531, "rougeL_precision_stderr": 0.001086199341044554, "rougeL_recall": 0.3823649104455851, "rougeL_recall_stderr": 0.004813722726129101, "rougeLsum_fmeasure": 0.11674347677370367, "rougeLsum_fmeasure_stderr": 0.0015911049077229342, "rougeLsum_precision": 0.07350481492884033, "rougeLsum_precision_stderr": 0.0011485603851221294, "rougeLsum_recall": 0.39303289732256513, "rougeLsum_recall_stderr": 0.005032921309423209}}, "4": {"PALM_prompt": {"bleu": 0.7128978831232388, "bleu_stderr": 0.03892571091725707, "rouge1_fmeasure": 0.12369444007480314, "rouge1_fmeasure_stderr": 0.0016461113907076013, "rouge1_precision": 0.07777752463393256, "rouge1_precision_stderr": 0.0011834566593491008, "rouge1_recall": 0.4222346728535934, "rouge1_recall_stderr": 0.005371874334785877, "rouge2_fmeasure": 0.05896624699685776, "rouge2_fmeasure_stderr": 0.0010852712903198752, "rouge2_precision": 0.03670980373588488, "rouge2_precision_stderr": 0.0007435896517793837, "rouge2_recall": 0.2212321108756897, "rouge2_recall_stderr": 0.004139207390440789, "rougeL_fmeasure": 0.11384905074014122, "rougeL_fmeasure_stderr": 0.00146289053656463, "rougeL_precision": 0.07163006841296378, "rougeL_precision_stderr": 0.001061488881603934, "rougeL_recall": 0.3878434903814413, "rougeL_recall_stderr": 0.004773394354686469, "rougeLsum_fmeasure": 0.11795118372977331, "rougeLsum_fmeasure_stderr": 0.001557940253935983, "rougeLsum_precision": 0.07421506767947293, "rougeLsum_precision_stderr": 0.001124685544226384, "rougeLsum_recall": 0.40133513783035873, "rougeLsum_recall_stderr": 0.005016786537745759}}, "5": {"PALM_prompt": {"bleu": 0.7296500968020738, "bleu_stderr": 0.03986679058747073, "rouge1_fmeasure": 0.12440202385333685, "rouge1_fmeasure_stderr": 0.0016439820305072118, "rouge1_precision": 0.07793578273286862, "rouge1_precision_stderr": 0.001170086916396576, "rouge1_recall": 0.4293926231707037, "rouge1_recall_stderr": 0.005544216357063792, "rouge2_fmeasure": 0.059676320234352584, "rouge2_fmeasure_stderr": 0.0010914957404078694, "rouge2_precision": 0.03706679304975263, "rouge2_precision_stderr": 0.0007461793226765045, "rouge2_recall": 0.22614254310251874, "rouge2_recall_stderr": 0.004258603721661164, "rougeL_fmeasure": 0.11426293462896298, "rougeL_fmeasure_stderr": 0.001470255169191014, "rougeL_precision": 0.07171154548923617, "rougeL_precision_stderr": 0.001063717283795246, "rougeL_recall": 0.39264622495319934, "rougeL_recall_stderr": 0.004853098099750933, "rougeLsum_fmeasure": 0.11846322387156706, "rougeLsum_fmeasure_stderr": 0.0015476540324461025, "rougeLsum_precision": 0.07430376688251292, "rougeLsum_precision_stderr": 0.0011115454477617379, "rougeLsum_recall": 0.40784183583085276, "rougeLsum_recall_stderr": 0.005148423771775361}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.6588537886073913, "bleu_stderr": 0.0443627487585664, "rouge1_fmeasure": 0.1771116987872097, "rouge1_fmeasure_stderr": 0.0019001225147585595, "rouge1_precision": 0.15071008120560497, "rouge1_precision_stderr": 0.0019169944124831117, "rouge1_recall": 0.2615215631789747, "rouge1_recall_stderr": 0.0028587647553713004, "rouge2_fmeasure": 0.037350220833466265, "rouge2_fmeasure_stderr": 0.0009013013631899656, "rouge2_precision": 0.03154759752710423, "rouge2_precision_stderr": 0.0007938342962238314, "rouge2_recall": 0.05718093036037643, "rouge2_recall_stderr": 0.0015126905612736802, "rougeL_fmeasure": 0.1360411096046868, "rougeL_fmeasure_stderr": 0.0013491547776702593, "rougeL_precision": 0.11439573579669693, "rougeL_precision_stderr": 0.0013313396769038728, "rougeL_recall": 0.2058353141373366, "rougeL_recall_stderr": 0.0022855149086852887, "rougeLsum_fmeasure": 0.16367916861611936, "rougeLsum_fmeasure_stderr": 0.0017419145199993307, "rougeLsum_precision": 0.13909515224541577, "rougeLsum_precision_stderr": 0.0017556231845242017, "rougeLsum_recall": 0.24244042499581767, "rougeLsum_recall_stderr": 0.0026519653366143562}}, "1": {"tldr_en": {"bleu": 3.5656470388016652, "bleu_stderr": 0.08197311523066556, "rouge1_fmeasure": 0.24342094967396746, "rouge1_fmeasure_stderr": 0.002061008293601447, "rouge1_precision": 0.2174735691686053, "rouge1_precision_stderr": 0.0024727115624900444, "rouge1_recall": 0.3460178647953164, "rouge1_recall_stderr": 0.0029105441665587413, "rouge2_fmeasure": 0.06579562483959114, "rouge2_fmeasure_stderr": 0.0012229149405673168, "rouge2_precision": 0.05974334784565509, "rouge2_precision_stderr": 0.0013520311423251665, "rouge2_recall": 0.09579105477293766, "rouge2_recall_stderr": 0.0019333447701627138, "rougeL_fmeasure": 0.1715894965219696, "rougeL_fmeasure_stderr": 0.0014647407258985053, "rougeL_precision": 0.15264641094715217, "rougeL_precision_stderr": 0.0017989249509303383, "rougeL_recall": 0.24988397509922583, "rougeL_recall_stderr": 0.0023954858097757943, "rougeLsum_fmeasure": 0.2293964977081046, "rougeLsum_fmeasure_stderr": 0.0019478558777254497, "rougeLsum_precision": 0.20485722652733274, "rougeLsum_precision_stderr": 0.002345428003149735, "rougeLsum_recall": 0.32704916730419714, "rougeLsum_recall_stderr": 0.0027965860033602368}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.0829790820445851, "bleu_stderr": 0.02197797601631595, "rouge1_fmeasure": 0.10446518863159925, "rouge1_fmeasure_stderr": 0.000777744935734902, "rouge1_precision": 0.07862879559521596, "rouge1_precision_stderr": 0.0006642205609450126, "rouge1_recall": 0.16475897375079807, "rouge1_recall_stderr": 0.0011383196654042192, "rouge2_fmeasure": 0.009544918858239404, "rouge2_fmeasure_stderr": 0.00027235599690889593, "rouge2_precision": 0.007369812030601589, "rouge2_precision_stderr": 0.00021896094396311517, "rouge2_recall": 0.014262052938121144, "rouge2_recall_stderr": 0.0004147127585356459, "rougeL_fmeasure": 0.1010125389757392, "rougeL_fmeasure_stderr": 0.0007159378810539217, "rougeL_precision": 0.07591871687158004, "rougeL_precision_stderr": 0.000610191190689566, "rougeL_recall": 0.1597865992371287, "rougeL_recall_stderr": 0.0010790075675615354, "rougeLsum_fmeasure": 0.08956898548608715, "rougeLsum_fmeasure_stderr": 0.0006418100960192206, "rougeLsum_precision": 0.0673047657702021, "rougeLsum_precision_stderr": 0.0005522793921438968, "rougeLsum_recall": 0.14195098898955258, "rougeLsum_recall_stderr": 0.0009730497278821901}}, "1": {"generate_text_restaurant": {"bleu": 10.908919859430615, "bleu_stderr": 0.16637332277469416, "rouge1_fmeasure": 0.43946652639475914, "rouge1_fmeasure_stderr": 0.0020459488608388305, "rouge1_precision": 0.4456385087012675, "rouge1_precision_stderr": 0.002353995044749726, "rouge1_recall": 0.47020454378467724, "rouge1_recall_stderr": 0.002936861869075764, "rouge2_fmeasure": 0.19809860044914396, "rouge2_fmeasure_stderr": 0.0017761118795827473, "rouge2_precision": 0.20055830506980424, "rouge2_precision_stderr": 0.0018894709575218048, "rouge2_recall": 0.2135715698673984, "rouge2_recall_stderr": 0.002164535077464, "rougeL_fmeasure": 0.31158605078339746, "rougeL_fmeasure_stderr": 0.0017722355642195858, "rougeL_precision": 0.31683119553670347, "rougeL_precision_stderr": 0.00202952794744226, "rougeL_recall": 0.33327675710241655, "rougeL_recall_stderr": 0.0023834364279679216, "rougeLsum_fmeasure": 0.36402615233989766, "rougeLsum_fmeasure_stderr": 0.0020471793436135748, "rougeLsum_precision": 0.36992976009148365, "rougeLsum_precision_stderr": 0.0023186341348778463, "rougeLsum_recall": 0.38880934305270765, "rougeLsum_recall_stderr": 0.0027097894189422028}}, "2": {"generate_text_restaurant": {"bleu": 12.496024480959404, "bleu_stderr": 0.17088930912135533, "rouge1_fmeasure": 0.46766982775713095, "rouge1_fmeasure_stderr": 0.001918126464011959, "rouge1_precision": 0.46513050554651497, "rouge1_precision_stderr": 0.002223586404391749, "rouge1_recall": 0.5044545963797141, "rouge1_recall_stderr": 0.0028151136740913597, "rouge2_fmeasure": 0.22321575010543798, "rouge2_fmeasure_stderr": 0.0018224289381751444, "rouge2_precision": 0.22174324642559978, "rouge2_precision_stderr": 0.0019100067340740226, "rouge2_recall": 0.24291920242568943, "rouge2_recall_stderr": 0.0022611578365930884, "rougeL_fmeasure": 0.3381497040905446, "rougeL_fmeasure_stderr": 0.0017546749447990841, "rougeL_precision": 0.3367051458657208, "rougeL_precision_stderr": 0.001973018443090721, "rougeL_recall": 0.36493130556649644, "rougeL_recall_stderr": 0.002393148715464209, "rougeLsum_fmeasure": 0.3916198529515355, "rougeLsum_fmeasure_stderr": 0.0020225802259552245, "rougeLsum_precision": 0.3898243742937925, "rougeLsum_precision_stderr": 0.002253177427026344, "rougeLsum_recall": 0.421949500869911, "rougeLsum_recall_stderr": 0.002689886396084085}}, "3": {"generate_text_restaurant": {"bleu": 12.876624238007336, "bleu_stderr": 0.13633367748001868, "rouge1_fmeasure": 0.47400023722312934, "rouge1_fmeasure_stderr": 0.001900202291806821, "rouge1_precision": 0.4666088727251901, "rouge1_precision_stderr": 0.0022313332553631853, "rouge1_recall": 0.5154695171919821, "rouge1_recall_stderr": 0.0027754047400933637, "rouge2_fmeasure": 0.2289130980630063, "rouge2_fmeasure_stderr": 0.0018363466703835796, "rouge2_precision": 0.2243865656346707, "rouge2_precision_stderr": 0.0018719494043509864, "rouge2_recall": 0.25156840321883794, "rouge2_recall_stderr": 0.0023063040992685763, "rougeL_fmeasure": 0.34349469052543125, "rougeL_fmeasure_stderr": 0.0017677223953016846, "rougeL_precision": 0.3380898171473523, "rougeL_precision_stderr": 0.0019534178000102315, "rougeL_recall": 0.37434995371411073, "rougeL_recall_stderr": 0.002444470010384392, "rougeLsum_fmeasure": 0.3977755285897407, "rougeLsum_fmeasure_stderr": 0.0020093339319118943, "rougeLsum_precision": 0.39156150543550683, "rougeLsum_precision_stderr": 0.0022264717816469198, "rougeLsum_recall": 0.43267538620734924, "rougeLsum_recall_stderr": 0.0027083330913664685}}, "4": {"generate_text_restaurant": {"bleu": 13.038419795082826, "bleu_stderr": 0.1976428408924392, "rouge1_fmeasure": 0.47584611404695387, "rouge1_fmeasure_stderr": 0.001947074296264647, "rouge1_precision": 0.466899050531228, "rouge1_precision_stderr": 0.0022803800444716094, "rouge1_recall": 0.5180929741571967, "rouge1_recall_stderr": 0.0027666521523327485, "rouge2_fmeasure": 0.2331128191296418, "rouge2_fmeasure_stderr": 0.0018880871361337953, "rouge2_precision": 0.22797984329863402, "rouge2_precision_stderr": 0.0019493427856802211, "rouge2_recall": 0.2562999197032082, "rouge2_recall_stderr": 0.002335812648509201, "rougeL_fmeasure": 0.34478040188814424, "rougeL_fmeasure_stderr": 0.0018168580160894932, "rougeL_precision": 0.33801665068739845, "rougeL_precision_stderr": 0.0019999447186985647, "rougeL_recall": 0.37648151811835306, "rougeL_recall_stderr": 0.0024685261116801542, "rougeLsum_fmeasure": 0.40002040198984967, "rougeLsum_fmeasure_stderr": 0.00209065471547177, "rougeLsum_precision": 0.3924955108326523, "rougeLsum_precision_stderr": 0.0023101764008543304, "rougeLsum_recall": 0.4357423190744161, "rougeLsum_recall_stderr": 0.0027563832782966625}}, "5": {"generate_text_restaurant": {"bleu": 12.732763368074703, "bleu_stderr": 0.17590369930486258, "rouge1_fmeasure": 0.47424433103008173, "rouge1_fmeasure_stderr": 0.0019121232239086051, "rouge1_precision": 0.46421051231524674, "rouge1_precision_stderr": 0.002228291979514538, "rouge1_recall": 0.5154379150999142, "rouge1_recall_stderr": 0.0026933286526690937, "rouge2_fmeasure": 0.22941163204827142, "rouge2_fmeasure_stderr": 0.001863304249033461, "rouge2_precision": 0.2240248553808123, "rouge2_precision_stderr": 0.001895223667089071, "rouge2_recall": 0.2513696885147789, "rouge2_recall_stderr": 0.0022849010356271104, "rougeL_fmeasure": 0.34311455561819293, "rougeL_fmeasure_stderr": 0.001783681336394135, "rougeL_precision": 0.3353528060064174, "rougeL_precision_stderr": 0.0019239210092200653, "rougeL_recall": 0.37390031423496406, "rougeL_recall_stderr": 0.0023965157476151687, "rougeLsum_fmeasure": 0.3973658692506892, "rougeLsum_fmeasure_stderr": 0.0020284980495066177, "rougeLsum_precision": 0.38889217013284716, "rougeLsum_precision_stderr": 0.0022276201914498226, "rougeLsum_recall": 0.43194157222453305, "rougeLsum_recall_stderr": 0.002653041029715851}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.379481370668616, "bleu_stderr": 0.06717438559240217, "rouge1_fmeasure": 0.22307907562150814, "rouge1_fmeasure_stderr": 0.002574919259740255, "rouge1_precision": 0.1641976469365769, "rouge1_precision_stderr": 0.0020666411427313226, "rouge1_recall": 0.3769795179589196, "rouge1_recall_stderr": 0.004557040772745154, "rouge2_fmeasure": 0.05581311382540214, "rouge2_fmeasure_stderr": 0.0017319768283373825, "rouge2_precision": 0.040059412047117826, "rouge2_precision_stderr": 0.0012577854209926665, "rouge2_recall": 0.09876412540351458, "rouge2_recall_stderr": 0.0031994931770752293, "rougeL_fmeasure": 0.16469766371025393, "rougeL_fmeasure_stderr": 0.0019416684869235467, "rougeL_precision": 0.12066732888153141, "rougeL_precision_stderr": 0.001491352270771391, "rougeL_recall": 0.2811429796223486, "rougeL_recall_stderr": 0.0036958208158303246, "rougeLsum_fmeasure": 0.1764735128921134, "rougeLsum_fmeasure_stderr": 0.0022058663139781407, "rougeLsum_precision": 0.12918261249170612, "rougeLsum_precision_stderr": 0.0016727035049342064, "rougeLsum_recall": 0.30129412715173376, "rougeLsum_recall_stderr": 0.0041384700689898295}}, "1": {"article_DOC_summary": {"bleu": 2.0057184143447615, "bleu_stderr": 0.10784776015193966, "rouge1_fmeasure": 0.19977490507493711, "rouge1_fmeasure_stderr": 0.0027304130941086407, "rouge1_precision": 0.14227928785850447, "rouge1_precision_stderr": 0.002038647007352537, "rouge1_recall": 0.3496445389327938, "rouge1_recall_stderr": 0.004687420539003245, "rouge2_fmeasure": 0.04842756093378267, "rouge2_fmeasure_stderr": 0.0016766359525896973, "rouge2_precision": 0.0341741556467082, "rouge2_precision_stderr": 0.0011919230006458453, "rouge2_recall": 0.0868085117734084, "rouge2_recall_stderr": 0.0030751950321611407, "rougeL_fmeasure": 0.15299299492001714, "rougeL_fmeasure_stderr": 0.0020609720286366845, "rougeL_precision": 0.1087255275926617, "rougeL_precision_stderr": 0.0015201339104374042, "rougeL_recall": 0.26941360644447593, "rougeL_recall_stderr": 0.0036873426379033505, "rougeLsum_fmeasure": 0.1598451013344069, "rougeLsum_fmeasure_stderr": 0.0023180038401233254, "rougeLsum_precision": 0.11362838643634136, "rougeLsum_precision_stderr": 0.0017063249170823618, "rougeLsum_recall": 0.28136113073762375, "rougeLsum_recall_stderr": 0.0041294851110808455}}, "2": {"article_DOC_summary": {"bleu": 2.2208813740601956, "bleu_stderr": 0.08894604311937805, "rouge1_fmeasure": 0.208834898706681, "rouge1_fmeasure_stderr": 0.0026978364031301984, "rouge1_precision": 0.14892949469644043, "rouge1_precision_stderr": 0.00200583160476678, "rouge1_recall": 0.3634135497733337, "rouge1_recall_stderr": 0.004671515326255189, "rouge2_fmeasure": 0.05372147678733591, "rouge2_fmeasure_stderr": 0.0017727625629579478, "rouge2_precision": 0.037878856143553255, "rouge2_precision_stderr": 0.0012582826871233468, "rouge2_recall": 0.09625625668090708, "rouge2_recall_stderr": 0.0032497068654648584, "rougeL_fmeasure": 0.1628407034853261, "rougeL_fmeasure_stderr": 0.0020951830521425178, "rougeL_precision": 0.11593507352191997, "rougeL_precision_stderr": 0.0015458134885275524, "rougeL_recall": 0.2849234476380913, "rougeL_recall_stderr": 0.0037398830811592306, "rougeLsum_fmeasure": 0.16488303511797936, "rougeLsum_fmeasure_stderr": 0.0022824435253937184, "rougeLsum_precision": 0.1172906648583005, "rougeLsum_precision_stderr": 0.0016670598084846545, "rougeLsum_recall": 0.2888804067318797, "rougeLsum_recall_stderr": 0.004097916730726731}}}} \ No newline at end of file +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3677810394757023, "bleu_stderr": 0.03281723526619896, "rouge1_fmeasure": 0.11606407601489055, "rouge1_fmeasure_stderr": 0.001969146758246114, "rouge1_precision": 0.07551756953809975, "rouge1_precision_stderr": 0.0014428401024288566, "rouge1_recall": 0.32667793832637043, "rouge1_recall_stderr": 0.0045432759692835755, "rouge2_fmeasure": 0.05525608727734654, "rouge2_fmeasure_stderr": 0.0012506172623425777, "rouge2_precision": 0.0358576440999003, "rouge2_precision_stderr": 0.0009026300665133467, "rouge2_recall": 0.16122516666992529, "rouge2_recall_stderr": 0.003267212716096976, "rougeL_fmeasure": 0.11192452656660684, "rougeL_fmeasure_stderr": 0.00184821865839915, "rougeL_precision": 0.07265841861456275, "rougeL_precision_stderr": 0.0013478808767448548, "rougeL_recall": 0.3173688843344159, "rougeL_recall_stderr": 0.004415574211500117, "rougeLsum_fmeasure": 0.11130422275494882, "rougeLsum_fmeasure_stderr": 0.0018686028478286372, "rougeLsum_precision": 0.07243892580671336, "rougeLsum_precision_stderr": 0.0013760467900819921, "rougeLsum_recall": 0.3134569235593474, "rougeLsum_recall_stderr": 0.004287283373230945}}, "1": {"PALM_prompt": {"bleu": 0.5497938903834476, "bleu_stderr": 0.030329648907717416, "rouge1_fmeasure": 0.12377098272865582, "rouge1_fmeasure_stderr": 0.0018005918585052136, "rouge1_precision": 0.07921341676188258, "rouge1_precision_stderr": 0.0013409822377619385, "rouge1_recall": 0.39870747987203026, "rouge1_recall_stderr": 0.005183434021587761, "rouge2_fmeasure": 0.059610216940881165, "rouge2_fmeasure_stderr": 0.0011877157461269233, "rouge2_precision": 0.037977631085495546, "rouge2_precision_stderr": 0.0008537095990628626, "rouge2_recall": 0.20482620328537757, "rouge2_recall_stderr": 0.003925783970259362, "rougeL_fmeasure": 0.11717642785021219, "rougeL_fmeasure_stderr": 0.001674344616420258, "rougeL_precision": 0.07494409148525415, "rougeL_precision_stderr": 0.0012339020353945417, "rougeL_recall": 0.3749296897875364, "rougeL_recall_stderr": 0.004746737866261382, "rougeLsum_fmeasure": 0.1180104667449292, "rougeLsum_fmeasure_stderr": 0.0017037357470238706, "rougeLsum_precision": 0.07561615649375214, "rougeLsum_precision_stderr": 0.001274630336916664, "rougeLsum_recall": 0.3781401543982665, "rougeLsum_recall_stderr": 0.004772450320283416}}, "2": {"PALM_prompt": {"bleu": 0.6345753558723145, "bleu_stderr": 0.022376664491002615, "rouge1_fmeasure": 0.12360575844131859, "rouge1_fmeasure_stderr": 0.0016749434032436248, "rouge1_precision": 0.07795669324486854, "rouge1_precision_stderr": 0.00119684640928671, "rouge1_recall": 0.41803228663523345, "rouge1_recall_stderr": 0.0053864905798921125, "rouge2_fmeasure": 0.059094553406324295, "rouge2_fmeasure_stderr": 0.0010986707292730262, "rouge2_precision": 0.03697168943889819, "rouge2_precision_stderr": 0.0007524504443837138, "rouge2_recall": 0.21717607459640267, "rouge2_recall_stderr": 0.004101036268432045, "rougeL_fmeasure": 0.1151080347627734, "rougeL_fmeasure_stderr": 0.0015232281204089519, "rougeL_precision": 0.07270569406411607, "rougeL_precision_stderr": 0.0010983482762338668, "rougeL_recall": 0.38609502456284567, "rougeL_recall_stderr": 0.004797386037833962, "rougeLsum_fmeasure": 0.11783674396217436, "rougeLsum_fmeasure_stderr": 0.0015879949279193565, "rougeLsum_precision": 0.07442415306126304, "rougeLsum_precision_stderr": 0.0011448850348819214, "rougeLsum_recall": 0.396883636105679, "rougeLsum_recall_stderr": 0.0049884922683259055}}, "3": {"PALM_prompt": {"bleu": 0.6611202603466564, "bleu_stderr": 0.026292760196011896, "rouge1_fmeasure": 0.12270682587650009, "rouge1_fmeasure_stderr": 0.001689827532535868, "rouge1_precision": 0.07720993388700956, "rouge1_precision_stderr": 0.0012101497257156794, "rouge1_recall": 0.4133639192496906, "rouge1_recall_stderr": 0.005411628648726416, "rouge2_fmeasure": 0.05861591086080908, "rouge2_fmeasure_stderr": 0.0010907460831452867, "rouge2_precision": 0.03655832324841317, "rouge2_precision_stderr": 0.0007506453443122757, "rouge2_recall": 0.21585685095591742, "rouge2_recall_stderr": 0.004093725567157412, "rougeL_fmeasure": 0.11361332097070079, "rougeL_fmeasure_stderr": 0.0015022331310385422, "rougeL_precision": 0.07151482183987531, "rougeL_precision_stderr": 0.001086199341044554, "rougeL_recall": 0.3823649104455851, "rougeL_recall_stderr": 0.004813722726129101, "rougeLsum_fmeasure": 0.11674347677370367, "rougeLsum_fmeasure_stderr": 0.0015911049077229342, "rougeLsum_precision": 0.07350481492884033, "rougeLsum_precision_stderr": 0.0011485603851221294, "rougeLsum_recall": 0.39303289732256513, "rougeLsum_recall_stderr": 0.005032921309423209}}, "4": {"PALM_prompt": {"bleu": 0.7128978831232388, "bleu_stderr": 0.03892571091725707, "rouge1_fmeasure": 0.12369444007480314, "rouge1_fmeasure_stderr": 0.0016461113907076013, "rouge1_precision": 0.07777752463393256, "rouge1_precision_stderr": 0.0011834566593491008, "rouge1_recall": 0.4222346728535934, "rouge1_recall_stderr": 0.005371874334785877, "rouge2_fmeasure": 0.05896624699685776, "rouge2_fmeasure_stderr": 0.0010852712903198752, "rouge2_precision": 0.03670980373588488, "rouge2_precision_stderr": 0.0007435896517793837, "rouge2_recall": 0.2212321108756897, "rouge2_recall_stderr": 0.004139207390440789, "rougeL_fmeasure": 0.11384905074014122, "rougeL_fmeasure_stderr": 0.00146289053656463, "rougeL_precision": 0.07163006841296378, "rougeL_precision_stderr": 0.001061488881603934, "rougeL_recall": 0.3878434903814413, "rougeL_recall_stderr": 0.004773394354686469, "rougeLsum_fmeasure": 0.11795118372977331, "rougeLsum_fmeasure_stderr": 0.001557940253935983, "rougeLsum_precision": 0.07421506767947293, "rougeLsum_precision_stderr": 0.001124685544226384, "rougeLsum_recall": 0.40133513783035873, "rougeLsum_recall_stderr": 0.005016786537745759}}, "5": {"PALM_prompt": {"bleu": 0.7296500968020738, "bleu_stderr": 0.03986679058747073, "rouge1_fmeasure": 0.12440202385333685, "rouge1_fmeasure_stderr": 0.0016439820305072118, "rouge1_precision": 0.07793578273286862, "rouge1_precision_stderr": 0.001170086916396576, "rouge1_recall": 0.4293926231707037, "rouge1_recall_stderr": 0.005544216357063792, "rouge2_fmeasure": 0.059676320234352584, "rouge2_fmeasure_stderr": 0.0010914957404078694, "rouge2_precision": 0.03706679304975263, "rouge2_precision_stderr": 0.0007461793226765045, "rouge2_recall": 0.22614254310251874, "rouge2_recall_stderr": 0.004258603721661164, "rougeL_fmeasure": 0.11426293462896298, "rougeL_fmeasure_stderr": 0.001470255169191014, "rougeL_precision": 0.07171154548923617, "rougeL_precision_stderr": 0.001063717283795246, "rougeL_recall": 0.39264622495319934, "rougeL_recall_stderr": 0.004853098099750933, "rougeLsum_fmeasure": 0.11846322387156706, "rougeLsum_fmeasure_stderr": 0.0015476540324461025, "rougeLsum_precision": 0.07430376688251292, "rougeLsum_precision_stderr": 0.0011115454477617379, "rougeLsum_recall": 0.40784183583085276, "rougeLsum_recall_stderr": 0.005148423771775361}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.6588537886073913, "bleu_stderr": 0.0443627487585664, "rouge1_fmeasure": 0.1771116987872097, "rouge1_fmeasure_stderr": 0.0019001225147585595, "rouge1_precision": 0.15071008120560497, "rouge1_precision_stderr": 0.0019169944124831117, "rouge1_recall": 0.2615215631789747, "rouge1_recall_stderr": 0.0028587647553713004, "rouge2_fmeasure": 0.037350220833466265, "rouge2_fmeasure_stderr": 0.0009013013631899656, "rouge2_precision": 0.03154759752710423, "rouge2_precision_stderr": 0.0007938342962238314, "rouge2_recall": 0.05718093036037643, "rouge2_recall_stderr": 0.0015126905612736802, "rougeL_fmeasure": 0.1360411096046868, "rougeL_fmeasure_stderr": 0.0013491547776702593, "rougeL_precision": 0.11439573579669693, "rougeL_precision_stderr": 0.0013313396769038728, "rougeL_recall": 0.2058353141373366, "rougeL_recall_stderr": 0.0022855149086852887, "rougeLsum_fmeasure": 0.16367916861611936, "rougeLsum_fmeasure_stderr": 0.0017419145199993307, "rougeLsum_precision": 0.13909515224541577, "rougeLsum_precision_stderr": 0.0017556231845242017, "rougeLsum_recall": 0.24244042499581767, "rougeLsum_recall_stderr": 0.0026519653366143562}}, "1": {"tldr_en": {"bleu": 3.5656470388016652, "bleu_stderr": 0.08197311523066556, "rouge1_fmeasure": 0.24342094967396746, "rouge1_fmeasure_stderr": 0.002061008293601447, "rouge1_precision": 0.2174735691686053, "rouge1_precision_stderr": 0.0024727115624900444, "rouge1_recall": 0.3460178647953164, "rouge1_recall_stderr": 0.0029105441665587413, "rouge2_fmeasure": 0.06579562483959114, "rouge2_fmeasure_stderr": 0.0012229149405673168, "rouge2_precision": 0.05974334784565509, "rouge2_precision_stderr": 0.0013520311423251665, "rouge2_recall": 0.09579105477293766, "rouge2_recall_stderr": 0.0019333447701627138, "rougeL_fmeasure": 0.1715894965219696, "rougeL_fmeasure_stderr": 0.0014647407258985053, "rougeL_precision": 0.15264641094715217, "rougeL_precision_stderr": 0.0017989249509303383, "rougeL_recall": 0.24988397509922583, "rougeL_recall_stderr": 0.0023954858097757943, "rougeLsum_fmeasure": 0.2293964977081046, "rougeLsum_fmeasure_stderr": 0.0019478558777254497, "rougeLsum_precision": 0.20485722652733274, "rougeLsum_precision_stderr": 0.002345428003149735, "rougeLsum_recall": 0.32704916730419714, "rougeLsum_recall_stderr": 0.0027965860033602368}}, "2": {"tldr_en": {"bleu": 4.105745383260244, "bleu_stderr": 0.08432146375641045, "rouge1_fmeasure": 0.24748860335539324, "rouge1_fmeasure_stderr": 0.0020650327174219227, "rouge1_precision": 0.23457091267008237, "rouge1_precision_stderr": 0.0027679657131044456, "rouge1_recall": 0.34134267442562116, "rouge1_recall_stderr": 0.0029201119619592353, "rouge2_fmeasure": 0.07016193751579543, "rouge2_fmeasure_stderr": 0.0012709364243235631, "rouge2_precision": 0.06809717111480851, "rouge2_precision_stderr": 0.0015408937269110604, "rouge2_recall": 0.09908181073861143, "rouge2_recall_stderr": 0.001996906692665077, "rougeL_fmeasure": 0.17762855895363824, "rougeL_fmeasure_stderr": 0.0015104607154289617, "rougeL_precision": 0.16899767873052451, "rougeL_precision_stderr": 0.0021331406852508664, "rougeL_recall": 0.24978190911695647, "rougeL_recall_stderr": 0.0024029557797763655, "rougeLsum_fmeasure": 0.23449572227657609, "rougeLsum_fmeasure_stderr": 0.001955519333700258, "rougeLsum_precision": 0.22219258646455792, "rougeLsum_precision_stderr": 0.0026321839454086595, "rougeLsum_recall": 0.32435218510070457, "rougeLsum_recall_stderr": 0.002817151691787598}}, "3": {"tldr_en": {"bleu": 4.150193084650233, "bleu_stderr": 0.11031172513046578, "rouge1_fmeasure": 0.20579755698645813, "rouge1_fmeasure_stderr": 0.0024814106035235444, "rouge1_precision": 0.2018448766253913, "rouge1_precision_stderr": 0.003073161400459354, "rouge1_recall": 0.28172071543272187, "rouge1_recall_stderr": 0.003594746473409513, "rouge2_fmeasure": 0.057766738614790786, "rouge2_fmeasure_stderr": 0.001223860321221621, "rouge2_precision": 0.05693855985307953, "rouge2_precision_stderr": 0.0014354920153072253, "rouge2_recall": 0.08217428533466181, "rouge2_recall_stderr": 0.001967375414041398, "rougeL_fmeasure": 0.14896417876723742, "rougeL_fmeasure_stderr": 0.001816503510484197, "rougeL_precision": 0.14712383510443716, "rougeL_precision_stderr": 0.002367934256298009, "rougeL_recall": 0.2074931184166513, "rougeL_recall_stderr": 0.002842631573610709, "rougeLsum_fmeasure": 0.19498864236687288, "rougeLsum_fmeasure_stderr": 0.0023499776362643178, "rougeLsum_precision": 0.19153153433363726, "rougeLsum_precision_stderr": 0.002934568467456808, "rougeLsum_recall": 0.2671010109221112, "rougeLsum_recall_stderr": 0.003425037370895828}}, "4": {"tldr_en": {"bleu": 0.7482733498786457, "bleu_stderr": 0.04289432927446872, "rouge1_fmeasure": 0.0638872592550199, "rouge1_fmeasure_stderr": 0.0021736088152739283, "rouge1_precision": 0.06276227014065545, "rouge1_precision_stderr": 0.0023884276241260254, "rouge1_recall": 0.09152089229465758, "rouge1_recall_stderr": 0.003170681109109529, "rouge2_fmeasure": 0.017492971654085164, "rouge2_fmeasure_stderr": 0.0008455647267737554, "rouge2_precision": 0.016993816746714407, "rouge2_precision_stderr": 0.0009445450958091424, "rouge2_recall": 0.026516974130858528, "rouge2_recall_stderr": 0.0013633238440930392, "rougeL_fmeasure": 0.04716945018429479, "rougeL_fmeasure_stderr": 0.0016102955217448768, "rougeL_precision": 0.04661295129618777, "rougeL_precision_stderr": 0.0018307969366172505, "rougeL_recall": 0.06895783497739912, "rougeL_recall_stderr": 0.002463509315480347, "rougeLsum_fmeasure": 0.06053208461846568, "rougeLsum_fmeasure_stderr": 0.002064190262965029, "rougeLsum_precision": 0.059550490378451754, "rougeLsum_precision_stderr": 0.0022812820815524093, "rougeLsum_recall": 0.08666480644593474, "rougeLsum_recall_stderr": 0.003010285669141728}}, "5": {"tldr_en": {"bleu": 1.2541436019570651e-06, "bleu_stderr": 3.589754879521194e-06, "rouge1_fmeasure": 0.010963700281898832, "rouge1_fmeasure_stderr": 0.001016356199855156, "rouge1_precision": 0.01062569815388256, "rouge1_precision_stderr": 0.001044931714423556, "rouge1_recall": 0.015829449605584002, "rouge1_recall_stderr": 0.0014988937828031954, "rouge2_fmeasure": 0.0031213037008769227, "rouge2_fmeasure_stderr": 0.00038874767150106574, "rouge2_precision": 0.002965306348837173, "rouge2_precision_stderr": 0.00039542449556901336, "rouge2_recall": 0.004883623633485144, "rouge2_recall_stderr": 0.0006415779749258939, "rougeL_fmeasure": 0.00827136001527878, "rougeL_fmeasure_stderr": 0.0007741743267056739, "rougeL_precision": 0.00810568450656589, "rougeL_precision_stderr": 0.0008158965824147807, "rougeL_recall": 0.012260844933610076, "rougeL_recall_stderr": 0.0012027923697108917, "rougeLsum_fmeasure": 0.010362958827854624, "rougeLsum_fmeasure_stderr": 0.0009617992097751725, "rougeLsum_precision": 0.010057343739525522, "rougeLsum_precision_stderr": 0.0009929081454170272, "rougeLsum_recall": 0.0149839670075284, "rougeLsum_recall_stderr": 0.0014220508785650803}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.0829790820445851, "bleu_stderr": 0.02197797601631595, "rouge1_fmeasure": 0.10446518863159925, "rouge1_fmeasure_stderr": 0.000777744935734902, "rouge1_precision": 0.07862879559521596, "rouge1_precision_stderr": 0.0006642205609450126, "rouge1_recall": 0.16475897375079807, "rouge1_recall_stderr": 0.0011383196654042192, "rouge2_fmeasure": 0.009544918858239404, "rouge2_fmeasure_stderr": 0.00027235599690889593, "rouge2_precision": 0.007369812030601589, "rouge2_precision_stderr": 0.00021896094396311517, "rouge2_recall": 0.014262052938121144, "rouge2_recall_stderr": 0.0004147127585356459, "rougeL_fmeasure": 0.1010125389757392, "rougeL_fmeasure_stderr": 0.0007159378810539217, "rougeL_precision": 0.07591871687158004, "rougeL_precision_stderr": 0.000610191190689566, "rougeL_recall": 0.1597865992371287, "rougeL_recall_stderr": 0.0010790075675615354, "rougeLsum_fmeasure": 0.08956898548608715, "rougeLsum_fmeasure_stderr": 0.0006418100960192206, "rougeLsum_precision": 0.0673047657702021, "rougeLsum_precision_stderr": 0.0005522793921438968, "rougeLsum_recall": 0.14195098898955258, "rougeLsum_recall_stderr": 0.0009730497278821901}}, "1": {"generate_text_restaurant": {"bleu": 10.908919859430615, "bleu_stderr": 0.16637332277469416, "rouge1_fmeasure": 0.43946652639475914, "rouge1_fmeasure_stderr": 0.0020459488608388305, "rouge1_precision": 0.4456385087012675, "rouge1_precision_stderr": 0.002353995044749726, "rouge1_recall": 0.47020454378467724, "rouge1_recall_stderr": 0.002936861869075764, "rouge2_fmeasure": 0.19809860044914396, "rouge2_fmeasure_stderr": 0.0017761118795827473, "rouge2_precision": 0.20055830506980424, "rouge2_precision_stderr": 0.0018894709575218048, "rouge2_recall": 0.2135715698673984, "rouge2_recall_stderr": 0.002164535077464, "rougeL_fmeasure": 0.31158605078339746, "rougeL_fmeasure_stderr": 0.0017722355642195858, "rougeL_precision": 0.31683119553670347, "rougeL_precision_stderr": 0.00202952794744226, "rougeL_recall": 0.33327675710241655, "rougeL_recall_stderr": 0.0023834364279679216, "rougeLsum_fmeasure": 0.36402615233989766, "rougeLsum_fmeasure_stderr": 0.0020471793436135748, "rougeLsum_precision": 0.36992976009148365, "rougeLsum_precision_stderr": 0.0023186341348778463, "rougeLsum_recall": 0.38880934305270765, "rougeLsum_recall_stderr": 0.0027097894189422028}}, "2": {"generate_text_restaurant": {"bleu": 12.496024480959404, "bleu_stderr": 0.17088930912135533, "rouge1_fmeasure": 0.46766982775713095, "rouge1_fmeasure_stderr": 0.001918126464011959, "rouge1_precision": 0.46513050554651497, "rouge1_precision_stderr": 0.002223586404391749, "rouge1_recall": 0.5044545963797141, "rouge1_recall_stderr": 0.0028151136740913597, "rouge2_fmeasure": 0.22321575010543798, "rouge2_fmeasure_stderr": 0.0018224289381751444, "rouge2_precision": 0.22174324642559978, "rouge2_precision_stderr": 0.0019100067340740226, "rouge2_recall": 0.24291920242568943, "rouge2_recall_stderr": 0.0022611578365930884, "rougeL_fmeasure": 0.3381497040905446, "rougeL_fmeasure_stderr": 0.0017546749447990841, "rougeL_precision": 0.3367051458657208, "rougeL_precision_stderr": 0.001973018443090721, "rougeL_recall": 0.36493130556649644, "rougeL_recall_stderr": 0.002393148715464209, "rougeLsum_fmeasure": 0.3916198529515355, "rougeLsum_fmeasure_stderr": 0.0020225802259552245, "rougeLsum_precision": 0.3898243742937925, "rougeLsum_precision_stderr": 0.002253177427026344, "rougeLsum_recall": 0.421949500869911, "rougeLsum_recall_stderr": 0.002689886396084085}}, "3": {"generate_text_restaurant": {"bleu": 12.876624238007336, "bleu_stderr": 0.13633367748001868, "rouge1_fmeasure": 0.47400023722312934, "rouge1_fmeasure_stderr": 0.001900202291806821, "rouge1_precision": 0.4666088727251901, "rouge1_precision_stderr": 0.0022313332553631853, "rouge1_recall": 0.5154695171919821, "rouge1_recall_stderr": 0.0027754047400933637, "rouge2_fmeasure": 0.2289130980630063, "rouge2_fmeasure_stderr": 0.0018363466703835796, "rouge2_precision": 0.2243865656346707, "rouge2_precision_stderr": 0.0018719494043509864, "rouge2_recall": 0.25156840321883794, "rouge2_recall_stderr": 0.0023063040992685763, "rougeL_fmeasure": 0.34349469052543125, "rougeL_fmeasure_stderr": 0.0017677223953016846, "rougeL_precision": 0.3380898171473523, "rougeL_precision_stderr": 0.0019534178000102315, "rougeL_recall": 0.37434995371411073, "rougeL_recall_stderr": 0.002444470010384392, "rougeLsum_fmeasure": 0.3977755285897407, "rougeLsum_fmeasure_stderr": 0.0020093339319118943, "rougeLsum_precision": 0.39156150543550683, "rougeLsum_precision_stderr": 0.0022264717816469198, "rougeLsum_recall": 0.43267538620734924, "rougeLsum_recall_stderr": 0.0027083330913664685}}, "4": {"generate_text_restaurant": {"bleu": 13.038419795082826, "bleu_stderr": 0.1976428408924392, "rouge1_fmeasure": 0.47584611404695387, "rouge1_fmeasure_stderr": 0.001947074296264647, "rouge1_precision": 0.466899050531228, "rouge1_precision_stderr": 0.0022803800444716094, "rouge1_recall": 0.5180929741571967, "rouge1_recall_stderr": 0.0027666521523327485, "rouge2_fmeasure": 0.2331128191296418, "rouge2_fmeasure_stderr": 0.0018880871361337953, "rouge2_precision": 0.22797984329863402, "rouge2_precision_stderr": 0.0019493427856802211, "rouge2_recall": 0.2562999197032082, "rouge2_recall_stderr": 0.002335812648509201, "rougeL_fmeasure": 0.34478040188814424, "rougeL_fmeasure_stderr": 0.0018168580160894932, "rougeL_precision": 0.33801665068739845, "rougeL_precision_stderr": 0.0019999447186985647, "rougeL_recall": 0.37648151811835306, "rougeL_recall_stderr": 0.0024685261116801542, "rougeLsum_fmeasure": 0.40002040198984967, "rougeLsum_fmeasure_stderr": 0.00209065471547177, "rougeLsum_precision": 0.3924955108326523, "rougeLsum_precision_stderr": 0.0023101764008543304, "rougeLsum_recall": 0.4357423190744161, "rougeLsum_recall_stderr": 0.0027563832782966625}}, "5": {"generate_text_restaurant": {"bleu": 12.732763368074703, "bleu_stderr": 0.17590369930486258, "rouge1_fmeasure": 0.47424433103008173, "rouge1_fmeasure_stderr": 0.0019121232239086051, "rouge1_precision": 0.46421051231524674, "rouge1_precision_stderr": 0.002228291979514538, "rouge1_recall": 0.5154379150999142, "rouge1_recall_stderr": 0.0026933286526690937, "rouge2_fmeasure": 0.22941163204827142, "rouge2_fmeasure_stderr": 0.001863304249033461, "rouge2_precision": 0.2240248553808123, "rouge2_precision_stderr": 0.001895223667089071, "rouge2_recall": 0.2513696885147789, "rouge2_recall_stderr": 0.0022849010356271104, "rougeL_fmeasure": 0.34311455561819293, "rougeL_fmeasure_stderr": 0.001783681336394135, "rougeL_precision": 0.3353528060064174, "rougeL_precision_stderr": 0.0019239210092200653, "rougeL_recall": 0.37390031423496406, "rougeL_recall_stderr": 0.0023965157476151687, "rougeLsum_fmeasure": 0.3973658692506892, "rougeLsum_fmeasure_stderr": 0.0020284980495066177, "rougeLsum_precision": 0.38889217013284716, "rougeLsum_precision_stderr": 0.0022276201914498226, "rougeLsum_recall": 0.43194157222453305, "rougeLsum_recall_stderr": 0.002653041029715851}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.379481370668616, "bleu_stderr": 0.06717438559240217, "rouge1_fmeasure": 0.22307907562150814, "rouge1_fmeasure_stderr": 0.002574919259740255, "rouge1_precision": 0.1641976469365769, "rouge1_precision_stderr": 0.0020666411427313226, "rouge1_recall": 0.3769795179589196, "rouge1_recall_stderr": 0.004557040772745154, "rouge2_fmeasure": 0.05581311382540214, "rouge2_fmeasure_stderr": 0.0017319768283373825, "rouge2_precision": 0.040059412047117826, "rouge2_precision_stderr": 0.0012577854209926665, "rouge2_recall": 0.09876412540351458, "rouge2_recall_stderr": 0.0031994931770752293, "rougeL_fmeasure": 0.16469766371025393, "rougeL_fmeasure_stderr": 0.0019416684869235467, "rougeL_precision": 0.12066732888153141, "rougeL_precision_stderr": 0.001491352270771391, "rougeL_recall": 0.2811429796223486, "rougeL_recall_stderr": 0.0036958208158303246, "rougeLsum_fmeasure": 0.1764735128921134, "rougeLsum_fmeasure_stderr": 0.0022058663139781407, "rougeLsum_precision": 0.12918261249170612, "rougeLsum_precision_stderr": 0.0016727035049342064, "rougeLsum_recall": 0.30129412715173376, "rougeLsum_recall_stderr": 0.0041384700689898295}}, "1": {"article_DOC_summary": {"bleu": 2.0057184143447615, "bleu_stderr": 0.10784776015193966, "rouge1_fmeasure": 0.19977490507493711, "rouge1_fmeasure_stderr": 0.0027304130941086407, "rouge1_precision": 0.14227928785850447, "rouge1_precision_stderr": 0.002038647007352537, "rouge1_recall": 0.3496445389327938, "rouge1_recall_stderr": 0.004687420539003245, "rouge2_fmeasure": 0.04842756093378267, "rouge2_fmeasure_stderr": 0.0016766359525896973, "rouge2_precision": 0.0341741556467082, "rouge2_precision_stderr": 0.0011919230006458453, "rouge2_recall": 0.0868085117734084, "rouge2_recall_stderr": 0.0030751950321611407, "rougeL_fmeasure": 0.15299299492001714, "rougeL_fmeasure_stderr": 0.0020609720286366845, "rougeL_precision": 0.1087255275926617, "rougeL_precision_stderr": 0.0015201339104374042, "rougeL_recall": 0.26941360644447593, "rougeL_recall_stderr": 0.0036873426379033505, "rougeLsum_fmeasure": 0.1598451013344069, "rougeLsum_fmeasure_stderr": 0.0023180038401233254, "rougeLsum_precision": 0.11362838643634136, "rougeLsum_precision_stderr": 0.0017063249170823618, "rougeLsum_recall": 0.28136113073762375, "rougeLsum_recall_stderr": 0.0041294851110808455}}, "2": {"article_DOC_summary": {"bleu": 2.2208813740601956, "bleu_stderr": 0.08894604311937805, "rouge1_fmeasure": 0.208834898706681, "rouge1_fmeasure_stderr": 0.0026978364031301984, "rouge1_precision": 0.14892949469644043, "rouge1_precision_stderr": 0.00200583160476678, "rouge1_recall": 0.3634135497733337, "rouge1_recall_stderr": 0.004671515326255189, "rouge2_fmeasure": 0.05372147678733591, "rouge2_fmeasure_stderr": 0.0017727625629579478, "rouge2_precision": 0.037878856143553255, "rouge2_precision_stderr": 0.0012582826871233468, "rouge2_recall": 0.09625625668090708, "rouge2_recall_stderr": 0.0032497068654648584, "rougeL_fmeasure": 0.1628407034853261, "rougeL_fmeasure_stderr": 0.0020951830521425178, "rougeL_precision": 0.11593507352191997, "rougeL_precision_stderr": 0.0015458134885275524, "rougeL_recall": 0.2849234476380913, "rougeL_recall_stderr": 0.0037398830811592306, "rougeLsum_fmeasure": 0.16488303511797936, "rougeLsum_fmeasure_stderr": 0.0022824435253937184, "rougeLsum_precision": 0.1172906648583005, "rougeLsum_precision_stderr": 0.0016670598084846545, "rougeLsum_recall": 0.2888804067318797, "rougeLsum_recall_stderr": 0.004097916730726731}}, "3": {"article_DOC_summary": {"bleu": 2.2460570125021815, "bleu_stderr": 0.09639536279310482, "rouge1_fmeasure": 0.203288145377478, "rouge1_fmeasure_stderr": 0.002902549912894558, "rouge1_precision": 0.14794374341275107, "rouge1_precision_stderr": 0.0022457907635443767, "rouge1_recall": 0.34719811012821766, "rouge1_recall_stderr": 0.0049938759471077005, "rouge2_fmeasure": 0.05387889032218581, "rouge2_fmeasure_stderr": 0.0017118408282623175, "rouge2_precision": 0.038373258945208606, "rouge2_precision_stderr": 0.0012224763268285, "rouge2_recall": 0.09516942967025471, "rouge2_recall_stderr": 0.0031451684639885744, "rougeL_fmeasure": 0.15948452331329185, "rougeL_fmeasure_stderr": 0.0022439870906278744, "rougeL_precision": 0.11596978873700024, "rougeL_precision_stderr": 0.0017382600731524656, "rougeL_recall": 0.2738718049202109, "rougeL_recall_stderr": 0.00398280173794519, "rougeLsum_fmeasure": 0.16072503506629401, "rougeLsum_fmeasure_stderr": 0.002448676603417113, "rougeLsum_precision": 0.11685891599330396, "rougeLsum_precision_stderr": 0.001878971804961673, "rougeLsum_recall": 0.2761128747693524, "rougeLsum_recall_stderr": 0.004338710723283792}}, "4": {"article_DOC_summary": {"bleu": 1.217900205097583, "bleu_stderr": 0.16547330167199492, "rouge1_fmeasure": 0.05559972270834411, "rouge1_fmeasure_stderr": 0.0030493540636307315, "rouge1_precision": 0.045641403513942344, "rouge1_precision_stderr": 0.0026877928553966255, "rouge1_recall": 0.08792590860532339, "rouge1_recall_stderr": 0.004951100510662503, "rouge2_fmeasure": 0.01431853871406581, "rouge2_fmeasure_stderr": 0.0011857334383597357, "rouge2_precision": 0.011710935455623647, "rouge2_precision_stderr": 0.0012861915474572897, "rouge2_recall": 0.02368119469360542, "rouge2_recall_stderr": 0.001960136241976673, "rougeL_fmeasure": 0.04491432061444308, "rougeL_fmeasure_stderr": 0.0024680842086334605, "rougeL_precision": 0.03727739770750944, "rougeL_precision_stderr": 0.0022705131920867042, "rougeL_recall": 0.07104571554164463, "rougeL_recall_stderr": 0.004021066125324381, "rougeLsum_fmeasure": 0.044904907347443464, "rougeLsum_fmeasure_stderr": 0.002501946586732125, "rougeLsum_precision": 0.03735291149683435, "rougeLsum_precision_stderr": 0.002314584651645963, "rougeLsum_recall": 0.07107549567098383, "rougeLsum_recall_stderr": 0.004070397888407412}}, "5": {"article_DOC_summary": {"bleu": 3.69574011869292e-17, "bleu_stderr": 2.684261446109072e-14, "rouge1_fmeasure": 0.002984428901393074, "rouge1_fmeasure_stderr": 0.0008077475320402855, "rouge1_precision": 0.0025215542744389413, "rouge1_precision_stderr": 0.0007044520139374636, "rouge1_recall": 0.004537012002175192, "rouge1_recall_stderr": 0.0012541613619486166, "rouge2_fmeasure": 0.000431120989446868, "rouge2_fmeasure_stderr": 0.00019695106711284816, "rouge2_precision": 0.0003456350518315212, "rouge2_precision_stderr": 0.00015654732477210276, "rouge2_recall": 0.0006329432975265091, "rouge2_recall_stderr": 0.00028793862001474474, "rougeL_fmeasure": 0.002060124493547919, "rougeL_fmeasure_stderr": 0.0005846305244422877, "rougeL_precision": 0.0017146833815532188, "rougeL_precision_stderr": 0.0004901362650343392, "rougeL_recall": 0.003097704549940136, "rougeL_recall_stderr": 0.000885003553914414, "rougeLsum_fmeasure": 0.0023450606462443726, "rougeLsum_fmeasure_stderr": 0.0006698668342557274, "rougeLsum_precision": 0.0019763029300951048, "rougeLsum_precision_stderr": 0.000583997721778465, "rougeLsum_recall": 0.003481088798128124, "rougeLsum_recall_stderr": 0.0009743709994872758}}}} \ No newline at end of file diff --git a/8b7178b58b/evaluation/generation/slim.8b7178b58b_GEM-wiki_lingua_en_tldr_en_2.json b/8b7178b58b/evaluation/generation/slim.8b7178b58b_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..2ed360b96aa7073fb4f2ab8ab08a47c6b599b0aa --- /dev/null +++ b/8b7178b58b/evaluation/generation/slim.8b7178b58b_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.23457091267008237, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0027679657131044456 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.34134267442562116, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0029201119619592353 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.24748860335539324, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020650327174219227 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.06809717111480851, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0015408937269110604 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.09908181073861143, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001996906692665077 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.07016193751579543, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012709364243235631 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.16899767873052451, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0021331406852508664 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.24978190911695647, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0024029557797763655 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.17762855895363824, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015104607154289617 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.22219258646455792, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0026321839454086595 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.32435218510070457, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002817151691787598 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.23449572227657609, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001955519333700258 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 4.105745383260244, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08432146375641045 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b58b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b58b/evaluation/generation/slim.8b7178b58b_GEM-wiki_lingua_en_tldr_en_3.json b/8b7178b58b/evaluation/generation/slim.8b7178b58b_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..104d8febd5ca7f474aac77a9d9c26260ffbec46d --- /dev/null +++ b/8b7178b58b/evaluation/generation/slim.8b7178b58b_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.2018448766253913, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.003073161400459354 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.28172071543272187, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003594746473409513 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.20579755698645813, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0024814106035235444 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.05693855985307953, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0014354920153072253 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.08217428533466181, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001967375414041398 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.057766738614790786, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001223860321221621 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.14712383510443716, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002367934256298009 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.2074931184166513, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002842631573610709 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.14896417876723742, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001816503510484197 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.19153153433363726, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002934568467456808 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.2671010109221112, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003425037370895828 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.19498864236687288, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0023499776362643178 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 4.150193084650233, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.11031172513046578 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b58b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b58b/evaluation/generation/slim.8b7178b58b_GEM-wiki_lingua_en_tldr_en_4.json b/8b7178b58b/evaluation/generation/slim.8b7178b58b_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f042aebfae681949bf9537c6bf7f5eae730f45a8 --- /dev/null +++ b/8b7178b58b/evaluation/generation/slim.8b7178b58b_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.06276227014065545, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0023884276241260254 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.09152089229465758, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003170681109109529 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.0638872592550199, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0021736088152739283 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.016993816746714407, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009445450958091424 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.026516974130858528, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0013633238440930392 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.017492971654085164, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008455647267737554 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.04661295129618777, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0018307969366172505 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.06895783497739912, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002463509315480347 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.04716945018429479, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016102955217448768 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.059550490378451754, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0022812820815524093 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.08666480644593474, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003010285669141728 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.06053208461846568, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002064190262965029 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.7482733498786457, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04289432927446872 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b58b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b58b/evaluation/generation/slim.8b7178b58b_GEM-wiki_lingua_en_tldr_en_5.json b/8b7178b58b/evaluation/generation/slim.8b7178b58b_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9b098afa733096f851b6ef9df9337331501bbe2e --- /dev/null +++ b/8b7178b58b/evaluation/generation/slim.8b7178b58b_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.01062569815388256, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001044931714423556 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.015829449605584002, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0014988937828031954 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.010963700281898832, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001016356199855156 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.002965306348837173, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00039542449556901336 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.004883623633485144, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0006415779749258939 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0031213037008769227, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00038874767150106574 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.00810568450656589, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0008158965824147807 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.012260844933610076, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0012027923697108917 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.00827136001527878, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0007741743267056739 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.010057343739525522, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0009929081454170272 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.0149839670075284, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0014220508785650803 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.010362958827854624, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0009617992097751725 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.2541436019570651e-06, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 3.589754879521194e-06 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b58b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b58b/evaluation/generation/slim.8b7178b58b_gem_xsum_article_DOC_summary_3.json b/8b7178b58b/evaluation/generation/slim.8b7178b58b_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d85c3d20e4dc018e7136261dad113a179882ecc2 --- /dev/null +++ b/8b7178b58b/evaluation/generation/slim.8b7178b58b_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.14794374341275107, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0022457907635443767 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.34719811012821766, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0049938759471077005 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.203288145377478, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002902549912894558 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.038373258945208606, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0012224763268285 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.09516942967025471, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0031451684639885744 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.05387889032218581, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0017118408282623175 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.11596978873700024, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0017382600731524656 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.2738718049202109, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.00398280173794519 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.15948452331329185, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0022439870906278744 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.11685891599330396, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001878971804961673 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2761128747693524, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004338710723283792 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.16072503506629401, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002448676603417113 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.2460570125021815, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.09639536279310482 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b58b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b58b/evaluation/generation/slim.8b7178b58b_gem_xsum_article_DOC_summary_4.json b/8b7178b58b/evaluation/generation/slim.8b7178b58b_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f5690e231613cf66425803629f3e413af3d97648 --- /dev/null +++ b/8b7178b58b/evaluation/generation/slim.8b7178b58b_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.045641403513942344, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0026877928553966255 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.08792590860532339, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004951100510662503 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.05559972270834411, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0030493540636307315 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.011710935455623647, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0012861915474572897 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.02368119469360542, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.001960136241976673 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.01431853871406581, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0011857334383597357 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.03727739770750944, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0022705131920867042 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.07104571554164463, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.004021066125324381 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.04491432061444308, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0024680842086334605 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.03735291149683435, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.002314584651645963 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.07107549567098383, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004070397888407412 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.044904907347443464, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002501946586732125 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.217900205097583, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.16547330167199492 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b58b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b58b/evaluation/generation/slim.8b7178b58b_gem_xsum_article_DOC_summary_5.json b/8b7178b58b/evaluation/generation/slim.8b7178b58b_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4f2345d65e9d92cca4ceef2ab72c6c85c95ee81c --- /dev/null +++ b/8b7178b58b/evaluation/generation/slim.8b7178b58b_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.0025215542744389413, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0007044520139374636 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.004537012002175192, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0012541613619486166 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.002984428901393074, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0008077475320402855 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0003456350518315212, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00015654732477210276 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0006329432975265091, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00028793862001474474 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.000431120989446868, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00019695106711284816 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0017146833815532188, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0004901362650343392 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.003097704549940136, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.000885003553914414 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.002060124493547919, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0005846305244422877 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0019763029300951048, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.000583997721778465 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.003481088798128124, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0009743709994872758 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0023450606462443726, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0006698668342557274 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 3.69574011869292e-17, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 2.684261446109072e-14 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b58b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_2.json b/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..760741527158cd61f45b85cc2129d5a61cb27d77 --- /dev/null +++ b/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.22168273116215406, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002435239313498893}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.34743629317668956, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028712547192456107}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.2476037847532769, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002025823379351112}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.06376061872271946, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013048039083209525}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.101241459889032, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001953242205361602}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.07067445283939051, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012417394471129861}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.16023681566231104, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001772038861775115}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.25897094370699875, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002426935392898513}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1802673381893277, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001478295743890994}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.20950862810749826, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002314989258793673}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.32880876166482065, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002739608654356716}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.23404735258003787, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019207642560461376}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.9743938237550784, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07475699238187014}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_3.json b/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4fc83e392848c814db5b8f123d0b253130533a90 --- /dev/null +++ b/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.1929078707094695, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0028265761167604094}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.28273526242062413, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003552029067040685}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.2047867720982337, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024642538193857724}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.054778926280360955, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013556972713623368}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.08337730610072844, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0019188280227343476}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0583824000959267, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012189630804918103}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.1417601937646707, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021376668461159194}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.21204142844316634, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002858744024264778}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1504897955536091, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018172498103686094}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.18285584254571302, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0026982092402270926}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.268579149278918, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0034097338970395575}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.19403204633681076, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023374505495094874}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.9817750726027175, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09479747244511556}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_4.json b/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2045abcefcb09072e44b72c7921f5af72f010188 --- /dev/null +++ b/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.06349732024114886, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00234492312222753}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.09340855339099945, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003159444376659221}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.06567263366786005, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021904578073412223}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.01738674391302479, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000886017251546639}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.02721797575693733, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013454614244059475}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.018412425374697422, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008441162033876125}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.04817497329724485, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018235195878999154}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.07173904334429866, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002478097323273335}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.04945824665200398, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016373751055148418}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.05985219645183224, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022220794271589225}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.0881853688625359, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0029988916997001343}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.06188112367809635, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002069249663413523}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.8498125282802876, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07488095870505093}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_5.json b/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c6802bf895a24b9d871120bcab7ac339cfcd6854 --- /dev/null +++ b/8b7178b88b/evaluation/generation/agg.8b7178b88b_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.010748286830524562, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0010977103345240155}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.015508382128794362, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001446236589386957}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.010706535802039367, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0009930611949791836}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0030969030541188967, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005268236540414107}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.004556824944932366, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005664861641115252}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.002970906530619929, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003682132280639809}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.008049434192397904, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0008462473363861482}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.011951304347862953, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0011515347961576568}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.008019691411733835, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.000754515109085896}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.010213486554639387, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001056290927469812}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.014853652320406176, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0013945219376869788}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.010182849522984623, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009497792801374568}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.8683848179992505e-06, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 7.381534992907998e-06}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/agg.8b7178b88b_gem_xsum_article_DOC_summary_3.json b/8b7178b88b/evaluation/generation/agg.8b7178b88b_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..18ce7c76dde037e49c292f740e17b36c823276fd --- /dev/null +++ b/8b7178b88b/evaluation/generation/agg.8b7178b88b_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.13614805917908393, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002182928610973459}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.319067828970501, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004854901693972656}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.18698891886072055, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0028344896483067416}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.03255285607712267, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011709136544211735}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.08091915270893477, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0030319428460283114}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.04579306632360622, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0016499700154799003}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.10907064034611987, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016952538239699338}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.25710809104483, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003938425696647421}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.14999604713991269, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022346501948052253}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.10781469737345471, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018160899239560767}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.25437759661970666, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004197817376394927}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.1481937356361328, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023789575527353485}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.960346415562878, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10375544830060275}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/agg.8b7178b88b_gem_xsum_article_DOC_summary_4.json b/8b7178b88b/evaluation/generation/agg.8b7178b88b_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..e49eec2b1ffe3f0921491e3d0d8ae666627299e5 --- /dev/null +++ b/8b7178b88b/evaluation/generation/agg.8b7178b88b_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.0461290015156943, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0026493012385445263}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.08781737838132268, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004925448191329273}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05597846129733605, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0030736717398869183}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.011276409349861457, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009910327288713146}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.02322309514343149, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0019591268818172957}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.014349999093671009, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00119440286191344}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.036708426223707534, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021597446633885207}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.06933837439843676, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0039167983045230665}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0441258641947338, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002422435816075437}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.036521786784889244, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021744741340258226}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.06862432645657374, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003921758075614257}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.04374778712530494, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024277919295063437}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.147010044637203, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1483287532182548}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/agg.8b7178b88b_gem_xsum_article_DOC_summary_5.json b/8b7178b88b/evaluation/generation/agg.8b7178b88b_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a604d44cda2a00752c64ec07a354e581736caa61 --- /dev/null +++ b/8b7178b88b/evaluation/generation/agg.8b7178b88b_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.002495597025413209, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0007589560712051177}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.004261862032154927, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0011922597752018513}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.002840764770276708, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007742945701180914}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0004857664430361433, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000238899022126221}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0005700394684856393, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.000211601005978903}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.00046551069178623087, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0001919512728404723}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0018255732115487248, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0005443002896546889}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.003014099249198048, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0008089040705550355}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0020608235251966707, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005517756786268286}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0020616490944038632, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.000695995858568096}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0030417584845173606, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0008279029688397777}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.002211561514290958, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006392646801827694}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 7.432818302647882e-17, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 6.736262144559321e-14}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_2.jsonl b/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_2.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..af9648ed84b48d197424b2041030d9ec0a1d298f 100644 --- a/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_2.jsonl +++ b/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68eef59fc90432d1ae6c6707cf7ed73a13a91c64bc6382348e4326f901f4ff03 +size 18895091 diff --git a/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_3.jsonl b/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..d1d3367c26191dbbc5235f4f49182b99bd7c2bda 100644 --- a/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_3.jsonl +++ b/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f8d63a83f7320b69cba6f4a50dc3239b84e76ed32b158401307f99a4bbdb48b +size 24300550 diff --git a/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_4.jsonl b/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..ed9e3acd12a77d34a03caa731d36308b606bceb2 100644 --- a/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_4.jsonl +++ b/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34d5fb7330dc2b1e483056df8f95dc121c4797015f1a6bab96308ab9336e8e79 +size 29463646 diff --git a/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_5.jsonl b/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..b6d8689bda6e1ec1fec895385e7f72dce3ee5e68 100644 --- a/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_5.jsonl +++ b/8b7178b88b/evaluation/generation/examples.8b7178b88b_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fccf6c80c17a2dade83d24ade864f2560a49483b85910bf77543485ea97991b9 +size 34799863 diff --git a/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_3.jsonl b/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..299a42e2cc7e7b39a7b7bf27ddf3c4c79c602250 100644 --- a/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_3.jsonl +++ b/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:006709910625573fec29d0682472bc1fe4ecf145cf75df0c9abb91e08002bdd0 +size 9649808 diff --git a/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_4.jsonl b/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..763bd09018314041158206754cc209e886a3aa69 100644 --- a/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_4.jsonl +++ b/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16609386bdc7b86cee681d6fb627645501f36190d805d7ee38144798cdcd5174 +size 11673949 diff --git a/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_5.jsonl b/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..b204a780c2af6cf421a464f54a06f1041d7b1d81 100644 --- a/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_5.jsonl +++ b/8b7178b88b/evaluation/generation/examples.8b7178b88b_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87581b844261fbb738cd7115c6355a1ced5eb26f020ef0c4bb83eb03e432b381 +size 13899558 diff --git a/8b7178b88b/evaluation/generation/merged.csv b/8b7178b88b/evaluation/generation/merged.csv index 6b453e11ed9ebf825dcaa043b71e19cd7bcc8444..befdba8c6985d25ee10975a7bc7cbe9745442cce 100644 --- a/8b7178b88b/evaluation/generation/merged.csv +++ b/8b7178b88b/evaluation/generation/merged.csv @@ -18,7 +18,13 @@ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.045112546772934525 gem_xsum,1,median,rouge2_fmeasure,0.045112546772934525 gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.04741261419771178 gem_xsum,2,median,rouge2_fmeasure,0.04741261419771178 -gem_xsum,2,average,multiple,0.04673628136019406 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04579306632360622 +gem_xsum,3,median,rouge2_fmeasure,0.04579306632360622 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.014349999093671009 +gem_xsum,4,median,rouge2_fmeasure,0.014349999093671009 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00046551069178623087 +gem_xsum,5,median,rouge2_fmeasure,0.00046551069178623087 +gem_xsum,5,average,multiple,0.03346957003160761 web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04762017526127606 web_nlg_en,0,median,rouge2_fmeasure,0.04762017526127606 web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05785803759738568 @@ -36,4 +42,12 @@ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.037091167992168994 wiki_lingua_en,0,median,rouge2_fmeasure,0.037091167992168994 wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.06328901574064953 wiki_lingua_en,1,median,rouge2_fmeasure,0.06328901574064953 -wiki_lingua_en,1,average,multiple,0.05019009186640926 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.07067445283939051 +wiki_lingua_en,2,median,rouge2_fmeasure,0.07067445283939051 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.0583824000959267 +wiki_lingua_en,3,median,rouge2_fmeasure,0.0583824000959267 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.018412425374697422 +wiki_lingua_en,4,median,rouge2_fmeasure,0.018412425374697422 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.002970906530619929 +wiki_lingua_en,5,median,rouge2_fmeasure,0.002970906530619929 +wiki_lingua_en,5,average,multiple,0.04180339476224218 diff --git a/8b7178b88b/evaluation/generation/merged.json b/8b7178b88b/evaluation/generation/merged.json index c7bfc36aa1d5e7b1ed7cd6d726716633b34fb320..811aa3f0ed1b65f3f709e576f209e55f165714e2 100644 --- a/8b7178b88b/evaluation/generation/merged.json +++ b/8b7178b88b/evaluation/generation/merged.json @@ -1 +1 @@ -{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.2883786280202115, "bleu_stderr": 0.03936084799729088, "rouge1_fmeasure": 0.103421554792752, "rouge1_fmeasure_stderr": 0.001762257816978338, "rouge1_precision": 0.06719578938907746, "rouge1_precision_stderr": 0.0013004585757370683, "rouge1_recall": 0.2933408335090845, "rouge1_recall_stderr": 0.004382296370554512, "rouge2_fmeasure": 0.04762017526127606, "rouge2_fmeasure_stderr": 0.0010873320211799394, "rouge2_precision": 0.030556722468906044, "rouge2_precision_stderr": 0.0007492988401010578, "rouge2_recall": 0.14088496408629886, "rouge2_recall_stderr": 0.003049315486738709, "rougeL_fmeasure": 0.09993996529791976, "rougeL_fmeasure_stderr": 0.0016728671052772506, "rougeL_precision": 0.06486438547957983, "rougeL_precision_stderr": 0.001231171118936048, "rougeL_recall": 0.28447763202238135, "rougeL_recall_stderr": 0.0042288031883992855, "rougeLsum_fmeasure": 0.09965071694396466, "rougeLsum_fmeasure_stderr": 0.0016836029502195003, "rougeLsum_precision": 0.0647465837461535, "rougeLsum_precision_stderr": 0.00124366132201253, "rougeLsum_recall": 0.28232976762951584, "rougeLsum_recall_stderr": 0.004163361732330291}}, "1": {"PALM_prompt": {"bleu": 0.5634646538427781, "bleu_stderr": 0.03736988855205685, "rouge1_fmeasure": 0.12131056010238059, "rouge1_fmeasure_stderr": 0.0018129711840276857, "rouge1_precision": 0.07697654849580207, "rouge1_precision_stderr": 0.0012951272613183852, "rouge1_recall": 0.3969061719233893, "rouge1_recall_stderr": 0.0056851136357343015, "rouge2_fmeasure": 0.05785803759738568, "rouge2_fmeasure_stderr": 0.0011523078709876162, "rouge2_precision": 0.03642271932626906, "rouge2_precision_stderr": 0.0007956739937252943, "rouge2_recall": 0.2051665201143765, "rouge2_recall_stderr": 0.0041594883413927294, "rougeL_fmeasure": 0.11462071685790733, "rougeL_fmeasure_stderr": 0.001624892567642086, "rougeL_precision": 0.07267273013759175, "rougeL_precision_stderr": 0.0011552399753195406, "rougeL_recall": 0.3755810527939102, "rougeL_recall_stderr": 0.00528545382584031, "rougeLsum_fmeasure": 0.11540379904462905, "rougeLsum_fmeasure_stderr": 0.0016960835886338564, "rougeLsum_precision": 0.07333558102707188, "rougeLsum_precision_stderr": 0.0012195728859027712, "rougeLsum_recall": 0.3757551406945481, "rougeLsum_recall_stderr": 0.005219245200343907}}, "2": {"PALM_prompt": {"bleu": 0.6828060703023814, "bleu_stderr": 0.0425207483296129, "rouge1_fmeasure": 0.12552592683219344, "rouge1_fmeasure_stderr": 0.001752255131105832, "rouge1_precision": 0.0790865886214714, "rouge1_precision_stderr": 0.0012505450756042313, "rouge1_recall": 0.427035567540795, "rouge1_recall_stderr": 0.005749871140223685, "rouge2_fmeasure": 0.05940509370992996, "rouge2_fmeasure_stderr": 0.0011135855631122815, "rouge2_precision": 0.037142287807399386, "rouge2_precision_stderr": 0.000765558996913647, "rouge2_recall": 0.21988251267080597, "rouge2_recall_stderr": 0.00425697447376351, "rougeL_fmeasure": 0.11527329683828981, "rougeL_fmeasure_stderr": 0.001524101349020404, "rougeL_precision": 0.07265150298226193, "rougeL_precision_stderr": 0.0011031397973466682, "rougeL_recall": 0.3936959005975406, "rougeL_recall_stderr": 0.005187697908305124, "rougeLsum_fmeasure": 0.11923137803967343, "rougeLsum_fmeasure_stderr": 0.0016398034437248557, "rougeLsum_precision": 0.07517938427479466, "rougeLsum_precision_stderr": 0.0011789577603185614, "rougeLsum_recall": 0.404943656992868, "rougeLsum_recall_stderr": 0.005313046062998814}}, "3": {"PALM_prompt": {"bleu": 0.8336982343397022, "bleu_stderr": 0.0327887377326258, "rouge1_fmeasure": 0.12549315081985135, "rouge1_fmeasure_stderr": 0.0017756300197492854, "rouge1_precision": 0.07862496938468061, "rouge1_precision_stderr": 0.0012544372219464325, "rouge1_recall": 0.43336757562100137, "rouge1_recall_stderr": 0.005813975851408121, "rouge2_fmeasure": 0.06022531877051832, "rouge2_fmeasure_stderr": 0.0011563600202083746, "rouge2_precision": 0.03741379787478959, "rouge2_precision_stderr": 0.0007895630732377968, "rouge2_recall": 0.22623496077748528, "rouge2_recall_stderr": 0.0042712928549412735, "rougeL_fmeasure": 0.11360185571717625, "rougeL_fmeasure_stderr": 0.0015268821384323132, "rougeL_precision": 0.07124494381793388, "rougeL_precision_stderr": 0.0010962640285763162, "rougeL_recall": 0.39321959371417686, "rougeL_recall_stderr": 0.005101213428819951, "rougeLsum_fmeasure": 0.11874541206482168, "rougeLsum_fmeasure_stderr": 0.0016641712015728286, "rougeLsum_precision": 0.07448805733289557, "rougeLsum_precision_stderr": 0.001187320005987778, "rougeLsum_recall": 0.4097236474085259, "rougeLsum_recall_stderr": 0.005356713714674863}}, "4": {"PALM_prompt": {"bleu": 0.8048993474300223, "bleu_stderr": 0.04157871677547976, "rouge1_fmeasure": 0.12696507303445326, "rouge1_fmeasure_stderr": 0.0017668948839916938, "rouge1_precision": 0.07952578574480215, "rouge1_precision_stderr": 0.0012661942787102602, "rouge1_recall": 0.4386081219638241, "rouge1_recall_stderr": 0.005677266555057123, "rouge2_fmeasure": 0.06061578220428707, "rouge2_fmeasure_stderr": 0.0011175090259571937, "rouge2_precision": 0.0376951838080061, "rouge2_precision_stderr": 0.0007761144319241388, "rouge2_recall": 0.22894151277692995, "rouge2_recall_stderr": 0.004196410528544762, "rougeL_fmeasure": 0.11378598207760166, "rougeL_fmeasure_stderr": 0.0014923445088795922, "rougeL_precision": 0.07134231184259153, "rougeL_precision_stderr": 0.0010918605781200322, "rougeL_recall": 0.39486889829999067, "rougeL_recall_stderr": 0.004923545353622342, "rougeLsum_fmeasure": 0.1197281993743887, "rougeLsum_fmeasure_stderr": 0.0016590997843029978, "rougeLsum_precision": 0.07509698381413346, "rougeLsum_precision_stderr": 0.0012018963700845971, "rougeLsum_recall": 0.41258265582996795, "rougeLsum_recall_stderr": 0.0051847453818666765}}, "5": {"PALM_prompt": {"bleu": 0.9424030277847187, "bleu_stderr": 0.052987929389338294, "rouge1_fmeasure": 0.12911370685505766, "rouge1_fmeasure_stderr": 0.0017515798700797859, "rouge1_precision": 0.08025169576842368, "rouge1_precision_stderr": 0.001236439325453276, "rouge1_recall": 0.4579613447211886, "rouge1_recall_stderr": 0.005880996871821512, "rouge2_fmeasure": 0.0624035474416899, "rouge2_fmeasure_stderr": 0.0011352673015657528, "rouge2_precision": 0.03846872592808952, "rouge2_precision_stderr": 0.0007759633225274091, "rouge2_recall": 0.24371157528692775, "rouge2_recall_stderr": 0.004498931653094872, "rougeL_fmeasure": 0.1142452159028682, "rougeL_fmeasure_stderr": 0.0014684795581487694, "rougeL_precision": 0.07114235942831466, "rougeL_precision_stderr": 0.0010624631237322504, "rougeL_recall": 0.40714301284245485, "rougeL_recall_stderr": 0.005064627475792024, "rougeLsum_fmeasure": 0.12145176665328596, "rougeLsum_fmeasure_stderr": 0.0016352600982162296, "rougeLsum_precision": 0.0756038827316064, "rougeLsum_precision_stderr": 0.0011672546888864264, "rougeLsum_recall": 0.42963457818781536, "rougeLsum_recall_stderr": 0.0053743909806075905}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.6054713437075743, "bleu_stderr": 0.05437478201375844, "rouge1_fmeasure": 0.17747760370302318, "rouge1_fmeasure_stderr": 0.0019053538765698758, "rouge1_precision": 0.1502175136439869, "rouge1_precision_stderr": 0.0019030361852174719, "rouge1_recall": 0.261455388498779, "rouge1_recall_stderr": 0.0028100578970410993, "rouge2_fmeasure": 0.037091167992168994, "rouge2_fmeasure_stderr": 0.0008718507339161006, "rouge2_precision": 0.03115494637566214, "rouge2_precision_stderr": 0.0007718936573069445, "rouge2_recall": 0.05641110971379137, "rouge2_recall_stderr": 0.0014645457777589359, "rougeL_fmeasure": 0.1361541337117608, "rougeL_fmeasure_stderr": 0.0013435136569955786, "rougeL_precision": 0.11393251734177375, "rougeL_precision_stderr": 0.0013110775704154515, "rougeL_recall": 0.20550271197177267, "rougeL_recall_stderr": 0.0022393561246885608, "rougeLsum_fmeasure": 0.16373807275193125, "rougeLsum_fmeasure_stderr": 0.0017484417302425664, "rougeLsum_precision": 0.13840143550418207, "rougeLsum_precision_stderr": 0.0017472927323399304, "rougeLsum_recall": 0.24208617369571592, "rougeLsum_recall_stderr": 0.002615495423998821}}, "1": {"tldr_en": {"bleu": 3.3568815479070158, "bleu_stderr": 0.062408281042864173, "rouge1_fmeasure": 0.2384414177661812, "rouge1_fmeasure_stderr": 0.001967738213001087, "rouge1_precision": 0.20521455786196224, "rouge1_precision_stderr": 0.0021566423356602616, "rouge1_recall": 0.3450088538250452, "rouge1_recall_stderr": 0.0028141198924903532, "rouge2_fmeasure": 0.06328901574064953, "rouge2_fmeasure_stderr": 0.0011184668519141068, "rouge2_precision": 0.0541891609093382, "rouge2_precision_stderr": 0.0010252409370391552, "rouge2_recall": 0.09354429527016248, "rouge2_recall_stderr": 0.0017970539715243055, "rougeL_fmeasure": 0.16967595559198553, "rougeL_fmeasure_stderr": 0.0013571592618223686, "rougeL_precision": 0.14442950228734072, "rougeL_precision_stderr": 0.0014388633343097553, "rougeL_recall": 0.25157906179953965, "rougeL_recall_stderr": 0.002266706627328072, "rougeLsum_fmeasure": 0.22437259777229618, "rougeLsum_fmeasure_stderr": 0.0018534765898883354, "rougeLsum_precision": 0.19302045441232318, "rougeLsum_precision_stderr": 0.002028535492752175, "rougeLsum_recall": 0.3251189492217731, "rougeLsum_recall_stderr": 0.0026790268923513527}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.16951406402821598, "bleu_stderr": 0.014615585721645268, "rouge1_fmeasure": 0.14902951599302275, "rouge1_fmeasure_stderr": 0.0011936812421680112, "rouge1_precision": 0.15133664159023066, "rouge1_precision_stderr": 0.003090044411839824, "rouge1_recall": 0.20829872832019666, "rouge1_recall_stderr": 0.0012350500662973555, "rouge2_fmeasure": 0.012769294373367771, "rouge2_fmeasure_stderr": 0.0005241002980862521, "rouge2_precision": 0.03637204499081546, "rouge2_precision_stderr": 0.0028085435311582378, "rouge2_recall": 0.012949043111655439, "rouge2_recall_stderr": 0.0005016259904639799, "rougeL_fmeasure": 0.14100584281893078, "rougeL_fmeasure_stderr": 0.0009038280484109508, "rougeL_precision": 0.139679315386246, "rougeL_precision_stderr": 0.0027670544139910772, "rougeL_recall": 0.2016577246385112, "rougeL_recall_stderr": 0.0012040942408512864, "rougeLsum_fmeasure": 0.1028831030157934, "rougeLsum_fmeasure_stderr": 0.001075855762070383, "rougeLsum_precision": 0.113513620222634, "rougeLsum_precision_stderr": 0.0030413500876753405, "rougeLsum_recall": 0.14029728805033925, "rougeLsum_recall_stderr": 0.0010082613852652347}}, "1": {"generate_text_restaurant": {"bleu": 10.067385696344799, "bleu_stderr": 0.10607089517794384, "rouge1_fmeasure": 0.4237261154678058, "rouge1_fmeasure_stderr": 0.0020488737967454696, "rouge1_precision": 0.42917506799106997, "rouge1_precision_stderr": 0.002365499643400411, "rouge1_recall": 0.45662630550997346, "rouge1_recall_stderr": 0.002963335187859022, "rouge2_fmeasure": 0.18427344702091913, "rouge2_fmeasure_stderr": 0.0017323159960132722, "rouge2_precision": 0.1861748094036103, "rouge2_precision_stderr": 0.0018728109018732715, "rouge2_recall": 0.20088789046913377, "rouge2_recall_stderr": 0.0021605722507918876, "rougeL_fmeasure": 0.30096531146642097, "rougeL_fmeasure_stderr": 0.001710001515688343, "rougeL_precision": 0.3059227854981379, "rougeL_precision_stderr": 0.001998310078066914, "rougeL_recall": 0.32448538965781326, "rougeL_recall_stderr": 0.0023860439595277186, "rougeLsum_fmeasure": 0.3528207417266004, "rougeLsum_fmeasure_stderr": 0.0020083480221353787, "rougeLsum_precision": 0.3579690622651419, "rougeLsum_precision_stderr": 0.0022901580739891005, "rougeLsum_recall": 0.38007301761756745, "rougeLsum_recall_stderr": 0.002747710382969998}}, "2": {"generate_text_restaurant": {"bleu": 11.636919978386004, "bleu_stderr": 0.19788162907948217, "rouge1_fmeasure": 0.45290381111986455, "rouge1_fmeasure_stderr": 0.0019462177421565893, "rouge1_precision": 0.44970849663299084, "rouge1_precision_stderr": 0.0022619593433166924, "rouge1_recall": 0.49228985626567096, "rouge1_recall_stderr": 0.0028656589338168894, "rouge2_fmeasure": 0.2101765831183815, "rouge2_fmeasure_stderr": 0.0017952071516282025, "rouge2_precision": 0.20827536079463077, "rouge2_precision_stderr": 0.0018769282280709765, "rouge2_recall": 0.23091649040389164, "rouge2_recall_stderr": 0.002254284484281101, "rougeL_fmeasure": 0.3265472292605653, "rougeL_fmeasure_stderr": 0.0017398385407963483, "rougeL_precision": 0.3243950518654991, "rougeL_precision_stderr": 0.0019552548930316443, "rougeL_recall": 0.35557783678320415, "rougeL_recall_stderr": 0.002428507434228921, "rougeLsum_fmeasure": 0.3773556348099214, "rougeLsum_fmeasure_stderr": 0.0019945557891501518, "rougeLsum_precision": 0.3749727434579744, "rougeLsum_precision_stderr": 0.002236863975642526, "rougeLsum_recall": 0.4099709634067351, "rougeLsum_recall_stderr": 0.002706090262648214}}, "3": {"generate_text_restaurant": {"bleu": 12.426631559674204, "bleu_stderr": 0.11440038355745161, "rouge1_fmeasure": 0.46304132819339217, "rouge1_fmeasure_stderr": 0.0020173924707804184, "rouge1_precision": 0.4562949226921436, "rouge1_precision_stderr": 0.002294468118391725, "rouge1_recall": 0.5049085061040581, "rouge1_recall_stderr": 0.0029021179641736915, "rouge2_fmeasure": 0.22103332973061124, "rouge2_fmeasure_stderr": 0.0019041506003672232, "rouge2_precision": 0.21679230842159727, "rouge2_precision_stderr": 0.0019314339034455107, "rouge2_recall": 0.24402017960207578, "rouge2_recall_stderr": 0.002398918086552701, "rougeL_fmeasure": 0.33425485052465553, "rougeL_fmeasure_stderr": 0.0018145324900473829, "rougeL_precision": 0.3294449670016489, "rougeL_precision_stderr": 0.0019862700698188526, "rougeL_recall": 0.36499231892760226, "rougeL_recall_stderr": 0.002472604135584095, "rougeLsum_fmeasure": 0.38784195443176667, "rougeLsum_fmeasure_stderr": 0.002095047300699444, "rougeLsum_precision": 0.3822668655881195, "rougeLsum_precision_stderr": 0.0022831595508053123, "rougeLsum_recall": 0.42289921789097146, "rougeLsum_recall_stderr": 0.0028059845754771946}}, "4": {"generate_text_restaurant": {"bleu": 12.590553484195926, "bleu_stderr": 0.1557062455646749, "rouge1_fmeasure": 0.4665951594342427, "rouge1_fmeasure_stderr": 0.0019811140237555058, "rouge1_precision": 0.45730914793351857, "rouge1_precision_stderr": 0.0022800216801940523, "rouge1_recall": 0.5099063296772233, "rouge1_recall_stderr": 0.002848408606334223, "rouge2_fmeasure": 0.22268786623363584, "rouge2_fmeasure_stderr": 0.0018973436430830147, "rouge2_precision": 0.21739148517372925, "rouge2_precision_stderr": 0.0019197356806646647, "rouge2_recall": 0.24585802349091362, "rouge2_recall_stderr": 0.002367288796030392, "rougeL_fmeasure": 0.3359737739996741, "rougeL_fmeasure_stderr": 0.0018304708156164022, "rougeL_precision": 0.32920126816755724, "rougeL_precision_stderr": 0.0019885426030395704, "rougeL_recall": 0.3675548516076049, "rougeL_recall_stderr": 0.0024640793870539996, "rougeLsum_fmeasure": 0.39000018030735667, "rougeLsum_fmeasure_stderr": 0.0020682797216599685, "rougeLsum_precision": 0.3822321205301913, "rougeLsum_precision_stderr": 0.002266945541059726, "rougeLsum_recall": 0.4260724023535776, "rougeLsum_recall_stderr": 0.0027436023351126207}}, "5": {"generate_text_restaurant": {"bleu": 12.568675301062258, "bleu_stderr": 0.16224872119521075, "rouge1_fmeasure": 0.46892055322657583, "rouge1_fmeasure_stderr": 0.0019557972473512363, "rouge1_precision": 0.46019703610358126, "rouge1_precision_stderr": 0.002310541944330683, "rouge1_recall": 0.5121643724875744, "rouge1_recall_stderr": 0.0028028895695916085, "rouge2_fmeasure": 0.22475547301426657, "rouge2_fmeasure_stderr": 0.0018950527696092341, "rouge2_precision": 0.22000376959092752, "rouge2_precision_stderr": 0.0019418480260118285, "rouge2_recall": 0.24793505182999162, "rouge2_recall_stderr": 0.0023724004656612103, "rougeL_fmeasure": 0.338538223656329, "rougeL_fmeasure_stderr": 0.0018504743886393609, "rougeL_precision": 0.3318569141781424, "rougeL_precision_stderr": 0.002016776093642391, "rougeL_recall": 0.37058399083148463, "rougeL_recall_stderr": 0.0024982918049127, "rougeLsum_fmeasure": 0.39172408140249776, "rougeLsum_fmeasure_stderr": 0.0020299941320428634, "rougeLsum_precision": 0.38438568129064205, "rougeLsum_precision_stderr": 0.002263964644987408, "rougeLsum_recall": 0.42799017201029466, "rougeLsum_recall_stderr": 0.0027129978907711573}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.123641882512125, "bleu_stderr": 0.11311464309535657, "rouge1_fmeasure": 0.21703276545763875, "rouge1_fmeasure_stderr": 0.002515369600939235, "rouge1_precision": 0.1660896826959829, "rouge1_precision_stderr": 0.002107288082741575, "rouge1_recall": 0.35083498826967574, "rouge1_recall_stderr": 0.004593723083105133, "rouge2_fmeasure": 0.04768368310993588, "rouge2_fmeasure_stderr": 0.0016080049517398576, "rouge2_precision": 0.035059732318482924, "rouge2_precision_stderr": 0.0011977915786786503, "rouge2_recall": 0.08226070946861705, "rouge2_recall_stderr": 0.0028797184426689847, "rougeL_fmeasure": 0.15826337701676133, "rougeL_fmeasure_stderr": 0.0019458792115624524, "rougeL_precision": 0.12091074416538322, "rougeL_precision_stderr": 0.0016031624561767372, "rougeL_recall": 0.2576388464400407, "rougeL_recall_stderr": 0.00369472688602631, "rougeLsum_fmeasure": 0.16763228337149372, "rougeLsum_fmeasure_stderr": 0.002117910852275356, "rougeLsum_precision": 0.12768714774937476, "rougeLsum_precision_stderr": 0.0016996047438811593, "rougeLsum_recall": 0.27322366216085703, "rougeLsum_recall_stderr": 0.003964231303059884}}, "1": {"article_DOC_summary": {"bleu": 1.824798489457088, "bleu_stderr": 0.07127603588876777, "rouge1_fmeasure": 0.19596597830254703, "rouge1_fmeasure_stderr": 0.0027034863790677227, "rouge1_precision": 0.13945719874562185, "rouge1_precision_stderr": 0.0019983868750527664, "rouge1_recall": 0.34308219072003043, "rouge1_recall_stderr": 0.004720352047720588, "rouge2_fmeasure": 0.045112546772934525, "rouge2_fmeasure_stderr": 0.0016064335304974058, "rouge2_precision": 0.03178740757757239, "rouge2_precision_stderr": 0.0011392122530476615, "rouge2_recall": 0.08118275228432598, "rouge2_recall_stderr": 0.0029455220106513707, "rougeL_fmeasure": 0.15024907208625723, "rougeL_fmeasure_stderr": 0.002045800799748567, "rougeL_precision": 0.10681129267314027, "rougeL_precision_stderr": 0.0015073695015672187, "rougeL_recall": 0.2641425374203942, "rougeL_recall_stderr": 0.003651992949055377, "rougeLsum_fmeasure": 0.15587486680514107, "rougeLsum_fmeasure_stderr": 0.0022911401506731394, "rougeLsum_precision": 0.11073338724734512, "rougeLsum_precision_stderr": 0.0016753848690916494, "rougeLsum_recall": 0.2742974784323833, "rougeLsum_recall_stderr": 0.004087132157440376}}, "2": {"article_DOC_summary": {"bleu": 1.9202286782616684, "bleu_stderr": 0.10294125559660114, "rouge1_fmeasure": 0.19441013280698327, "rouge1_fmeasure_stderr": 0.0026788744932156566, "rouge1_precision": 0.13831430998317307, "rouge1_precision_stderr": 0.001976957405007506, "rouge1_recall": 0.34054180926843874, "rouge1_recall_stderr": 0.004697948807654975, "rouge2_fmeasure": 0.04741261419771178, "rouge2_fmeasure_stderr": 0.0016319693236568167, "rouge2_precision": 0.03335127280499144, "rouge2_precision_stderr": 0.001154865026778613, "rouge2_recall": 0.0856058058231469, "rouge2_recall_stderr": 0.003020560704657517, "rougeL_fmeasure": 0.1548115500775745, "rougeL_fmeasure_stderr": 0.0021179712451710035, "rougeL_precision": 0.11005048631596476, "rougeL_precision_stderr": 0.0015585194589067973, "rougeL_recall": 0.2721344790344476, "rougeL_recall_stderr": 0.003803909579012272, "rougeLsum_fmeasure": 0.15284300610999954, "rougeLsum_fmeasure_stderr": 0.0022472050629649657, "rougeLsum_precision": 0.10851611339383292, "rougeLsum_precision_stderr": 0.0016395893229652913, "rougeLsum_recall": 0.2694877082238382, "rougeLsum_recall_stderr": 0.004071472128097917}}}} \ No newline at end of file +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.2883786280202115, "bleu_stderr": 0.03936084799729088, "rouge1_fmeasure": 0.103421554792752, "rouge1_fmeasure_stderr": 0.001762257816978338, "rouge1_precision": 0.06719578938907746, "rouge1_precision_stderr": 0.0013004585757370683, "rouge1_recall": 0.2933408335090845, "rouge1_recall_stderr": 0.004382296370554512, "rouge2_fmeasure": 0.04762017526127606, "rouge2_fmeasure_stderr": 0.0010873320211799394, "rouge2_precision": 0.030556722468906044, "rouge2_precision_stderr": 0.0007492988401010578, "rouge2_recall": 0.14088496408629886, "rouge2_recall_stderr": 0.003049315486738709, "rougeL_fmeasure": 0.09993996529791976, "rougeL_fmeasure_stderr": 0.0016728671052772506, "rougeL_precision": 0.06486438547957983, "rougeL_precision_stderr": 0.001231171118936048, "rougeL_recall": 0.28447763202238135, "rougeL_recall_stderr": 0.0042288031883992855, "rougeLsum_fmeasure": 0.09965071694396466, "rougeLsum_fmeasure_stderr": 0.0016836029502195003, "rougeLsum_precision": 0.0647465837461535, "rougeLsum_precision_stderr": 0.00124366132201253, "rougeLsum_recall": 0.28232976762951584, "rougeLsum_recall_stderr": 0.004163361732330291}}, "1": {"PALM_prompt": {"bleu": 0.5634646538427781, "bleu_stderr": 0.03736988855205685, "rouge1_fmeasure": 0.12131056010238059, "rouge1_fmeasure_stderr": 0.0018129711840276857, "rouge1_precision": 0.07697654849580207, "rouge1_precision_stderr": 0.0012951272613183852, "rouge1_recall": 0.3969061719233893, "rouge1_recall_stderr": 0.0056851136357343015, "rouge2_fmeasure": 0.05785803759738568, "rouge2_fmeasure_stderr": 0.0011523078709876162, "rouge2_precision": 0.03642271932626906, "rouge2_precision_stderr": 0.0007956739937252943, "rouge2_recall": 0.2051665201143765, "rouge2_recall_stderr": 0.0041594883413927294, "rougeL_fmeasure": 0.11462071685790733, "rougeL_fmeasure_stderr": 0.001624892567642086, "rougeL_precision": 0.07267273013759175, "rougeL_precision_stderr": 0.0011552399753195406, "rougeL_recall": 0.3755810527939102, "rougeL_recall_stderr": 0.00528545382584031, "rougeLsum_fmeasure": 0.11540379904462905, "rougeLsum_fmeasure_stderr": 0.0016960835886338564, "rougeLsum_precision": 0.07333558102707188, "rougeLsum_precision_stderr": 0.0012195728859027712, "rougeLsum_recall": 0.3757551406945481, "rougeLsum_recall_stderr": 0.005219245200343907}}, "2": {"PALM_prompt": {"bleu": 0.6828060703023814, "bleu_stderr": 0.0425207483296129, "rouge1_fmeasure": 0.12552592683219344, "rouge1_fmeasure_stderr": 0.001752255131105832, "rouge1_precision": 0.0790865886214714, "rouge1_precision_stderr": 0.0012505450756042313, "rouge1_recall": 0.427035567540795, "rouge1_recall_stderr": 0.005749871140223685, "rouge2_fmeasure": 0.05940509370992996, "rouge2_fmeasure_stderr": 0.0011135855631122815, "rouge2_precision": 0.037142287807399386, "rouge2_precision_stderr": 0.000765558996913647, "rouge2_recall": 0.21988251267080597, "rouge2_recall_stderr": 0.00425697447376351, "rougeL_fmeasure": 0.11527329683828981, "rougeL_fmeasure_stderr": 0.001524101349020404, "rougeL_precision": 0.07265150298226193, "rougeL_precision_stderr": 0.0011031397973466682, "rougeL_recall": 0.3936959005975406, "rougeL_recall_stderr": 0.005187697908305124, "rougeLsum_fmeasure": 0.11923137803967343, "rougeLsum_fmeasure_stderr": 0.0016398034437248557, "rougeLsum_precision": 0.07517938427479466, "rougeLsum_precision_stderr": 0.0011789577603185614, "rougeLsum_recall": 0.404943656992868, "rougeLsum_recall_stderr": 0.005313046062998814}}, "3": {"PALM_prompt": {"bleu": 0.8336982343397022, "bleu_stderr": 0.0327887377326258, "rouge1_fmeasure": 0.12549315081985135, "rouge1_fmeasure_stderr": 0.0017756300197492854, "rouge1_precision": 0.07862496938468061, "rouge1_precision_stderr": 0.0012544372219464325, "rouge1_recall": 0.43336757562100137, "rouge1_recall_stderr": 0.005813975851408121, "rouge2_fmeasure": 0.06022531877051832, "rouge2_fmeasure_stderr": 0.0011563600202083746, "rouge2_precision": 0.03741379787478959, "rouge2_precision_stderr": 0.0007895630732377968, "rouge2_recall": 0.22623496077748528, "rouge2_recall_stderr": 0.0042712928549412735, "rougeL_fmeasure": 0.11360185571717625, "rougeL_fmeasure_stderr": 0.0015268821384323132, "rougeL_precision": 0.07124494381793388, "rougeL_precision_stderr": 0.0010962640285763162, "rougeL_recall": 0.39321959371417686, "rougeL_recall_stderr": 0.005101213428819951, "rougeLsum_fmeasure": 0.11874541206482168, "rougeLsum_fmeasure_stderr": 0.0016641712015728286, "rougeLsum_precision": 0.07448805733289557, "rougeLsum_precision_stderr": 0.001187320005987778, "rougeLsum_recall": 0.4097236474085259, "rougeLsum_recall_stderr": 0.005356713714674863}}, "4": {"PALM_prompt": {"bleu": 0.8048993474300223, "bleu_stderr": 0.04157871677547976, "rouge1_fmeasure": 0.12696507303445326, "rouge1_fmeasure_stderr": 0.0017668948839916938, "rouge1_precision": 0.07952578574480215, "rouge1_precision_stderr": 0.0012661942787102602, "rouge1_recall": 0.4386081219638241, "rouge1_recall_stderr": 0.005677266555057123, "rouge2_fmeasure": 0.06061578220428707, "rouge2_fmeasure_stderr": 0.0011175090259571937, "rouge2_precision": 0.0376951838080061, "rouge2_precision_stderr": 0.0007761144319241388, "rouge2_recall": 0.22894151277692995, "rouge2_recall_stderr": 0.004196410528544762, "rougeL_fmeasure": 0.11378598207760166, "rougeL_fmeasure_stderr": 0.0014923445088795922, "rougeL_precision": 0.07134231184259153, "rougeL_precision_stderr": 0.0010918605781200322, "rougeL_recall": 0.39486889829999067, "rougeL_recall_stderr": 0.004923545353622342, "rougeLsum_fmeasure": 0.1197281993743887, "rougeLsum_fmeasure_stderr": 0.0016590997843029978, "rougeLsum_precision": 0.07509698381413346, "rougeLsum_precision_stderr": 0.0012018963700845971, "rougeLsum_recall": 0.41258265582996795, "rougeLsum_recall_stderr": 0.0051847453818666765}}, "5": {"PALM_prompt": {"bleu": 0.9424030277847187, "bleu_stderr": 0.052987929389338294, "rouge1_fmeasure": 0.12911370685505766, "rouge1_fmeasure_stderr": 0.0017515798700797859, "rouge1_precision": 0.08025169576842368, "rouge1_precision_stderr": 0.001236439325453276, "rouge1_recall": 0.4579613447211886, "rouge1_recall_stderr": 0.005880996871821512, "rouge2_fmeasure": 0.0624035474416899, "rouge2_fmeasure_stderr": 0.0011352673015657528, "rouge2_precision": 0.03846872592808952, "rouge2_precision_stderr": 0.0007759633225274091, "rouge2_recall": 0.24371157528692775, "rouge2_recall_stderr": 0.004498931653094872, "rougeL_fmeasure": 0.1142452159028682, "rougeL_fmeasure_stderr": 0.0014684795581487694, "rougeL_precision": 0.07114235942831466, "rougeL_precision_stderr": 0.0010624631237322504, "rougeL_recall": 0.40714301284245485, "rougeL_recall_stderr": 0.005064627475792024, "rougeLsum_fmeasure": 0.12145176665328596, "rougeLsum_fmeasure_stderr": 0.0016352600982162296, "rougeLsum_precision": 0.0756038827316064, "rougeLsum_precision_stderr": 0.0011672546888864264, "rougeLsum_recall": 0.42963457818781536, "rougeLsum_recall_stderr": 0.0053743909806075905}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.6054713437075743, "bleu_stderr": 0.05437478201375844, "rouge1_fmeasure": 0.17747760370302318, "rouge1_fmeasure_stderr": 0.0019053538765698758, "rouge1_precision": 0.1502175136439869, "rouge1_precision_stderr": 0.0019030361852174719, "rouge1_recall": 0.261455388498779, "rouge1_recall_stderr": 0.0028100578970410993, "rouge2_fmeasure": 0.037091167992168994, "rouge2_fmeasure_stderr": 0.0008718507339161006, "rouge2_precision": 0.03115494637566214, "rouge2_precision_stderr": 0.0007718936573069445, "rouge2_recall": 0.05641110971379137, "rouge2_recall_stderr": 0.0014645457777589359, "rougeL_fmeasure": 0.1361541337117608, "rougeL_fmeasure_stderr": 0.0013435136569955786, "rougeL_precision": 0.11393251734177375, "rougeL_precision_stderr": 0.0013110775704154515, "rougeL_recall": 0.20550271197177267, "rougeL_recall_stderr": 0.0022393561246885608, "rougeLsum_fmeasure": 0.16373807275193125, "rougeLsum_fmeasure_stderr": 0.0017484417302425664, "rougeLsum_precision": 0.13840143550418207, "rougeLsum_precision_stderr": 0.0017472927323399304, "rougeLsum_recall": 0.24208617369571592, "rougeLsum_recall_stderr": 0.002615495423998821}}, "1": {"tldr_en": {"bleu": 3.3568815479070158, "bleu_stderr": 0.062408281042864173, "rouge1_fmeasure": 0.2384414177661812, "rouge1_fmeasure_stderr": 0.001967738213001087, "rouge1_precision": 0.20521455786196224, "rouge1_precision_stderr": 0.0021566423356602616, "rouge1_recall": 0.3450088538250452, "rouge1_recall_stderr": 0.0028141198924903532, "rouge2_fmeasure": 0.06328901574064953, "rouge2_fmeasure_stderr": 0.0011184668519141068, "rouge2_precision": 0.0541891609093382, "rouge2_precision_stderr": 0.0010252409370391552, "rouge2_recall": 0.09354429527016248, "rouge2_recall_stderr": 0.0017970539715243055, "rougeL_fmeasure": 0.16967595559198553, "rougeL_fmeasure_stderr": 0.0013571592618223686, "rougeL_precision": 0.14442950228734072, "rougeL_precision_stderr": 0.0014388633343097553, "rougeL_recall": 0.25157906179953965, "rougeL_recall_stderr": 0.002266706627328072, "rougeLsum_fmeasure": 0.22437259777229618, "rougeLsum_fmeasure_stderr": 0.0018534765898883354, "rougeLsum_precision": 0.19302045441232318, "rougeLsum_precision_stderr": 0.002028535492752175, "rougeLsum_recall": 0.3251189492217731, "rougeLsum_recall_stderr": 0.0026790268923513527}}, "2": {"tldr_en": {"bleu": 3.9743938237550784, "bleu_stderr": 0.07475699238187014, "rouge1_fmeasure": 0.2476037847532769, "rouge1_fmeasure_stderr": 0.002025823379351112, "rouge1_precision": 0.22168273116215406, "rouge1_precision_stderr": 0.002435239313498893, "rouge1_recall": 0.34743629317668956, "rouge1_recall_stderr": 0.0028712547192456107, "rouge2_fmeasure": 0.07067445283939051, "rouge2_fmeasure_stderr": 0.0012417394471129861, "rouge2_precision": 0.06376061872271946, "rouge2_precision_stderr": 0.0013048039083209525, "rouge2_recall": 0.101241459889032, "rouge2_recall_stderr": 0.001953242205361602, "rougeL_fmeasure": 0.1802673381893277, "rougeL_fmeasure_stderr": 0.001478295743890994, "rougeL_precision": 0.16023681566231104, "rougeL_precision_stderr": 0.001772038861775115, "rougeL_recall": 0.25897094370699875, "rougeL_recall_stderr": 0.002426935392898513, "rougeLsum_fmeasure": 0.23404735258003787, "rougeLsum_fmeasure_stderr": 0.0019207642560461376, "rougeLsum_precision": 0.20950862810749826, "rougeLsum_precision_stderr": 0.002314989258793673, "rougeLsum_recall": 0.32880876166482065, "rougeLsum_recall_stderr": 0.002739608654356716}}, "3": {"tldr_en": {"bleu": 3.9817750726027175, "bleu_stderr": 0.09479747244511556, "rouge1_fmeasure": 0.2047867720982337, "rouge1_fmeasure_stderr": 0.0024642538193857724, "rouge1_precision": 0.1929078707094695, "rouge1_precision_stderr": 0.0028265761167604094, "rouge1_recall": 0.28273526242062413, "rouge1_recall_stderr": 0.003552029067040685, "rouge2_fmeasure": 0.0583824000959267, "rouge2_fmeasure_stderr": 0.0012189630804918103, "rouge2_precision": 0.054778926280360955, "rouge2_precision_stderr": 0.0013556972713623368, "rouge2_recall": 0.08337730610072844, "rouge2_recall_stderr": 0.0019188280227343476, "rougeL_fmeasure": 0.1504897955536091, "rougeL_fmeasure_stderr": 0.0018172498103686094, "rougeL_precision": 0.1417601937646707, "rougeL_precision_stderr": 0.0021376668461159194, "rougeL_recall": 0.21204142844316634, "rougeL_recall_stderr": 0.002858744024264778, "rougeLsum_fmeasure": 0.19403204633681076, "rougeLsum_fmeasure_stderr": 0.0023374505495094874, "rougeLsum_precision": 0.18285584254571302, "rougeLsum_precision_stderr": 0.0026982092402270926, "rougeLsum_recall": 0.268579149278918, "rougeLsum_recall_stderr": 0.0034097338970395575}}, "4": {"tldr_en": {"bleu": 0.8498125282802876, "bleu_stderr": 0.07488095870505093, "rouge1_fmeasure": 0.06567263366786005, "rouge1_fmeasure_stderr": 0.0021904578073412223, "rouge1_precision": 0.06349732024114886, "rouge1_precision_stderr": 0.00234492312222753, "rouge1_recall": 0.09340855339099945, "rouge1_recall_stderr": 0.003159444376659221, "rouge2_fmeasure": 0.018412425374697422, "rouge2_fmeasure_stderr": 0.0008441162033876125, "rouge2_precision": 0.01738674391302479, "rouge2_precision_stderr": 0.000886017251546639, "rouge2_recall": 0.02721797575693733, "rouge2_recall_stderr": 0.0013454614244059475, "rougeL_fmeasure": 0.04945824665200398, "rougeL_fmeasure_stderr": 0.0016373751055148418, "rougeL_precision": 0.04817497329724485, "rougeL_precision_stderr": 0.0018235195878999154, "rougeL_recall": 0.07173904334429866, "rougeL_recall_stderr": 0.002478097323273335, "rougeLsum_fmeasure": 0.06188112367809635, "rougeLsum_fmeasure_stderr": 0.002069249663413523, "rougeLsum_precision": 0.05985219645183224, "rougeLsum_precision_stderr": 0.0022220794271589225, "rougeLsum_recall": 0.0881853688625359, "rougeLsum_recall_stderr": 0.0029988916997001343}}, "5": {"tldr_en": {"bleu": 2.8683848179992505e-06, "bleu_stderr": 7.381534992907998e-06, "rouge1_fmeasure": 0.010706535802039367, "rouge1_fmeasure_stderr": 0.0009930611949791836, "rouge1_precision": 0.010748286830524562, "rouge1_precision_stderr": 0.0010977103345240155, "rouge1_recall": 0.015508382128794362, "rouge1_recall_stderr": 0.001446236589386957, "rouge2_fmeasure": 0.002970906530619929, "rouge2_fmeasure_stderr": 0.0003682132280639809, "rouge2_precision": 0.0030969030541188967, "rouge2_precision_stderr": 0.0005268236540414107, "rouge2_recall": 0.004556824944932366, "rouge2_recall_stderr": 0.0005664861641115252, "rougeL_fmeasure": 0.008019691411733835, "rougeL_fmeasure_stderr": 0.000754515109085896, "rougeL_precision": 0.008049434192397904, "rougeL_precision_stderr": 0.0008462473363861482, "rougeL_recall": 0.011951304347862953, "rougeL_recall_stderr": 0.0011515347961576568, "rougeLsum_fmeasure": 0.010182849522984623, "rougeLsum_fmeasure_stderr": 0.0009497792801374568, "rougeLsum_precision": 0.010213486554639387, "rougeLsum_precision_stderr": 0.001056290927469812, "rougeLsum_recall": 0.014853652320406176, "rougeLsum_recall_stderr": 0.0013945219376869788}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.16951406402821598, "bleu_stderr": 0.014615585721645268, "rouge1_fmeasure": 0.14902951599302275, "rouge1_fmeasure_stderr": 0.0011936812421680112, "rouge1_precision": 0.15133664159023066, "rouge1_precision_stderr": 0.003090044411839824, "rouge1_recall": 0.20829872832019666, "rouge1_recall_stderr": 0.0012350500662973555, "rouge2_fmeasure": 0.012769294373367771, "rouge2_fmeasure_stderr": 0.0005241002980862521, "rouge2_precision": 0.03637204499081546, "rouge2_precision_stderr": 0.0028085435311582378, "rouge2_recall": 0.012949043111655439, "rouge2_recall_stderr": 0.0005016259904639799, "rougeL_fmeasure": 0.14100584281893078, "rougeL_fmeasure_stderr": 0.0009038280484109508, "rougeL_precision": 0.139679315386246, "rougeL_precision_stderr": 0.0027670544139910772, "rougeL_recall": 0.2016577246385112, "rougeL_recall_stderr": 0.0012040942408512864, "rougeLsum_fmeasure": 0.1028831030157934, "rougeLsum_fmeasure_stderr": 0.001075855762070383, "rougeLsum_precision": 0.113513620222634, "rougeLsum_precision_stderr": 0.0030413500876753405, "rougeLsum_recall": 0.14029728805033925, "rougeLsum_recall_stderr": 0.0010082613852652347}}, "1": {"generate_text_restaurant": {"bleu": 10.067385696344799, "bleu_stderr": 0.10607089517794384, "rouge1_fmeasure": 0.4237261154678058, "rouge1_fmeasure_stderr": 0.0020488737967454696, "rouge1_precision": 0.42917506799106997, "rouge1_precision_stderr": 0.002365499643400411, "rouge1_recall": 0.45662630550997346, "rouge1_recall_stderr": 0.002963335187859022, "rouge2_fmeasure": 0.18427344702091913, "rouge2_fmeasure_stderr": 0.0017323159960132722, "rouge2_precision": 0.1861748094036103, "rouge2_precision_stderr": 0.0018728109018732715, "rouge2_recall": 0.20088789046913377, "rouge2_recall_stderr": 0.0021605722507918876, "rougeL_fmeasure": 0.30096531146642097, "rougeL_fmeasure_stderr": 0.001710001515688343, "rougeL_precision": 0.3059227854981379, "rougeL_precision_stderr": 0.001998310078066914, "rougeL_recall": 0.32448538965781326, "rougeL_recall_stderr": 0.0023860439595277186, "rougeLsum_fmeasure": 0.3528207417266004, "rougeLsum_fmeasure_stderr": 0.0020083480221353787, "rougeLsum_precision": 0.3579690622651419, "rougeLsum_precision_stderr": 0.0022901580739891005, "rougeLsum_recall": 0.38007301761756745, "rougeLsum_recall_stderr": 0.002747710382969998}}, "2": {"generate_text_restaurant": {"bleu": 11.636919978386004, "bleu_stderr": 0.19788162907948217, "rouge1_fmeasure": 0.45290381111986455, "rouge1_fmeasure_stderr": 0.0019462177421565893, "rouge1_precision": 0.44970849663299084, "rouge1_precision_stderr": 0.0022619593433166924, "rouge1_recall": 0.49228985626567096, "rouge1_recall_stderr": 0.0028656589338168894, "rouge2_fmeasure": 0.2101765831183815, "rouge2_fmeasure_stderr": 0.0017952071516282025, "rouge2_precision": 0.20827536079463077, "rouge2_precision_stderr": 0.0018769282280709765, "rouge2_recall": 0.23091649040389164, "rouge2_recall_stderr": 0.002254284484281101, "rougeL_fmeasure": 0.3265472292605653, "rougeL_fmeasure_stderr": 0.0017398385407963483, "rougeL_precision": 0.3243950518654991, "rougeL_precision_stderr": 0.0019552548930316443, "rougeL_recall": 0.35557783678320415, "rougeL_recall_stderr": 0.002428507434228921, "rougeLsum_fmeasure": 0.3773556348099214, "rougeLsum_fmeasure_stderr": 0.0019945557891501518, "rougeLsum_precision": 0.3749727434579744, "rougeLsum_precision_stderr": 0.002236863975642526, "rougeLsum_recall": 0.4099709634067351, "rougeLsum_recall_stderr": 0.002706090262648214}}, "3": {"generate_text_restaurant": {"bleu": 12.426631559674204, "bleu_stderr": 0.11440038355745161, "rouge1_fmeasure": 0.46304132819339217, "rouge1_fmeasure_stderr": 0.0020173924707804184, "rouge1_precision": 0.4562949226921436, "rouge1_precision_stderr": 0.002294468118391725, "rouge1_recall": 0.5049085061040581, "rouge1_recall_stderr": 0.0029021179641736915, "rouge2_fmeasure": 0.22103332973061124, "rouge2_fmeasure_stderr": 0.0019041506003672232, "rouge2_precision": 0.21679230842159727, "rouge2_precision_stderr": 0.0019314339034455107, "rouge2_recall": 0.24402017960207578, "rouge2_recall_stderr": 0.002398918086552701, "rougeL_fmeasure": 0.33425485052465553, "rougeL_fmeasure_stderr": 0.0018145324900473829, "rougeL_precision": 0.3294449670016489, "rougeL_precision_stderr": 0.0019862700698188526, "rougeL_recall": 0.36499231892760226, "rougeL_recall_stderr": 0.002472604135584095, "rougeLsum_fmeasure": 0.38784195443176667, "rougeLsum_fmeasure_stderr": 0.002095047300699444, "rougeLsum_precision": 0.3822668655881195, "rougeLsum_precision_stderr": 0.0022831595508053123, "rougeLsum_recall": 0.42289921789097146, "rougeLsum_recall_stderr": 0.0028059845754771946}}, "4": {"generate_text_restaurant": {"bleu": 12.590553484195926, "bleu_stderr": 0.1557062455646749, "rouge1_fmeasure": 0.4665951594342427, "rouge1_fmeasure_stderr": 0.0019811140237555058, "rouge1_precision": 0.45730914793351857, "rouge1_precision_stderr": 0.0022800216801940523, "rouge1_recall": 0.5099063296772233, "rouge1_recall_stderr": 0.002848408606334223, "rouge2_fmeasure": 0.22268786623363584, "rouge2_fmeasure_stderr": 0.0018973436430830147, "rouge2_precision": 0.21739148517372925, "rouge2_precision_stderr": 0.0019197356806646647, "rouge2_recall": 0.24585802349091362, "rouge2_recall_stderr": 0.002367288796030392, "rougeL_fmeasure": 0.3359737739996741, "rougeL_fmeasure_stderr": 0.0018304708156164022, "rougeL_precision": 0.32920126816755724, "rougeL_precision_stderr": 0.0019885426030395704, "rougeL_recall": 0.3675548516076049, "rougeL_recall_stderr": 0.0024640793870539996, "rougeLsum_fmeasure": 0.39000018030735667, "rougeLsum_fmeasure_stderr": 0.0020682797216599685, "rougeLsum_precision": 0.3822321205301913, "rougeLsum_precision_stderr": 0.002266945541059726, "rougeLsum_recall": 0.4260724023535776, "rougeLsum_recall_stderr": 0.0027436023351126207}}, "5": {"generate_text_restaurant": {"bleu": 12.568675301062258, "bleu_stderr": 0.16224872119521075, "rouge1_fmeasure": 0.46892055322657583, "rouge1_fmeasure_stderr": 0.0019557972473512363, "rouge1_precision": 0.46019703610358126, "rouge1_precision_stderr": 0.002310541944330683, "rouge1_recall": 0.5121643724875744, "rouge1_recall_stderr": 0.0028028895695916085, "rouge2_fmeasure": 0.22475547301426657, "rouge2_fmeasure_stderr": 0.0018950527696092341, "rouge2_precision": 0.22000376959092752, "rouge2_precision_stderr": 0.0019418480260118285, "rouge2_recall": 0.24793505182999162, "rouge2_recall_stderr": 0.0023724004656612103, "rougeL_fmeasure": 0.338538223656329, "rougeL_fmeasure_stderr": 0.0018504743886393609, "rougeL_precision": 0.3318569141781424, "rougeL_precision_stderr": 0.002016776093642391, "rougeL_recall": 0.37058399083148463, "rougeL_recall_stderr": 0.0024982918049127, "rougeLsum_fmeasure": 0.39172408140249776, "rougeLsum_fmeasure_stderr": 0.0020299941320428634, "rougeLsum_precision": 0.38438568129064205, "rougeLsum_precision_stderr": 0.002263964644987408, "rougeLsum_recall": 0.42799017201029466, "rougeLsum_recall_stderr": 0.0027129978907711573}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.123641882512125, "bleu_stderr": 0.11311464309535657, "rouge1_fmeasure": 0.21703276545763875, "rouge1_fmeasure_stderr": 0.002515369600939235, "rouge1_precision": 0.1660896826959829, "rouge1_precision_stderr": 0.002107288082741575, "rouge1_recall": 0.35083498826967574, "rouge1_recall_stderr": 0.004593723083105133, "rouge2_fmeasure": 0.04768368310993588, "rouge2_fmeasure_stderr": 0.0016080049517398576, "rouge2_precision": 0.035059732318482924, "rouge2_precision_stderr": 0.0011977915786786503, "rouge2_recall": 0.08226070946861705, "rouge2_recall_stderr": 0.0028797184426689847, "rougeL_fmeasure": 0.15826337701676133, "rougeL_fmeasure_stderr": 0.0019458792115624524, "rougeL_precision": 0.12091074416538322, "rougeL_precision_stderr": 0.0016031624561767372, "rougeL_recall": 0.2576388464400407, "rougeL_recall_stderr": 0.00369472688602631, "rougeLsum_fmeasure": 0.16763228337149372, "rougeLsum_fmeasure_stderr": 0.002117910852275356, "rougeLsum_precision": 0.12768714774937476, "rougeLsum_precision_stderr": 0.0016996047438811593, "rougeLsum_recall": 0.27322366216085703, "rougeLsum_recall_stderr": 0.003964231303059884}}, "1": {"article_DOC_summary": {"bleu": 1.824798489457088, "bleu_stderr": 0.07127603588876777, "rouge1_fmeasure": 0.19596597830254703, "rouge1_fmeasure_stderr": 0.0027034863790677227, "rouge1_precision": 0.13945719874562185, "rouge1_precision_stderr": 0.0019983868750527664, "rouge1_recall": 0.34308219072003043, "rouge1_recall_stderr": 0.004720352047720588, "rouge2_fmeasure": 0.045112546772934525, "rouge2_fmeasure_stderr": 0.0016064335304974058, "rouge2_precision": 0.03178740757757239, "rouge2_precision_stderr": 0.0011392122530476615, "rouge2_recall": 0.08118275228432598, "rouge2_recall_stderr": 0.0029455220106513707, "rougeL_fmeasure": 0.15024907208625723, "rougeL_fmeasure_stderr": 0.002045800799748567, "rougeL_precision": 0.10681129267314027, "rougeL_precision_stderr": 0.0015073695015672187, "rougeL_recall": 0.2641425374203942, "rougeL_recall_stderr": 0.003651992949055377, "rougeLsum_fmeasure": 0.15587486680514107, "rougeLsum_fmeasure_stderr": 0.0022911401506731394, "rougeLsum_precision": 0.11073338724734512, "rougeLsum_precision_stderr": 0.0016753848690916494, "rougeLsum_recall": 0.2742974784323833, "rougeLsum_recall_stderr": 0.004087132157440376}}, "2": {"article_DOC_summary": {"bleu": 1.9202286782616684, "bleu_stderr": 0.10294125559660114, "rouge1_fmeasure": 0.19441013280698327, "rouge1_fmeasure_stderr": 0.0026788744932156566, "rouge1_precision": 0.13831430998317307, "rouge1_precision_stderr": 0.001976957405007506, "rouge1_recall": 0.34054180926843874, "rouge1_recall_stderr": 0.004697948807654975, "rouge2_fmeasure": 0.04741261419771178, "rouge2_fmeasure_stderr": 0.0016319693236568167, "rouge2_precision": 0.03335127280499144, "rouge2_precision_stderr": 0.001154865026778613, "rouge2_recall": 0.0856058058231469, "rouge2_recall_stderr": 0.003020560704657517, "rougeL_fmeasure": 0.1548115500775745, "rougeL_fmeasure_stderr": 0.0021179712451710035, "rougeL_precision": 0.11005048631596476, "rougeL_precision_stderr": 0.0015585194589067973, "rougeL_recall": 0.2721344790344476, "rougeL_recall_stderr": 0.003803909579012272, "rougeLsum_fmeasure": 0.15284300610999954, "rougeLsum_fmeasure_stderr": 0.0022472050629649657, "rougeLsum_precision": 0.10851611339383292, "rougeLsum_precision_stderr": 0.0016395893229652913, "rougeLsum_recall": 0.2694877082238382, "rougeLsum_recall_stderr": 0.004071472128097917}}, "3": {"article_DOC_summary": {"bleu": 1.960346415562878, "bleu_stderr": 0.10375544830060275, "rouge1_fmeasure": 0.18698891886072055, "rouge1_fmeasure_stderr": 0.0028344896483067416, "rouge1_precision": 0.13614805917908393, "rouge1_precision_stderr": 0.002182928610973459, "rouge1_recall": 0.319067828970501, "rouge1_recall_stderr": 0.004854901693972656, "rouge2_fmeasure": 0.04579306632360622, "rouge2_fmeasure_stderr": 0.0016499700154799003, "rouge2_precision": 0.03255285607712267, "rouge2_precision_stderr": 0.0011709136544211735, "rouge2_recall": 0.08091915270893477, "rouge2_recall_stderr": 0.0030319428460283114, "rougeL_fmeasure": 0.14999604713991269, "rougeL_fmeasure_stderr": 0.0022346501948052253, "rougeL_precision": 0.10907064034611987, "rougeL_precision_stderr": 0.0016952538239699338, "rougeL_recall": 0.25710809104483, "rougeL_recall_stderr": 0.003938425696647421, "rougeLsum_fmeasure": 0.1481937356361328, "rougeLsum_fmeasure_stderr": 0.0023789575527353485, "rougeLsum_precision": 0.10781469737345471, "rougeLsum_precision_stderr": 0.0018160899239560767, "rougeLsum_recall": 0.25437759661970666, "rougeLsum_recall_stderr": 0.004197817376394927}}, "4": {"article_DOC_summary": {"bleu": 1.147010044637203, "bleu_stderr": 0.1483287532182548, "rouge1_fmeasure": 0.05597846129733605, "rouge1_fmeasure_stderr": 0.0030736717398869183, "rouge1_precision": 0.0461290015156943, "rouge1_precision_stderr": 0.0026493012385445263, "rouge1_recall": 0.08781737838132268, "rouge1_recall_stderr": 0.004925448191329273, "rouge2_fmeasure": 0.014349999093671009, "rouge2_fmeasure_stderr": 0.00119440286191344, "rouge2_precision": 0.011276409349861457, "rouge2_precision_stderr": 0.0009910327288713146, "rouge2_recall": 0.02322309514343149, "rouge2_recall_stderr": 0.0019591268818172957, "rougeL_fmeasure": 0.0441258641947338, "rougeL_fmeasure_stderr": 0.002422435816075437, "rougeL_precision": 0.036708426223707534, "rougeL_precision_stderr": 0.0021597446633885207, "rougeL_recall": 0.06933837439843676, "rougeL_recall_stderr": 0.0039167983045230665, "rougeLsum_fmeasure": 0.04374778712530494, "rougeLsum_fmeasure_stderr": 0.0024277919295063437, "rougeLsum_precision": 0.036521786784889244, "rougeLsum_precision_stderr": 0.0021744741340258226, "rougeLsum_recall": 0.06862432645657374, "rougeLsum_recall_stderr": 0.003921758075614257}}, "5": {"article_DOC_summary": {"bleu": 7.432818302647882e-17, "bleu_stderr": 6.736262144559321e-14, "rouge1_fmeasure": 0.002840764770276708, "rouge1_fmeasure_stderr": 0.0007742945701180914, "rouge1_precision": 0.002495597025413209, "rouge1_precision_stderr": 0.0007589560712051177, "rouge1_recall": 0.004261862032154927, "rouge1_recall_stderr": 0.0011922597752018513, "rouge2_fmeasure": 0.00046551069178623087, "rouge2_fmeasure_stderr": 0.0001919512728404723, "rouge2_precision": 0.0004857664430361433, "rouge2_precision_stderr": 0.000238899022126221, "rouge2_recall": 0.0005700394684856393, "rouge2_recall_stderr": 0.000211601005978903, "rougeL_fmeasure": 0.0020608235251966707, "rougeL_fmeasure_stderr": 0.0005517756786268286, "rougeL_precision": 0.0018255732115487248, "rougeL_precision_stderr": 0.0005443002896546889, "rougeL_recall": 0.003014099249198048, "rougeL_recall_stderr": 0.0008089040705550355, "rougeLsum_fmeasure": 0.002211561514290958, "rougeLsum_fmeasure_stderr": 0.0006392646801827694, "rougeLsum_precision": 0.0020616490944038632, "rougeLsum_precision_stderr": 0.000695995858568096, "rougeLsum_recall": 0.0030417584845173606, "rougeLsum_recall_stderr": 0.0008279029688397777}}}} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_2.json b/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1f79eccb9a31ce069375ecfbde2c32b1808e6afe --- /dev/null +++ b/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.22168273116215406, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002435239313498893 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.34743629317668956, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0028712547192456107 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.2476037847532769, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002025823379351112 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.06376061872271946, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0013048039083209525 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.101241459889032, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001953242205361602 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.07067445283939051, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012417394471129861 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.16023681566231104, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001772038861775115 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.25897094370699875, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002426935392898513 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.1802673381893277, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001478295743890994 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.20950862810749826, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002314989258793673 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.32880876166482065, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002739608654356716 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.23404735258003787, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019207642560461376 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.9743938237550784, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07475699238187014 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_3.json b/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4e071432bb9caae34ea8298b759d4fc138363763 --- /dev/null +++ b/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.1929078707094695, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0028265761167604094 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.28273526242062413, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003552029067040685 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.2047867720982337, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0024642538193857724 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.054778926280360955, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0013556972713623368 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.08337730610072844, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0019188280227343476 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0583824000959267, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012189630804918103 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.1417601937646707, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0021376668461159194 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.21204142844316634, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002858744024264778 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.1504897955536091, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018172498103686094 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.18285584254571302, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0026982092402270926 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.268579149278918, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0034097338970395575 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.19403204633681076, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0023374505495094874 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.9817750726027175, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.09479747244511556 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_4.json b/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d266da15c39b0f23d246627a4ed6819ca45dcb4f --- /dev/null +++ b/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.06349732024114886, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00234492312222753 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.09340855339099945, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003159444376659221 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.06567263366786005, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0021904578073412223 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.01738674391302479, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000886017251546639 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.02721797575693733, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0013454614244059475 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.018412425374697422, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008441162033876125 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.04817497329724485, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0018235195878999154 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.07173904334429866, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002478097323273335 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.04945824665200398, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016373751055148418 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.05985219645183224, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0022220794271589225 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.0881853688625359, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0029988916997001343 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.06188112367809635, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002069249663413523 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.8498125282802876, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07488095870505093 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_5.json b/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ee0fcbfda09675ea3ff74aae2304b293fc81b382 --- /dev/null +++ b/8b7178b88b/evaluation/generation/slim.8b7178b88b_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.010748286830524562, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0010977103345240155 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.015508382128794362, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.001446236589386957 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.010706535802039367, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0009930611949791836 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0030969030541188967, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0005268236540414107 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.004556824944932366, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0005664861641115252 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.002970906530619929, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0003682132280639809 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.008049434192397904, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0008462473363861482 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.011951304347862953, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0011515347961576568 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.008019691411733835, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.000754515109085896 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.010213486554639387, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001056290927469812 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.014853652320406176, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0013945219376869788 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.010182849522984623, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0009497792801374568 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.8683848179992505e-06, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 7.381534992907998e-06 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/slim.8b7178b88b_gem_xsum_article_DOC_summary_3.json b/8b7178b88b/evaluation/generation/slim.8b7178b88b_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2a4ed799da3af72b2f2b62d5fa8b076957b5d356 --- /dev/null +++ b/8b7178b88b/evaluation/generation/slim.8b7178b88b_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.13614805917908393, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002182928610973459 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.319067828970501, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004854901693972656 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.18698891886072055, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0028344896483067416 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03255285607712267, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011709136544211735 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08091915270893477, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0030319428460283114 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04579306632360622, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0016499700154799003 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.10907064034611987, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0016952538239699338 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.25710809104483, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003938425696647421 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.14999604713991269, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0022346501948052253 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.10781469737345471, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0018160899239560767 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.25437759661970666, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004197817376394927 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.1481937356361328, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0023789575527353485 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.960346415562878, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.10375544830060275 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/slim.8b7178b88b_gem_xsum_article_DOC_summary_4.json b/8b7178b88b/evaluation/generation/slim.8b7178b88b_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b036b9525bb474555e36712cec92b9aef0e7315f --- /dev/null +++ b/8b7178b88b/evaluation/generation/slim.8b7178b88b_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.0461290015156943, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0026493012385445263 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.08781737838132268, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004925448191329273 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.05597846129733605, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0030736717398869183 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.011276409349861457, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0009910327288713146 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.02322309514343149, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0019591268818172957 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.014349999093671009, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00119440286191344 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.036708426223707534, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0021597446633885207 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.06933837439843676, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0039167983045230665 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.0441258641947338, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002422435816075437 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.036521786784889244, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0021744741340258226 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.06862432645657374, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003921758075614257 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.04374778712530494, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0024277919295063437 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.147010044637203, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.1483287532182548 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/8b7178b88b/evaluation/generation/slim.8b7178b88b_gem_xsum_article_DOC_summary_5.json b/8b7178b88b/evaluation/generation/slim.8b7178b88b_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6d8c16ee56ad40cd9cd8ab0788d61441ef4e84ce --- /dev/null +++ b/8b7178b88b/evaluation/generation/slim.8b7178b88b_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.002495597025413209, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0007589560712051177 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.004261862032154927, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0011922597752018513 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.002840764770276708, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0007742945701180914 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0004857664430361433, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.000238899022126221 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0005700394684856393, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.000211601005978903 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.00046551069178623087, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0001919512728404723 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0018255732115487248, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0005443002896546889 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.003014099249198048, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0008089040705550355 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.0020608235251966707, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0005517756786268286 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0020616490944038632, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.000695995858568096 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0030417584845173606, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0008279029688397777 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.002211561514290958, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0006392646801827694 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 7.432818302647882e-17, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 6.736262144559321e-14 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b88b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 4, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file