Muennighoff commited on
Commit
8263782
1 Parent(s): a706068

Add scores

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. 8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.json +1 -0
  2. 8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.json +1 -0
  3. 8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.json +1 -0
  4. 8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.json +1 -0
  5. 8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_3.json +1 -0
  6. 8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_4.json +1 -0
  7. 8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_5.json +1 -0
  8. 8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.jsonl +3 -0
  9. 8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.jsonl +3 -0
  10. 8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.jsonl +3 -0
  11. 8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.jsonl +3 -0
  12. 8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_3.jsonl +3 -0
  13. 8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_4.jsonl +3 -0
  14. 8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_5.jsonl +3 -0
  15. 8b7178b25b/evaluation/generation/merged.csv +16 -2
  16. 8b7178b25b/evaluation/generation/merged.json +1 -1
  17. 8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.json +133 -0
  18. 8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.json +133 -0
  19. 8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.json +133 -0
  20. 8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.json +133 -0
  21. 8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_3.json +133 -0
  22. 8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_4.json +133 -0
  23. 8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_5.json +133 -0
  24. 8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.json +1 -0
  25. 8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.json +1 -0
  26. 8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.json +1 -0
  27. 8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.json +1 -0
  28. 8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_3.json +1 -0
  29. 8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_4.json +1 -0
  30. 8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_5.json +1 -0
  31. 8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.jsonl +3 -0
  32. 8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.jsonl +3 -0
  33. 8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.jsonl +3 -0
  34. 8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.jsonl +3 -0
  35. 8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_3.jsonl +3 -0
  36. 8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_4.jsonl +3 -0
  37. 8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_5.jsonl +3 -0
  38. 8b7178b35b/evaluation/generation/merged.csv +16 -2
  39. 8b7178b35b/evaluation/generation/merged.json +1 -1
  40. 8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.json +133 -0
  41. 8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.json +133 -0
  42. 8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.json +133 -0
  43. 8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.json +133 -0
  44. 8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_3.json +133 -0
  45. 8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_4.json +133 -0
  46. 8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_5.json +133 -0
  47. 8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.json +1 -0
  48. 8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.json +1 -0
  49. 8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.json +1 -0
  50. 8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_5.json +1 -0
8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.20824164042833873, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002234214827494051}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.3342680558497238, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002719351845027267}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.235556648636045, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018848294908913575}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0553447765818299, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001097279257804817}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.09158152498363732, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00182638815550646}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.06251395339429003, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011032340747877004}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.14804254896591035, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001582109014998804}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.24506636928994607, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002277815665553641}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.16869690772741758, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013497024922833358}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.19708639575292217, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021089064966209187}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.31720897725992786, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002596567797319611}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.22311024372097865, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017776265096001729}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.5155467010727994, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06141556601214586}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.17811439171989488, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002560275363708373}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.27886090799993984, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0034448901496326586}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.1960859652047859, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023366060464774423}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.04742286059353845, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011604513875678288}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.07742946215061461, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001866825934930981}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0523326312861241, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011285281285534974}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.12852552196126066, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018964172540042934}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.20545470726815437, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0027326532184944424}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.14162844117143206, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016899728317895791}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.16868014464807685, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002438611490893621}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.26411127102133913, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003281374640568146}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.18556326066261042, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002215627597176747}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.504980488837104, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07033302280883086}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.057686885490487505, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002077970469715233}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.09385574649347103, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0031890083282469467}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.06282501006696624, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020888213951751034}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.015071313264611244, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007774016714959856}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.026883790080977472, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014075436212441558}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.016837586892487218, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000790811753626707}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.042638568215269866, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015504702364272222}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.07032908173779572, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024572632014890276}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.0461316780279477, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015235672554804573}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.05433748981297747, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019639155165544035}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.0885417619284403, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030172562108610624}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.05917832063435797, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001970707904332247}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.9086939324156383, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05855823137144976}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.009722491969586005, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009564982386894138}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.015357271032115101, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001448960021557378}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.010180728590080238, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0009430571774107825}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0024973101781513925, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000343733403974376}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0045816927074040755, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006153391357680226}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.00281382806546607, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00035214119256937883}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.0072997926636164424, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0007272617797612332}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.011767619522932812, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0011356853451163903}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.00759156476319381, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007014066070016197}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.009129679261559958, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009077175108677888}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.014481417293915742, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0013773388496390326}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.00955145315759681, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0008912980968449334}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 6.161453193982374e-06, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.2644510555241896e-05}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.12661685012408697, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0021093732895884356}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.30042742106922904, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004793262475513874}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.1741691220448831, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027529808698691072}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.027457383236278155, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011005894460101174}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.06872339351563556, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0028264005600779733}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.03848207108872355, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015349514843449124}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.10054376826357136, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016355832838556733}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.24044972226758587, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0038558430038978684}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.13848485640839917, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002136410452186076}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.10022509495467177, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001733799927568896}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.2391221551244155, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0040668979594426224}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.13794850154717367, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002276629118568194}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.6273581251014975, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08297152976943713}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.03984641222755382, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023583006672399276}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.07781373363270898, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0044678999283725125}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.049025842019891894, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002761462686946734}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.008839770813015032, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009132501416122141}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.018595501088417683, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018046371518142589}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.011276208078698832, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010882501335885415}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.03193279708670165, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0019219705874772618}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.06201212154363477, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0036181165069829016}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.03899309107571578, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022095258634566655}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.03172097901935682, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019113003105201993}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.06162603968016436, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003613145314591428}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.03871502132726552, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021963924186294845}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.9297944407512662, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10452568299951195}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.0018747157977051885, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0005217271715436668}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.003549283577549333, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0010767599221403455}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.002273937140206459, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006514011124893405}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.00047917769370046107, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00017934012188651556}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0007855487711203583, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00030506211146057416}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0005538280315731646, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00021055797770595916}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.001591076043425948, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00044966429790604164}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.002863749112737054, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0008323073183414411}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0019019709421522583, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005434789167320575}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.00156298958828556, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00043239685994550666}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0027767015907513405, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0007833271709551212}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0018351130157390045, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005055637602691865}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.3519576766092788e-16, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.7148754157361587e-13}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf52e7e89cb05892fcb65745e1779ab79a7bf338c0fd951fce559bbf967d309c
3
+ size 18902094
8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d48b05d6c928cef63c3cbdedcfb8bca843c623bb49ed5e05c5ba8bafd912330a
3
+ size 24327036
8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f03b9fc6ca034ddb19e6558c89401d6667967bb919e5e43cbb227c773e165f4
3
+ size 29475398
8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0696f3da8c2c1666d68d54ba9c6a0adf0015721431da594f6fd3d5ffd0de34b
3
+ size 34801699
8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_3.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66dd94ebc21d8fb6bc777be380e845ad59128f54b1275c0ea4139ffc162849d5
3
+ size 9647268
8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_4.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f81a6460b234aa2e50301e03345295e21bd5c5a28290727bc037d1865867fef
3
+ size 11673456
8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_5.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a8cc00d33bfd2864acc67c803f643c08279364e5b8639b6a77f56c7e52fe3cc
3
+ size 13899565
8b7178b25b/evaluation/generation/merged.csv CHANGED
@@ -18,7 +18,13 @@ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03598895109620969
18
  gem_xsum,1,median,rouge2_fmeasure,0.03598895109620969
19
  gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.03728951183116744
20
  gem_xsum,2,median,rouge2_fmeasure,0.03728951183116744
21
- gem_xsum,2,average,multiple,0.04324025638606534
 
 
 
 
 
 
22
  web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04746695901782121
23
  web_nlg_en,0,median,rouge2_fmeasure,0.04746695901782121
24
  web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.052665704185835084
@@ -36,4 +42,12 @@ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.04047796652620078
36
  wiki_lingua_en,0,median,rouge2_fmeasure,0.04047796652620078
37
  wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.0622366147541512
38
  wiki_lingua_en,1,median,rouge2_fmeasure,0.0622366147541512
39
- wiki_lingua_en,1,average,multiple,0.051357290640175995
 
 
 
 
 
 
 
 
 
18
  gem_xsum,1,median,rouge2_fmeasure,0.03598895109620969
19
  gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.03728951183116744
20
  gem_xsum,2,median,rouge2_fmeasure,0.03728951183116744
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03848207108872355
22
+ gem_xsum,3,median,rouge2_fmeasure,0.03848207108872355
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.011276208078698832
24
+ gem_xsum,4,median,rouge2_fmeasure,0.011276208078698832
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0005538280315731646
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0005538280315731646
27
+ gem_xsum,5,average,multiple,0.03000547939286526
28
  web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04746695901782121
29
  web_nlg_en,0,median,rouge2_fmeasure,0.04746695901782121
30
  web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.052665704185835084
 
42
  wiki_lingua_en,0,median,rouge2_fmeasure,0.04047796652620078
43
  wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.0622366147541512
44
  wiki_lingua_en,1,median,rouge2_fmeasure,0.0622366147541512
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.06251395339429003
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.06251395339429003
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.0523326312861241
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.0523326312861241
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.016837586892487218
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.016837586892487218
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.00281382806546607
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.00281382806546607
53
+ wiki_lingua_en,5,average,multiple,0.0395354301531199
8b7178b25b/evaluation/generation/merged.json CHANGED
@@ -1 +1 @@
1
- {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.31606716251794437, "bleu_stderr": 0.02563944351471059, "rouge1_fmeasure": 0.10246108718545202, "rouge1_fmeasure_stderr": 0.0017992092481067788, "rouge1_precision": 0.06664761750887295, "rouge1_precision_stderr": 0.0013732959148702218, "rouge1_recall": 0.2940457093239289, "rouge1_recall_stderr": 0.004555499438691762, "rouge2_fmeasure": 0.04746695901782121, "rouge2_fmeasure_stderr": 0.0011129470648980045, "rouge2_precision": 0.030425492420252335, "rouge2_precision_stderr": 0.0007598510657945697, "rouge2_recall": 0.1409722690208349, "rouge2_recall_stderr": 0.0031027610869315144, "rougeL_fmeasure": 0.09860897831373452, "rougeL_fmeasure_stderr": 0.00170202692564, "rougeL_precision": 0.06400114097684483, "rougeL_precision_stderr": 0.0012983914696295615, "rougeL_recall": 0.2849573873935468, "rougeL_recall_stderr": 0.004452046846530438, "rougeLsum_fmeasure": 0.09784615421581211, "rougeLsum_fmeasure_stderr": 0.001694890791053129, "rougeLsum_precision": 0.06366499205115346, "rougeLsum_precision_stderr": 0.0013067510671992333, "rougeLsum_recall": 0.281020892101209, "rougeLsum_recall_stderr": 0.004305691903165883}}, "1": {"PALM_prompt": {"bleu": 0.4594356029919803, "bleu_stderr": 0.025125558903473632, "rouge1_fmeasure": 0.10986827089930207, "rouge1_fmeasure_stderr": 0.0017427056384300328, "rouge1_precision": 0.07027188536501679, "rouge1_precision_stderr": 0.0012558026173472138, "rouge1_recall": 0.3450765288308058, "rouge1_recall_stderr": 0.005055990718374951, "rouge2_fmeasure": 0.052665704185835084, "rouge2_fmeasure_stderr": 0.001116219363919427, "rouge2_precision": 0.03347891417115085, "rouge2_precision_stderr": 0.0007753123397897254, "rouge2_recall": 0.1749435024767411, "rouge2_recall_stderr": 0.0036147215578007344, "rougeL_fmeasure": 0.10429693361721804, "rougeL_fmeasure_stderr": 0.0016079944655777501, "rougeL_precision": 0.06659999496172114, "rougeL_precision_stderr": 0.001145948530216064, "rougeL_recall": 0.32753689197885233, "rougeL_recall_stderr": 0.004724892115185279, "rougeLsum_fmeasure": 0.10453836123127251, "rougeLsum_fmeasure_stderr": 0.0016341468252465234, "rougeLsum_precision": 0.06687788385741289, "rougeLsum_precision_stderr": 0.0011787909083336645, "rougeLsum_recall": 0.32738351484636163, "rougeLsum_recall_stderr": 0.004663465991453483}}, "2": {"PALM_prompt": {"bleu": 0.4626917691641033, "bleu_stderr": 0.026701626014007735, "rouge1_fmeasure": 0.11037120367033523, "rouge1_fmeasure_stderr": 0.0016020141226316521, "rouge1_precision": 0.07008037252994849, "rouge1_precision_stderr": 0.0012035993637679488, "rouge1_recall": 0.36892015349821927, "rouge1_recall_stderr": 0.0052030339175267275, "rouge2_fmeasure": 0.05157944416358925, "rouge2_fmeasure_stderr": 0.0010362585811456641, "rouge2_precision": 0.032540593106790174, "rouge2_precision_stderr": 0.000731515411120624, "rouge2_recall": 0.18396134575469794, "rouge2_recall_stderr": 0.0036503252463729982, "rougeL_fmeasure": 0.10359385842660361, "rougeL_fmeasure_stderr": 0.001472076271258242, "rougeL_precision": 0.06586167470031352, "rougeL_precision_stderr": 0.0011239315244069698, "rougeL_recall": 0.3440427226978943, "rougeL_recall_stderr": 0.0046759733900977925, "rougeLsum_fmeasure": 0.1053829136834338, "rougeLsum_fmeasure_stderr": 0.0015312528661206545, "rougeLsum_precision": 0.06702892136385862, "rougeLsum_precision_stderr": 0.0011636056187413796, "rougeLsum_recall": 0.3498407885544722, "rougeLsum_recall_stderr": 0.004802606835414834}}, "3": {"PALM_prompt": {"bleu": 0.5229218783075383, "bleu_stderr": 0.03287750689960854, "rouge1_fmeasure": 0.11129898486093194, "rouge1_fmeasure_stderr": 0.001603697213150543, "rouge1_precision": 0.07078968601889724, "rouge1_precision_stderr": 0.0012530189540338403, "rouge1_recall": 0.370854174943212, "rouge1_recall_stderr": 0.005131257813808184, "rouge2_fmeasure": 0.052270761889885206, "rouge2_fmeasure_stderr": 0.00103207142359725, "rouge2_precision": 0.03298499960390668, "rouge2_precision_stderr": 0.0007381370373430346, "rouge2_recall": 0.18675978429729426, "rouge2_recall_stderr": 0.0037180617775293043, "rougeL_fmeasure": 0.10352172121642274, "rougeL_fmeasure_stderr": 0.001462770227679166, "rougeL_precision": 0.06597257056782777, "rougeL_precision_stderr": 0.0011757840654232027, "rougeL_recall": 0.3435597898095353, "rougeL_recall_stderr": 0.004593814760542546, "rougeLsum_fmeasure": 0.10603634362382153, "rougeLsum_fmeasure_stderr": 0.0015192749230585549, "rougeLsum_precision": 0.06757005923050743, "rougeLsum_precision_stderr": 0.001208590145084183, "rougeLsum_recall": 0.35225955402286013, "rougeLsum_recall_stderr": 0.004772649571059099}}, "4": {"PALM_prompt": {"bleu": 0.6153677621128861, "bleu_stderr": 0.06053705735401149, "rouge1_fmeasure": 0.11264103250558938, "rouge1_fmeasure_stderr": 0.00163118942788917, "rouge1_precision": 0.07071127507433782, "rouge1_precision_stderr": 0.0011520192275434789, "rouge1_recall": 0.38147578757395145, "rouge1_recall_stderr": 0.005244875143584364, "rouge2_fmeasure": 0.05295652860215851, "rouge2_fmeasure_stderr": 0.0010479736263979194, "rouge2_precision": 0.032992739334882344, "rouge2_precision_stderr": 0.0007156556105984286, "rouge2_recall": 0.1932194114953707, "rouge2_recall_stderr": 0.00374859289181986, "rougeL_fmeasure": 0.10299927861932978, "rougeL_fmeasure_stderr": 0.0014284397671676585, "rougeL_precision": 0.06471147959433785, "rougeL_precision_stderr": 0.0010212506145839731, "rougeL_recall": 0.3483975351615344, "rougeL_recall_stderr": 0.004606426897749366, "rougeLsum_fmeasure": 0.10684797875052807, "rougeLsum_fmeasure_stderr": 0.0015286400891968755, "rougeLsum_precision": 0.06714267179992058, "rougeLsum_precision_stderr": 0.0010872321667830566, "rougeLsum_recall": 0.3609556696570317, "rougeLsum_recall_stderr": 0.004821987801086811}}, "5": {"PALM_prompt": {"bleu": 0.6522310718826546, "bleu_stderr": 0.0342543434913143, "rouge1_fmeasure": 0.11496831895443499, "rouge1_fmeasure_stderr": 0.0016131648938140647, "rouge1_precision": 0.07224798328936255, "rouge1_precision_stderr": 0.0011956217365912398, "rouge1_recall": 0.3930127103047457, "rouge1_recall_stderr": 0.005342634817712522, "rouge2_fmeasure": 0.054266401974880724, "rouge2_fmeasure_stderr": 0.0010279738093731765, "rouge2_precision": 0.03383327897010777, "rouge2_precision_stderr": 0.0007155324088112528, "rouge2_recall": 0.20017152367590102, "rouge2_recall_stderr": 0.0038194708260410088, "rougeL_fmeasure": 0.10413378847184503, "rougeL_fmeasure_stderr": 0.001404569572763983, "rougeL_precision": 0.06558923187444043, "rougeL_precision_stderr": 0.0010757591438449354, "rougeL_recall": 0.35557332046269224, "rougeL_recall_stderr": 0.0046352766809419025, "rougeLsum_fmeasure": 0.10879985852497424, "rougeLsum_fmeasure_stderr": 0.001514236323280586, "rougeLsum_precision": 0.0684743244702778, "rougeLsum_precision_stderr": 0.0011401249172099674, "rougeLsum_recall": 0.3715362630204789, "rougeLsum_recall_stderr": 0.004921937595664833}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.8695101649478336, "bleu_stderr": 0.0774798723948649, "rouge1_fmeasure": 0.18492809283993106, "rouge1_fmeasure_stderr": 0.0019138467632652185, "rouge1_precision": 0.16049598964045364, "rouge1_precision_stderr": 0.0019462983151563367, "rouge1_recall": 0.2676713790563651, "rouge1_recall_stderr": 0.0028735302834564675, "rouge2_fmeasure": 0.04047796652620078, "rouge2_fmeasure_stderr": 0.0009161392431729087, "rouge2_precision": 0.034697662184252506, "rouge2_precision_stderr": 0.0008205861453711819, "rouge2_recall": 0.06110103740366906, "rouge2_recall_stderr": 0.0015502965185361641, "rougeL_fmeasure": 0.13965276114673036, "rougeL_fmeasure_stderr": 0.0013470236487103265, "rougeL_precision": 0.11976282577213969, "rougeL_precision_stderr": 0.0013342259804847978, "rougeL_recall": 0.20766916878715935, "rougeL_recall_stderr": 0.0023245522996809573, "rougeLsum_fmeasure": 0.17014737831445742, "rougeLsum_fmeasure_stderr": 0.0017471450796099739, "rougeLsum_precision": 0.14751551166205068, "rougeLsum_precision_stderr": 0.001780031941907223, "rougeLsum_recall": 0.24735686937216053, "rougeLsum_recall_stderr": 0.0026833728446082504}}, "1": {"tldr_en": {"bleu": 3.2332562389930515, "bleu_stderr": 0.0799951186296727, "rouge1_fmeasure": 0.23690998702342858, "rouge1_fmeasure_stderr": 0.0019890231968532827, "rouge1_precision": 0.20729964457048256, "rouge1_precision_stderr": 0.002274245844031677, "rouge1_recall": 0.33969139617897154, "rouge1_recall_stderr": 0.002833556197863864, "rouge2_fmeasure": 0.0622366147541512, "rouge2_fmeasure_stderr": 0.0011025942027406593, "rouge2_precision": 0.05460146452130771, "rouge2_precision_stderr": 0.001093799845706477, "rouge2_recall": 0.09225300993047651, "rouge2_recall_stderr": 0.0018248020314785354, "rougeL_fmeasure": 0.16727654774531026, "rougeL_fmeasure_stderr": 0.0013621791164943034, "rougeL_precision": 0.14525073667888475, "rougeL_precision_stderr": 0.0015687845277167008, "rougeL_recall": 0.24628845510672517, "rougeL_recall_stderr": 0.0023082894542053194, "rougeLsum_fmeasure": 0.2230496791129934, "rougeLsum_fmeasure_stderr": 0.0018757377324494942, "rougeLsum_precision": 0.19509659175047353, "rougeLsum_precision_stderr": 0.0021449940715420894, "rougeLsum_recall": 0.3204235648372041, "rougeLsum_recall_stderr": 0.0027128359683571725}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.03454795333510589, "bleu_stderr": 0.010722245919714395, "rouge1_fmeasure": 0.009983083620835689, "rouge1_fmeasure_stderr": 0.0007216517112036717, "rouge1_precision": 0.008330877542768076, "rouge1_precision_stderr": 0.0006789467135202171, "rouge1_recall": 0.01512213229502354, "rouge1_recall_stderr": 0.0010598056749213033, "rouge2_fmeasure": 0.0015194719382464828, "rouge2_fmeasure_stderr": 0.0002186426847142056, "rouge2_precision": 0.0012462152328688286, "rouge2_precision_stderr": 0.0001850740246851996, "rouge2_recall": 0.0021941504736507373, "rouge2_recall_stderr": 0.00031789022057788433, "rougeL_fmeasure": 0.009451610525850281, "rougeL_fmeasure_stderr": 0.0006635438560036569, "rougeL_precision": 0.0076678486076520065, "rougeL_precision_stderr": 0.0005780206165870198, "rougeL_recall": 0.014540667195886358, "rougeL_recall_stderr": 0.0010048165449253396, "rougeLsum_fmeasure": 0.008102826704141256, "rougeLsum_fmeasure_stderr": 0.0005948187054074003, "rougeLsum_precision": 0.006854342833172471, "rougeLsum_precision_stderr": 0.0005808821610184307, "rougeLsum_recall": 0.01222352423290077, "rougeLsum_recall_stderr": 0.0008679529695598302}}, "1": {"generate_text_restaurant": {"bleu": 10.110683183008721, "bleu_stderr": 0.10856580026912326, "rouge1_fmeasure": 0.4261931768500333, "rouge1_fmeasure_stderr": 0.0020338904219177937, "rouge1_precision": 0.4273885051454009, "rouge1_precision_stderr": 0.0023386626945434775, "rouge1_recall": 0.4619168895673238, "rouge1_recall_stderr": 0.002910080537073359, "rouge2_fmeasure": 0.18637115332137155, "rouge2_fmeasure_stderr": 0.0017490252095085136, "rouge2_precision": 0.18621605242973624, "rouge2_precision_stderr": 0.0018420288481743644, "rouge2_recall": 0.20425913452615088, "rouge2_recall_stderr": 0.002171191668790452, "rougeL_fmeasure": 0.30007012360581886, "rougeL_fmeasure_stderr": 0.0017073194851728616, "rougeL_precision": 0.30141579223639636, "rougeL_precision_stderr": 0.0019324093600811055, "rougeL_recall": 0.3256528804455511, "rougeL_recall_stderr": 0.002364609519861813, "rougeLsum_fmeasure": 0.35343380568090416, "rougeLsum_fmeasure_stderr": 0.0020183396806264077, "rougeLsum_precision": 0.3549239657030783, "rougeLsum_precision_stderr": 0.0022624041723470494, "rougeLsum_recall": 0.38280848019973457, "rougeLsum_recall_stderr": 0.002716917275700221}}, "2": {"generate_text_restaurant": {"bleu": 11.76941570816289, "bleu_stderr": 0.15573900245748828, "rouge1_fmeasure": 0.45040818552274897, "rouge1_fmeasure_stderr": 0.001961202454945367, "rouge1_precision": 0.44751016860851195, "rouge1_precision_stderr": 0.002306632595634685, "rouge1_recall": 0.4895112200039268, "rouge1_recall_stderr": 0.0028440252957840422, "rouge2_fmeasure": 0.21309279835498804, "rouge2_fmeasure_stderr": 0.0018084420773651576, "rouge2_precision": 0.21124466241603482, "rouge2_precision_stderr": 0.001920141813437847, "rouge2_recall": 0.23408285111523244, "rouge2_recall_stderr": 0.0022741821628275683, "rougeL_fmeasure": 0.3267435445462492, "rougeL_fmeasure_stderr": 0.0017435336297606837, "rougeL_precision": 0.3250865088115516, "rougeL_precision_stderr": 0.0020090753832284394, "rougeL_recall": 0.35555451083591993, "rougeL_recall_stderr": 0.0024076335160479975, "rougeLsum_fmeasure": 0.37491707249596695, "rougeLsum_fmeasure_stderr": 0.0020146819194002373, "rougeLsum_precision": 0.3727655581276283, "rougeLsum_precision_stderr": 0.002277906383748808, "rougeLsum_recall": 0.40742138628815283, "rougeLsum_recall_stderr": 0.0027125462981769503}}, "3": {"generate_text_restaurant": {"bleu": 12.357057866617781, "bleu_stderr": 0.11622210780282331, "rouge1_fmeasure": 0.4562426842321961, "rouge1_fmeasure_stderr": 0.0019386232886629857, "rouge1_precision": 0.45217337104133665, "rouge1_precision_stderr": 0.002294505554504784, "rouge1_recall": 0.4942792865855869, "rouge1_recall_stderr": 0.002743597811824126, "rouge2_fmeasure": 0.22032333042194527, "rouge2_fmeasure_stderr": 0.001852412446316337, "rouge2_precision": 0.2175811124968712, "rouge2_precision_stderr": 0.0019112105040248436, "rouge2_recall": 0.2409573657644072, "rouge2_recall_stderr": 0.0023044463187969245, "rougeL_fmeasure": 0.3334482257131471, "rougeL_fmeasure_stderr": 0.0017866016624809113, "rougeL_precision": 0.33057426692006453, "rougeL_precision_stderr": 0.0020126164762619228, "rougeL_recall": 0.3619557938558922, "rougeL_recall_stderr": 0.0024177454342242813, "rougeLsum_fmeasure": 0.3816770816558835, "rougeLsum_fmeasure_stderr": 0.002007876237485467, "rougeLsum_precision": 0.37808218718729336, "rougeLsum_precision_stderr": 0.0022431062970265465, "rougeLsum_recall": 0.41390580636859037, "rougeLsum_recall_stderr": 0.0026729564370056775}}, "4": {"generate_text_restaurant": {"bleu": 12.468466502215751, "bleu_stderr": 0.09990648109577639, "rouge1_fmeasure": 0.45529352918386296, "rouge1_fmeasure_stderr": 0.0019729158566691536, "rouge1_precision": 0.45220424528442504, "rouge1_precision_stderr": 0.002337808233031794, "rouge1_recall": 0.4921231264159867, "rouge1_recall_stderr": 0.0027602306430746792, "rouge2_fmeasure": 0.21993594978939215, "rouge2_fmeasure_stderr": 0.001850258705420329, "rouge2_precision": 0.21806635417357498, "rouge2_precision_stderr": 0.0019446195531272607, "rouge2_recall": 0.2401062314536212, "rouge2_recall_stderr": 0.00230192039014965, "rougeL_fmeasure": 0.3320801238250487, "rougeL_fmeasure_stderr": 0.001796774794658876, "rougeL_precision": 0.32979623685018056, "rougeL_precision_stderr": 0.002025955430723721, "rougeL_recall": 0.35968401480469203, "rougeL_recall_stderr": 0.0024104842159546787, "rougeLsum_fmeasure": 0.38073707749695784, "rougeLsum_fmeasure_stderr": 0.0020603619333531575, "rougeLsum_precision": 0.37807324998983866, "rougeLsum_precision_stderr": 0.002305718057682757, "rougeLsum_recall": 0.4118614121463698, "rougeLsum_recall_stderr": 0.0027125157827382973}}, "5": {"generate_text_restaurant": {"bleu": 12.11108163274697, "bleu_stderr": 0.1511086038189987, "rouge1_fmeasure": 0.45448318084710065, "rouge1_fmeasure_stderr": 0.0019502573764627392, "rouge1_precision": 0.4494689335142834, "rouge1_precision_stderr": 0.0023139900407069896, "rouge1_recall": 0.49189724241328564, "rouge1_recall_stderr": 0.002720672072662264, "rouge2_fmeasure": 0.21793438022656353, "rouge2_fmeasure_stderr": 0.0018382910372242867, "rouge2_precision": 0.21502011952369768, "rouge2_precision_stderr": 0.0018993108425412735, "rouge2_recall": 0.23790406650497267, "rouge2_recall_stderr": 0.002257304748049696, "rougeL_fmeasure": 0.33100721144592343, "rougeL_fmeasure_stderr": 0.0017930595424372638, "rougeL_precision": 0.3270263161597272, "rougeL_precision_stderr": 0.001992100365911566, "rougeL_recall": 0.3590215756959229, "rougeL_recall_stderr": 0.0023843408170312035, "rougeLsum_fmeasure": 0.3800458168267657, "rougeLsum_fmeasure_stderr": 0.002019377882054415, "rougeLsum_precision": 0.3757504695199971, "rougeLsum_precision_stderr": 0.00226312608847076, "rougeLsum_recall": 0.4115209951658525, "rougeLsum_recall_stderr": 0.002643403932683506}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.4185935712214466, "bleu_stderr": 0.11565432283117048, "rouge1_fmeasure": 0.22302727328338223, "rouge1_fmeasure_stderr": 0.002543649203320007, "rouge1_precision": 0.16079100538036975, "rouge1_precision_stderr": 0.0019317155011328527, "rouge1_recall": 0.38463169025745175, "rouge1_recall_stderr": 0.004526539026922928, "rouge2_fmeasure": 0.056442306230818876, "rouge2_fmeasure_stderr": 0.0016804520319396248, "rouge2_precision": 0.03999269013558242, "rouge2_precision_stderr": 0.0011971308201383902, "rouge2_recall": 0.10113339981669754, "rouge2_recall_stderr": 0.0031343748451553286, "rougeL_fmeasure": 0.16717619407627404, "rougeL_fmeasure_stderr": 0.0019481211634922503, "rougeL_precision": 0.12033234686017578, "rougeL_precision_stderr": 0.0014585203532609563, "rougeL_recall": 0.2903778922407286, "rougeL_recall_stderr": 0.003692559366146235, "rougeLsum_fmeasure": 0.17608763758164225, "rougeLsum_fmeasure_stderr": 0.002145834312930374, "rougeLsum_precision": 0.12659083153302422, "rougeLsum_precision_stderr": 0.0015911326356590116, "rougeLsum_recall": 0.30622228336959245, "rougeLsum_recall_stderr": 0.004035233836387343}}, "1": {"article_DOC_summary": {"bleu": 1.4753760157360585, "bleu_stderr": 0.053942288193643405, "rouge1_fmeasure": 0.17409533783255507, "rouge1_fmeasure_stderr": 0.002615183917058679, "rouge1_precision": 0.123975541885696, "rouge1_precision_stderr": 0.001937069913810257, "rouge1_recall": 0.3041634033606052, "rouge1_recall_stderr": 0.004462494037919621, "rouge2_fmeasure": 0.03598895109620969, "rouge2_fmeasure_stderr": 0.001496555270960858, "rouge2_precision": 0.02530985857085611, "rouge2_precision_stderr": 0.0010516965631042789, "rouge2_recall": 0.06497657707857857, "rouge2_recall_stderr": 0.0028062229327394097, "rougeL_fmeasure": 0.13719393806212205, "rougeL_fmeasure_stderr": 0.001988792672158033, "rougeL_precision": 0.09739516038141627, "rougeL_precision_stderr": 0.0014498365795769053, "rougeL_recall": 0.24191669859950216, "rougeL_recall_stderr": 0.0036028516933205405, "rougeLsum_fmeasure": 0.1371396960626393, "rougeLsum_fmeasure_stderr": 0.0021254210571197236, "rougeLsum_precision": 0.0973784808310999, "rougeLsum_precision_stderr": 0.0015541782441477088, "rougeLsum_recall": 0.24166767773481146, "rougeLsum_recall_stderr": 0.0037775246270440404}}, "2": {"article_DOC_summary": {"bleu": 1.4658077495541642, "bleu_stderr": 0.05656663077206122, "rouge1_fmeasure": 0.17850901621898715, "rouge1_fmeasure_stderr": 0.002631185065208956, "rouge1_precision": 0.12701574570405383, "rouge1_precision_stderr": 0.0019497483407488042, "rouge1_recall": 0.31209308297568034, "rouge1_recall_stderr": 0.0044823221140013695, "rouge2_fmeasure": 0.03728951183116744, "rouge2_fmeasure_stderr": 0.0014839130164513042, "rouge2_precision": 0.02628489874277305, "rouge2_precision_stderr": 0.0010496767573945315, "rouge2_recall": 0.06678550185926611, "rouge2_recall_stderr": 0.002741050501727463, "rougeL_fmeasure": 0.14040861303541324, "rougeL_fmeasure_stderr": 0.001957888420474033, "rougeL_precision": 0.09965436966997658, "rougeL_precision_stderr": 0.0014375571522234208, "rougeL_recall": 0.24737259975268608, "rougeL_recall_stderr": 0.003487423921250019, "rougeLsum_fmeasure": 0.1402274218021141, "rougeLsum_fmeasure_stderr": 0.00214102737994103, "rougeLsum_precision": 0.0995909554171278, "rougeLsum_precision_stderr": 0.001569684395558083, "rougeLsum_recall": 0.2465747406900749, "rougeLsum_recall_stderr": 0.0037567150545319778}}}}
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.31606716251794437, "bleu_stderr": 0.02563944351471059, "rouge1_fmeasure": 0.10246108718545202, "rouge1_fmeasure_stderr": 0.0017992092481067788, "rouge1_precision": 0.06664761750887295, "rouge1_precision_stderr": 0.0013732959148702218, "rouge1_recall": 0.2940457093239289, "rouge1_recall_stderr": 0.004555499438691762, "rouge2_fmeasure": 0.04746695901782121, "rouge2_fmeasure_stderr": 0.0011129470648980045, "rouge2_precision": 0.030425492420252335, "rouge2_precision_stderr": 0.0007598510657945697, "rouge2_recall": 0.1409722690208349, "rouge2_recall_stderr": 0.0031027610869315144, "rougeL_fmeasure": 0.09860897831373452, "rougeL_fmeasure_stderr": 0.00170202692564, "rougeL_precision": 0.06400114097684483, "rougeL_precision_stderr": 0.0012983914696295615, "rougeL_recall": 0.2849573873935468, "rougeL_recall_stderr": 0.004452046846530438, "rougeLsum_fmeasure": 0.09784615421581211, "rougeLsum_fmeasure_stderr": 0.001694890791053129, "rougeLsum_precision": 0.06366499205115346, "rougeLsum_precision_stderr": 0.0013067510671992333, "rougeLsum_recall": 0.281020892101209, "rougeLsum_recall_stderr": 0.004305691903165883}}, "1": {"PALM_prompt": {"bleu": 0.4594356029919803, "bleu_stderr": 0.025125558903473632, "rouge1_fmeasure": 0.10986827089930207, "rouge1_fmeasure_stderr": 0.0017427056384300328, "rouge1_precision": 0.07027188536501679, "rouge1_precision_stderr": 0.0012558026173472138, "rouge1_recall": 0.3450765288308058, "rouge1_recall_stderr": 0.005055990718374951, "rouge2_fmeasure": 0.052665704185835084, "rouge2_fmeasure_stderr": 0.001116219363919427, "rouge2_precision": 0.03347891417115085, "rouge2_precision_stderr": 0.0007753123397897254, "rouge2_recall": 0.1749435024767411, "rouge2_recall_stderr": 0.0036147215578007344, "rougeL_fmeasure": 0.10429693361721804, "rougeL_fmeasure_stderr": 0.0016079944655777501, "rougeL_precision": 0.06659999496172114, "rougeL_precision_stderr": 0.001145948530216064, "rougeL_recall": 0.32753689197885233, "rougeL_recall_stderr": 0.004724892115185279, "rougeLsum_fmeasure": 0.10453836123127251, "rougeLsum_fmeasure_stderr": 0.0016341468252465234, "rougeLsum_precision": 0.06687788385741289, "rougeLsum_precision_stderr": 0.0011787909083336645, "rougeLsum_recall": 0.32738351484636163, "rougeLsum_recall_stderr": 0.004663465991453483}}, "2": {"PALM_prompt": {"bleu": 0.4626917691641033, "bleu_stderr": 0.026701626014007735, "rouge1_fmeasure": 0.11037120367033523, "rouge1_fmeasure_stderr": 0.0016020141226316521, "rouge1_precision": 0.07008037252994849, "rouge1_precision_stderr": 0.0012035993637679488, "rouge1_recall": 0.36892015349821927, "rouge1_recall_stderr": 0.0052030339175267275, "rouge2_fmeasure": 0.05157944416358925, "rouge2_fmeasure_stderr": 0.0010362585811456641, "rouge2_precision": 0.032540593106790174, "rouge2_precision_stderr": 0.000731515411120624, "rouge2_recall": 0.18396134575469794, "rouge2_recall_stderr": 0.0036503252463729982, "rougeL_fmeasure": 0.10359385842660361, "rougeL_fmeasure_stderr": 0.001472076271258242, "rougeL_precision": 0.06586167470031352, "rougeL_precision_stderr": 0.0011239315244069698, "rougeL_recall": 0.3440427226978943, "rougeL_recall_stderr": 0.0046759733900977925, "rougeLsum_fmeasure": 0.1053829136834338, "rougeLsum_fmeasure_stderr": 0.0015312528661206545, "rougeLsum_precision": 0.06702892136385862, "rougeLsum_precision_stderr": 0.0011636056187413796, "rougeLsum_recall": 0.3498407885544722, "rougeLsum_recall_stderr": 0.004802606835414834}}, "3": {"PALM_prompt": {"bleu": 0.5229218783075383, "bleu_stderr": 0.03287750689960854, "rouge1_fmeasure": 0.11129898486093194, "rouge1_fmeasure_stderr": 0.001603697213150543, "rouge1_precision": 0.07078968601889724, "rouge1_precision_stderr": 0.0012530189540338403, "rouge1_recall": 0.370854174943212, "rouge1_recall_stderr": 0.005131257813808184, "rouge2_fmeasure": 0.052270761889885206, "rouge2_fmeasure_stderr": 0.00103207142359725, "rouge2_precision": 0.03298499960390668, "rouge2_precision_stderr": 0.0007381370373430346, "rouge2_recall": 0.18675978429729426, "rouge2_recall_stderr": 0.0037180617775293043, "rougeL_fmeasure": 0.10352172121642274, "rougeL_fmeasure_stderr": 0.001462770227679166, "rougeL_precision": 0.06597257056782777, "rougeL_precision_stderr": 0.0011757840654232027, "rougeL_recall": 0.3435597898095353, "rougeL_recall_stderr": 0.004593814760542546, "rougeLsum_fmeasure": 0.10603634362382153, "rougeLsum_fmeasure_stderr": 0.0015192749230585549, "rougeLsum_precision": 0.06757005923050743, "rougeLsum_precision_stderr": 0.001208590145084183, "rougeLsum_recall": 0.35225955402286013, "rougeLsum_recall_stderr": 0.004772649571059099}}, "4": {"PALM_prompt": {"bleu": 0.6153677621128861, "bleu_stderr": 0.06053705735401149, "rouge1_fmeasure": 0.11264103250558938, "rouge1_fmeasure_stderr": 0.00163118942788917, "rouge1_precision": 0.07071127507433782, "rouge1_precision_stderr": 0.0011520192275434789, "rouge1_recall": 0.38147578757395145, "rouge1_recall_stderr": 0.005244875143584364, "rouge2_fmeasure": 0.05295652860215851, "rouge2_fmeasure_stderr": 0.0010479736263979194, "rouge2_precision": 0.032992739334882344, "rouge2_precision_stderr": 0.0007156556105984286, "rouge2_recall": 0.1932194114953707, "rouge2_recall_stderr": 0.00374859289181986, "rougeL_fmeasure": 0.10299927861932978, "rougeL_fmeasure_stderr": 0.0014284397671676585, "rougeL_precision": 0.06471147959433785, "rougeL_precision_stderr": 0.0010212506145839731, "rougeL_recall": 0.3483975351615344, "rougeL_recall_stderr": 0.004606426897749366, "rougeLsum_fmeasure": 0.10684797875052807, "rougeLsum_fmeasure_stderr": 0.0015286400891968755, "rougeLsum_precision": 0.06714267179992058, "rougeLsum_precision_stderr": 0.0010872321667830566, "rougeLsum_recall": 0.3609556696570317, "rougeLsum_recall_stderr": 0.004821987801086811}}, "5": {"PALM_prompt": {"bleu": 0.6522310718826546, "bleu_stderr": 0.0342543434913143, "rouge1_fmeasure": 0.11496831895443499, "rouge1_fmeasure_stderr": 0.0016131648938140647, "rouge1_precision": 0.07224798328936255, "rouge1_precision_stderr": 0.0011956217365912398, "rouge1_recall": 0.3930127103047457, "rouge1_recall_stderr": 0.005342634817712522, "rouge2_fmeasure": 0.054266401974880724, "rouge2_fmeasure_stderr": 0.0010279738093731765, "rouge2_precision": 0.03383327897010777, "rouge2_precision_stderr": 0.0007155324088112528, "rouge2_recall": 0.20017152367590102, "rouge2_recall_stderr": 0.0038194708260410088, "rougeL_fmeasure": 0.10413378847184503, "rougeL_fmeasure_stderr": 0.001404569572763983, "rougeL_precision": 0.06558923187444043, "rougeL_precision_stderr": 0.0010757591438449354, "rougeL_recall": 0.35557332046269224, "rougeL_recall_stderr": 0.0046352766809419025, "rougeLsum_fmeasure": 0.10879985852497424, "rougeLsum_fmeasure_stderr": 0.001514236323280586, "rougeLsum_precision": 0.0684743244702778, "rougeLsum_precision_stderr": 0.0011401249172099674, "rougeLsum_recall": 0.3715362630204789, "rougeLsum_recall_stderr": 0.004921937595664833}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.8695101649478336, "bleu_stderr": 0.0774798723948649, "rouge1_fmeasure": 0.18492809283993106, "rouge1_fmeasure_stderr": 0.0019138467632652185, "rouge1_precision": 0.16049598964045364, "rouge1_precision_stderr": 0.0019462983151563367, "rouge1_recall": 0.2676713790563651, "rouge1_recall_stderr": 0.0028735302834564675, "rouge2_fmeasure": 0.04047796652620078, "rouge2_fmeasure_stderr": 0.0009161392431729087, "rouge2_precision": 0.034697662184252506, "rouge2_precision_stderr": 0.0008205861453711819, "rouge2_recall": 0.06110103740366906, "rouge2_recall_stderr": 0.0015502965185361641, "rougeL_fmeasure": 0.13965276114673036, "rougeL_fmeasure_stderr": 0.0013470236487103265, "rougeL_precision": 0.11976282577213969, "rougeL_precision_stderr": 0.0013342259804847978, "rougeL_recall": 0.20766916878715935, "rougeL_recall_stderr": 0.0023245522996809573, "rougeLsum_fmeasure": 0.17014737831445742, "rougeLsum_fmeasure_stderr": 0.0017471450796099739, "rougeLsum_precision": 0.14751551166205068, "rougeLsum_precision_stderr": 0.001780031941907223, "rougeLsum_recall": 0.24735686937216053, "rougeLsum_recall_stderr": 0.0026833728446082504}}, "1": {"tldr_en": {"bleu": 3.2332562389930515, "bleu_stderr": 0.0799951186296727, "rouge1_fmeasure": 0.23690998702342858, "rouge1_fmeasure_stderr": 0.0019890231968532827, "rouge1_precision": 0.20729964457048256, "rouge1_precision_stderr": 0.002274245844031677, "rouge1_recall": 0.33969139617897154, "rouge1_recall_stderr": 0.002833556197863864, "rouge2_fmeasure": 0.0622366147541512, "rouge2_fmeasure_stderr": 0.0011025942027406593, "rouge2_precision": 0.05460146452130771, "rouge2_precision_stderr": 0.001093799845706477, "rouge2_recall": 0.09225300993047651, "rouge2_recall_stderr": 0.0018248020314785354, "rougeL_fmeasure": 0.16727654774531026, "rougeL_fmeasure_stderr": 0.0013621791164943034, "rougeL_precision": 0.14525073667888475, "rougeL_precision_stderr": 0.0015687845277167008, "rougeL_recall": 0.24628845510672517, "rougeL_recall_stderr": 0.0023082894542053194, "rougeLsum_fmeasure": 0.2230496791129934, "rougeLsum_fmeasure_stderr": 0.0018757377324494942, "rougeLsum_precision": 0.19509659175047353, "rougeLsum_precision_stderr": 0.0021449940715420894, "rougeLsum_recall": 0.3204235648372041, "rougeLsum_recall_stderr": 0.0027128359683571725}}, "2": {"tldr_en": {"bleu": 3.5155467010727994, "bleu_stderr": 0.06141556601214586, "rouge1_fmeasure": 0.235556648636045, "rouge1_fmeasure_stderr": 0.0018848294908913575, "rouge1_precision": 0.20824164042833873, "rouge1_precision_stderr": 0.002234214827494051, "rouge1_recall": 0.3342680558497238, "rouge1_recall_stderr": 0.002719351845027267, "rouge2_fmeasure": 0.06251395339429003, "rouge2_fmeasure_stderr": 0.0011032340747877004, "rouge2_precision": 0.0553447765818299, "rouge2_precision_stderr": 0.001097279257804817, "rouge2_recall": 0.09158152498363732, "rouge2_recall_stderr": 0.00182638815550646, "rougeL_fmeasure": 0.16869690772741758, "rougeL_fmeasure_stderr": 0.0013497024922833358, "rougeL_precision": 0.14804254896591035, "rougeL_precision_stderr": 0.001582109014998804, "rougeL_recall": 0.24506636928994607, "rougeL_recall_stderr": 0.002277815665553641, "rougeLsum_fmeasure": 0.22311024372097865, "rougeLsum_fmeasure_stderr": 0.0017776265096001729, "rougeLsum_precision": 0.19708639575292217, "rougeLsum_precision_stderr": 0.0021089064966209187, "rougeLsum_recall": 0.31720897725992786, "rougeLsum_recall_stderr": 0.002596567797319611}}, "3": {"tldr_en": {"bleu": 3.504980488837104, "bleu_stderr": 0.07033302280883086, "rouge1_fmeasure": 0.1960859652047859, "rouge1_fmeasure_stderr": 0.0023366060464774423, "rouge1_precision": 0.17811439171989488, "rouge1_precision_stderr": 0.002560275363708373, "rouge1_recall": 0.27886090799993984, "rouge1_recall_stderr": 0.0034448901496326586, "rouge2_fmeasure": 0.0523326312861241, "rouge2_fmeasure_stderr": 0.0011285281285534974, "rouge2_precision": 0.04742286059353845, "rouge2_precision_stderr": 0.0011604513875678288, "rouge2_recall": 0.07742946215061461, "rouge2_recall_stderr": 0.001866825934930981, "rougeL_fmeasure": 0.14162844117143206, "rougeL_fmeasure_stderr": 0.0016899728317895791, "rougeL_precision": 0.12852552196126066, "rougeL_precision_stderr": 0.0018964172540042934, "rougeL_recall": 0.20545470726815437, "rougeL_recall_stderr": 0.0027326532184944424, "rougeLsum_fmeasure": 0.18556326066261042, "rougeLsum_fmeasure_stderr": 0.002215627597176747, "rougeLsum_precision": 0.16868014464807685, "rougeLsum_precision_stderr": 0.002438611490893621, "rougeLsum_recall": 0.26411127102133913, "rougeLsum_recall_stderr": 0.003281374640568146}}, "4": {"tldr_en": {"bleu": 0.9086939324156383, "bleu_stderr": 0.05855823137144976, "rouge1_fmeasure": 0.06282501006696624, "rouge1_fmeasure_stderr": 0.0020888213951751034, "rouge1_precision": 0.057686885490487505, "rouge1_precision_stderr": 0.002077970469715233, "rouge1_recall": 0.09385574649347103, "rouge1_recall_stderr": 0.0031890083282469467, "rouge2_fmeasure": 0.016837586892487218, "rouge2_fmeasure_stderr": 0.000790811753626707, "rouge2_precision": 0.015071313264611244, "rouge2_precision_stderr": 0.0007774016714959856, "rouge2_recall": 0.026883790080977472, "rouge2_recall_stderr": 0.0014075436212441558, "rougeL_fmeasure": 0.0461316780279477, "rougeL_fmeasure_stderr": 0.0015235672554804573, "rougeL_precision": 0.042638568215269866, "rougeL_precision_stderr": 0.0015504702364272222, "rougeL_recall": 0.07032908173779572, "rougeL_recall_stderr": 0.0024572632014890276, "rougeLsum_fmeasure": 0.05917832063435797, "rougeLsum_fmeasure_stderr": 0.001970707904332247, "rougeLsum_precision": 0.05433748981297747, "rougeLsum_precision_stderr": 0.0019639155165544035, "rougeLsum_recall": 0.0885417619284403, "rougeLsum_recall_stderr": 0.0030172562108610624}}, "5": {"tldr_en": {"bleu": 6.161453193982374e-06, "bleu_stderr": 1.2644510555241896e-05, "rouge1_fmeasure": 0.010180728590080238, "rouge1_fmeasure_stderr": 0.0009430571774107825, "rouge1_precision": 0.009722491969586005, "rouge1_precision_stderr": 0.0009564982386894138, "rouge1_recall": 0.015357271032115101, "rouge1_recall_stderr": 0.001448960021557378, "rouge2_fmeasure": 0.00281382806546607, "rouge2_fmeasure_stderr": 0.00035214119256937883, "rouge2_precision": 0.0024973101781513925, "rouge2_precision_stderr": 0.000343733403974376, "rouge2_recall": 0.0045816927074040755, "rouge2_recall_stderr": 0.0006153391357680226, "rougeL_fmeasure": 0.00759156476319381, "rougeL_fmeasure_stderr": 0.0007014066070016197, "rougeL_precision": 0.0072997926636164424, "rougeL_precision_stderr": 0.0007272617797612332, "rougeL_recall": 0.011767619522932812, "rougeL_recall_stderr": 0.0011356853451163903, "rougeLsum_fmeasure": 0.00955145315759681, "rougeLsum_fmeasure_stderr": 0.0008912980968449334, "rougeLsum_precision": 0.009129679261559958, "rougeLsum_precision_stderr": 0.0009077175108677888, "rougeLsum_recall": 0.014481417293915742, "rougeLsum_recall_stderr": 0.0013773388496390326}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.03454795333510589, "bleu_stderr": 0.010722245919714395, "rouge1_fmeasure": 0.009983083620835689, "rouge1_fmeasure_stderr": 0.0007216517112036717, "rouge1_precision": 0.008330877542768076, "rouge1_precision_stderr": 0.0006789467135202171, "rouge1_recall": 0.01512213229502354, "rouge1_recall_stderr": 0.0010598056749213033, "rouge2_fmeasure": 0.0015194719382464828, "rouge2_fmeasure_stderr": 0.0002186426847142056, "rouge2_precision": 0.0012462152328688286, "rouge2_precision_stderr": 0.0001850740246851996, "rouge2_recall": 0.0021941504736507373, "rouge2_recall_stderr": 0.00031789022057788433, "rougeL_fmeasure": 0.009451610525850281, "rougeL_fmeasure_stderr": 0.0006635438560036569, "rougeL_precision": 0.0076678486076520065, "rougeL_precision_stderr": 0.0005780206165870198, "rougeL_recall": 0.014540667195886358, "rougeL_recall_stderr": 0.0010048165449253396, "rougeLsum_fmeasure": 0.008102826704141256, "rougeLsum_fmeasure_stderr": 0.0005948187054074003, "rougeLsum_precision": 0.006854342833172471, "rougeLsum_precision_stderr": 0.0005808821610184307, "rougeLsum_recall": 0.01222352423290077, "rougeLsum_recall_stderr": 0.0008679529695598302}}, "1": {"generate_text_restaurant": {"bleu": 10.110683183008721, "bleu_stderr": 0.10856580026912326, "rouge1_fmeasure": 0.4261931768500333, "rouge1_fmeasure_stderr": 0.0020338904219177937, "rouge1_precision": 0.4273885051454009, "rouge1_precision_stderr": 0.0023386626945434775, "rouge1_recall": 0.4619168895673238, "rouge1_recall_stderr": 0.002910080537073359, "rouge2_fmeasure": 0.18637115332137155, "rouge2_fmeasure_stderr": 0.0017490252095085136, "rouge2_precision": 0.18621605242973624, "rouge2_precision_stderr": 0.0018420288481743644, "rouge2_recall": 0.20425913452615088, "rouge2_recall_stderr": 0.002171191668790452, "rougeL_fmeasure": 0.30007012360581886, "rougeL_fmeasure_stderr": 0.0017073194851728616, "rougeL_precision": 0.30141579223639636, "rougeL_precision_stderr": 0.0019324093600811055, "rougeL_recall": 0.3256528804455511, "rougeL_recall_stderr": 0.002364609519861813, "rougeLsum_fmeasure": 0.35343380568090416, "rougeLsum_fmeasure_stderr": 0.0020183396806264077, "rougeLsum_precision": 0.3549239657030783, "rougeLsum_precision_stderr": 0.0022624041723470494, "rougeLsum_recall": 0.38280848019973457, "rougeLsum_recall_stderr": 0.002716917275700221}}, "2": {"generate_text_restaurant": {"bleu": 11.76941570816289, "bleu_stderr": 0.15573900245748828, "rouge1_fmeasure": 0.45040818552274897, "rouge1_fmeasure_stderr": 0.001961202454945367, "rouge1_precision": 0.44751016860851195, "rouge1_precision_stderr": 0.002306632595634685, "rouge1_recall": 0.4895112200039268, "rouge1_recall_stderr": 0.0028440252957840422, "rouge2_fmeasure": 0.21309279835498804, "rouge2_fmeasure_stderr": 0.0018084420773651576, "rouge2_precision": 0.21124466241603482, "rouge2_precision_stderr": 0.001920141813437847, "rouge2_recall": 0.23408285111523244, "rouge2_recall_stderr": 0.0022741821628275683, "rougeL_fmeasure": 0.3267435445462492, "rougeL_fmeasure_stderr": 0.0017435336297606837, "rougeL_precision": 0.3250865088115516, "rougeL_precision_stderr": 0.0020090753832284394, "rougeL_recall": 0.35555451083591993, "rougeL_recall_stderr": 0.0024076335160479975, "rougeLsum_fmeasure": 0.37491707249596695, "rougeLsum_fmeasure_stderr": 0.0020146819194002373, "rougeLsum_precision": 0.3727655581276283, "rougeLsum_precision_stderr": 0.002277906383748808, "rougeLsum_recall": 0.40742138628815283, "rougeLsum_recall_stderr": 0.0027125462981769503}}, "3": {"generate_text_restaurant": {"bleu": 12.357057866617781, "bleu_stderr": 0.11622210780282331, "rouge1_fmeasure": 0.4562426842321961, "rouge1_fmeasure_stderr": 0.0019386232886629857, "rouge1_precision": 0.45217337104133665, "rouge1_precision_stderr": 0.002294505554504784, "rouge1_recall": 0.4942792865855869, "rouge1_recall_stderr": 0.002743597811824126, "rouge2_fmeasure": 0.22032333042194527, "rouge2_fmeasure_stderr": 0.001852412446316337, "rouge2_precision": 0.2175811124968712, "rouge2_precision_stderr": 0.0019112105040248436, "rouge2_recall": 0.2409573657644072, "rouge2_recall_stderr": 0.0023044463187969245, "rougeL_fmeasure": 0.3334482257131471, "rougeL_fmeasure_stderr": 0.0017866016624809113, "rougeL_precision": 0.33057426692006453, "rougeL_precision_stderr": 0.0020126164762619228, "rougeL_recall": 0.3619557938558922, "rougeL_recall_stderr": 0.0024177454342242813, "rougeLsum_fmeasure": 0.3816770816558835, "rougeLsum_fmeasure_stderr": 0.002007876237485467, "rougeLsum_precision": 0.37808218718729336, "rougeLsum_precision_stderr": 0.0022431062970265465, "rougeLsum_recall": 0.41390580636859037, "rougeLsum_recall_stderr": 0.0026729564370056775}}, "4": {"generate_text_restaurant": {"bleu": 12.468466502215751, "bleu_stderr": 0.09990648109577639, "rouge1_fmeasure": 0.45529352918386296, "rouge1_fmeasure_stderr": 0.0019729158566691536, "rouge1_precision": 0.45220424528442504, "rouge1_precision_stderr": 0.002337808233031794, "rouge1_recall": 0.4921231264159867, "rouge1_recall_stderr": 0.0027602306430746792, "rouge2_fmeasure": 0.21993594978939215, "rouge2_fmeasure_stderr": 0.001850258705420329, "rouge2_precision": 0.21806635417357498, "rouge2_precision_stderr": 0.0019446195531272607, "rouge2_recall": 0.2401062314536212, "rouge2_recall_stderr": 0.00230192039014965, "rougeL_fmeasure": 0.3320801238250487, "rougeL_fmeasure_stderr": 0.001796774794658876, "rougeL_precision": 0.32979623685018056, "rougeL_precision_stderr": 0.002025955430723721, "rougeL_recall": 0.35968401480469203, "rougeL_recall_stderr": 0.0024104842159546787, "rougeLsum_fmeasure": 0.38073707749695784, "rougeLsum_fmeasure_stderr": 0.0020603619333531575, "rougeLsum_precision": 0.37807324998983866, "rougeLsum_precision_stderr": 0.002305718057682757, "rougeLsum_recall": 0.4118614121463698, "rougeLsum_recall_stderr": 0.0027125157827382973}}, "5": {"generate_text_restaurant": {"bleu": 12.11108163274697, "bleu_stderr": 0.1511086038189987, "rouge1_fmeasure": 0.45448318084710065, "rouge1_fmeasure_stderr": 0.0019502573764627392, "rouge1_precision": 0.4494689335142834, "rouge1_precision_stderr": 0.0023139900407069896, "rouge1_recall": 0.49189724241328564, "rouge1_recall_stderr": 0.002720672072662264, "rouge2_fmeasure": 0.21793438022656353, "rouge2_fmeasure_stderr": 0.0018382910372242867, "rouge2_precision": 0.21502011952369768, "rouge2_precision_stderr": 0.0018993108425412735, "rouge2_recall": 0.23790406650497267, "rouge2_recall_stderr": 0.002257304748049696, "rougeL_fmeasure": 0.33100721144592343, "rougeL_fmeasure_stderr": 0.0017930595424372638, "rougeL_precision": 0.3270263161597272, "rougeL_precision_stderr": 0.001992100365911566, "rougeL_recall": 0.3590215756959229, "rougeL_recall_stderr": 0.0023843408170312035, "rougeLsum_fmeasure": 0.3800458168267657, "rougeLsum_fmeasure_stderr": 0.002019377882054415, "rougeLsum_precision": 0.3757504695199971, "rougeLsum_precision_stderr": 0.00226312608847076, "rougeLsum_recall": 0.4115209951658525, "rougeLsum_recall_stderr": 0.002643403932683506}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.4185935712214466, "bleu_stderr": 0.11565432283117048, "rouge1_fmeasure": 0.22302727328338223, "rouge1_fmeasure_stderr": 0.002543649203320007, "rouge1_precision": 0.16079100538036975, "rouge1_precision_stderr": 0.0019317155011328527, "rouge1_recall": 0.38463169025745175, "rouge1_recall_stderr": 0.004526539026922928, "rouge2_fmeasure": 0.056442306230818876, "rouge2_fmeasure_stderr": 0.0016804520319396248, "rouge2_precision": 0.03999269013558242, "rouge2_precision_stderr": 0.0011971308201383902, "rouge2_recall": 0.10113339981669754, "rouge2_recall_stderr": 0.0031343748451553286, "rougeL_fmeasure": 0.16717619407627404, "rougeL_fmeasure_stderr": 0.0019481211634922503, "rougeL_precision": 0.12033234686017578, "rougeL_precision_stderr": 0.0014585203532609563, "rougeL_recall": 0.2903778922407286, "rougeL_recall_stderr": 0.003692559366146235, "rougeLsum_fmeasure": 0.17608763758164225, "rougeLsum_fmeasure_stderr": 0.002145834312930374, "rougeLsum_precision": 0.12659083153302422, "rougeLsum_precision_stderr": 0.0015911326356590116, "rougeLsum_recall": 0.30622228336959245, "rougeLsum_recall_stderr": 0.004035233836387343}}, "1": {"article_DOC_summary": {"bleu": 1.4753760157360585, "bleu_stderr": 0.053942288193643405, "rouge1_fmeasure": 0.17409533783255507, "rouge1_fmeasure_stderr": 0.002615183917058679, "rouge1_precision": 0.123975541885696, "rouge1_precision_stderr": 0.001937069913810257, "rouge1_recall": 0.3041634033606052, "rouge1_recall_stderr": 0.004462494037919621, "rouge2_fmeasure": 0.03598895109620969, "rouge2_fmeasure_stderr": 0.001496555270960858, "rouge2_precision": 0.02530985857085611, "rouge2_precision_stderr": 0.0010516965631042789, "rouge2_recall": 0.06497657707857857, "rouge2_recall_stderr": 0.0028062229327394097, "rougeL_fmeasure": 0.13719393806212205, "rougeL_fmeasure_stderr": 0.001988792672158033, "rougeL_precision": 0.09739516038141627, "rougeL_precision_stderr": 0.0014498365795769053, "rougeL_recall": 0.24191669859950216, "rougeL_recall_stderr": 0.0036028516933205405, "rougeLsum_fmeasure": 0.1371396960626393, "rougeLsum_fmeasure_stderr": 0.0021254210571197236, "rougeLsum_precision": 0.0973784808310999, "rougeLsum_precision_stderr": 0.0015541782441477088, "rougeLsum_recall": 0.24166767773481146, "rougeLsum_recall_stderr": 0.0037775246270440404}}, "2": {"article_DOC_summary": {"bleu": 1.4658077495541642, "bleu_stderr": 0.05656663077206122, "rouge1_fmeasure": 0.17850901621898715, "rouge1_fmeasure_stderr": 0.002631185065208956, "rouge1_precision": 0.12701574570405383, "rouge1_precision_stderr": 0.0019497483407488042, "rouge1_recall": 0.31209308297568034, "rouge1_recall_stderr": 0.0044823221140013695, "rouge2_fmeasure": 0.03728951183116744, "rouge2_fmeasure_stderr": 0.0014839130164513042, "rouge2_precision": 0.02628489874277305, "rouge2_precision_stderr": 0.0010496767573945315, "rouge2_recall": 0.06678550185926611, "rouge2_recall_stderr": 0.002741050501727463, "rougeL_fmeasure": 0.14040861303541324, "rougeL_fmeasure_stderr": 0.001957888420474033, "rougeL_precision": 0.09965436966997658, "rougeL_precision_stderr": 0.0014375571522234208, "rougeL_recall": 0.24737259975268608, "rougeL_recall_stderr": 0.003487423921250019, "rougeLsum_fmeasure": 0.1402274218021141, "rougeLsum_fmeasure_stderr": 0.00214102737994103, "rougeLsum_precision": 0.0995909554171278, "rougeLsum_precision_stderr": 0.001569684395558083, "rougeLsum_recall": 0.2465747406900749, "rougeLsum_recall_stderr": 0.0037567150545319778}}, "3": {"article_DOC_summary": {"bleu": 1.6273581251014975, "bleu_stderr": 0.08297152976943713, "rouge1_fmeasure": 0.1741691220448831, "rouge1_fmeasure_stderr": 0.0027529808698691072, "rouge1_precision": 0.12661685012408697, "rouge1_precision_stderr": 0.0021093732895884356, "rouge1_recall": 0.30042742106922904, "rouge1_recall_stderr": 0.004793262475513874, "rouge2_fmeasure": 0.03848207108872355, "rouge2_fmeasure_stderr": 0.0015349514843449124, "rouge2_precision": 0.027457383236278155, "rouge2_precision_stderr": 0.0011005894460101174, "rouge2_recall": 0.06872339351563556, "rouge2_recall_stderr": 0.0028264005600779733, "rougeL_fmeasure": 0.13848485640839917, "rougeL_fmeasure_stderr": 0.002136410452186076, "rougeL_precision": 0.10054376826357136, "rougeL_precision_stderr": 0.0016355832838556733, "rougeL_recall": 0.24044972226758587, "rougeL_recall_stderr": 0.0038558430038978684, "rougeLsum_fmeasure": 0.13794850154717367, "rougeLsum_fmeasure_stderr": 0.002276629118568194, "rougeLsum_precision": 0.10022509495467177, "rougeLsum_precision_stderr": 0.001733799927568896, "rougeLsum_recall": 0.2391221551244155, "rougeLsum_recall_stderr": 0.0040668979594426224}}, "4": {"article_DOC_summary": {"bleu": 0.9297944407512662, "bleu_stderr": 0.10452568299951195, "rouge1_fmeasure": 0.049025842019891894, "rouge1_fmeasure_stderr": 0.002761462686946734, "rouge1_precision": 0.03984641222755382, "rouge1_precision_stderr": 0.0023583006672399276, "rouge1_recall": 0.07781373363270898, "rouge1_recall_stderr": 0.0044678999283725125, "rouge2_fmeasure": 0.011276208078698832, "rouge2_fmeasure_stderr": 0.0010882501335885415, "rouge2_precision": 0.008839770813015032, "rouge2_precision_stderr": 0.0009132501416122141, "rouge2_recall": 0.018595501088417683, "rouge2_recall_stderr": 0.0018046371518142589, "rougeL_fmeasure": 0.03899309107571578, "rougeL_fmeasure_stderr": 0.0022095258634566655, "rougeL_precision": 0.03193279708670165, "rougeL_precision_stderr": 0.0019219705874772618, "rougeL_recall": 0.06201212154363477, "rougeL_recall_stderr": 0.0036181165069829016, "rougeLsum_fmeasure": 0.03871502132726552, "rougeLsum_fmeasure_stderr": 0.0021963924186294845, "rougeLsum_precision": 0.03172097901935682, "rougeLsum_precision_stderr": 0.0019113003105201993, "rougeLsum_recall": 0.06162603968016436, "rougeLsum_recall_stderr": 0.003613145314591428}}, "5": {"article_DOC_summary": {"bleu": 1.3519576766092788e-16, "bleu_stderr": 1.7148754157361587e-13, "rouge1_fmeasure": 0.002273937140206459, "rouge1_fmeasure_stderr": 0.0006514011124893405, "rouge1_precision": 0.0018747157977051885, "rouge1_precision_stderr": 0.0005217271715436668, "rouge1_recall": 0.003549283577549333, "rouge1_recall_stderr": 0.0010767599221403455, "rouge2_fmeasure": 0.0005538280315731646, "rouge2_fmeasure_stderr": 0.00021055797770595916, "rouge2_precision": 0.00047917769370046107, "rouge2_precision_stderr": 0.00017934012188651556, "rouge2_recall": 0.0007855487711203583, "rouge2_recall_stderr": 0.00030506211146057416, "rougeL_fmeasure": 0.0019019709421522583, "rougeL_fmeasure_stderr": 0.0005434789167320575, "rougeL_precision": 0.001591076043425948, "rougeL_precision_stderr": 0.00044966429790604164, "rougeL_recall": 0.002863749112737054, "rougeL_recall_stderr": 0.0008323073183414411, "rougeLsum_fmeasure": 0.0018351130157390045, "rougeLsum_fmeasure_stderr": 0.0005055637602691865, "rougeLsum_precision": 0.00156298958828556, "rougeLsum_precision_stderr": 0.00043239685994550666, "rougeLsum_recall": 0.0027767015907513405, "rougeLsum_recall_stderr": 0.0007833271709551212}}}}
8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.20824164042833873,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.002234214827494051
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.3342680558497238,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.002719351845027267
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.235556648636045,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0018848294908913575
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.0553447765818299,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.001097279257804817
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.09158152498363732,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.00182638815550646
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.06251395339429003,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0011032340747877004
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.14804254896591035,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.001582109014998804
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.24506636928994607,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.002277815665553641
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.16869690772741758,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0013497024922833358
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.19708639575292217,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.0021089064966209187
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.31720897725992786,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.002596567797319611
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.22311024372097865,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0017776265096001729
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 3.5155467010727994,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.06141556601214586
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 2,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.17811439171989488,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.002560275363708373
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.27886090799993984,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.0034448901496326586
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.1960859652047859,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0023366060464774423
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.04742286059353845,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0011604513875678288
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.07742946215061461,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.001866825934930981
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.0523326312861241,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0011285281285534974
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.12852552196126066,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0018964172540042934
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.20545470726815437,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0027326532184944424
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.14162844117143206,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0016899728317895791
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.16868014464807685,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.002438611490893621
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.26411127102133913,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.003281374640568146
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.18556326066261042,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.002215627597176747
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 3.504980488837104,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.07033302280883086
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 3,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.057686885490487505,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.002077970469715233
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.09385574649347103,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.0031890083282469467
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.06282501006696624,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0020888213951751034
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.015071313264611244,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0007774016714959856
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.026883790080977472,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0014075436212441558
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.016837586892487218,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.000790811753626707
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.042638568215269866,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0015504702364272222
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.07032908173779572,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0024572632014890276
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.0461316780279477,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0015235672554804573
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.05433748981297747,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.0019639155165544035
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.0885417619284403,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0030172562108610624
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.05917832063435797,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.001970707904332247
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 0.9086939324156383,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.05855823137144976
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.009722491969586005,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.0009564982386894138
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.015357271032115101,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.001448960021557378
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.010180728590080238,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0009430571774107825
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.0024973101781513925,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.000343733403974376
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.0045816927074040755,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0006153391357680226
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.00281382806546607,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.00035214119256937883
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.0072997926636164424,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0007272617797612332
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.011767619522932812,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0011356853451163903
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.00759156476319381,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0007014066070016197
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.009129679261559958,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.0009077175108677888
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.014481417293915742,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0013773388496390326
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.00955145315759681,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0008912980968449334
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 6.161453193982374e-06,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 1.2644510555241896e-05
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_3.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.12661685012408697,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0021093732895884356
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.30042742106922904,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.004793262475513874
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.1741691220448831,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.0027529808698691072
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.027457383236278155,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.0011005894460101174
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.06872339351563556,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.0028264005600779733
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.03848207108872355,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.0015349514843449124
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.10054376826357136,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.0016355832838556733
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.24044972226758587,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0038558430038978684
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.13848485640839917,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.002136410452186076
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.10022509495467177,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.001733799927568896
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.2391221551244155,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.0040668979594426224
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.13794850154717367,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.002276629118568194
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 1.6273581251014975,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 0.08297152976943713
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 3,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.03984641222755382,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0023583006672399276
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.07781373363270898,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.0044678999283725125
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.049025842019891894,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.002761462686946734
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.008839770813015032,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.0009132501416122141
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.018595501088417683,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.0018046371518142589
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.011276208078698832,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.0010882501335885415
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.03193279708670165,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.0019219705874772618
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.06201212154363477,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0036181165069829016
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.03899309107571578,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.0022095258634566655
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.03172097901935682,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.0019113003105201993
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.06162603968016436,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.003613145314591428
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.03871502132726552,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.0021963924186294845
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 0.9297944407512662,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 0.10452568299951195
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.0018747157977051885,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0005217271715436668
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.003549283577549333,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.0010767599221403455
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.002273937140206459,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.0006514011124893405
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.00047917769370046107,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.00017934012188651556
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.0007855487711203583,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.00030506211146057416
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.0005538280315731646,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.00021055797770595916
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.001591076043425948,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.00044966429790604164
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.002863749112737054,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0008323073183414411
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.0019019709421522583,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.0005434789167320575
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.00156298958828556,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.00043239685994550666
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.0027767015907513405,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.0007833271709551212
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.0018351130157390045,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.0005055637602691865
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 1.3519576766092788e-16,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 1.7148754157361587e-13
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.22135115607796563, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002466762662359004}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.3499365920126553, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029103670069339765}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.24678912656203086, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002037101161511086}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.06270201642870808, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001276661389687198}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.10236690363201062, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0019960722912629107}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.06998330324965386, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012429781942009475}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.15820160856971907, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001803134257764531}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.25667905638991584, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002404233442497606}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.17719154028942327, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014768440178427862}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.20961741686683294, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0023343675964983485}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.332399007963623, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027852830969350697}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.2338283238156954, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019162970817694011}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.9658166802535484, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07925927931568665}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.1877914020690942, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002780603639421943}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.28469073541532774, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003552554685317243}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.2013934895923683, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0024101386411320806}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.05231468260770844, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00130393473012376}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.08172649028680513, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018880261158435462}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05612430703445463, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011944949124968374}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.1363160410891958, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021057536768160825}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.21040942089474482, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002825298497100027}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.14593849406958256, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017576382763014175}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.17789698988425304, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0026448838544979162}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2699836129539428, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003393162716447727}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.19069625305891352, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002278271153942546}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.971856308559798, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09289145672499902}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.05986373469352374, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022413247620520016}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.09299451528533728, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003182662484214728}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.06294177276794305, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021234318631902995}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.01607251699061772, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008627575848833899}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.026217246289428207, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013573347317069598}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.01701967942257221, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008072426334728445}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.04416852187112644, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016929957101032731}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.06977139563193656, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002453474360518891}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.04627660806628133, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001562406165753214}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.056539524010580496, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002122275707889583}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.08802275906234883, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00301921948656467}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.05949644164272974, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002009657736950104}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.8216828826403728, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06116206229347704}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.010026792511549231, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009722843833023145}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.014997054249515373, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0014200291615747175}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.010348132353778797, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0009643798928416376}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.002614111292262381, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0003582186570242107}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0043272321523977674, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.000556694180882867}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.002947850654970828, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00038101201967536574}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.007696209279533862, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0007541223407519663}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.01149712833393674, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001106103576275103}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.007846213305722558, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007330941188437008}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.009502293292635411, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009264781441363666}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.01418906709799213, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0013478881437725126}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.009790499222718256, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009156513115428641}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.939501013520158e-06, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 6.1416042954126946e-06}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.1451633530537778, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022577436785926417}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.3454037481783916, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005024002677948197}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.2005996992165322, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002922307185273832}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.03399383589660795, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011970297160356436}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.08515533208064306, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00305750154560904}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.047805324803931785, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001665552432783648}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.1082275772002201, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016303512277000601}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.26018618860799503, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0038502171494847245}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.15011348459355398, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021656916632942095}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.1163681459054013, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018244541208628394}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.27969335970329556, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004315552235909818}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.1614133142157355, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00243279237750366}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.0101663995130106, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.13035938633140529}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.04604504358511682, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0026703224008911334}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.08906761715367195, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0050274498862542885}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05612514432374489, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0030731828239370837}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.011276377923656494, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012571358140648242}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.023708158418892494, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002002425619950796}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.013937278041326827, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011465942789645456}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.03500836518053345, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021377284451202998}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.06645887444884206, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0037898342177074525}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.04184105680154411, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002285547204915818}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0378191473406664, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002272186759930814}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.07247078079866072, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004137833255697601}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.04557476167159472, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002503478149457618}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.1210416443624807, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.18886560236043304}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.0022472741378328047, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0006307356242527256}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.004133903246439151, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0011668769662793363}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0026984759176508304, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.000738763092128848}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.00044268370843308837, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00017799352097082119}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0008499613285961788, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003602171813535447}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.000551357929598578, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00022851833366345053}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0016336554354190846, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0004503176195113186}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.003027477992470196, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0008730946685903953}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.001991063584250406, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005515138571257219}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0017629252737840803, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0005014954502254297}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0031744555131817086, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0008937483641872015}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.002097477776387118, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005739873846094995}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.7628232823336373e-17, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 2.8521803760603906e-14}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:633b67c477b50749ee0df318352f9c26b8246632303e42a04130096db86134b3
3
+ size 18901229
8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb0375c0b10c4c1729c1811d0079a86d3b6f67ab5947b77f4d83c597606dc8a1
3
+ size 24315240
8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd549626f85f15b551ff61f922588c672e9dea8045df9f459c0b25c747f712b4
3
+ size 29469414
8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83f21fa77bb6a64f6ca3dc23d6cb94b8078265eb91430652eedcf44cb11c30b0
3
+ size 34800828
8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_3.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c98fd50bf29300b85ea2c8c42cc1dac19e47a5deddd7ab319507ef9553809abe
3
+ size 9647374
8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_4.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8140c5d43dadd582c3d79c1c06d9ee99ca459810a3b1e56368fb4f12a2b4cb63
3
+ size 11673693
8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_5.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04979a4e0509f41ac7b144862e4b7aded18e0e4e2e69fdcbff4d50eb8aed3dc2
3
+ size 13899458
8b7178b35b/evaluation/generation/merged.csv CHANGED
@@ -18,7 +18,13 @@ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04451173090845151
18
  gem_xsum,1,median,rouge2_fmeasure,0.04451173090845151
19
  gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.05083267238349886
20
  gem_xsum,2,median,rouge2_fmeasure,0.05083267238349886
21
- gem_xsum,2,average,multiple,0.04955506903105885
 
 
 
 
 
 
22
  web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05212321093989505
23
  web_nlg_en,0,median,rouge2_fmeasure,0.05212321093989505
24
  web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.058822699744876736
@@ -36,4 +42,12 @@ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03900064299554168
36
  wiki_lingua_en,0,median,rouge2_fmeasure,0.03900064299554168
37
  wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.06561109570973458
38
  wiki_lingua_en,1,median,rouge2_fmeasure,0.06561109570973458
39
- wiki_lingua_en,1,average,multiple,0.05230586935263813
 
 
 
 
 
 
 
 
 
18
  gem_xsum,1,median,rouge2_fmeasure,0.04451173090845151
19
  gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.05083267238349886
20
  gem_xsum,2,median,rouge2_fmeasure,0.05083267238349886
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.047805324803931785
22
+ gem_xsum,3,median,rouge2_fmeasure,0.047805324803931785
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.013937278041326827
24
+ gem_xsum,4,median,rouge2_fmeasure,0.013937278041326827
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.000551357929598578
26
+ gem_xsum,5,median,rouge2_fmeasure,0.000551357929598578
27
+ gem_xsum,5,average,multiple,0.035159861311338955
28
  web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05212321093989505
29
  web_nlg_en,0,median,rouge2_fmeasure,0.05212321093989505
30
  web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.058822699744876736
 
42
  wiki_lingua_en,0,median,rouge2_fmeasure,0.03900064299554168
43
  wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.06561109570973458
44
  wiki_lingua_en,1,median,rouge2_fmeasure,0.06561109570973458
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.06998330324965386
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.06998330324965386
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.05612430703445463
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.05612430703445463
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01701967942257221
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.01701967942257221
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.002947850654970828
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.002947850654970828
53
+ wiki_lingua_en,5,average,multiple,0.04178114651115463
8b7178b35b/evaluation/generation/merged.json CHANGED
@@ -1 +1 @@
1
- {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.32917980943922837, "bleu_stderr": 0.028436979522399264, "rouge1_fmeasure": 0.11092359878172349, "rouge1_fmeasure_stderr": 0.0021083743244835647, "rouge1_precision": 0.07454038083418474, "rouge1_precision_stderr": 0.001766454284632946, "rouge1_recall": 0.30290353802392916, "rouge1_recall_stderr": 0.004835002819067963, "rouge2_fmeasure": 0.05212321093989505, "rouge2_fmeasure_stderr": 0.0013412640798549406, "rouge2_precision": 0.03405534335780326, "rouge2_precision_stderr": 0.0009795255375204467, "rouge2_recall": 0.14725829338052365, "rouge2_recall_stderr": 0.0033496237842817397, "rougeL_fmeasure": 0.1059469350929842, "rougeL_fmeasure_stderr": 0.0019397934888169463, "rougeL_precision": 0.07086594439969038, "rougeL_precision_stderr": 0.0016082892660627373, "rougeL_recall": 0.29226896530687846, "rougeL_recall_stderr": 0.00466575095416219, "rougeLsum_fmeasure": 0.1060475373688485, "rougeLsum_fmeasure_stderr": 0.0019809059744097615, "rougeLsum_precision": 0.0711343154862909, "rougeLsum_precision_stderr": 0.001649638611842115, "rougeLsum_recall": 0.2903231316909427, "rougeLsum_recall_stderr": 0.004582555178091478}}, "1": {"PALM_prompt": {"bleu": 0.5285748263104777, "bleu_stderr": 0.05154539253732287, "rouge1_fmeasure": 0.1218665522358165, "rouge1_fmeasure_stderr": 0.0019158434392141466, "rouge1_precision": 0.07826126347996014, "rouge1_precision_stderr": 0.001412323883175065, "rouge1_recall": 0.38231102690350377, "rouge1_recall_stderr": 0.0053881864820272286, "rouge2_fmeasure": 0.058822699744876736, "rouge2_fmeasure_stderr": 0.0012405607498159584, "rouge2_precision": 0.037391127717872924, "rouge2_precision_stderr": 0.0008664479676328054, "rouge2_recall": 0.19839458948033734, "rouge2_recall_stderr": 0.004010973785657583, "rougeL_fmeasure": 0.11559328292690177, "rougeL_fmeasure_stderr": 0.001764131335387481, "rougeL_precision": 0.07407975935897594, "rougeL_precision_stderr": 0.0012854765812861943, "rougeL_recall": 0.3629315958992141, "rougeL_recall_stderr": 0.005069344323262684, "rougeLsum_fmeasure": 0.11544948693652962, "rougeLsum_fmeasure_stderr": 0.0017842924340211324, "rougeLsum_precision": 0.07415121638849131, "rougeLsum_precision_stderr": 0.001318142684082723, "rougeLsum_recall": 0.3614370946625198, "rougeLsum_recall_stderr": 0.004964006794864235}}, "2": {"PALM_prompt": {"bleu": 0.6066453441760302, "bleu_stderr": 0.02113523644793951, "rouge1_fmeasure": 0.12529215862638043, "rouge1_fmeasure_stderr": 0.0018259072915703836, "rouge1_precision": 0.08017864305448325, "rouge1_precision_stderr": 0.0014244751923494794, "rouge1_recall": 0.40480204267227765, "rouge1_recall_stderr": 0.005138963278066339, "rouge2_fmeasure": 0.059482069465634994, "rouge2_fmeasure_stderr": 0.0011595782202414246, "rouge2_precision": 0.03746116833011539, "rouge2_precision_stderr": 0.0008054382500683709, "rouge2_recall": 0.2078327469176558, "rouge2_recall_stderr": 0.003905498610846629, "rougeL_fmeasure": 0.11765794068577944, "rougeL_fmeasure_stderr": 0.0016600536639441099, "rougeL_precision": 0.0752258255540516, "rougeL_precision_stderr": 0.0013017426600869019, "rougeL_recall": 0.37921672738638584, "rougeL_recall_stderr": 0.004724125031038082, "rougeLsum_fmeasure": 0.11886578443137756, "rougeLsum_fmeasure_stderr": 0.001710265056607144, "rougeLsum_precision": 0.07612402487838264, "rougeLsum_precision_stderr": 0.0013499817758363393, "rougeLsum_recall": 0.3833520840989382, "rougeLsum_recall_stderr": 0.004787520387833171}}, "3": {"PALM_prompt": {"bleu": 0.6647933502215259, "bleu_stderr": 0.04923433986433192, "rouge1_fmeasure": 0.12320395767637426, "rouge1_fmeasure_stderr": 0.0017848979409527468, "rouge1_precision": 0.0781723421483203, "rouge1_precision_stderr": 0.0013023299811420648, "rouge1_recall": 0.4046567231900049, "rouge1_recall_stderr": 0.005137890673994109, "rouge2_fmeasure": 0.05931767519712951, "rouge2_fmeasure_stderr": 0.0011649351056668448, "rouge2_precision": 0.037339890555931926, "rouge2_precision_stderr": 0.0008144074272058679, "rouge2_recall": 0.2106193724526676, "rouge2_recall_stderr": 0.003927640735394401, "rougeL_fmeasure": 0.1150473458151772, "rougeL_fmeasure_stderr": 0.001619592187307252, "rougeL_precision": 0.07298298160971306, "rougeL_precision_stderr": 0.0011729650830812643, "rougeL_recall": 0.37541756571956564, "rougeL_recall_stderr": 0.004598099062061018, "rougeLsum_fmeasure": 0.1167920444605632, "rougeLsum_fmeasure_stderr": 0.0016685972133379994, "rougeLsum_precision": 0.0741403065968848, "rougeLsum_precision_stderr": 0.0012209548996141538, "rougeLsum_recall": 0.38296199684709586, "rougeLsum_recall_stderr": 0.004753402389332228}}, "4": {"PALM_prompt": {"bleu": 0.6566656848041873, "bleu_stderr": 0.03433459283803238, "rouge1_fmeasure": 0.12360753221007545, "rouge1_fmeasure_stderr": 0.001737583890202225, "rouge1_precision": 0.07771155841369212, "rouge1_precision_stderr": 0.0012374008770267796, "rouge1_recall": 0.4160226119034251, "rouge1_recall_stderr": 0.005178203274221526, "rouge2_fmeasure": 0.05886517252410502, "rouge2_fmeasure_stderr": 0.0011242425541824303, "rouge2_precision": 0.036708952345678336, "rouge2_precision_stderr": 0.0007671406759682118, "rouge2_recall": 0.2142530872032143, "rouge2_recall_stderr": 0.003938344188890309, "rougeL_fmeasure": 0.11421873690833906, "rougeL_fmeasure_stderr": 0.0015701261033135242, "rougeL_precision": 0.07189004426671647, "rougeL_precision_stderr": 0.001119612418604389, "rougeL_recall": 0.381893221081078, "rougeL_recall_stderr": 0.004577229345577291, "rougeLsum_fmeasure": 0.11777877062717519, "rougeLsum_fmeasure_stderr": 0.0016474868402205588, "rougeLsum_precision": 0.0741079421040528, "rougeLsum_precision_stderr": 0.0011766632070640108, "rougeLsum_recall": 0.39536320623753024, "rougeLsum_recall_stderr": 0.004834163190323313}}, "5": {"PALM_prompt": {"bleu": 0.7608515369903156, "bleu_stderr": 0.05544338902785279, "rouge1_fmeasure": 0.12542945154484628, "rouge1_fmeasure_stderr": 0.0017246642532628591, "rouge1_precision": 0.07874251282939974, "rouge1_precision_stderr": 0.0012390781355337446, "rouge1_recall": 0.4274915903037139, "rouge1_recall_stderr": 0.0052178214889688, "rouge2_fmeasure": 0.059919950381812144, "rouge2_fmeasure_stderr": 0.0011224239943820847, "rouge2_precision": 0.03725892588478996, "rouge2_precision_stderr": 0.0007691554935322187, "rouge2_recall": 0.22193078536394883, "rouge2_recall_stderr": 0.003999018222823969, "rougeL_fmeasure": 0.11503193912763204, "rougeL_fmeasure_stderr": 0.0015332329154476773, "rougeL_precision": 0.07225670171843512, "rougeL_precision_stderr": 0.001101933354024317, "rougeL_recall": 0.390252551254915, "rougeL_recall_stderr": 0.004591599013709285, "rougeLsum_fmeasure": 0.11871364855558929, "rougeLsum_fmeasure_stderr": 0.0016202511632686142, "rougeLsum_precision": 0.07457349456330893, "rougeLsum_precision_stderr": 0.0011662890645907052, "rougeLsum_recall": 0.40390693081462914, "rougeLsum_recall_stderr": 0.004850186326089204}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.7311283501769414, "bleu_stderr": 0.07292848467922458, "rouge1_fmeasure": 0.18112864366064585, "rouge1_fmeasure_stderr": 0.0019261005266957636, "rouge1_precision": 0.15412231045043578, "rouge1_precision_stderr": 0.0019133423603737852, "rouge1_recall": 0.26596610436897833, "rouge1_recall_stderr": 0.0029078617781084343, "rouge2_fmeasure": 0.03900064299554168, "rouge2_fmeasure_stderr": 0.0009076451372655918, "rouge2_precision": 0.032745788295643126, "rouge2_precision_stderr": 0.000790295587407252, "rouge2_recall": 0.06015567293207339, "rouge2_recall_stderr": 0.0015850260882465787, "rougeL_fmeasure": 0.13940156755327857, "rougeL_fmeasure_stderr": 0.0013668263941440067, "rougeL_precision": 0.1171936085278983, "rougeL_precision_stderr": 0.00132039683220814, "rougeL_recall": 0.21005905052835785, "rougeL_recall_stderr": 0.002357155817310683, "rougeLsum_fmeasure": 0.16698321861271978, "rougeLsum_fmeasure_stderr": 0.0017578260329471065, "rougeLsum_precision": 0.14187748986714835, "rougeLsum_precision_stderr": 0.0017420221740131032, "rougeLsum_recall": 0.2462694168340833, "rougeLsum_recall_stderr": 0.002709379098155295}}, "1": {"tldr_en": {"bleu": 3.5922679202027847, "bleu_stderr": 0.05984463327930867, "rouge1_fmeasure": 0.24327622952771533, "rouge1_fmeasure_stderr": 0.002037288100861591, "rouge1_precision": 0.2123430992404537, "rouge1_precision_stderr": 0.002316190617323844, "rouge1_recall": 0.349869999562048, "rouge1_recall_stderr": 0.0028776097446615145, "rouge2_fmeasure": 0.06561109570973458, "rouge2_fmeasure_stderr": 0.0011754883309790891, "rouge2_precision": 0.05726169522995292, "rouge2_precision_stderr": 0.0011305457211571272, "rouge2_recall": 0.09761319314398223, "rouge2_recall_stderr": 0.001931346687472591, "rougeL_fmeasure": 0.17044218669835878, "rougeL_fmeasure_stderr": 0.001393233918351284, "rougeL_precision": 0.147593790999516, "rougeL_precision_stderr": 0.0015843013179494662, "rougeL_recall": 0.25163325238017475, "rougeL_recall_stderr": 0.0023185700212327545, "rougeLsum_fmeasure": 0.22970291581408894, "rougeLsum_fmeasure_stderr": 0.0019241182984497916, "rougeLsum_precision": 0.20040929059968782, "rougeLsum_precision_stderr": 0.0021906873980096493, "rougeLsum_recall": 0.33105684407128727, "rougeLsum_recall_stderr": 0.0027539492668926546}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.17854969577688523, "bleu_stderr": 0.02350515901837014, "rouge1_fmeasure": 0.14982661873128114, "rouge1_fmeasure_stderr": 0.0010486948402501617, "rouge1_precision": 0.34481003507654684, "rouge1_precision_stderr": 0.0024960642876163215, "rouge1_recall": 0.11140479511426793, "rouge1_recall_stderr": 0.0012326825192403048, "rouge2_fmeasure": 0.010042498177274786, "rouge2_fmeasure_stderr": 0.0004526496809829142, "rouge2_precision": 0.02652033223020393, "rouge2_precision_stderr": 0.0012104924394046484, "rouge2_recall": 0.007738642998342415, "rouge2_recall_stderr": 0.00040331722748865453, "rougeL_fmeasure": 0.13556698518148783, "rougeL_fmeasure_stderr": 0.0010029610186637107, "rougeL_precision": 0.31148977629184244, "rougeL_precision_stderr": 0.0024243327495898086, "rougeL_recall": 0.10191142585459571, "rougeL_recall_stderr": 0.0012109507603402815, "rougeLsum_fmeasure": 0.1364054799839495, "rougeLsum_fmeasure_stderr": 0.0010086900095262318, "rougeLsum_precision": 0.3173006007013496, "rougeLsum_precision_stderr": 0.002513826319580817, "rougeLsum_recall": 0.10030998557950792, "rougeLsum_recall_stderr": 0.0010758913304541808}}, "1": {"generate_text_restaurant": {"bleu": 8.550714951198435, "bleu_stderr": 0.06794955954281734, "rouge1_fmeasure": 0.3940891272584714, "rouge1_fmeasure_stderr": 0.00233253448293986, "rouge1_precision": 0.37441894592411723, "rouge1_precision_stderr": 0.00285043399066679, "rouge1_recall": 0.4681809355112701, "rouge1_recall_stderr": 0.0029387631347472464, "rouge2_fmeasure": 0.17310173953900088, "rouge2_fmeasure_stderr": 0.0017673218319637287, "rouge2_precision": 0.16514863563264978, "rouge2_precision_stderr": 0.0019360690941877992, "rouge2_recall": 0.20649495583013436, "rouge2_recall_stderr": 0.0021624061718912763, "rougeL_fmeasure": 0.2901588169021053, "rougeL_fmeasure_stderr": 0.0017017464729554115, "rougeL_precision": 0.2743117208263101, "rougeL_precision_stderr": 0.0021002662608094564, "rougeL_recall": 0.34924701312209255, "rougeL_recall_stderr": 0.0023705538319379695, "rougeLsum_fmeasure": 0.327671306182913, "rougeLsum_fmeasure_stderr": 0.0022171207606254796, "rougeLsum_precision": 0.31201556645452433, "rougeLsum_precision_stderr": 0.002622342685847327, "rougeLsum_recall": 0.3885350242195471, "rougeLsum_recall_stderr": 0.0027432106106179788}}, "2": {"generate_text_restaurant": {"bleu": 11.867888027230435, "bleu_stderr": 0.15989930331829488, "rouge1_fmeasure": 0.4442817836593718, "rouge1_fmeasure_stderr": 0.0019988950330102873, "rouge1_precision": 0.44246406218448636, "rouge1_precision_stderr": 0.002320846131050864, "rouge1_recall": 0.4822237492014395, "rouge1_recall_stderr": 0.0028480697292934715, "rouge2_fmeasure": 0.20571413885055867, "rouge2_fmeasure_stderr": 0.001815833869403785, "rouge2_precision": 0.20447987966953268, "rouge2_precision_stderr": 0.0019087131351150518, "rouge2_recall": 0.22575098541036898, "rouge2_recall_stderr": 0.0022679440038537117, "rougeL_fmeasure": 0.3232871465388061, "rougeL_fmeasure_stderr": 0.0017540023019821938, "rougeL_precision": 0.3220661789895318, "rougeL_precision_stderr": 0.0019807481977887516, "rougeL_recall": 0.35167106294020406, "rougeL_recall_stderr": 0.002412719701186727, "rougeLsum_fmeasure": 0.37187691948404467, "rougeLsum_fmeasure_stderr": 0.0020334507090512886, "rougeLsum_precision": 0.37040704761920407, "rougeLsum_precision_stderr": 0.002271535999762247, "rougeLsum_recall": 0.4036635693660681, "rougeLsum_recall_stderr": 0.0027100283713887775}}, "3": {"generate_text_restaurant": {"bleu": 12.176808245577714, "bleu_stderr": 0.1522141950985559, "rouge1_fmeasure": 0.4492578440402226, "rouge1_fmeasure_stderr": 0.0019667066974463896, "rouge1_precision": 0.44578562677217626, "rouge1_precision_stderr": 0.0022755220565382653, "rouge1_recall": 0.4889266631673112, "rouge1_recall_stderr": 0.0028806835048615135, "rouge2_fmeasure": 0.21198726153120898, "rouge2_fmeasure_stderr": 0.0018195540133290194, "rouge2_precision": 0.20934965847705986, "rouge2_precision_stderr": 0.0018748523719357, "rouge2_recall": 0.23374791215018093, "rouge2_recall_stderr": 0.0023263783868214196, "rougeL_fmeasure": 0.3276087887468011, "rougeL_fmeasure_stderr": 0.0017546007227396715, "rougeL_precision": 0.32511799875091574, "rougeL_precision_stderr": 0.001967046506806585, "rougeL_recall": 0.35739711489377646, "rougeL_recall_stderr": 0.002465697341506471, "rougeLsum_fmeasure": 0.3771411500357944, "rougeLsum_fmeasure_stderr": 0.0020137589520510426, "rougeLsum_precision": 0.3741382823179302, "rougeLsum_precision_stderr": 0.002234766274837387, "rougeLsum_recall": 0.4108171966535684, "rougeLsum_recall_stderr": 0.0027621282248548955}}, "4": {"generate_text_restaurant": {"bleu": 12.402384292539924, "bleu_stderr": 0.135024423037405, "rouge1_fmeasure": 0.4552455893734124, "rouge1_fmeasure_stderr": 0.0019583919126399917, "rouge1_precision": 0.4500297698600483, "rouge1_precision_stderr": 0.002279184782036985, "rouge1_recall": 0.49411520793728253, "rouge1_recall_stderr": 0.0028065754284846934, "rouge2_fmeasure": 0.2155975402534293, "rouge2_fmeasure_stderr": 0.001873826925854458, "rouge2_precision": 0.2124489885242478, "rouge2_precision_stderr": 0.001924872387744921, "rouge2_recall": 0.23653217550233827, "rouge2_recall_stderr": 0.0023355573505469554, "rougeL_fmeasure": 0.33116918396735845, "rougeL_fmeasure_stderr": 0.0017792810189211944, "rougeL_precision": 0.32709295569865143, "rougeL_precision_stderr": 0.001956608790763274, "rougeL_recall": 0.3602236836443227, "rougeL_recall_stderr": 0.002427496751180958, "rougeLsum_fmeasure": 0.3808847920405725, "rougeLsum_fmeasure_stderr": 0.002065156905180787, "rougeLsum_precision": 0.37596847671524797, "rougeLsum_precision_stderr": 0.002251960910942368, "rougeLsum_recall": 0.41399734380099557, "rougeLsum_recall_stderr": 0.0027661212857413163}}, "5": {"generate_text_restaurant": {"bleu": 12.262420161138401, "bleu_stderr": 0.18903914838075944, "rouge1_fmeasure": 0.45590768378496416, "rouge1_fmeasure_stderr": 0.001962173998693794, "rouge1_precision": 0.4494732628845236, "rouge1_precision_stderr": 0.0023023795088449608, "rouge1_recall": 0.49537684561519607, "rouge1_recall_stderr": 0.002767197249479356, "rouge2_fmeasure": 0.21627498227149344, "rouge2_fmeasure_stderr": 0.0018499821709783027, "rouge2_precision": 0.21297442064638186, "rouge2_precision_stderr": 0.001926070075686657, "rouge2_recall": 0.23703970881860875, "rouge2_recall_stderr": 0.0022735756287929677, "rougeL_fmeasure": 0.3331238951307013, "rougeL_fmeasure_stderr": 0.0017799650141595914, "rougeL_precision": 0.3281425210661801, "rougeL_precision_stderr": 0.001972901049188069, "rougeL_recall": 0.3628730729029585, "rougeL_recall_stderr": 0.0024176721948621425, "rougeLsum_fmeasure": 0.38246870938641964, "rougeLsum_fmeasure_stderr": 0.0020417804934292076, "rougeLsum_precision": 0.3768630011135112, "rougeLsum_precision_stderr": 0.0022664490914563948, "rougeLsum_recall": 0.41605530731114854, "rougeLsum_recall_stderr": 0.0027097026120309876}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.24608084382355, "bleu_stderr": 0.11799890497604383, "rouge1_fmeasure": 0.22178670977262957, "rouge1_fmeasure_stderr": 0.0026603762671440385, "rouge1_precision": 0.17400888678487844, "rouge1_precision_stderr": 0.002464330237468343, "rouge1_recall": 0.3502023417511915, "rouge1_recall_stderr": 0.004626152702942569, "rouge2_fmeasure": 0.05332080380122619, "rouge2_fmeasure_stderr": 0.0017188362970072153, "rouge2_precision": 0.040704743736010573, "rouge2_precision_stderr": 0.001393829442555013, "rouge2_recall": 0.08799878519146338, "rouge2_recall_stderr": 0.0029196986234636744, "rougeL_fmeasure": 0.16609949129334142, "rougeL_fmeasure_stderr": 0.0020543947028805716, "rougeL_precision": 0.12992137120722907, "rougeL_precision_stderr": 0.0018781656373678345, "rougeL_recall": 0.2641999836224796, "rougeL_recall_stderr": 0.0036812097901456225, "rougeLsum_fmeasure": 0.17331928576093303, "rougeLsum_fmeasure_stderr": 0.0022434557390865763, "rougeLsum_precision": 0.13526811734802355, "rougeLsum_precision_stderr": 0.001975414967835506, "rougeLsum_recall": 0.27591607890974656, "rougeLsum_recall_stderr": 0.004020504841332198}}, "1": {"article_DOC_summary": {"bleu": 1.8189118554354626, "bleu_stderr": 0.10029587869243096, "rouge1_fmeasure": 0.19671334605636295, "rouge1_fmeasure_stderr": 0.0027188446769093686, "rouge1_precision": 0.14019002697714425, "rouge1_precision_stderr": 0.002033439876939429, "rouge1_recall": 0.3437328928644267, "rouge1_recall_stderr": 0.004622184360550014, "rouge2_fmeasure": 0.04451173090845151, "rouge2_fmeasure_stderr": 0.0016237255917685457, "rouge2_precision": 0.031342583459769556, "rouge2_precision_stderr": 0.0011487670391079187, "rouge2_recall": 0.0803191311481453, "rouge2_recall_stderr": 0.003010334414958687, "rougeL_fmeasure": 0.14825510655652663, "rougeL_fmeasure_stderr": 0.0020028300424767575, "rougeL_precision": 0.1053445488770993, "rougeL_precision_stderr": 0.001476955053927121, "rougeL_recall": 0.2613986979531857, "rougeL_recall_stderr": 0.0035961912243557873, "rougeLsum_fmeasure": 0.1592072439127538, "rougeLsum_fmeasure_stderr": 0.002270863529627419, "rougeLsum_precision": 0.11322117707364136, "rougeLsum_precision_stderr": 0.001678631461460253, "rougeLsum_recall": 0.27993778037490374, "rougeLsum_recall_stderr": 0.003988822898958396}}, "2": {"article_DOC_summary": {"bleu": 2.0454443359589503, "bleu_stderr": 0.08368970692051826, "rouge1_fmeasure": 0.20889368966335237, "rouge1_fmeasure_stderr": 0.0026887281040736378, "rouge1_precision": 0.1488455450923004, "rouge1_precision_stderr": 0.002009481565468108, "rouge1_recall": 0.3642751817540943, "rouge1_recall_stderr": 0.004565498697439836, "rouge2_fmeasure": 0.05083267238349886, "rouge2_fmeasure_stderr": 0.001670794310427006, "rouge2_precision": 0.03580310577134537, "rouge2_precision_stderr": 0.0011818056791973873, "rouge2_recall": 0.09140711571916593, "rouge2_recall_stderr": 0.003096432205185466, "rougeL_fmeasure": 0.15708491403271296, "rougeL_fmeasure_stderr": 0.0019982552541995204, "rougeL_precision": 0.11164126156649677, "rougeL_precision_stderr": 0.001473760236619727, "rougeL_recall": 0.27611637516436727, "rougeL_recall_stderr": 0.003575935893484929, "rougeLsum_fmeasure": 0.1672065849237029, "rougeLsum_fmeasure_stderr": 0.002247088064282535, "rougeLsum_precision": 0.1188385887857819, "rougeLsum_precision_stderr": 0.0016510150185409913, "rougeLsum_recall": 0.2937542835590118, "rougeLsum_recall_stderr": 0.003999046140359821}}}}
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.32917980943922837, "bleu_stderr": 0.028436979522399264, "rouge1_fmeasure": 0.11092359878172349, "rouge1_fmeasure_stderr": 0.0021083743244835647, "rouge1_precision": 0.07454038083418474, "rouge1_precision_stderr": 0.001766454284632946, "rouge1_recall": 0.30290353802392916, "rouge1_recall_stderr": 0.004835002819067963, "rouge2_fmeasure": 0.05212321093989505, "rouge2_fmeasure_stderr": 0.0013412640798549406, "rouge2_precision": 0.03405534335780326, "rouge2_precision_stderr": 0.0009795255375204467, "rouge2_recall": 0.14725829338052365, "rouge2_recall_stderr": 0.0033496237842817397, "rougeL_fmeasure": 0.1059469350929842, "rougeL_fmeasure_stderr": 0.0019397934888169463, "rougeL_precision": 0.07086594439969038, "rougeL_precision_stderr": 0.0016082892660627373, "rougeL_recall": 0.29226896530687846, "rougeL_recall_stderr": 0.00466575095416219, "rougeLsum_fmeasure": 0.1060475373688485, "rougeLsum_fmeasure_stderr": 0.0019809059744097615, "rougeLsum_precision": 0.0711343154862909, "rougeLsum_precision_stderr": 0.001649638611842115, "rougeLsum_recall": 0.2903231316909427, "rougeLsum_recall_stderr": 0.004582555178091478}}, "1": {"PALM_prompt": {"bleu": 0.5285748263104777, "bleu_stderr": 0.05154539253732287, "rouge1_fmeasure": 0.1218665522358165, "rouge1_fmeasure_stderr": 0.0019158434392141466, "rouge1_precision": 0.07826126347996014, "rouge1_precision_stderr": 0.001412323883175065, "rouge1_recall": 0.38231102690350377, "rouge1_recall_stderr": 0.0053881864820272286, "rouge2_fmeasure": 0.058822699744876736, "rouge2_fmeasure_stderr": 0.0012405607498159584, "rouge2_precision": 0.037391127717872924, "rouge2_precision_stderr": 0.0008664479676328054, "rouge2_recall": 0.19839458948033734, "rouge2_recall_stderr": 0.004010973785657583, "rougeL_fmeasure": 0.11559328292690177, "rougeL_fmeasure_stderr": 0.001764131335387481, "rougeL_precision": 0.07407975935897594, "rougeL_precision_stderr": 0.0012854765812861943, "rougeL_recall": 0.3629315958992141, "rougeL_recall_stderr": 0.005069344323262684, "rougeLsum_fmeasure": 0.11544948693652962, "rougeLsum_fmeasure_stderr": 0.0017842924340211324, "rougeLsum_precision": 0.07415121638849131, "rougeLsum_precision_stderr": 0.001318142684082723, "rougeLsum_recall": 0.3614370946625198, "rougeLsum_recall_stderr": 0.004964006794864235}}, "2": {"PALM_prompt": {"bleu": 0.6066453441760302, "bleu_stderr": 0.02113523644793951, "rouge1_fmeasure": 0.12529215862638043, "rouge1_fmeasure_stderr": 0.0018259072915703836, "rouge1_precision": 0.08017864305448325, "rouge1_precision_stderr": 0.0014244751923494794, "rouge1_recall": 0.40480204267227765, "rouge1_recall_stderr": 0.005138963278066339, "rouge2_fmeasure": 0.059482069465634994, "rouge2_fmeasure_stderr": 0.0011595782202414246, "rouge2_precision": 0.03746116833011539, "rouge2_precision_stderr": 0.0008054382500683709, "rouge2_recall": 0.2078327469176558, "rouge2_recall_stderr": 0.003905498610846629, "rougeL_fmeasure": 0.11765794068577944, "rougeL_fmeasure_stderr": 0.0016600536639441099, "rougeL_precision": 0.0752258255540516, "rougeL_precision_stderr": 0.0013017426600869019, "rougeL_recall": 0.37921672738638584, "rougeL_recall_stderr": 0.004724125031038082, "rougeLsum_fmeasure": 0.11886578443137756, "rougeLsum_fmeasure_stderr": 0.001710265056607144, "rougeLsum_precision": 0.07612402487838264, "rougeLsum_precision_stderr": 0.0013499817758363393, "rougeLsum_recall": 0.3833520840989382, "rougeLsum_recall_stderr": 0.004787520387833171}}, "3": {"PALM_prompt": {"bleu": 0.6647933502215259, "bleu_stderr": 0.04923433986433192, "rouge1_fmeasure": 0.12320395767637426, "rouge1_fmeasure_stderr": 0.0017848979409527468, "rouge1_precision": 0.0781723421483203, "rouge1_precision_stderr": 0.0013023299811420648, "rouge1_recall": 0.4046567231900049, "rouge1_recall_stderr": 0.005137890673994109, "rouge2_fmeasure": 0.05931767519712951, "rouge2_fmeasure_stderr": 0.0011649351056668448, "rouge2_precision": 0.037339890555931926, "rouge2_precision_stderr": 0.0008144074272058679, "rouge2_recall": 0.2106193724526676, "rouge2_recall_stderr": 0.003927640735394401, "rougeL_fmeasure": 0.1150473458151772, "rougeL_fmeasure_stderr": 0.001619592187307252, "rougeL_precision": 0.07298298160971306, "rougeL_precision_stderr": 0.0011729650830812643, "rougeL_recall": 0.37541756571956564, "rougeL_recall_stderr": 0.004598099062061018, "rougeLsum_fmeasure": 0.1167920444605632, "rougeLsum_fmeasure_stderr": 0.0016685972133379994, "rougeLsum_precision": 0.0741403065968848, "rougeLsum_precision_stderr": 0.0012209548996141538, "rougeLsum_recall": 0.38296199684709586, "rougeLsum_recall_stderr": 0.004753402389332228}}, "4": {"PALM_prompt": {"bleu": 0.6566656848041873, "bleu_stderr": 0.03433459283803238, "rouge1_fmeasure": 0.12360753221007545, "rouge1_fmeasure_stderr": 0.001737583890202225, "rouge1_precision": 0.07771155841369212, "rouge1_precision_stderr": 0.0012374008770267796, "rouge1_recall": 0.4160226119034251, "rouge1_recall_stderr": 0.005178203274221526, "rouge2_fmeasure": 0.05886517252410502, "rouge2_fmeasure_stderr": 0.0011242425541824303, "rouge2_precision": 0.036708952345678336, "rouge2_precision_stderr": 0.0007671406759682118, "rouge2_recall": 0.2142530872032143, "rouge2_recall_stderr": 0.003938344188890309, "rougeL_fmeasure": 0.11421873690833906, "rougeL_fmeasure_stderr": 0.0015701261033135242, "rougeL_precision": 0.07189004426671647, "rougeL_precision_stderr": 0.001119612418604389, "rougeL_recall": 0.381893221081078, "rougeL_recall_stderr": 0.004577229345577291, "rougeLsum_fmeasure": 0.11777877062717519, "rougeLsum_fmeasure_stderr": 0.0016474868402205588, "rougeLsum_precision": 0.0741079421040528, "rougeLsum_precision_stderr": 0.0011766632070640108, "rougeLsum_recall": 0.39536320623753024, "rougeLsum_recall_stderr": 0.004834163190323313}}, "5": {"PALM_prompt": {"bleu": 0.7608515369903156, "bleu_stderr": 0.05544338902785279, "rouge1_fmeasure": 0.12542945154484628, "rouge1_fmeasure_stderr": 0.0017246642532628591, "rouge1_precision": 0.07874251282939974, "rouge1_precision_stderr": 0.0012390781355337446, "rouge1_recall": 0.4274915903037139, "rouge1_recall_stderr": 0.0052178214889688, "rouge2_fmeasure": 0.059919950381812144, "rouge2_fmeasure_stderr": 0.0011224239943820847, "rouge2_precision": 0.03725892588478996, "rouge2_precision_stderr": 0.0007691554935322187, "rouge2_recall": 0.22193078536394883, "rouge2_recall_stderr": 0.003999018222823969, "rougeL_fmeasure": 0.11503193912763204, "rougeL_fmeasure_stderr": 0.0015332329154476773, "rougeL_precision": 0.07225670171843512, "rougeL_precision_stderr": 0.001101933354024317, "rougeL_recall": 0.390252551254915, "rougeL_recall_stderr": 0.004591599013709285, "rougeLsum_fmeasure": 0.11871364855558929, "rougeLsum_fmeasure_stderr": 0.0016202511632686142, "rougeLsum_precision": 0.07457349456330893, "rougeLsum_precision_stderr": 0.0011662890645907052, "rougeLsum_recall": 0.40390693081462914, "rougeLsum_recall_stderr": 0.004850186326089204}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.7311283501769414, "bleu_stderr": 0.07292848467922458, "rouge1_fmeasure": 0.18112864366064585, "rouge1_fmeasure_stderr": 0.0019261005266957636, "rouge1_precision": 0.15412231045043578, "rouge1_precision_stderr": 0.0019133423603737852, "rouge1_recall": 0.26596610436897833, "rouge1_recall_stderr": 0.0029078617781084343, "rouge2_fmeasure": 0.03900064299554168, "rouge2_fmeasure_stderr": 0.0009076451372655918, "rouge2_precision": 0.032745788295643126, "rouge2_precision_stderr": 0.000790295587407252, "rouge2_recall": 0.06015567293207339, "rouge2_recall_stderr": 0.0015850260882465787, "rougeL_fmeasure": 0.13940156755327857, "rougeL_fmeasure_stderr": 0.0013668263941440067, "rougeL_precision": 0.1171936085278983, "rougeL_precision_stderr": 0.00132039683220814, "rougeL_recall": 0.21005905052835785, "rougeL_recall_stderr": 0.002357155817310683, "rougeLsum_fmeasure": 0.16698321861271978, "rougeLsum_fmeasure_stderr": 0.0017578260329471065, "rougeLsum_precision": 0.14187748986714835, "rougeLsum_precision_stderr": 0.0017420221740131032, "rougeLsum_recall": 0.2462694168340833, "rougeLsum_recall_stderr": 0.002709379098155295}}, "1": {"tldr_en": {"bleu": 3.5922679202027847, "bleu_stderr": 0.05984463327930867, "rouge1_fmeasure": 0.24327622952771533, "rouge1_fmeasure_stderr": 0.002037288100861591, "rouge1_precision": 0.2123430992404537, "rouge1_precision_stderr": 0.002316190617323844, "rouge1_recall": 0.349869999562048, "rouge1_recall_stderr": 0.0028776097446615145, "rouge2_fmeasure": 0.06561109570973458, "rouge2_fmeasure_stderr": 0.0011754883309790891, "rouge2_precision": 0.05726169522995292, "rouge2_precision_stderr": 0.0011305457211571272, "rouge2_recall": 0.09761319314398223, "rouge2_recall_stderr": 0.001931346687472591, "rougeL_fmeasure": 0.17044218669835878, "rougeL_fmeasure_stderr": 0.001393233918351284, "rougeL_precision": 0.147593790999516, "rougeL_precision_stderr": 0.0015843013179494662, "rougeL_recall": 0.25163325238017475, "rougeL_recall_stderr": 0.0023185700212327545, "rougeLsum_fmeasure": 0.22970291581408894, "rougeLsum_fmeasure_stderr": 0.0019241182984497916, "rougeLsum_precision": 0.20040929059968782, "rougeLsum_precision_stderr": 0.0021906873980096493, "rougeLsum_recall": 0.33105684407128727, "rougeLsum_recall_stderr": 0.0027539492668926546}}, "2": {"tldr_en": {"bleu": 3.9658166802535484, "bleu_stderr": 0.07925927931568665, "rouge1_fmeasure": 0.24678912656203086, "rouge1_fmeasure_stderr": 0.002037101161511086, "rouge1_precision": 0.22135115607796563, "rouge1_precision_stderr": 0.002466762662359004, "rouge1_recall": 0.3499365920126553, "rouge1_recall_stderr": 0.0029103670069339765, "rouge2_fmeasure": 0.06998330324965386, "rouge2_fmeasure_stderr": 0.0012429781942009475, "rouge2_precision": 0.06270201642870808, "rouge2_precision_stderr": 0.001276661389687198, "rouge2_recall": 0.10236690363201062, "rouge2_recall_stderr": 0.0019960722912629107, "rougeL_fmeasure": 0.17719154028942327, "rougeL_fmeasure_stderr": 0.0014768440178427862, "rougeL_precision": 0.15820160856971907, "rougeL_precision_stderr": 0.001803134257764531, "rougeL_recall": 0.25667905638991584, "rougeL_recall_stderr": 0.002404233442497606, "rougeLsum_fmeasure": 0.2338283238156954, "rougeLsum_fmeasure_stderr": 0.0019162970817694011, "rougeLsum_precision": 0.20961741686683294, "rougeLsum_precision_stderr": 0.0023343675964983485, "rougeLsum_recall": 0.332399007963623, "rougeLsum_recall_stderr": 0.0027852830969350697}}, "3": {"tldr_en": {"bleu": 3.971856308559798, "bleu_stderr": 0.09289145672499902, "rouge1_fmeasure": 0.2013934895923683, "rouge1_fmeasure_stderr": 0.0024101386411320806, "rouge1_precision": 0.1877914020690942, "rouge1_precision_stderr": 0.002780603639421943, "rouge1_recall": 0.28469073541532774, "rouge1_recall_stderr": 0.003552554685317243, "rouge2_fmeasure": 0.05612430703445463, "rouge2_fmeasure_stderr": 0.0011944949124968374, "rouge2_precision": 0.05231468260770844, "rouge2_precision_stderr": 0.00130393473012376, "rouge2_recall": 0.08172649028680513, "rouge2_recall_stderr": 0.0018880261158435462, "rougeL_fmeasure": 0.14593849406958256, "rougeL_fmeasure_stderr": 0.0017576382763014175, "rougeL_precision": 0.1363160410891958, "rougeL_precision_stderr": 0.0021057536768160825, "rougeL_recall": 0.21040942089474482, "rougeL_recall_stderr": 0.002825298497100027, "rougeLsum_fmeasure": 0.19069625305891352, "rougeLsum_fmeasure_stderr": 0.002278271153942546, "rougeLsum_precision": 0.17789698988425304, "rougeLsum_precision_stderr": 0.0026448838544979162, "rougeLsum_recall": 0.2699836129539428, "rougeLsum_recall_stderr": 0.003393162716447727}}, "4": {"tldr_en": {"bleu": 0.8216828826403728, "bleu_stderr": 0.06116206229347704, "rouge1_fmeasure": 0.06294177276794305, "rouge1_fmeasure_stderr": 0.0021234318631902995, "rouge1_precision": 0.05986373469352374, "rouge1_precision_stderr": 0.0022413247620520016, "rouge1_recall": 0.09299451528533728, "rouge1_recall_stderr": 0.003182662484214728, "rouge2_fmeasure": 0.01701967942257221, "rouge2_fmeasure_stderr": 0.0008072426334728445, "rouge2_precision": 0.01607251699061772, "rouge2_precision_stderr": 0.0008627575848833899, "rouge2_recall": 0.026217246289428207, "rouge2_recall_stderr": 0.0013573347317069598, "rougeL_fmeasure": 0.04627660806628133, "rougeL_fmeasure_stderr": 0.001562406165753214, "rougeL_precision": 0.04416852187112644, "rougeL_precision_stderr": 0.0016929957101032731, "rougeL_recall": 0.06977139563193656, "rougeL_recall_stderr": 0.002453474360518891, "rougeLsum_fmeasure": 0.05949644164272974, "rougeLsum_fmeasure_stderr": 0.002009657736950104, "rougeLsum_precision": 0.056539524010580496, "rougeLsum_precision_stderr": 0.002122275707889583, "rougeLsum_recall": 0.08802275906234883, "rougeLsum_recall_stderr": 0.00301921948656467}}, "5": {"tldr_en": {"bleu": 2.939501013520158e-06, "bleu_stderr": 6.1416042954126946e-06, "rouge1_fmeasure": 0.010348132353778797, "rouge1_fmeasure_stderr": 0.0009643798928416376, "rouge1_precision": 0.010026792511549231, "rouge1_precision_stderr": 0.0009722843833023145, "rouge1_recall": 0.014997054249515373, "rouge1_recall_stderr": 0.0014200291615747175, "rouge2_fmeasure": 0.002947850654970828, "rouge2_fmeasure_stderr": 0.00038101201967536574, "rouge2_precision": 0.002614111292262381, "rouge2_precision_stderr": 0.0003582186570242107, "rouge2_recall": 0.0043272321523977674, "rouge2_recall_stderr": 0.000556694180882867, "rougeL_fmeasure": 0.007846213305722558, "rougeL_fmeasure_stderr": 0.0007330941188437008, "rougeL_precision": 0.007696209279533862, "rougeL_precision_stderr": 0.0007541223407519663, "rougeL_recall": 0.01149712833393674, "rougeL_recall_stderr": 0.001106103576275103, "rougeLsum_fmeasure": 0.009790499222718256, "rougeLsum_fmeasure_stderr": 0.0009156513115428641, "rougeLsum_precision": 0.009502293292635411, "rougeLsum_precision_stderr": 0.0009264781441363666, "rougeLsum_recall": 0.01418906709799213, "rougeLsum_recall_stderr": 0.0013478881437725126}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.17854969577688523, "bleu_stderr": 0.02350515901837014, "rouge1_fmeasure": 0.14982661873128114, "rouge1_fmeasure_stderr": 0.0010486948402501617, "rouge1_precision": 0.34481003507654684, "rouge1_precision_stderr": 0.0024960642876163215, "rouge1_recall": 0.11140479511426793, "rouge1_recall_stderr": 0.0012326825192403048, "rouge2_fmeasure": 0.010042498177274786, "rouge2_fmeasure_stderr": 0.0004526496809829142, "rouge2_precision": 0.02652033223020393, "rouge2_precision_stderr": 0.0012104924394046484, "rouge2_recall": 0.007738642998342415, "rouge2_recall_stderr": 0.00040331722748865453, "rougeL_fmeasure": 0.13556698518148783, "rougeL_fmeasure_stderr": 0.0010029610186637107, "rougeL_precision": 0.31148977629184244, "rougeL_precision_stderr": 0.0024243327495898086, "rougeL_recall": 0.10191142585459571, "rougeL_recall_stderr": 0.0012109507603402815, "rougeLsum_fmeasure": 0.1364054799839495, "rougeLsum_fmeasure_stderr": 0.0010086900095262318, "rougeLsum_precision": 0.3173006007013496, "rougeLsum_precision_stderr": 0.002513826319580817, "rougeLsum_recall": 0.10030998557950792, "rougeLsum_recall_stderr": 0.0010758913304541808}}, "1": {"generate_text_restaurant": {"bleu": 8.550714951198435, "bleu_stderr": 0.06794955954281734, "rouge1_fmeasure": 0.3940891272584714, "rouge1_fmeasure_stderr": 0.00233253448293986, "rouge1_precision": 0.37441894592411723, "rouge1_precision_stderr": 0.00285043399066679, "rouge1_recall": 0.4681809355112701, "rouge1_recall_stderr": 0.0029387631347472464, "rouge2_fmeasure": 0.17310173953900088, "rouge2_fmeasure_stderr": 0.0017673218319637287, "rouge2_precision": 0.16514863563264978, "rouge2_precision_stderr": 0.0019360690941877992, "rouge2_recall": 0.20649495583013436, "rouge2_recall_stderr": 0.0021624061718912763, "rougeL_fmeasure": 0.2901588169021053, "rougeL_fmeasure_stderr": 0.0017017464729554115, "rougeL_precision": 0.2743117208263101, "rougeL_precision_stderr": 0.0021002662608094564, "rougeL_recall": 0.34924701312209255, "rougeL_recall_stderr": 0.0023705538319379695, "rougeLsum_fmeasure": 0.327671306182913, "rougeLsum_fmeasure_stderr": 0.0022171207606254796, "rougeLsum_precision": 0.31201556645452433, "rougeLsum_precision_stderr": 0.002622342685847327, "rougeLsum_recall": 0.3885350242195471, "rougeLsum_recall_stderr": 0.0027432106106179788}}, "2": {"generate_text_restaurant": {"bleu": 11.867888027230435, "bleu_stderr": 0.15989930331829488, "rouge1_fmeasure": 0.4442817836593718, "rouge1_fmeasure_stderr": 0.0019988950330102873, "rouge1_precision": 0.44246406218448636, "rouge1_precision_stderr": 0.002320846131050864, "rouge1_recall": 0.4822237492014395, "rouge1_recall_stderr": 0.0028480697292934715, "rouge2_fmeasure": 0.20571413885055867, "rouge2_fmeasure_stderr": 0.001815833869403785, "rouge2_precision": 0.20447987966953268, "rouge2_precision_stderr": 0.0019087131351150518, "rouge2_recall": 0.22575098541036898, "rouge2_recall_stderr": 0.0022679440038537117, "rougeL_fmeasure": 0.3232871465388061, "rougeL_fmeasure_stderr": 0.0017540023019821938, "rougeL_precision": 0.3220661789895318, "rougeL_precision_stderr": 0.0019807481977887516, "rougeL_recall": 0.35167106294020406, "rougeL_recall_stderr": 0.002412719701186727, "rougeLsum_fmeasure": 0.37187691948404467, "rougeLsum_fmeasure_stderr": 0.0020334507090512886, "rougeLsum_precision": 0.37040704761920407, "rougeLsum_precision_stderr": 0.002271535999762247, "rougeLsum_recall": 0.4036635693660681, "rougeLsum_recall_stderr": 0.0027100283713887775}}, "3": {"generate_text_restaurant": {"bleu": 12.176808245577714, "bleu_stderr": 0.1522141950985559, "rouge1_fmeasure": 0.4492578440402226, "rouge1_fmeasure_stderr": 0.0019667066974463896, "rouge1_precision": 0.44578562677217626, "rouge1_precision_stderr": 0.0022755220565382653, "rouge1_recall": 0.4889266631673112, "rouge1_recall_stderr": 0.0028806835048615135, "rouge2_fmeasure": 0.21198726153120898, "rouge2_fmeasure_stderr": 0.0018195540133290194, "rouge2_precision": 0.20934965847705986, "rouge2_precision_stderr": 0.0018748523719357, "rouge2_recall": 0.23374791215018093, "rouge2_recall_stderr": 0.0023263783868214196, "rougeL_fmeasure": 0.3276087887468011, "rougeL_fmeasure_stderr": 0.0017546007227396715, "rougeL_precision": 0.32511799875091574, "rougeL_precision_stderr": 0.001967046506806585, "rougeL_recall": 0.35739711489377646, "rougeL_recall_stderr": 0.002465697341506471, "rougeLsum_fmeasure": 0.3771411500357944, "rougeLsum_fmeasure_stderr": 0.0020137589520510426, "rougeLsum_precision": 0.3741382823179302, "rougeLsum_precision_stderr": 0.002234766274837387, "rougeLsum_recall": 0.4108171966535684, "rougeLsum_recall_stderr": 0.0027621282248548955}}, "4": {"generate_text_restaurant": {"bleu": 12.402384292539924, "bleu_stderr": 0.135024423037405, "rouge1_fmeasure": 0.4552455893734124, "rouge1_fmeasure_stderr": 0.0019583919126399917, "rouge1_precision": 0.4500297698600483, "rouge1_precision_stderr": 0.002279184782036985, "rouge1_recall": 0.49411520793728253, "rouge1_recall_stderr": 0.0028065754284846934, "rouge2_fmeasure": 0.2155975402534293, "rouge2_fmeasure_stderr": 0.001873826925854458, "rouge2_precision": 0.2124489885242478, "rouge2_precision_stderr": 0.001924872387744921, "rouge2_recall": 0.23653217550233827, "rouge2_recall_stderr": 0.0023355573505469554, "rougeL_fmeasure": 0.33116918396735845, "rougeL_fmeasure_stderr": 0.0017792810189211944, "rougeL_precision": 0.32709295569865143, "rougeL_precision_stderr": 0.001956608790763274, "rougeL_recall": 0.3602236836443227, "rougeL_recall_stderr": 0.002427496751180958, "rougeLsum_fmeasure": 0.3808847920405725, "rougeLsum_fmeasure_stderr": 0.002065156905180787, "rougeLsum_precision": 0.37596847671524797, "rougeLsum_precision_stderr": 0.002251960910942368, "rougeLsum_recall": 0.41399734380099557, "rougeLsum_recall_stderr": 0.0027661212857413163}}, "5": {"generate_text_restaurant": {"bleu": 12.262420161138401, "bleu_stderr": 0.18903914838075944, "rouge1_fmeasure": 0.45590768378496416, "rouge1_fmeasure_stderr": 0.001962173998693794, "rouge1_precision": 0.4494732628845236, "rouge1_precision_stderr": 0.0023023795088449608, "rouge1_recall": 0.49537684561519607, "rouge1_recall_stderr": 0.002767197249479356, "rouge2_fmeasure": 0.21627498227149344, "rouge2_fmeasure_stderr": 0.0018499821709783027, "rouge2_precision": 0.21297442064638186, "rouge2_precision_stderr": 0.001926070075686657, "rouge2_recall": 0.23703970881860875, "rouge2_recall_stderr": 0.0022735756287929677, "rougeL_fmeasure": 0.3331238951307013, "rougeL_fmeasure_stderr": 0.0017799650141595914, "rougeL_precision": 0.3281425210661801, "rougeL_precision_stderr": 0.001972901049188069, "rougeL_recall": 0.3628730729029585, "rougeL_recall_stderr": 0.0024176721948621425, "rougeLsum_fmeasure": 0.38246870938641964, "rougeLsum_fmeasure_stderr": 0.0020417804934292076, "rougeLsum_precision": 0.3768630011135112, "rougeLsum_precision_stderr": 0.0022664490914563948, "rougeLsum_recall": 0.41605530731114854, "rougeLsum_recall_stderr": 0.0027097026120309876}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.24608084382355, "bleu_stderr": 0.11799890497604383, "rouge1_fmeasure": 0.22178670977262957, "rouge1_fmeasure_stderr": 0.0026603762671440385, "rouge1_precision": 0.17400888678487844, "rouge1_precision_stderr": 0.002464330237468343, "rouge1_recall": 0.3502023417511915, "rouge1_recall_stderr": 0.004626152702942569, "rouge2_fmeasure": 0.05332080380122619, "rouge2_fmeasure_stderr": 0.0017188362970072153, "rouge2_precision": 0.040704743736010573, "rouge2_precision_stderr": 0.001393829442555013, "rouge2_recall": 0.08799878519146338, "rouge2_recall_stderr": 0.0029196986234636744, "rougeL_fmeasure": 0.16609949129334142, "rougeL_fmeasure_stderr": 0.0020543947028805716, "rougeL_precision": 0.12992137120722907, "rougeL_precision_stderr": 0.0018781656373678345, "rougeL_recall": 0.2641999836224796, "rougeL_recall_stderr": 0.0036812097901456225, "rougeLsum_fmeasure": 0.17331928576093303, "rougeLsum_fmeasure_stderr": 0.0022434557390865763, "rougeLsum_precision": 0.13526811734802355, "rougeLsum_precision_stderr": 0.001975414967835506, "rougeLsum_recall": 0.27591607890974656, "rougeLsum_recall_stderr": 0.004020504841332198}}, "1": {"article_DOC_summary": {"bleu": 1.8189118554354626, "bleu_stderr": 0.10029587869243096, "rouge1_fmeasure": 0.19671334605636295, "rouge1_fmeasure_stderr": 0.0027188446769093686, "rouge1_precision": 0.14019002697714425, "rouge1_precision_stderr": 0.002033439876939429, "rouge1_recall": 0.3437328928644267, "rouge1_recall_stderr": 0.004622184360550014, "rouge2_fmeasure": 0.04451173090845151, "rouge2_fmeasure_stderr": 0.0016237255917685457, "rouge2_precision": 0.031342583459769556, "rouge2_precision_stderr": 0.0011487670391079187, "rouge2_recall": 0.0803191311481453, "rouge2_recall_stderr": 0.003010334414958687, "rougeL_fmeasure": 0.14825510655652663, "rougeL_fmeasure_stderr": 0.0020028300424767575, "rougeL_precision": 0.1053445488770993, "rougeL_precision_stderr": 0.001476955053927121, "rougeL_recall": 0.2613986979531857, "rougeL_recall_stderr": 0.0035961912243557873, "rougeLsum_fmeasure": 0.1592072439127538, "rougeLsum_fmeasure_stderr": 0.002270863529627419, "rougeLsum_precision": 0.11322117707364136, "rougeLsum_precision_stderr": 0.001678631461460253, "rougeLsum_recall": 0.27993778037490374, "rougeLsum_recall_stderr": 0.003988822898958396}}, "2": {"article_DOC_summary": {"bleu": 2.0454443359589503, "bleu_stderr": 0.08368970692051826, "rouge1_fmeasure": 0.20889368966335237, "rouge1_fmeasure_stderr": 0.0026887281040736378, "rouge1_precision": 0.1488455450923004, "rouge1_precision_stderr": 0.002009481565468108, "rouge1_recall": 0.3642751817540943, "rouge1_recall_stderr": 0.004565498697439836, "rouge2_fmeasure": 0.05083267238349886, "rouge2_fmeasure_stderr": 0.001670794310427006, "rouge2_precision": 0.03580310577134537, "rouge2_precision_stderr": 0.0011818056791973873, "rouge2_recall": 0.09140711571916593, "rouge2_recall_stderr": 0.003096432205185466, "rougeL_fmeasure": 0.15708491403271296, "rougeL_fmeasure_stderr": 0.0019982552541995204, "rougeL_precision": 0.11164126156649677, "rougeL_precision_stderr": 0.001473760236619727, "rougeL_recall": 0.27611637516436727, "rougeL_recall_stderr": 0.003575935893484929, "rougeLsum_fmeasure": 0.1672065849237029, "rougeLsum_fmeasure_stderr": 0.002247088064282535, "rougeLsum_precision": 0.1188385887857819, "rougeLsum_precision_stderr": 0.0016510150185409913, "rougeLsum_recall": 0.2937542835590118, "rougeLsum_recall_stderr": 0.003999046140359821}}, "3": {"article_DOC_summary": {"bleu": 2.0101663995130106, "bleu_stderr": 0.13035938633140529, "rouge1_fmeasure": 0.2005996992165322, "rouge1_fmeasure_stderr": 0.002922307185273832, "rouge1_precision": 0.1451633530537778, "rouge1_precision_stderr": 0.0022577436785926417, "rouge1_recall": 0.3454037481783916, "rouge1_recall_stderr": 0.005024002677948197, "rouge2_fmeasure": 0.047805324803931785, "rouge2_fmeasure_stderr": 0.001665552432783648, "rouge2_precision": 0.03399383589660795, "rouge2_precision_stderr": 0.0011970297160356436, "rouge2_recall": 0.08515533208064306, "rouge2_recall_stderr": 0.00305750154560904, "rougeL_fmeasure": 0.15011348459355398, "rougeL_fmeasure_stderr": 0.0021656916632942095, "rougeL_precision": 0.1082275772002201, "rougeL_precision_stderr": 0.0016303512277000601, "rougeL_recall": 0.26018618860799503, "rougeL_recall_stderr": 0.0038502171494847245, "rougeLsum_fmeasure": 0.1614133142157355, "rougeLsum_fmeasure_stderr": 0.00243279237750366, "rougeLsum_precision": 0.1163681459054013, "rougeLsum_precision_stderr": 0.0018244541208628394, "rougeLsum_recall": 0.27969335970329556, "rougeLsum_recall_stderr": 0.004315552235909818}}, "4": {"article_DOC_summary": {"bleu": 1.1210416443624807, "bleu_stderr": 0.18886560236043304, "rouge1_fmeasure": 0.05612514432374489, "rouge1_fmeasure_stderr": 0.0030731828239370837, "rouge1_precision": 0.04604504358511682, "rouge1_precision_stderr": 0.0026703224008911334, "rouge1_recall": 0.08906761715367195, "rouge1_recall_stderr": 0.0050274498862542885, "rouge2_fmeasure": 0.013937278041326827, "rouge2_fmeasure_stderr": 0.0011465942789645456, "rouge2_precision": 0.011276377923656494, "rouge2_precision_stderr": 0.0012571358140648242, "rouge2_recall": 0.023708158418892494, "rouge2_recall_stderr": 0.002002425619950796, "rougeL_fmeasure": 0.04184105680154411, "rougeL_fmeasure_stderr": 0.002285547204915818, "rougeL_precision": 0.03500836518053345, "rougeL_precision_stderr": 0.0021377284451202998, "rougeL_recall": 0.06645887444884206, "rougeL_recall_stderr": 0.0037898342177074525, "rougeLsum_fmeasure": 0.04557476167159472, "rougeLsum_fmeasure_stderr": 0.002503478149457618, "rougeLsum_precision": 0.0378191473406664, "rougeLsum_precision_stderr": 0.002272186759930814, "rougeLsum_recall": 0.07247078079866072, "rougeLsum_recall_stderr": 0.004137833255697601}}, "5": {"article_DOC_summary": {"bleu": 1.7628232823336373e-17, "bleu_stderr": 2.8521803760603906e-14, "rouge1_fmeasure": 0.0026984759176508304, "rouge1_fmeasure_stderr": 0.000738763092128848, "rouge1_precision": 0.0022472741378328047, "rouge1_precision_stderr": 0.0006307356242527256, "rouge1_recall": 0.004133903246439151, "rouge1_recall_stderr": 0.0011668769662793363, "rouge2_fmeasure": 0.000551357929598578, "rouge2_fmeasure_stderr": 0.00022851833366345053, "rouge2_precision": 0.00044268370843308837, "rouge2_precision_stderr": 0.00017799352097082119, "rouge2_recall": 0.0008499613285961788, "rouge2_recall_stderr": 0.0003602171813535447, "rougeL_fmeasure": 0.001991063584250406, "rougeL_fmeasure_stderr": 0.0005515138571257219, "rougeL_precision": 0.0016336554354190846, "rougeL_precision_stderr": 0.0004503176195113186, "rougeL_recall": 0.003027477992470196, "rougeL_recall_stderr": 0.0008730946685903953, "rougeLsum_fmeasure": 0.002097477776387118, "rougeLsum_fmeasure_stderr": 0.0005739873846094995, "rougeLsum_precision": 0.0017629252737840803, "rougeLsum_precision_stderr": 0.0005014954502254297, "rougeLsum_recall": 0.0031744555131817086, "rougeLsum_recall_stderr": 0.0008937483641872015}}}}
8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.22135115607796563,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.002466762662359004
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.3499365920126553,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.0029103670069339765
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.24678912656203086,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.002037101161511086
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.06270201642870808,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.001276661389687198
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.10236690363201062,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0019960722912629107
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.06998330324965386,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0012429781942009475
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.15820160856971907,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.001803134257764531
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.25667905638991584,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.002404233442497606
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.17719154028942327,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0014768440178427862
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.20961741686683294,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.0023343675964983485
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.332399007963623,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0027852830969350697
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.2338283238156954,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0019162970817694011
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 3.9658166802535484,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.07925927931568665
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 2,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.1877914020690942,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.002780603639421943
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.28469073541532774,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.003552554685317243
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.2013934895923683,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0024101386411320806
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.05231468260770844,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.00130393473012376
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.08172649028680513,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0018880261158435462
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.05612430703445463,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0011944949124968374
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.1363160410891958,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0021057536768160825
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.21040942089474482,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.002825298497100027
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.14593849406958256,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0017576382763014175
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.17789698988425304,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.0026448838544979162
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.2699836129539428,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.003393162716447727
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.19069625305891352,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.002278271153942546
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 3.971856308559798,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.09289145672499902
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 3,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.05986373469352374,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.0022413247620520016
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.09299451528533728,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.003182662484214728
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.06294177276794305,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0021234318631902995
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.01607251699061772,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0008627575848833899
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.026217246289428207,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0013573347317069598
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.01701967942257221,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0008072426334728445
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.04416852187112644,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0016929957101032731
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.06977139563193656,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.002453474360518891
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.04627660806628133,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.001562406165753214
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.056539524010580496,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.002122275707889583
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.08802275906234883,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.00301921948656467
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.05949644164272974,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.002009657736950104
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 0.8216828826403728,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.06116206229347704
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.010026792511549231,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.0009722843833023145
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.014997054249515373,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.0014200291615747175
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.010348132353778797,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0009643798928416376
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.002614111292262381,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0003582186570242107
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.0043272321523977674,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.000556694180882867
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.002947850654970828,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.00038101201967536574
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.007696209279533862,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0007541223407519663
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.01149712833393674,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.001106103576275103
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.007846213305722558,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0007330941188437008
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.009502293292635411,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.0009264781441363666
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.01418906709799213,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0013478881437725126
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.009790499222718256,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0009156513115428641
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 2.939501013520158e-06,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 6.1416042954126946e-06
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_3.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.1451633530537778,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0022577436785926417
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.3454037481783916,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.005024002677948197
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.2005996992165322,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.002922307185273832
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.03399383589660795,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.0011970297160356436
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.08515533208064306,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.00305750154560904
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.047805324803931785,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.001665552432783648
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.1082275772002201,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.0016303512277000601
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.26018618860799503,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0038502171494847245
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.15011348459355398,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.0021656916632942095
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.1163681459054013,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.0018244541208628394
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.27969335970329556,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.004315552235909818
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.1614133142157355,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.00243279237750366
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 2.0101663995130106,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 0.13035938633140529
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 3,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.04604504358511682,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0026703224008911334
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.08906761715367195,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.0050274498862542885
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.05612514432374489,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.0030731828239370837
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.011276377923656494,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.0012571358140648242
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.023708158418892494,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.002002425619950796
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.013937278041326827,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.0011465942789645456
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.03500836518053345,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.0021377284451202998
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.06645887444884206,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0037898342177074525
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.04184105680154411,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.002285547204915818
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.0378191473406664,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.002272186759930814
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.07247078079866072,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.004137833255697601
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.04557476167159472,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.002503478149457618
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 1.1210416443624807,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 0.18886560236043304
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.0022472741378328047,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0006307356242527256
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.004133903246439151,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.0011668769662793363
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.0026984759176508304,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.000738763092128848
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.00044268370843308837,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.00017799352097082119
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.0008499613285961788,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.0003602171813535447
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.000551357929598578,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.00022851833366345053
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.0016336554354190846,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.0004503176195113186
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.003027477992470196,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0008730946685903953
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.001991063584250406,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.0005515138571257219
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.0017629252737840803,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.0005014954502254297
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.0031744555131817086,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.0008937483641872015
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.002097477776387118,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.0005739873846094995
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 1.7628232823336373e-17,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 2.8521803760603906e-14
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.24807700870805302, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032686331426239274}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.2993366704587394, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028897035935155396}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.23240946839912788, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020535331913193833}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.07542612135833042, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019732499800712644}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.08771578704686069, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001852341009446473}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.06722787106866046, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001306015352926357}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.19005839255632967, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002778132688091869}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.22871074992869012, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002406566174307835}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.17544324351165277, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001595587702286189}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.23648500820691407, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031596714430369476}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.285430067349801, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027751440475442097}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.2213635199796785, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019597819111732734}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.6209568989964964, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05693014182524011}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.23084151108139417, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003999536576630713}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.22564401884098545, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00322268288462845}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.18788478661055097, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002472024769674421}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.07199814947709587, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002257359657010455}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0658546131120379, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016996600071196948}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0547277065977853, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001316330403077771}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.1812422041327713, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0033598941031452102}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.17549664105843907, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00262690640098719}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1450771150583464, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019574386379697857}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.21961824449300582, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003836248551627411}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.21459589482318636, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003079841928060846}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.17866009428807192, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023640874867847488}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.2600101837197197, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04841341881147457}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.07762612304263315, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0032117324014510297}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.07075067661625373, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00266141134016641}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.05902884784226779, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021304994672767174}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.024715265794116245, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0015375396891910912}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.02194236604474347, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001230091261779581}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.01811109085502527, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009535025790089693}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.06261195931905689, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002684073457127978}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.0564250650958304, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021744446112801413}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.04687343959092794, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00171937320425508}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.07398653730135701, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003081234652717845}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.06720084909348248, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002527336091137797}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.056127739240012624, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002031189471014326}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.6197416585249048, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.050726126214537225}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.012749197720849256, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013678224737624083}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.011454983373776283, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001152103487332925}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.01025088836822709, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010237064667092625}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.004106263613998532, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006621811206101381}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.003568369407358366, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00047406732675066786}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.003230399473756007, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004373491168596821}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.01021272837958567, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001137838670735105}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.009264383578933708, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0009514107080117658}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.008151883997164445, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008219787823794978}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.012094227953101243, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013067091712138548}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.010971397408480117, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0011066936947192276}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.00977974159773322, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009796733551025018}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 4.394475685873892e-07, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.020668525155571e-06}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-c4-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}