Muennighoff commited on
Commit
9defb95
1 Parent(s): 5c159b1
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. 8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.json +1 -0
  3. 8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.json +1 -0
  4. 8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.json +1 -0
  5. 8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.json +1 -0
  6. 8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_3.json +1 -0
  7. 8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_4.json +1 -0
  8. 8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_5.json +1 -0
  9. 8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.jsonl +3 -0
  10. 8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.jsonl +3 -0
  11. 8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.jsonl +3 -0
  12. 8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.jsonl +3 -0
  13. 8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_3.jsonl +3 -0
  14. 8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_4.jsonl +3 -0
  15. 8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_5.jsonl +3 -0
  16. 8b7178b25b/evaluation/generation/merged.csv +16 -2
  17. 8b7178b25b/evaluation/generation/merged.json +1 -1
  18. 8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.json +133 -0
  19. 8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.json +133 -0
  20. 8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.json +133 -0
  21. 8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.json +133 -0
  22. 8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_3.json +133 -0
  23. 8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_4.json +133 -0
  24. 8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_5.json +133 -0
  25. 8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.json +1 -0
  26. 8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.json +1 -0
  27. 8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.json +1 -0
  28. 8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.json +1 -0
  29. 8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_3.json +1 -0
  30. 8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_4.json +1 -0
  31. 8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_5.json +1 -0
  32. 8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.jsonl +3 -0
  33. 8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.jsonl +3 -0
  34. 8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.jsonl +3 -0
  35. 8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.jsonl +3 -0
  36. 8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_3.jsonl +3 -0
  37. 8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_4.jsonl +3 -0
  38. 8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_5.jsonl +3 -0
  39. 8b7178b35b/evaluation/generation/merged.csv +16 -2
  40. 8b7178b35b/evaluation/generation/merged.json +1 -1
  41. 8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.json +133 -0
  42. 8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.json +133 -0
  43. 8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.json +133 -0
  44. 8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.json +133 -0
  45. 8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_3.json +133 -0
  46. 8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_4.json +133 -0
  47. 8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_5.json +133 -0
  48. 8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.json +1 -0
  49. 8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.json +1 -0
  50. 8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.json +1 -0
.gitattributes CHANGED
@@ -153,3 +153,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
153
  8b7178b25b/evaluation/generation/examples.8b7178b25b_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
154
  8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
155
  8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
 
 
153
  8b7178b25b/evaluation/generation/examples.8b7178b25b_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
154
  8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text
155
  8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text
156
+ */evaluatuon/generation/examples*.jsonl filter=lfs diff=lfs merge=lfs -text
8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.30679877376441267, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003595563898423582}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.25602196677157735, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002878244091822486}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.24392276306894944, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023000547396419086}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.09387841885638748, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002194124801129067}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0740439804559834, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001644608125064704}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.07100355255541753, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014153910474231026}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.23412738417408802, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029250825035858734}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.19426592228730813, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023089496783281213}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.18445520873996413, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001797621869515743}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.28859199947210284, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003452141980034977}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.23966421160449644, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027125548096589747}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.2286765314746336, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021801688851002263}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 4.074528254683577, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10710940903995719}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.272311629129121, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0039505188546921826}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.21816270627496134, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0032448735585025937}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.2094195871488414, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027068429969493734}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.08329439938013247, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0021782819294409984}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.06480971019253767, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016827399941316222}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.06204427812400913, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014401113659882462}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.20896331681491714, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031500865399556595}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.1664625275086072, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025592493419649177}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1590668148316741, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020766324795938023}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.2563037368454477, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0037619221351167056}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.20447650361202882, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030540843932082493}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.1964756114951262, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0025500807038816515}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.168617248016535, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08041979746938775}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.0910819200952716, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0033747683121654196}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.07028567198192245, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002640964970940966}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.06757697669893872, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023707490073172043}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.028646478253466095, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016076285180980135}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.020868526921797886, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011537893245677335}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.019948424307073904, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000986170441488306}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.07082578661014885, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002691634510208164}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.05448506471222756, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020854622601667937}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.05201990108593777, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018314763879696625}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.08551950005220435, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0031983492312005384}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.06543250975997009, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024532813023277916}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.06308719820573185, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002213451741392458}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.07788444152267485, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.01315525656911281}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b25b/evaluation/generation/agg.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.014515441910190443, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001501466217296952}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.011301943813768834, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0012288119306628567}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.010473201550267177, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010554421691169314}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.004591980814271401, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006515529824929627}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.00416888931272546, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005930107839352501}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0035604598673033626, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004625418185814249}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.011779972055249343, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012586537746088814}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.009040049894954919, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0009873007391796482}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.008371164394493067, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008468942819407445}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.013708349668480058, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014293611241212706}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.010579263673799228, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0011521180305532216}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.009841875351812794, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.000994934070773035}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.4470192084599072e-15, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 5.1547920765171173e-14}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.2511767032658256, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004670743744963309}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.2497224746972405, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004054437554455434}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.23697855013529284, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003931784147472208}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.06435388180319074, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002934840810442915}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.060376215777618605, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002521653823700491}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.059313458488013185, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0025641694242472487}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.19057534922020994, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0038620821553629983}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.1901634518957205, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0033714037003502494}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.17977672776005849, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00326317345808302}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.19211901704097034, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003850775083612694}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.19288110429850555, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003421247538700275}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.18163235303784986, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003264460230665997}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 3.267835808853254, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.26356764701611307}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.06645727693786248, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004158365862658779}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.061174931724787315, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003797370735030977}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05891769720848019, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0035369396004201713}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.01730890961224063, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0017698712455992984}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.016289532767626923, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017234032328131926}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.015297632126595406, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00149002417901287}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.050637820115463826, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0032826504244346}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.04683971382625969, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003039332848507969}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.04476173097758684, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0027730017560448727}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.051192679124604304, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0033047254109378095}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0475395827212547, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030900986283582004}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.04533434532971928, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0028025558912498296}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.17970307621766216, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04965869269887041}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b25b/evaluation/generation/agg.8b7178b25b_gem_xsum_article_DOC_summary_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.0025272416860947825, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0007813236132809655}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.002586618259740399, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0008670624223009818}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0025036711184808556, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.000808568948388182}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0003236042782341223, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0001381414258477455}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0003324696313480705, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00015129466488844655}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0003242107125389267, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00014221025181447073}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0016454419652040672, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0004929867496820194}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.001639076487035711, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005281423440330004}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.00159555474061474, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0004956987539243005}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0016454419652040672, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0004929867496820194}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.001639076487035711, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005281423440330004}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.00159555474061474, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0004956987539243005}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.393056311124318e-43, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.2487431746784071e-33}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a65f5bdecdcdbda148d7cf5f9a8083572b9b5126c74c6d30245dd0c6cd2c9014
3
+ size 18548218
8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5374e9109304e0df5ed2574cfc9f109a1bb720175a205d37d9ac84ded27edfb
3
+ size 24030292
8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:310cb6d4407a529f7d950d4793387bcd9f135ca965d953c9b52223fe42eb205c
3
+ size 29368906
8b7178b25b/evaluation/generation/examples.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfcf599522eee8de2bb128abdb6f9b462612cd1e029f7e9fff32a4a6eea5edf7
3
+ size 34782587
8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_3.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b173fc150a337823b0701d7368bfb431e82df82576249bb67449c03fcb9ecdd
3
+ size 9475426
8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_4.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46702c5bc39b1b96361477587114a9a3aa24c24502dbe7b328a34ab3585bef66
3
+ size 11630243
8b7178b25b/evaluation/generation/examples.8b7178b25b_gem_xsum_article_DOC_summary_5.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e07d25012d66b17c9c8318583932c7cae300c19f37c931271015911964b1436a
3
+ size 13897359
8b7178b25b/evaluation/generation/merged.csv CHANGED
@@ -18,7 +18,13 @@ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.05891708042714147
18
  gem_xsum,1,median,rouge2_fmeasure,0.05891708042714147
19
  gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.06271701269177109
20
  gem_xsum,2,median,rouge2_fmeasure,0.06271701269177109
21
- gem_xsum,2,average,multiple,0.05525598610960542
 
 
 
 
 
 
22
  web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04353421604632926
23
  web_nlg_en,0,median,rouge2_fmeasure,0.04353421604632926
24
  web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.07820906828691397
@@ -36,4 +42,12 @@ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.04445750772336212
36
  wiki_lingua_en,0,median,rouge2_fmeasure,0.04445750772336212
37
  wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.053780768787129375
38
  wiki_lingua_en,1,median,rouge2_fmeasure,0.053780768787129375
39
- wiki_lingua_en,1,average,multiple,0.04911913825524575
 
 
 
 
 
 
 
 
 
18
  gem_xsum,1,median,rouge2_fmeasure,0.05891708042714147
19
  gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.06271701269177109
20
  gem_xsum,2,median,rouge2_fmeasure,0.06271701269177109
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.059313458488013185
22
+ gem_xsum,3,median,rouge2_fmeasure,0.059313458488013185
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.015297632126595406
24
+ gem_xsum,4,median,rouge2_fmeasure,0.015297632126595406
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0003242107125389267
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0003242107125389267
27
+ gem_xsum,5,average,multiple,0.04011720994266063
28
  web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04353421604632926
29
  web_nlg_en,0,median,rouge2_fmeasure,0.04353421604632926
30
  web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.07820906828691397
 
42
  wiki_lingua_en,0,median,rouge2_fmeasure,0.04445750772336212
43
  wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.053780768787129375
44
  wiki_lingua_en,1,median,rouge2_fmeasure,0.053780768787129375
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.07100355255541753
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.07100355255541753
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.06204427812400913
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.06204427812400913
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.019948424307073904
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.019948424307073904
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0035604598673033626
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0035604598673033626
53
+ wiki_lingua_en,5,average,multiple,0.04246583189404924
8b7178b25b/evaluation/generation/merged.json CHANGED
@@ -1 +1 @@
1
- {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.33618386090404295, "bleu_stderr": 0.043286547374004225, "rouge1_fmeasure": 0.09550360928926843, "rouge1_fmeasure_stderr": 0.0021611192824620468, "rouge1_precision": 0.07608115633976532, "rouge1_precision_stderr": 0.0028424335866037475, "rouge1_recall": 0.243734587530279, "rouge1_recall_stderr": 0.005032751672173048, "rouge2_fmeasure": 0.04353421604632926, "rouge2_fmeasure_stderr": 0.0013022741040944032, "rouge2_precision": 0.03391274645846755, "rouge2_precision_stderr": 0.0017782441786429572, "rouge2_recall": 0.11477361563470867, "rouge2_recall_stderr": 0.0030177112931945443, "rougeL_fmeasure": 0.0908905716888711, "rougeL_fmeasure_stderr": 0.0019748576875776485, "rougeL_precision": 0.07249954185930807, "rougeL_precision_stderr": 0.002716866763111571, "rougeL_recall": 0.23447583587320983, "rougeL_recall_stderr": 0.004820896551192642, "rougeLsum_fmeasure": 0.09063319866429098, "rougeLsum_fmeasure_stderr": 0.002018782468480028, "rougeLsum_precision": 0.07260565563718514, "rougeLsum_precision_stderr": 0.0027622865524293628, "rougeLsum_recall": 0.2312007323793673, "rougeLsum_recall_stderr": 0.004666750865618609}}, "1": {"PALM_prompt": {"bleu": 0.5152510207869508, "bleu_stderr": 0.039697768768641464, "rouge1_fmeasure": 0.15419793855600986, "rouge1_fmeasure_stderr": 0.0037752055082234935, "rouge1_precision": 0.1335251141437586, "rouge1_precision_stderr": 0.004421796872864802, "rouge1_recall": 0.3040669240102272, "rouge1_recall_stderr": 0.0049926465687368735, "rouge2_fmeasure": 0.07820906828691397, "rouge2_fmeasure_stderr": 0.0026222161094111586, "rouge2_precision": 0.06857339375782437, "rouge2_precision_stderr": 0.0030273264083127568, "rouge2_recall": 0.1555986112064758, "rouge2_recall_stderr": 0.0036341014671041087, "rougeL_fmeasure": 0.13965714245032726, "rougeL_fmeasure_stderr": 0.0032487392690607572, "rougeL_precision": 0.1195649771956543, "rougeL_precision_stderr": 0.0038518942294089972, "rougeL_recall": 0.2838951533263151, "rougeL_recall_stderr": 0.004599064364517073, "rougeLsum_fmeasure": 0.14150307262693815, "rougeLsum_fmeasure_stderr": 0.0033007395324502467, "rougeLsum_precision": 0.12140487798820795, "rougeLsum_precision_stderr": 0.003915984273012587, "rougeLsum_recall": 0.2857833042522088, "rougeLsum_recall_stderr": 0.004601461544101827}}, "2": {"PALM_prompt": {"bleu": 0.6450721424031474, "bleu_stderr": 0.035938592978876936, "rouge1_fmeasure": 0.19228537050298922, "rouge1_fmeasure_stderr": 0.004356183491682661, "rouge1_precision": 0.17715496451419446, "rouge1_precision_stderr": 0.005377686826508908, "rouge1_recall": 0.3442204927205235, "rouge1_recall_stderr": 0.00499209962075588, "rouge2_fmeasure": 0.1017490401630499, "rouge2_fmeasure_stderr": 0.003080557280075834, "rouge2_precision": 0.09659406582034752, "rouge2_precision_stderr": 0.0036720254666308344, "rouge2_recall": 0.1816245156374427, "rouge2_recall_stderr": 0.0038421174433974025, "rougeL_fmeasure": 0.17158392384169643, "rougeL_fmeasure_stderr": 0.0037128302671958994, "rougeL_precision": 0.15519367504713033, "rougeL_precision_stderr": 0.004580706449804959, "rougeL_recall": 0.31893881029765586, "rougeL_recall_stderr": 0.004597766706728441, "rougeLsum_fmeasure": 0.17498254398589475, "rougeLsum_fmeasure_stderr": 0.0037985612156152086, "rougeLsum_precision": 0.1592800915810083, "rougeLsum_precision_stderr": 0.0047149524911017215, "rougeLsum_recall": 0.3221825821541532, "rougeLsum_recall_stderr": 0.0046252310161258746}}, "3": {"PALM_prompt": {"bleu": 0.8785043636862591, "bleu_stderr": 0.02652000083973547, "rouge1_fmeasure": 0.21217682060449367, "rouge1_fmeasure_stderr": 0.004581243352642801, "rouge1_precision": 0.2002191357399255, "rouge1_precision_stderr": 0.005700191155087746, "rouge1_recall": 0.3622650735119951, "rouge1_recall_stderr": 0.005105318033672305, "rouge2_fmeasure": 0.11312048852329888, "rouge2_fmeasure_stderr": 0.003186877123824252, "rouge2_precision": 0.10955905558045385, "rouge2_precision_stderr": 0.0038849982073218873, "rouge2_recall": 0.19262377036723086, "rouge2_recall_stderr": 0.003902442560309681, "rougeL_fmeasure": 0.1874319081280966, "rougeL_fmeasure_stderr": 0.0038899168243038715, "rougeL_precision": 0.1744946205047383, "rougeL_precision_stderr": 0.004899868234689971, "rougeL_recall": 0.33190561350734876, "rougeL_recall_stderr": 0.004628476785750535, "rougeLsum_fmeasure": 0.19207381785134825, "rougeLsum_fmeasure_stderr": 0.004003703743860644, "rougeLsum_precision": 0.17972186968155374, "rougeLsum_precision_stderr": 0.005058207025529124, "rougeLsum_recall": 0.33723235220496645, "rougeLsum_recall_stderr": 0.0046889691038068775}}, "4": {"PALM_prompt": {"bleu": 1.020225734756704, "bleu_stderr": 0.06946466420275214, "rouge1_fmeasure": 0.22615455778377724, "rouge1_fmeasure_stderr": 0.004584303113870005, "rouge1_precision": 0.21447559212644274, "rouge1_precision_stderr": 0.005787602630220149, "rouge1_recall": 0.3797811786576114, "rouge1_recall_stderr": 0.005078572769442694, "rouge2_fmeasure": 0.12153309984630654, "rouge2_fmeasure_stderr": 0.0032009564132827184, "rouge2_precision": 0.11802182161980528, "rouge2_precision_stderr": 0.003954053081043237, "rouge2_recall": 0.20495230642568182, "rouge2_recall_stderr": 0.003956396099130858, "rougeL_fmeasure": 0.1985084531863174, "rougeL_fmeasure_stderr": 0.003863020108912202, "rougeL_precision": 0.18574729294363898, "rougeL_precision_stderr": 0.00493965653212434, "rougeL_recall": 0.34661430398431914, "rougeL_recall_stderr": 0.004643977843173891, "rougeLsum_fmeasure": 0.20493355705865743, "rougeLsum_fmeasure_stderr": 0.004011218572127487, "rougeLsum_precision": 0.19295820334137398, "rougeLsum_precision_stderr": 0.005149923293184489, "rougeLsum_recall": 0.3532599821115898, "rougeLsum_recall_stderr": 0.004685486372918684}}, "5": {"PALM_prompt": {"bleu": 1.1738991703843653, "bleu_stderr": 0.05137420728965795, "rouge1_fmeasure": 0.2452749113827306, "rouge1_fmeasure_stderr": 0.004866057995099286, "rouge1_precision": 0.23842752486609714, "rouge1_precision_stderr": 0.00616558462817229, "rouge1_recall": 0.39104608539227964, "rouge1_recall_stderr": 0.005079097315439687, "rouge2_fmeasure": 0.13673599319945717, "rouge2_fmeasure_stderr": 0.003545382189555846, "rouge2_precision": 0.13743673376990653, "rouge2_precision_stderr": 0.004448566773563913, "rouge2_recall": 0.21617363071605727, "rouge2_recall_stderr": 0.004065117072186819, "rougeL_fmeasure": 0.2145149869812429, "rougeL_fmeasure_stderr": 0.004135695136449003, "rougeL_precision": 0.20669362004199143, "rougeL_precision_stderr": 0.0053303444381744315, "rougeL_recall": 0.35345362943589836, "rougeL_recall_stderr": 0.004581784867877382, "rougeLsum_fmeasure": 0.22169261840373009, "rougeLsum_fmeasure_stderr": 0.004292938565416988, "rougeLsum_precision": 0.21482050441032427, "rougeLsum_precision_stderr": 0.005549487778372521, "rougeLsum_recall": 0.3612051449037077, "rougeLsum_recall_stderr": 0.004657096205531439}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 3.2204812581094187, "bleu_stderr": 0.0968937530189579, "rouge1_fmeasure": 0.17281704832161807, "rouge1_fmeasure_stderr": 0.0025639705499622615, "rouge1_precision": 0.16222110345712215, "rouge1_precision_stderr": 0.002918181626000938, "rouge1_recall": 0.23556142979563985, "rouge1_recall_stderr": 0.0035092798940514463, "rouge2_fmeasure": 0.04445750772336212, "rouge2_fmeasure_stderr": 0.0010694173040695087, "rouge2_precision": 0.04035916262517482, "rouge2_precision_stderr": 0.0011089852877097363, "rouge2_recall": 0.06187933259063034, "rouge2_recall_stderr": 0.0015927286355937283, "rougeL_fmeasure": 0.12828595447707558, "rougeL_fmeasure_stderr": 0.0018630698897748783, "rougeL_precision": 0.12115800294799915, "rougeL_precision_stderr": 0.0023302786580353405, "rougeL_recall": 0.1793437869920672, "rougeL_recall_stderr": 0.0027904364481447434, "rougeLsum_fmeasure": 0.16125234602130137, "rougeLsum_fmeasure_stderr": 0.0024054029356366125, "rougeLsum_precision": 0.15193645088089489, "rougeLsum_precision_stderr": 0.002794596188541928, "rougeLsum_recall": 0.22017606445792967, "rougeLsum_recall_stderr": 0.003307741344782461}}, "1": {"tldr_en": {"bleu": 3.030764799858383, "bleu_stderr": 0.10180137949258324, "rouge1_fmeasure": 0.20057722026752264, "rouge1_fmeasure_stderr": 0.00237727127672953, "rouge1_precision": 0.24835405674131716, "rouge1_precision_stderr": 0.003595946507957674, "rouge1_recall": 0.22065222520601965, "rouge1_recall_stderr": 0.0030326234032250735, "rouge2_fmeasure": 0.053780768787129375, "rouge2_fmeasure_stderr": 0.0012652221510456923, "rouge2_precision": 0.07072572329707479, "rouge2_precision_stderr": 0.002023483136430399, "rouge2_recall": 0.05888749376061464, "rouge2_recall_stderr": 0.0015271270172799948, "rougeL_fmeasure": 0.15099195793433165, "rougeL_fmeasure_stderr": 0.0017939149386485946, "rougeL_precision": 0.18962847615836217, "rougeL_precision_stderr": 0.0029056148560385174, "rougeL_recall": 0.16669593752772216, "rougeL_recall_stderr": 0.0023452388157941375, "rougeLsum_fmeasure": 0.18619742012806825, "rougeLsum_fmeasure_stderr": 0.0022080861119325576, "rougeLsum_precision": 0.23119707647225618, "rougeLsum_precision_stderr": 0.0034029730598757074, "rougeLsum_recall": 0.20514467867088854, "rougeLsum_recall_stderr": 0.0028358663517956315}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 1.8080075016885704, "bleu_stderr": 0.061027545228237855, "rouge1_fmeasure": 0.17861979049522314, "rouge1_fmeasure_stderr": 0.0018002125016461491, "rouge1_precision": 0.1356910382997071, "rouge1_precision_stderr": 0.0015410643240858764, "rouge1_recall": 0.2778453886224831, "rouge1_recall_stderr": 0.0023539268248121705, "rouge2_fmeasure": 0.046215425350075, "rouge2_fmeasure_stderr": 0.0010566397123776003, "rouge2_precision": 0.03490809623131593, "rouge2_precision_stderr": 0.0008357898041576343, "rouge2_recall": 0.07284634942140401, "rouge2_recall_stderr": 0.0015701168954429102, "rougeL_fmeasure": 0.16201597249905292, "rougeL_fmeasure_stderr": 0.0013971090679042123, "rougeL_precision": 0.12227403244327856, "rougeL_precision_stderr": 0.0011855454993101654, "rougeL_recall": 0.2551033251187225, "rougeL_recall_stderr": 0.0019031145709281366, "rougeLsum_fmeasure": 0.14908466648922475, "rougeLsum_fmeasure_stderr": 0.0016626826909044133, "rougeLsum_precision": 0.113242028542405, "rougeLsum_precision_stderr": 0.0013998996791185847, "rougeLsum_recall": 0.23203443734017606, "rougeLsum_recall_stderr": 0.0022390418934348437}}, "1": {"generate_text_restaurant": {"bleu": 12.173849341913149, "bleu_stderr": 0.0669092468478733, "rouge1_fmeasure": 0.48129644782721304, "rouge1_fmeasure_stderr": 0.0023219173855042303, "rouge1_precision": 0.5970385739901486, "rouge1_precision_stderr": 0.003272991717133695, "rouge1_recall": 0.44205108840831364, "rouge1_recall_stderr": 0.002990943407358534, "rouge2_fmeasure": 0.23032000484043905, "rouge2_fmeasure_stderr": 0.0021147211593944798, "rouge2_precision": 0.2911262146867607, "rouge2_precision_stderr": 0.002892523328423528, "rouge2_recall": 0.2107609477318019, "rouge2_recall_stderr": 0.0022166981158979714, "rougeL_fmeasure": 0.34966965256911114, "rougeL_fmeasure_stderr": 0.002108342891501849, "rougeL_precision": 0.4377488624210833, "rougeL_precision_stderr": 0.003110832463080634, "rougeL_recall": 0.3198170788492075, "rougeL_recall_stderr": 0.002444190246186975, "rougeLsum_fmeasure": 0.39331173897680705, "rougeLsum_fmeasure_stderr": 0.0023364505709058668, "rougeLsum_precision": 0.48957738460024036, "rougeLsum_precision_stderr": 0.00327157474691632, "rougeLsum_recall": 0.36068779636155734, "rougeLsum_recall_stderr": 0.0027524175233159713}}, "2": {"generate_text_restaurant": {"bleu": 14.360624153571306, "bleu_stderr": 0.15724700369597677, "rouge1_fmeasure": 0.5041317140273615, "rouge1_fmeasure_stderr": 0.0022753134233410278, "rouge1_precision": 0.6032331597687249, "rouge1_precision_stderr": 0.003142877448057622, "rouge1_recall": 0.4697575589134947, "rouge1_recall_stderr": 0.0029526957130554953, "rouge2_fmeasure": 0.25374280478792693, "rouge2_fmeasure_stderr": 0.002169411661246531, "rouge2_precision": 0.30851618494266825, "rouge2_precision_stderr": 0.002855313490834544, "rouge2_recall": 0.23616858770973606, "rouge2_recall_stderr": 0.002325964607821345, "rougeL_fmeasure": 0.3721296496966484, "rougeL_fmeasure_stderr": 0.002164018647000424, "rougeL_precision": 0.4477614821909179, "rougeL_precision_stderr": 0.003011118677636196, "rougeL_recall": 0.3459816794847227, "rougeL_recall_stderr": 0.002532543665969986, "rougeLsum_fmeasure": 0.42160904040581204, "rougeLsum_fmeasure_stderr": 0.002384317338752149, "rougeLsum_precision": 0.5054563233222107, "rougeLsum_precision_stderr": 0.0032185989379280733, "rougeLsum_recall": 0.39233324008569126, "rougeLsum_recall_stderr": 0.0027964602331100393}}, "3": {"generate_text_restaurant": {"bleu": 15.173778916293744, "bleu_stderr": 0.24020125390484678, "rouge1_fmeasure": 0.5122282123885756, "rouge1_fmeasure_stderr": 0.0022794016747342454, "rouge1_precision": 0.6038782195503243, "rouge1_precision_stderr": 0.003130218028564903, "rouge1_recall": 0.4797961783746489, "rouge1_recall_stderr": 0.0029264432136497623, "rouge2_fmeasure": 0.2611392423531194, "rouge2_fmeasure_stderr": 0.0021928684064362373, "rouge2_precision": 0.31128306325561933, "rouge2_precision_stderr": 0.0028044649859898306, "rouge2_recall": 0.2447494516430458, "rouge2_recall_stderr": 0.002368198668181156, "rougeL_fmeasure": 0.37928627551530325, "rougeL_fmeasure_stderr": 0.0021871071072142663, "rougeL_precision": 0.44868202075612884, "rougeL_precision_stderr": 0.0029681578589764264, "rougeL_recall": 0.35482043982343436, "rougeL_recall_stderr": 0.002532913929369527, "rougeLsum_fmeasure": 0.43123333798993546, "rougeLsum_fmeasure_stderr": 0.002401597688343299, "rougeLsum_precision": 0.5087702457406414, "rougeLsum_precision_stderr": 0.0031818356512290486, "rougeLsum_recall": 0.4036850753818513, "rougeLsum_recall_stderr": 0.0028061971662930287}}, "4": {"generate_text_restaurant": {"bleu": 15.78836334652692, "bleu_stderr": 0.16277087917874616, "rouge1_fmeasure": 0.5201963806927633, "rouge1_fmeasure_stderr": 0.0022664890545995124, "rouge1_precision": 0.6052306848983445, "rouge1_precision_stderr": 0.0031205758451821283, "rouge1_recall": 0.49015291142822803, "rouge1_recall_stderr": 0.002870806275113502, "rouge2_fmeasure": 0.2653610425045864, "rouge2_fmeasure_stderr": 0.0021838284720614534, "rouge2_precision": 0.31194428367362853, "rouge2_precision_stderr": 0.0027609843407215874, "rouge2_recall": 0.2500276639929066, "rouge2_recall_stderr": 0.0023425058959290804, "rougeL_fmeasure": 0.3838642484464721, "rougeL_fmeasure_stderr": 0.002185515405072229, "rougeL_precision": 0.4475978408946653, "rougeL_precision_stderr": 0.002901824428209634, "rougeL_recall": 0.36147111146407906, "rougeL_recall_stderr": 0.002523200720958866, "rougeLsum_fmeasure": 0.43919378367441203, "rougeLsum_fmeasure_stderr": 0.002408659068024605, "rougeLsum_precision": 0.5110579537103325, "rougeLsum_precision_stderr": 0.0031404148777386892, "rougeLsum_recall": 0.4135967411759738, "rougeLsum_recall_stderr": 0.0027946622087117543}}, "5": {"generate_text_restaurant": {"bleu": 15.879323420491742, "bleu_stderr": 0.1522927233333457, "rouge1_fmeasure": 0.5216373530710218, "rouge1_fmeasure_stderr": 0.002262224044659468, "rouge1_precision": 0.6064323245894362, "rouge1_precision_stderr": 0.00315510278665659, "rouge1_recall": 0.49013366730559393, "rouge1_recall_stderr": 0.0028136604439425025, "rouge2_fmeasure": 0.26786622625063644, "rouge2_fmeasure_stderr": 0.002208434520877025, "rouge2_precision": 0.3150817079641561, "rouge2_precision_stderr": 0.002829192112850875, "rouge2_recall": 0.25132514912770915, "rouge2_recall_stderr": 0.002318826430015561, "rougeL_fmeasure": 0.38681699896646954, "rougeL_fmeasure_stderr": 0.002220989687079745, "rougeL_precision": 0.45023806138186195, "rougeL_precision_stderr": 0.002949168521598567, "rougeL_recall": 0.36355539000425124, "rougeL_recall_stderr": 0.002526510160020316, "rougeLsum_fmeasure": 0.4417889417099761, "rougeLsum_fmeasure_stderr": 0.0024158193931717187, "rougeLsum_precision": 0.5137337228447252, "rougeLsum_precision_stderr": 0.0031936836223124105, "rougeLsum_recall": 0.41496788459604567, "rougeLsum_recall_stderr": 0.0027605438893900454}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.9472662234201847, "bleu_stderr": 0.0954010664780363, "rouge1_fmeasure": 0.20062931818668506, "rouge1_fmeasure_stderr": 0.002892087092668822, "rouge1_precision": 0.1512189693389312, "rouge1_precision_stderr": 0.002590198483449934, "rouge1_recall": 0.32951007687075756, "rouge1_recall_stderr": 0.004923756202298708, "rouge2_fmeasure": 0.044133865209903714, "rouge2_fmeasure_stderr": 0.0015693045278532822, "rouge2_precision": 0.03240083882273609, "rouge2_precision_stderr": 0.0011887319293568278, "rouge2_recall": 0.07461053739480651, "rouge2_recall_stderr": 0.0027791525130900266, "rougeL_fmeasure": 0.14533977808404885, "rougeL_fmeasure_stderr": 0.0021630231020242576, "rougeL_precision": 0.11056422701824653, "rougeL_precision_stderr": 0.0021853162208514175, "rougeL_recall": 0.23921769394978765, "rougeL_recall_stderr": 0.003783594428094317, "rougeLsum_fmeasure": 0.15912733837926105, "rougeLsum_fmeasure_stderr": 0.0023937408312475765, "rougeLsum_precision": 0.12055829236568064, "rougeLsum_precision_stderr": 0.002297890666943901, "rougeLsum_recall": 0.2623026232832005, "rougeLsum_recall_stderr": 0.004182995633448934}}, "1": {"article_DOC_summary": {"bleu": 2.857997728240847, "bleu_stderr": 0.15132505328438867, "rouge1_fmeasure": 0.24058051563117627, "rouge1_fmeasure_stderr": 0.003619950901783304, "rouge1_precision": 0.2431489069099882, "rouge1_precision_stderr": 0.004319149380207122, "rouge1_recall": 0.27274551008869324, "rouge1_recall_stderr": 0.004181125297952178, "rouge2_fmeasure": 0.05891708042714147, "rouge2_fmeasure_stderr": 0.002369248568269019, "rouge2_precision": 0.06156338993755893, "rouge2_precision_stderr": 0.0027090151794028584, "rouge2_recall": 0.06564279411701872, "rouge2_recall_stderr": 0.0026141410795920464, "rougeL_fmeasure": 0.1845786738464429, "rougeL_fmeasure_stderr": 0.0030093215424524295, "rougeL_precision": 0.18703509762652787, "rougeL_precision_stderr": 0.0036036431129194833, "rougeL_recall": 0.20956010556672522, "rougeL_recall_stderr": 0.00345405751329193, "rougeLsum_fmeasure": 0.18768186260751757, "rougeLsum_fmeasure_stderr": 0.003017598467168456, "rougeLsum_precision": 0.18954846036997042, "rougeLsum_precision_stderr": 0.0035941447605698394, "rougeLsum_recall": 0.21418956311713871, "rougeLsum_recall_stderr": 0.0035468160066617596}}, "2": {"article_DOC_summary": {"bleu": 3.19195992073562, "bleu_stderr": 0.2067955639958893, "rouge1_fmeasure": 0.24657160673903905, "rouge1_fmeasure_stderr": 0.0037739176215048444, "rouge1_precision": 0.25234682823435783, "rouge1_precision_stderr": 0.004406932928762655, "rouge1_recall": 0.26739840698638306, "rouge1_recall_stderr": 0.0039999072862688684, "rouge2_fmeasure": 0.06271701269177109, "rouge2_fmeasure_stderr": 0.0025663699767628305, "rouge2_precision": 0.06567480327233229, "rouge2_precision_stderr": 0.0027842377535723913, "rouge2_recall": 0.06599412859644187, "rouge2_recall_stderr": 0.0026775951767211374, "rougeL_fmeasure": 0.1876781101224112, "rougeL_fmeasure_stderr": 0.0031437463539169125, "rougeL_precision": 0.1921361411782326, "rougeL_precision_stderr": 0.0036263536814562453, "rougeL_recall": 0.20386135964058816, "rougeL_recall_stderr": 0.0033147209011893724, "rougeLsum_fmeasure": 0.1896272533115015, "rougeLsum_fmeasure_stderr": 0.0031550558124475133, "rougeLsum_precision": 0.19374913452852363, "rougeLsum_precision_stderr": 0.0036251531161871544, "rougeLsum_recall": 0.20676693183261305, "rougeLsum_recall_stderr": 0.0034014354284824795}}}}
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.33618386090404295, "bleu_stderr": 0.043286547374004225, "rouge1_fmeasure": 0.09550360928926843, "rouge1_fmeasure_stderr": 0.0021611192824620468, "rouge1_precision": 0.07608115633976532, "rouge1_precision_stderr": 0.0028424335866037475, "rouge1_recall": 0.243734587530279, "rouge1_recall_stderr": 0.005032751672173048, "rouge2_fmeasure": 0.04353421604632926, "rouge2_fmeasure_stderr": 0.0013022741040944032, "rouge2_precision": 0.03391274645846755, "rouge2_precision_stderr": 0.0017782441786429572, "rouge2_recall": 0.11477361563470867, "rouge2_recall_stderr": 0.0030177112931945443, "rougeL_fmeasure": 0.0908905716888711, "rougeL_fmeasure_stderr": 0.0019748576875776485, "rougeL_precision": 0.07249954185930807, "rougeL_precision_stderr": 0.002716866763111571, "rougeL_recall": 0.23447583587320983, "rougeL_recall_stderr": 0.004820896551192642, "rougeLsum_fmeasure": 0.09063319866429098, "rougeLsum_fmeasure_stderr": 0.002018782468480028, "rougeLsum_precision": 0.07260565563718514, "rougeLsum_precision_stderr": 0.0027622865524293628, "rougeLsum_recall": 0.2312007323793673, "rougeLsum_recall_stderr": 0.004666750865618609}}, "1": {"PALM_prompt": {"bleu": 0.5152510207869508, "bleu_stderr": 0.039697768768641464, "rouge1_fmeasure": 0.15419793855600986, "rouge1_fmeasure_stderr": 0.0037752055082234935, "rouge1_precision": 0.1335251141437586, "rouge1_precision_stderr": 0.004421796872864802, "rouge1_recall": 0.3040669240102272, "rouge1_recall_stderr": 0.0049926465687368735, "rouge2_fmeasure": 0.07820906828691397, "rouge2_fmeasure_stderr": 0.0026222161094111586, "rouge2_precision": 0.06857339375782437, "rouge2_precision_stderr": 0.0030273264083127568, "rouge2_recall": 0.1555986112064758, "rouge2_recall_stderr": 0.0036341014671041087, "rougeL_fmeasure": 0.13965714245032726, "rougeL_fmeasure_stderr": 0.0032487392690607572, "rougeL_precision": 0.1195649771956543, "rougeL_precision_stderr": 0.0038518942294089972, "rougeL_recall": 0.2838951533263151, "rougeL_recall_stderr": 0.004599064364517073, "rougeLsum_fmeasure": 0.14150307262693815, "rougeLsum_fmeasure_stderr": 0.0033007395324502467, "rougeLsum_precision": 0.12140487798820795, "rougeLsum_precision_stderr": 0.003915984273012587, "rougeLsum_recall": 0.2857833042522088, "rougeLsum_recall_stderr": 0.004601461544101827}}, "2": {"PALM_prompt": {"bleu": 0.6450721424031474, "bleu_stderr": 0.035938592978876936, "rouge1_fmeasure": 0.19228537050298922, "rouge1_fmeasure_stderr": 0.004356183491682661, "rouge1_precision": 0.17715496451419446, "rouge1_precision_stderr": 0.005377686826508908, "rouge1_recall": 0.3442204927205235, "rouge1_recall_stderr": 0.00499209962075588, "rouge2_fmeasure": 0.1017490401630499, "rouge2_fmeasure_stderr": 0.003080557280075834, "rouge2_precision": 0.09659406582034752, "rouge2_precision_stderr": 0.0036720254666308344, "rouge2_recall": 0.1816245156374427, "rouge2_recall_stderr": 0.0038421174433974025, "rougeL_fmeasure": 0.17158392384169643, "rougeL_fmeasure_stderr": 0.0037128302671958994, "rougeL_precision": 0.15519367504713033, "rougeL_precision_stderr": 0.004580706449804959, "rougeL_recall": 0.31893881029765586, "rougeL_recall_stderr": 0.004597766706728441, "rougeLsum_fmeasure": 0.17498254398589475, "rougeLsum_fmeasure_stderr": 0.0037985612156152086, "rougeLsum_precision": 0.1592800915810083, "rougeLsum_precision_stderr": 0.0047149524911017215, "rougeLsum_recall": 0.3221825821541532, "rougeLsum_recall_stderr": 0.0046252310161258746}}, "3": {"PALM_prompt": {"bleu": 0.8785043636862591, "bleu_stderr": 0.02652000083973547, "rouge1_fmeasure": 0.21217682060449367, "rouge1_fmeasure_stderr": 0.004581243352642801, "rouge1_precision": 0.2002191357399255, "rouge1_precision_stderr": 0.005700191155087746, "rouge1_recall": 0.3622650735119951, "rouge1_recall_stderr": 0.005105318033672305, "rouge2_fmeasure": 0.11312048852329888, "rouge2_fmeasure_stderr": 0.003186877123824252, "rouge2_precision": 0.10955905558045385, "rouge2_precision_stderr": 0.0038849982073218873, "rouge2_recall": 0.19262377036723086, "rouge2_recall_stderr": 0.003902442560309681, "rougeL_fmeasure": 0.1874319081280966, "rougeL_fmeasure_stderr": 0.0038899168243038715, "rougeL_precision": 0.1744946205047383, "rougeL_precision_stderr": 0.004899868234689971, "rougeL_recall": 0.33190561350734876, "rougeL_recall_stderr": 0.004628476785750535, "rougeLsum_fmeasure": 0.19207381785134825, "rougeLsum_fmeasure_stderr": 0.004003703743860644, "rougeLsum_precision": 0.17972186968155374, "rougeLsum_precision_stderr": 0.005058207025529124, "rougeLsum_recall": 0.33723235220496645, "rougeLsum_recall_stderr": 0.0046889691038068775}}, "4": {"PALM_prompt": {"bleu": 1.020225734756704, "bleu_stderr": 0.06946466420275214, "rouge1_fmeasure": 0.22615455778377724, "rouge1_fmeasure_stderr": 0.004584303113870005, "rouge1_precision": 0.21447559212644274, "rouge1_precision_stderr": 0.005787602630220149, "rouge1_recall": 0.3797811786576114, "rouge1_recall_stderr": 0.005078572769442694, "rouge2_fmeasure": 0.12153309984630654, "rouge2_fmeasure_stderr": 0.0032009564132827184, "rouge2_precision": 0.11802182161980528, "rouge2_precision_stderr": 0.003954053081043237, "rouge2_recall": 0.20495230642568182, "rouge2_recall_stderr": 0.003956396099130858, "rougeL_fmeasure": 0.1985084531863174, "rougeL_fmeasure_stderr": 0.003863020108912202, "rougeL_precision": 0.18574729294363898, "rougeL_precision_stderr": 0.00493965653212434, "rougeL_recall": 0.34661430398431914, "rougeL_recall_stderr": 0.004643977843173891, "rougeLsum_fmeasure": 0.20493355705865743, "rougeLsum_fmeasure_stderr": 0.004011218572127487, "rougeLsum_precision": 0.19295820334137398, "rougeLsum_precision_stderr": 0.005149923293184489, "rougeLsum_recall": 0.3532599821115898, "rougeLsum_recall_stderr": 0.004685486372918684}}, "5": {"PALM_prompt": {"bleu": 1.1738991703843653, "bleu_stderr": 0.05137420728965795, "rouge1_fmeasure": 0.2452749113827306, "rouge1_fmeasure_stderr": 0.004866057995099286, "rouge1_precision": 0.23842752486609714, "rouge1_precision_stderr": 0.00616558462817229, "rouge1_recall": 0.39104608539227964, "rouge1_recall_stderr": 0.005079097315439687, "rouge2_fmeasure": 0.13673599319945717, "rouge2_fmeasure_stderr": 0.003545382189555846, "rouge2_precision": 0.13743673376990653, "rouge2_precision_stderr": 0.004448566773563913, "rouge2_recall": 0.21617363071605727, "rouge2_recall_stderr": 0.004065117072186819, "rougeL_fmeasure": 0.2145149869812429, "rougeL_fmeasure_stderr": 0.004135695136449003, "rougeL_precision": 0.20669362004199143, "rougeL_precision_stderr": 0.0053303444381744315, "rougeL_recall": 0.35345362943589836, "rougeL_recall_stderr": 0.004581784867877382, "rougeLsum_fmeasure": 0.22169261840373009, "rougeLsum_fmeasure_stderr": 0.004292938565416988, "rougeLsum_precision": 0.21482050441032427, "rougeLsum_precision_stderr": 0.005549487778372521, "rougeLsum_recall": 0.3612051449037077, "rougeLsum_recall_stderr": 0.004657096205531439}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 3.2204812581094187, "bleu_stderr": 0.0968937530189579, "rouge1_fmeasure": 0.17281704832161807, "rouge1_fmeasure_stderr": 0.0025639705499622615, "rouge1_precision": 0.16222110345712215, "rouge1_precision_stderr": 0.002918181626000938, "rouge1_recall": 0.23556142979563985, "rouge1_recall_stderr": 0.0035092798940514463, "rouge2_fmeasure": 0.04445750772336212, "rouge2_fmeasure_stderr": 0.0010694173040695087, "rouge2_precision": 0.04035916262517482, "rouge2_precision_stderr": 0.0011089852877097363, "rouge2_recall": 0.06187933259063034, "rouge2_recall_stderr": 0.0015927286355937283, "rougeL_fmeasure": 0.12828595447707558, "rougeL_fmeasure_stderr": 0.0018630698897748783, "rougeL_precision": 0.12115800294799915, "rougeL_precision_stderr": 0.0023302786580353405, "rougeL_recall": 0.1793437869920672, "rougeL_recall_stderr": 0.0027904364481447434, "rougeLsum_fmeasure": 0.16125234602130137, "rougeLsum_fmeasure_stderr": 0.0024054029356366125, "rougeLsum_precision": 0.15193645088089489, "rougeLsum_precision_stderr": 0.002794596188541928, "rougeLsum_recall": 0.22017606445792967, "rougeLsum_recall_stderr": 0.003307741344782461}}, "1": {"tldr_en": {"bleu": 3.030764799858383, "bleu_stderr": 0.10180137949258324, "rouge1_fmeasure": 0.20057722026752264, "rouge1_fmeasure_stderr": 0.00237727127672953, "rouge1_precision": 0.24835405674131716, "rouge1_precision_stderr": 0.003595946507957674, "rouge1_recall": 0.22065222520601965, "rouge1_recall_stderr": 0.0030326234032250735, "rouge2_fmeasure": 0.053780768787129375, "rouge2_fmeasure_stderr": 0.0012652221510456923, "rouge2_precision": 0.07072572329707479, "rouge2_precision_stderr": 0.002023483136430399, "rouge2_recall": 0.05888749376061464, "rouge2_recall_stderr": 0.0015271270172799948, "rougeL_fmeasure": 0.15099195793433165, "rougeL_fmeasure_stderr": 0.0017939149386485946, "rougeL_precision": 0.18962847615836217, "rougeL_precision_stderr": 0.0029056148560385174, "rougeL_recall": 0.16669593752772216, "rougeL_recall_stderr": 0.0023452388157941375, "rougeLsum_fmeasure": 0.18619742012806825, "rougeLsum_fmeasure_stderr": 0.0022080861119325576, "rougeLsum_precision": 0.23119707647225618, "rougeLsum_precision_stderr": 0.0034029730598757074, "rougeLsum_recall": 0.20514467867088854, "rougeLsum_recall_stderr": 0.0028358663517956315}}, "2": {"tldr_en": {"bleu": 4.074528254683577, "bleu_stderr": 0.10710940903995719, "rouge1_fmeasure": 0.24392276306894944, "rouge1_fmeasure_stderr": 0.0023000547396419086, "rouge1_precision": 0.30679877376441267, "rouge1_precision_stderr": 0.003595563898423582, "rouge1_recall": 0.25602196677157735, "rouge1_recall_stderr": 0.002878244091822486, "rouge2_fmeasure": 0.07100355255541753, "rouge2_fmeasure_stderr": 0.0014153910474231026, "rouge2_precision": 0.09387841885638748, "rouge2_precision_stderr": 0.002194124801129067, "rouge2_recall": 0.0740439804559834, "rouge2_recall_stderr": 0.001644608125064704, "rougeL_fmeasure": 0.18445520873996413, "rougeL_fmeasure_stderr": 0.001797621869515743, "rougeL_precision": 0.23412738417408802, "rougeL_precision_stderr": 0.0029250825035858734, "rougeL_recall": 0.19426592228730813, "rougeL_recall_stderr": 0.0023089496783281213, "rougeLsum_fmeasure": 0.2286765314746336, "rougeLsum_fmeasure_stderr": 0.0021801688851002263, "rougeLsum_precision": 0.28859199947210284, "rougeLsum_precision_stderr": 0.003452141980034977, "rougeLsum_recall": 0.23966421160449644, "rougeLsum_recall_stderr": 0.0027125548096589747}}, "3": {"tldr_en": {"bleu": 3.168617248016535, "bleu_stderr": 0.08041979746938775, "rouge1_fmeasure": 0.2094195871488414, "rouge1_fmeasure_stderr": 0.0027068429969493734, "rouge1_precision": 0.272311629129121, "rouge1_precision_stderr": 0.0039505188546921826, "rouge1_recall": 0.21816270627496134, "rouge1_recall_stderr": 0.0032448735585025937, "rouge2_fmeasure": 0.06204427812400913, "rouge2_fmeasure_stderr": 0.0014401113659882462, "rouge2_precision": 0.08329439938013247, "rouge2_precision_stderr": 0.0021782819294409984, "rouge2_recall": 0.06480971019253767, "rouge2_recall_stderr": 0.0016827399941316222, "rougeL_fmeasure": 0.1590668148316741, "rougeL_fmeasure_stderr": 0.0020766324795938023, "rougeL_precision": 0.20896331681491714, "rougeL_precision_stderr": 0.0031500865399556595, "rougeL_recall": 0.1664625275086072, "rougeL_recall_stderr": 0.0025592493419649177, "rougeLsum_fmeasure": 0.1964756114951262, "rougeLsum_fmeasure_stderr": 0.0025500807038816515, "rougeLsum_precision": 0.2563037368454477, "rougeLsum_precision_stderr": 0.0037619221351167056, "rougeLsum_recall": 0.20447650361202882, "rougeLsum_recall_stderr": 0.0030540843932082493}}, "4": {"tldr_en": {"bleu": 0.07788444152267485, "bleu_stderr": 0.01315525656911281, "rouge1_fmeasure": 0.06757697669893872, "rouge1_fmeasure_stderr": 0.0023707490073172043, "rouge1_precision": 0.0910819200952716, "rouge1_precision_stderr": 0.0033747683121654196, "rouge1_recall": 0.07028567198192245, "rouge1_recall_stderr": 0.002640964970940966, "rouge2_fmeasure": 0.019948424307073904, "rouge2_fmeasure_stderr": 0.000986170441488306, "rouge2_precision": 0.028646478253466095, "rouge2_precision_stderr": 0.0016076285180980135, "rouge2_recall": 0.020868526921797886, "rouge2_recall_stderr": 0.0011537893245677335, "rougeL_fmeasure": 0.05201990108593777, "rougeL_fmeasure_stderr": 0.0018314763879696625, "rougeL_precision": 0.07082578661014885, "rougeL_precision_stderr": 0.002691634510208164, "rougeL_recall": 0.05448506471222756, "rougeL_recall_stderr": 0.0020854622601667937, "rougeLsum_fmeasure": 0.06308719820573185, "rougeLsum_fmeasure_stderr": 0.002213451741392458, "rougeLsum_precision": 0.08551950005220435, "rougeLsum_precision_stderr": 0.0031983492312005384, "rougeLsum_recall": 0.06543250975997009, "rougeLsum_recall_stderr": 0.0024532813023277916}}, "5": {"tldr_en": {"bleu": 1.4470192084599072e-15, "bleu_stderr": 5.1547920765171173e-14, "rouge1_fmeasure": 0.010473201550267177, "rouge1_fmeasure_stderr": 0.0010554421691169314, "rouge1_precision": 0.014515441910190443, "rouge1_precision_stderr": 0.001501466217296952, "rouge1_recall": 0.011301943813768834, "rouge1_recall_stderr": 0.0012288119306628567, "rouge2_fmeasure": 0.0035604598673033626, "rouge2_fmeasure_stderr": 0.0004625418185814249, "rouge2_precision": 0.004591980814271401, "rouge2_precision_stderr": 0.0006515529824929627, "rouge2_recall": 0.00416888931272546, "rouge2_recall_stderr": 0.0005930107839352501, "rougeL_fmeasure": 0.008371164394493067, "rougeL_fmeasure_stderr": 0.0008468942819407445, "rougeL_precision": 0.011779972055249343, "rougeL_precision_stderr": 0.0012586537746088814, "rougeL_recall": 0.009040049894954919, "rougeL_recall_stderr": 0.0009873007391796482, "rougeLsum_fmeasure": 0.009841875351812794, "rougeLsum_fmeasure_stderr": 0.000994934070773035, "rougeLsum_precision": 0.013708349668480058, "rougeLsum_precision_stderr": 0.0014293611241212706, "rougeLsum_recall": 0.010579263673799228, "rougeLsum_recall_stderr": 0.0011521180305532216}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 1.8080075016885704, "bleu_stderr": 0.061027545228237855, "rouge1_fmeasure": 0.17861979049522314, "rouge1_fmeasure_stderr": 0.0018002125016461491, "rouge1_precision": 0.1356910382997071, "rouge1_precision_stderr": 0.0015410643240858764, "rouge1_recall": 0.2778453886224831, "rouge1_recall_stderr": 0.0023539268248121705, "rouge2_fmeasure": 0.046215425350075, "rouge2_fmeasure_stderr": 0.0010566397123776003, "rouge2_precision": 0.03490809623131593, "rouge2_precision_stderr": 0.0008357898041576343, "rouge2_recall": 0.07284634942140401, "rouge2_recall_stderr": 0.0015701168954429102, "rougeL_fmeasure": 0.16201597249905292, "rougeL_fmeasure_stderr": 0.0013971090679042123, "rougeL_precision": 0.12227403244327856, "rougeL_precision_stderr": 0.0011855454993101654, "rougeL_recall": 0.2551033251187225, "rougeL_recall_stderr": 0.0019031145709281366, "rougeLsum_fmeasure": 0.14908466648922475, "rougeLsum_fmeasure_stderr": 0.0016626826909044133, "rougeLsum_precision": 0.113242028542405, "rougeLsum_precision_stderr": 0.0013998996791185847, "rougeLsum_recall": 0.23203443734017606, "rougeLsum_recall_stderr": 0.0022390418934348437}}, "1": {"generate_text_restaurant": {"bleu": 12.173849341913149, "bleu_stderr": 0.0669092468478733, "rouge1_fmeasure": 0.48129644782721304, "rouge1_fmeasure_stderr": 0.0023219173855042303, "rouge1_precision": 0.5970385739901486, "rouge1_precision_stderr": 0.003272991717133695, "rouge1_recall": 0.44205108840831364, "rouge1_recall_stderr": 0.002990943407358534, "rouge2_fmeasure": 0.23032000484043905, "rouge2_fmeasure_stderr": 0.0021147211593944798, "rouge2_precision": 0.2911262146867607, "rouge2_precision_stderr": 0.002892523328423528, "rouge2_recall": 0.2107609477318019, "rouge2_recall_stderr": 0.0022166981158979714, "rougeL_fmeasure": 0.34966965256911114, "rougeL_fmeasure_stderr": 0.002108342891501849, "rougeL_precision": 0.4377488624210833, "rougeL_precision_stderr": 0.003110832463080634, "rougeL_recall": 0.3198170788492075, "rougeL_recall_stderr": 0.002444190246186975, "rougeLsum_fmeasure": 0.39331173897680705, "rougeLsum_fmeasure_stderr": 0.0023364505709058668, "rougeLsum_precision": 0.48957738460024036, "rougeLsum_precision_stderr": 0.00327157474691632, "rougeLsum_recall": 0.36068779636155734, "rougeLsum_recall_stderr": 0.0027524175233159713}}, "2": {"generate_text_restaurant": {"bleu": 14.360624153571306, "bleu_stderr": 0.15724700369597677, "rouge1_fmeasure": 0.5041317140273615, "rouge1_fmeasure_stderr": 0.0022753134233410278, "rouge1_precision": 0.6032331597687249, "rouge1_precision_stderr": 0.003142877448057622, "rouge1_recall": 0.4697575589134947, "rouge1_recall_stderr": 0.0029526957130554953, "rouge2_fmeasure": 0.25374280478792693, "rouge2_fmeasure_stderr": 0.002169411661246531, "rouge2_precision": 0.30851618494266825, "rouge2_precision_stderr": 0.002855313490834544, "rouge2_recall": 0.23616858770973606, "rouge2_recall_stderr": 0.002325964607821345, "rougeL_fmeasure": 0.3721296496966484, "rougeL_fmeasure_stderr": 0.002164018647000424, "rougeL_precision": 0.4477614821909179, "rougeL_precision_stderr": 0.003011118677636196, "rougeL_recall": 0.3459816794847227, "rougeL_recall_stderr": 0.002532543665969986, "rougeLsum_fmeasure": 0.42160904040581204, "rougeLsum_fmeasure_stderr": 0.002384317338752149, "rougeLsum_precision": 0.5054563233222107, "rougeLsum_precision_stderr": 0.0032185989379280733, "rougeLsum_recall": 0.39233324008569126, "rougeLsum_recall_stderr": 0.0027964602331100393}}, "3": {"generate_text_restaurant": {"bleu": 15.173778916293744, "bleu_stderr": 0.24020125390484678, "rouge1_fmeasure": 0.5122282123885756, "rouge1_fmeasure_stderr": 0.0022794016747342454, "rouge1_precision": 0.6038782195503243, "rouge1_precision_stderr": 0.003130218028564903, "rouge1_recall": 0.4797961783746489, "rouge1_recall_stderr": 0.0029264432136497623, "rouge2_fmeasure": 0.2611392423531194, "rouge2_fmeasure_stderr": 0.0021928684064362373, "rouge2_precision": 0.31128306325561933, "rouge2_precision_stderr": 0.0028044649859898306, "rouge2_recall": 0.2447494516430458, "rouge2_recall_stderr": 0.002368198668181156, "rougeL_fmeasure": 0.37928627551530325, "rougeL_fmeasure_stderr": 0.0021871071072142663, "rougeL_precision": 0.44868202075612884, "rougeL_precision_stderr": 0.0029681578589764264, "rougeL_recall": 0.35482043982343436, "rougeL_recall_stderr": 0.002532913929369527, "rougeLsum_fmeasure": 0.43123333798993546, "rougeLsum_fmeasure_stderr": 0.002401597688343299, "rougeLsum_precision": 0.5087702457406414, "rougeLsum_precision_stderr": 0.0031818356512290486, "rougeLsum_recall": 0.4036850753818513, "rougeLsum_recall_stderr": 0.0028061971662930287}}, "4": {"generate_text_restaurant": {"bleu": 15.78836334652692, "bleu_stderr": 0.16277087917874616, "rouge1_fmeasure": 0.5201963806927633, "rouge1_fmeasure_stderr": 0.0022664890545995124, "rouge1_precision": 0.6052306848983445, "rouge1_precision_stderr": 0.0031205758451821283, "rouge1_recall": 0.49015291142822803, "rouge1_recall_stderr": 0.002870806275113502, "rouge2_fmeasure": 0.2653610425045864, "rouge2_fmeasure_stderr": 0.0021838284720614534, "rouge2_precision": 0.31194428367362853, "rouge2_precision_stderr": 0.0027609843407215874, "rouge2_recall": 0.2500276639929066, "rouge2_recall_stderr": 0.0023425058959290804, "rougeL_fmeasure": 0.3838642484464721, "rougeL_fmeasure_stderr": 0.002185515405072229, "rougeL_precision": 0.4475978408946653, "rougeL_precision_stderr": 0.002901824428209634, "rougeL_recall": 0.36147111146407906, "rougeL_recall_stderr": 0.002523200720958866, "rougeLsum_fmeasure": 0.43919378367441203, "rougeLsum_fmeasure_stderr": 0.002408659068024605, "rougeLsum_precision": 0.5110579537103325, "rougeLsum_precision_stderr": 0.0031404148777386892, "rougeLsum_recall": 0.4135967411759738, "rougeLsum_recall_stderr": 0.0027946622087117543}}, "5": {"generate_text_restaurant": {"bleu": 15.879323420491742, "bleu_stderr": 0.1522927233333457, "rouge1_fmeasure": 0.5216373530710218, "rouge1_fmeasure_stderr": 0.002262224044659468, "rouge1_precision": 0.6064323245894362, "rouge1_precision_stderr": 0.00315510278665659, "rouge1_recall": 0.49013366730559393, "rouge1_recall_stderr": 0.0028136604439425025, "rouge2_fmeasure": 0.26786622625063644, "rouge2_fmeasure_stderr": 0.002208434520877025, "rouge2_precision": 0.3150817079641561, "rouge2_precision_stderr": 0.002829192112850875, "rouge2_recall": 0.25132514912770915, "rouge2_recall_stderr": 0.002318826430015561, "rougeL_fmeasure": 0.38681699896646954, "rougeL_fmeasure_stderr": 0.002220989687079745, "rougeL_precision": 0.45023806138186195, "rougeL_precision_stderr": 0.002949168521598567, "rougeL_recall": 0.36355539000425124, "rougeL_recall_stderr": 0.002526510160020316, "rougeLsum_fmeasure": 0.4417889417099761, "rougeLsum_fmeasure_stderr": 0.0024158193931717187, "rougeLsum_precision": 0.5137337228447252, "rougeLsum_precision_stderr": 0.0031936836223124105, "rougeLsum_recall": 0.41496788459604567, "rougeLsum_recall_stderr": 0.0027605438893900454}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.9472662234201847, "bleu_stderr": 0.0954010664780363, "rouge1_fmeasure": 0.20062931818668506, "rouge1_fmeasure_stderr": 0.002892087092668822, "rouge1_precision": 0.1512189693389312, "rouge1_precision_stderr": 0.002590198483449934, "rouge1_recall": 0.32951007687075756, "rouge1_recall_stderr": 0.004923756202298708, "rouge2_fmeasure": 0.044133865209903714, "rouge2_fmeasure_stderr": 0.0015693045278532822, "rouge2_precision": 0.03240083882273609, "rouge2_precision_stderr": 0.0011887319293568278, "rouge2_recall": 0.07461053739480651, "rouge2_recall_stderr": 0.0027791525130900266, "rougeL_fmeasure": 0.14533977808404885, "rougeL_fmeasure_stderr": 0.0021630231020242576, "rougeL_precision": 0.11056422701824653, "rougeL_precision_stderr": 0.0021853162208514175, "rougeL_recall": 0.23921769394978765, "rougeL_recall_stderr": 0.003783594428094317, "rougeLsum_fmeasure": 0.15912733837926105, "rougeLsum_fmeasure_stderr": 0.0023937408312475765, "rougeLsum_precision": 0.12055829236568064, "rougeLsum_precision_stderr": 0.002297890666943901, "rougeLsum_recall": 0.2623026232832005, "rougeLsum_recall_stderr": 0.004182995633448934}}, "1": {"article_DOC_summary": {"bleu": 2.857997728240847, "bleu_stderr": 0.15132505328438867, "rouge1_fmeasure": 0.24058051563117627, "rouge1_fmeasure_stderr": 0.003619950901783304, "rouge1_precision": 0.2431489069099882, "rouge1_precision_stderr": 0.004319149380207122, "rouge1_recall": 0.27274551008869324, "rouge1_recall_stderr": 0.004181125297952178, "rouge2_fmeasure": 0.05891708042714147, "rouge2_fmeasure_stderr": 0.002369248568269019, "rouge2_precision": 0.06156338993755893, "rouge2_precision_stderr": 0.0027090151794028584, "rouge2_recall": 0.06564279411701872, "rouge2_recall_stderr": 0.0026141410795920464, "rougeL_fmeasure": 0.1845786738464429, "rougeL_fmeasure_stderr": 0.0030093215424524295, "rougeL_precision": 0.18703509762652787, "rougeL_precision_stderr": 0.0036036431129194833, "rougeL_recall": 0.20956010556672522, "rougeL_recall_stderr": 0.00345405751329193, "rougeLsum_fmeasure": 0.18768186260751757, "rougeLsum_fmeasure_stderr": 0.003017598467168456, "rougeLsum_precision": 0.18954846036997042, "rougeLsum_precision_stderr": 0.0035941447605698394, "rougeLsum_recall": 0.21418956311713871, "rougeLsum_recall_stderr": 0.0035468160066617596}}, "2": {"article_DOC_summary": {"bleu": 3.19195992073562, "bleu_stderr": 0.2067955639958893, "rouge1_fmeasure": 0.24657160673903905, "rouge1_fmeasure_stderr": 0.0037739176215048444, "rouge1_precision": 0.25234682823435783, "rouge1_precision_stderr": 0.004406932928762655, "rouge1_recall": 0.26739840698638306, "rouge1_recall_stderr": 0.0039999072862688684, "rouge2_fmeasure": 0.06271701269177109, "rouge2_fmeasure_stderr": 0.0025663699767628305, "rouge2_precision": 0.06567480327233229, "rouge2_precision_stderr": 0.0027842377535723913, "rouge2_recall": 0.06599412859644187, "rouge2_recall_stderr": 0.0026775951767211374, "rougeL_fmeasure": 0.1876781101224112, "rougeL_fmeasure_stderr": 0.0031437463539169125, "rougeL_precision": 0.1921361411782326, "rougeL_precision_stderr": 0.0036263536814562453, "rougeL_recall": 0.20386135964058816, "rougeL_recall_stderr": 0.0033147209011893724, "rougeLsum_fmeasure": 0.1896272533115015, "rougeLsum_fmeasure_stderr": 0.0031550558124475133, "rougeLsum_precision": 0.19374913452852363, "rougeLsum_precision_stderr": 0.0036251531161871544, "rougeLsum_recall": 0.20676693183261305, "rougeLsum_recall_stderr": 0.0034014354284824795}}, "3": {"article_DOC_summary": {"bleu": 3.267835808853254, "bleu_stderr": 0.26356764701611307, "rouge1_fmeasure": 0.23697855013529284, "rouge1_fmeasure_stderr": 0.003931784147472208, "rouge1_precision": 0.2511767032658256, "rouge1_precision_stderr": 0.004670743744963309, "rouge1_recall": 0.2497224746972405, "rouge1_recall_stderr": 0.004054437554455434, "rouge2_fmeasure": 0.059313458488013185, "rouge2_fmeasure_stderr": 0.0025641694242472487, "rouge2_precision": 0.06435388180319074, "rouge2_precision_stderr": 0.002934840810442915, "rouge2_recall": 0.060376215777618605, "rouge2_recall_stderr": 0.002521653823700491, "rougeL_fmeasure": 0.17977672776005849, "rougeL_fmeasure_stderr": 0.00326317345808302, "rougeL_precision": 0.19057534922020994, "rougeL_precision_stderr": 0.0038620821553629983, "rougeL_recall": 0.1901634518957205, "rougeL_recall_stderr": 0.0033714037003502494, "rougeLsum_fmeasure": 0.18163235303784986, "rougeLsum_fmeasure_stderr": 0.003264460230665997, "rougeLsum_precision": 0.19211901704097034, "rougeLsum_precision_stderr": 0.003850775083612694, "rougeLsum_recall": 0.19288110429850555, "rougeLsum_recall_stderr": 0.003421247538700275}}, "4": {"article_DOC_summary": {"bleu": 0.17970307621766216, "bleu_stderr": 0.04965869269887041, "rouge1_fmeasure": 0.05891769720848019, "rouge1_fmeasure_stderr": 0.0035369396004201713, "rouge1_precision": 0.06645727693786248, "rouge1_precision_stderr": 0.004158365862658779, "rouge1_recall": 0.061174931724787315, "rouge1_recall_stderr": 0.003797370735030977, "rouge2_fmeasure": 0.015297632126595406, "rouge2_fmeasure_stderr": 0.00149002417901287, "rouge2_precision": 0.01730890961224063, "rouge2_precision_stderr": 0.0017698712455992984, "rouge2_recall": 0.016289532767626923, "rouge2_recall_stderr": 0.0017234032328131926, "rougeL_fmeasure": 0.04476173097758684, "rougeL_fmeasure_stderr": 0.0027730017560448727, "rougeL_precision": 0.050637820115463826, "rougeL_precision_stderr": 0.0032826504244346, "rougeL_recall": 0.04683971382625969, "rougeL_recall_stderr": 0.003039332848507969, "rougeLsum_fmeasure": 0.04533434532971928, "rougeLsum_fmeasure_stderr": 0.0028025558912498296, "rougeLsum_precision": 0.051192679124604304, "rougeLsum_precision_stderr": 0.0033047254109378095, "rougeLsum_recall": 0.0475395827212547, "rougeLsum_recall_stderr": 0.0030900986283582004}}, "5": {"article_DOC_summary": {"bleu": 1.393056311124318e-43, "bleu_stderr": 1.2487431746784071e-33, "rouge1_fmeasure": 0.0025036711184808556, "rouge1_fmeasure_stderr": 0.000808568948388182, "rouge1_precision": 0.0025272416860947825, "rouge1_precision_stderr": 0.0007813236132809655, "rouge1_recall": 0.002586618259740399, "rouge1_recall_stderr": 0.0008670624223009818, "rouge2_fmeasure": 0.0003242107125389267, "rouge2_fmeasure_stderr": 0.00014221025181447073, "rouge2_precision": 0.0003236042782341223, "rouge2_precision_stderr": 0.0001381414258477455, "rouge2_recall": 0.0003324696313480705, "rouge2_recall_stderr": 0.00015129466488844655, "rougeL_fmeasure": 0.00159555474061474, "rougeL_fmeasure_stderr": 0.0004956987539243005, "rougeL_precision": 0.0016454419652040672, "rougeL_precision_stderr": 0.0004929867496820194, "rougeL_recall": 0.001639076487035711, "rougeL_recall_stderr": 0.0005281423440330004, "rougeLsum_fmeasure": 0.00159555474061474, "rougeLsum_fmeasure_stderr": 0.0004956987539243005, "rougeLsum_precision": 0.0016454419652040672, "rougeLsum_precision_stderr": 0.0004929867496820194, "rougeLsum_recall": 0.001639076487035711, "rougeLsum_recall_stderr": 0.0005281423440330004}}}}
8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_2.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.30679877376441267,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.003595563898423582
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.25602196677157735,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.002878244091822486
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.24392276306894944,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0023000547396419086
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.09387841885638748,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.002194124801129067
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.0740439804559834,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.001644608125064704
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.07100355255541753,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0014153910474231026
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.23412738417408802,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0029250825035858734
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.19426592228730813,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0023089496783281213
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.18445520873996413,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.001797621869515743
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.28859199947210284,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.003452141980034977
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.23966421160449644,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0027125548096589747
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.2286765314746336,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0021801688851002263
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 4.074528254683577,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.10710940903995719
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 2,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_3.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.272311629129121,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.0039505188546921826
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.21816270627496134,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.0032448735585025937
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.2094195871488414,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0027068429969493734
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.08329439938013247,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0021782819294409984
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.06480971019253767,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0016827399941316222
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.06204427812400913,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0014401113659882462
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.20896331681491714,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0031500865399556595
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.1664625275086072,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0025592493419649177
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.1590668148316741,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0020766324795938023
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.2563037368454477,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.0037619221351167056
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.20447650361202882,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0030540843932082493
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.1964756114951262,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0025500807038816515
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 3.168617248016535,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.08041979746938775
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 3,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.0910819200952716,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.0033747683121654196
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.07028567198192245,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.002640964970940966
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.06757697669893872,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0023707490073172043
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.028646478253466095,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0016076285180980135
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.020868526921797886,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0011537893245677335
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.019948424307073904,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.000986170441488306
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.07082578661014885,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.002691634510208164
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.05448506471222756,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0020854622601667937
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.05201990108593777,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0018314763879696625
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.08551950005220435,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.0031983492312005384
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.06543250975997009,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0024532813023277916
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.06308719820573185,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.002213451741392458
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 0.07788444152267485,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.01315525656911281
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b25b/evaluation/generation/slim.8b7178b25b_GEM-wiki_lingua_en_tldr_en_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.014515441910190443,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.001501466217296952
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.011301943813768834,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.0012288119306628567
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.010473201550267177,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0010554421691169314
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.004591980814271401,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0006515529824929627
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.00416888931272546,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0005930107839352501
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.0035604598673033626,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0004625418185814249
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.011779972055249343,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0012586537746088814
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.009040049894954919,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0009873007391796482
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.008371164394493067,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0008468942819407445
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.013708349668480058,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.0014293611241212706
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.010579263673799228,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0011521180305532216
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.009841875351812794,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.000994934070773035
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 1.4470192084599072e-15,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 5.1547920765171173e-14
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_3.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.2511767032658256,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.004670743744963309
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.2497224746972405,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.004054437554455434
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.23697855013529284,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.003931784147472208
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.06435388180319074,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.002934840810442915
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.060376215777618605,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.002521653823700491
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.059313458488013185,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.0025641694242472487
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.19057534922020994,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.0038620821553629983
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.1901634518957205,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0033714037003502494
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.17977672776005849,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.00326317345808302
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.19211901704097034,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.003850775083612694
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.19288110429850555,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.003421247538700275
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.18163235303784986,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.003264460230665997
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 3.267835808853254,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 0.26356764701611307
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 3,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.06645727693786248,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.004158365862658779
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.061174931724787315,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.003797370735030977
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.05891769720848019,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.0035369396004201713
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.01730890961224063,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.0017698712455992984
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.016289532767626923,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.0017234032328131926
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.015297632126595406,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.00149002417901287
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.050637820115463826,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.0032826504244346
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.04683971382625969,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.003039332848507969
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.04476173097758684,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.0027730017560448727
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.051192679124604304,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.0033047254109378095
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.0475395827212547,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.0030900986283582004
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.04533434532971928,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.0028025558912498296
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 0.17970307621766216,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 0.04965869269887041
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b25b/evaluation/generation/slim.8b7178b25b_gem_xsum_article_DOC_summary_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.0025272416860947825,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0007813236132809655
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.002586618259740399,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.0008670624223009818
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.0025036711184808556,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.000808568948388182
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.0003236042782341223,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.0001381414258477455
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.0003324696313480705,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.00015129466488844655
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.0003242107125389267,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.00014221025181447073
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.0016454419652040672,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.0004929867496820194
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.001639076487035711,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0005281423440330004
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.00159555474061474,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.0004956987539243005
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.0016454419652040672,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.0004929867496820194
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.001639076487035711,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.0005281423440330004
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.00159555474061474,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.0004956987539243005
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 1.393056311124318e-43,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 1.2487431746784071e-33
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.31947645525912327, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0038030155885502574}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.26784034634508286, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003035447513908994}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.24919698955059236, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002338885547952402}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.10032559893960638, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0024161327250120574}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.07957621599352918, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017828844980089247}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.07449487544541136, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015020810496653879}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.2468115494253994, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0032025411700306672}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.20411776076410448, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024277091219038756}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.18970698519609924, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018593336285269075}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.3009096197940564, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0036645356865323267}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.251048984299023, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002859798214873002}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.23380521600569548, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022238011394774266}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 4.480739832822098, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11578330856151871}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.2871095429771904, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004185534949827048}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.22207861869609608, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0033297349416556445}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.21276268379590219, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002745499539353653}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.08919737395825947, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00234436562476581}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.06668770358204071, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017001378203776699}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.06395005883547085, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014814659874998457}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.22428029391025195, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0034506563669751745}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.17117251055926017, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002654839665946902}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.16381135583162096, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021664607071098315}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.27186005782650335, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004024882931968101}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2092333223784394, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0031542921648456132}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.20067896546125685, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026118338406594167}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.416486588059767, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05871038450450248}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.09854206688714345, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0036437541450359593}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.07142922091930162, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026886044555192072}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.06934635599486277, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002419160037746626}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.03200012684242646, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001809559832121984}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.02183618792959388, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011694185534509838}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.021348512828022314, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001055380041694426}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.07858578853182602, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003017215712567151}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.055804214936503324, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021264630991137314}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.05429349387264579, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001919877615338037}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.0930444540425215, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003479952193050714}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.06690767452071034, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025201480735759655}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.06504921478523501, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002272098654651633}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.08573923963165347, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.01281159560771889}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b35b/evaluation/generation/agg.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.019085080359257173, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018850577941172406}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.011645843404748045, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0012042546813690255}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.012068612050969258, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001175572535963405}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.006894559713384093, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010187138526863938}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.003894968708693124, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.000532555761738665}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.004052279635441424, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005200626541191212}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.016020963520094682, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016304077510915223}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.00943399649665596, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.000984307352962584}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.009809956651925644, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0009599920054622314}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.018298677104570716, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001820476829671769}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.011022020415497191, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0011405019701246609}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.011452198317124414, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0011152369987060954}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 7.249514036066506e-16, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 3.339336428482265e-14}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.27838096040655913, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004710736944579292}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.256517363989891, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0043630766599847375}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.25539660890379623, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00408774094209769}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.07429604893692207, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0029381253193927104}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.06987562130359767, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002769991072408768}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.06904105989761854, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0026844346914174943}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.21060638862089187, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003887329749618103}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.19474922081143328, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0036536236006364338}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.19320917420626382, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0033971194258629626}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.21158267461706523, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003892248180779862}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.1959669205905632, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0036938624037444975}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.19420752036343683, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0034107307485400193}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 3.892045553878533, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.15338529710936155}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.06908582074094831, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004346958400381964}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0576659953376829, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0036525448833407675}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05924716020491812, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0036326176527612343}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.018055976337857795, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001722423057966192}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.015572542649221232, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014824376881666455}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.015889553201926813, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001483207722000358}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0526906303370273, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00345962655866441}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.04371274927728761, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00285083893180292}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.04481653988120624, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0028149978857180923}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.05278495458510948, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0034751372117411933}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.04367091698209838, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002855118245232499}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.044863049752252554, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002827990480856111}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.0793183178082136, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.033794375862044715}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b35b/evaluation/generation/agg.8b7178b35b_gem_xsum_article_DOC_summary_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.002843906337499315, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009956544746623187}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0024927455220140884, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0008268054962235649}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.002561173851810196, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008604007460830072}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0005943099210680686, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00031949865564033644}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0004189368714895907, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00021998461769511408}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.00047911010108716033, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00025331584786903913}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0019474370851628444, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000688363115056785}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0016525178188876207, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005530966506647648}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0017324699968285832, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005921585827612698}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0019474370851628444, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.000688363115056785}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0016525178188876207, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005530966506647648}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0017324699968285832, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005921585827612698}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 4.872051358237048e-56, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.0994911419227211e-38}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cef9e23c45b67405beb3ff4d8e97a8060db73f0e64df98df4af5d1acd5a1c31
3
+ size 18570445
8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d8ab41153f8c5cbdc0e5db1ae88eb357e47d8a89ad418763eecd322d316ca53
3
+ size 24036414
8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b63a61d5ecbd64af6b936ef71578a174d5f6e8a17c62f394bac9e9bc9a989e2e
3
+ size 29368136
8b7178b35b/evaluation/generation/examples.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c3059fcd2c910dc6f6c2c416d6d9b01454f1e9159acc33a0ca091228d556a9f
3
+ size 34782138
8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_3.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93d43594e15f1d34ea0fbf2c7239b5a7c541f3b067f5cc9e2c9642eaa2e12a98
3
+ size 9452835
8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_4.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94abf8a4ac5eb39351ff0818bc2877bbf39ffecb2edb70b911dc7ab3111798c9
3
+ size 11623564
8b7178b35b/evaluation/generation/examples.8b7178b35b_gem_xsum_article_DOC_summary_5.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:826540fe8c81c1c81615e699be345425a39b09165794b22a59b7bae5dea47af6
3
+ size 13897110
8b7178b35b/evaluation/generation/merged.csv CHANGED
@@ -18,7 +18,13 @@ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.05697362472735911
18
  gem_xsum,1,median,rouge2_fmeasure,0.05697362472735911
19
  gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.06850101702956525
20
  gem_xsum,2,median,rouge2_fmeasure,0.06850101702956525
21
- gem_xsum,2,average,multiple,0.056975464704954716
 
 
 
 
 
 
22
  web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05172430149730381
23
  web_nlg_en,0,median,rouge2_fmeasure,0.05172430149730381
24
  web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.0838728953683248
@@ -36,4 +42,12 @@ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.049529716403418626
36
  wiki_lingua_en,0,median,rouge2_fmeasure,0.049529716403418626
37
  wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04749462077218236
38
  wiki_lingua_en,1,median,rouge2_fmeasure,0.04749462077218236
39
- wiki_lingua_en,1,average,multiple,0.0485121685878005
 
 
 
 
 
 
 
 
 
18
  gem_xsum,1,median,rouge2_fmeasure,0.05697362472735911
19
  gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.06850101702956525
20
  gem_xsum,2,median,rouge2_fmeasure,0.06850101702956525
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.06904105989761854
22
+ gem_xsum,3,median,rouge2_fmeasure,0.06904105989761854
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.015889553201926813
24
+ gem_xsum,4,median,rouge2_fmeasure,0.015889553201926813
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00047911010108716033
26
+ gem_xsum,5,median,rouge2_fmeasure,0.00047911010108716033
27
+ gem_xsum,5,average,multiple,0.04272268621924944
28
  web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05172430149730381
29
  web_nlg_en,0,median,rouge2_fmeasure,0.05172430149730381
30
  web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.0838728953683248
 
42
  wiki_lingua_en,0,median,rouge2_fmeasure,0.049529716403418626
43
  wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04749462077218236
44
  wiki_lingua_en,1,median,rouge2_fmeasure,0.04749462077218236
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.07449487544541136
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.07449487544541136
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.06395005883547085
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.06395005883547085
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.021348512828022314
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.021348512828022314
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.004052279635441424
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.004052279635441424
53
+ wiki_lingua_en,5,average,multiple,0.04347834398665782
8b7178b35b/evaluation/generation/merged.json CHANGED
@@ -1 +1 @@
1
- {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.347289354458488, "bleu_stderr": 0.03777797007924809, "rouge1_fmeasure": 0.10890954353426278, "rouge1_fmeasure_stderr": 0.002110258935484955, "rouge1_precision": 0.079226368742227, "rouge1_precision_stderr": 0.0023231969845277887, "rouge1_recall": 0.28110828630725015, "rouge1_recall_stderr": 0.004882004381200993, "rouge2_fmeasure": 0.05172430149730381, "rouge2_fmeasure_stderr": 0.0013180585408877222, "rouge2_precision": 0.03743726607123581, "rouge2_precision_stderr": 0.001494228694268955, "rouge2_recall": 0.13690374884005102, "rouge2_recall_stderr": 0.003169133187878269, "rougeL_fmeasure": 0.10554978776184795, "rougeL_fmeasure_stderr": 0.0020007799763877027, "rougeL_precision": 0.07666350308702279, "rougeL_precision_stderr": 0.0022367957170880146, "rougeL_recall": 0.2745019482026448, "rougeL_recall_stderr": 0.004789753920178336, "rougeLsum_fmeasure": 0.10411236694388043, "rougeLsum_fmeasure_stderr": 0.0019983020211639833, "rougeLsum_precision": 0.07592638133051025, "rougeLsum_precision_stderr": 0.002251359653603086, "rougeLsum_recall": 0.2686557447211624, "rougeLsum_recall_stderr": 0.004613052990575398}}, "1": {"PALM_prompt": {"bleu": 0.5749625372797772, "bleu_stderr": 0.042168587225678913, "rouge1_fmeasure": 0.1639618887300654, "rouge1_fmeasure_stderr": 0.003768473720694237, "rouge1_precision": 0.1384507797063748, "rouge1_precision_stderr": 0.004372707845611215, "rouge1_recall": 0.322644364844527, "rouge1_recall_stderr": 0.004855940304467451, "rouge2_fmeasure": 0.0838728953683248, "rouge2_fmeasure_stderr": 0.002551976059024005, "rouge2_precision": 0.0725164426950792, "rouge2_precision_stderr": 0.0030197107273513208, "rouge2_recall": 0.16687390506721525, "rouge2_recall_stderr": 0.0035088962671374286, "rougeL_fmeasure": 0.14973459838273895, "rougeL_fmeasure_stderr": 0.003227143648327153, "rougeL_precision": 0.12455299793053924, "rougeL_precision_stderr": 0.0037936801945088155, "rougeL_recall": 0.3050028170207448, "rougeL_recall_stderr": 0.004518320595879189, "rougeLsum_fmeasure": 0.15125353657464896, "rougeLsum_fmeasure_stderr": 0.0032779530465988412, "rougeLsum_precision": 0.1263163648899627, "rougeLsum_precision_stderr": 0.0038597130269363692, "rougeLsum_recall": 0.3062159795615415, "rougeLsum_recall_stderr": 0.004522396040564721}}, "2": {"PALM_prompt": {"bleu": 0.6896289267842831, "bleu_stderr": 0.04363432869368013, "rouge1_fmeasure": 0.19577491362940544, "rouge1_fmeasure_stderr": 0.004183820712066921, "rouge1_precision": 0.1736763143186271, "rouge1_precision_stderr": 0.005190144069158226, "rouge1_recall": 0.3690495199973296, "rouge1_recall_stderr": 0.004674550419980399, "rouge2_fmeasure": 0.10468204406915697, "rouge2_fmeasure_stderr": 0.002958633901985044, "rouge2_precision": 0.09630923485520457, "rouge2_precision_stderr": 0.003591484421705004, "rouge2_recall": 0.19774238776363945, "rouge2_recall_stderr": 0.0036773455774089777, "rougeL_fmeasure": 0.17655301985743188, "rougeL_fmeasure_stderr": 0.003543990303517691, "rougeL_precision": 0.15346964126360133, "rougeL_precision_stderr": 0.004394358403285555, "rougeL_recall": 0.34605615384304983, "rougeL_recall_stderr": 0.0043553659806620975, "rougeLsum_fmeasure": 0.17970682314011696, "rougeLsum_fmeasure_stderr": 0.003629660902642662, "rougeLsum_precision": 0.15737438288230765, "rougeLsum_precision_stderr": 0.004551780258151066, "rougeLsum_recall": 0.34891501370883077, "rougeLsum_recall_stderr": 0.004358767783710016}}, "3": {"PALM_prompt": {"bleu": 0.7168649256709774, "bleu_stderr": 0.04675221435801664, "rouge1_fmeasure": 0.20039314889286589, "rouge1_fmeasure_stderr": 0.00433393844787534, "rouge1_precision": 0.1766984056786614, "rouge1_precision_stderr": 0.0052889863501627, "rouge1_recall": 0.3784023798729607, "rouge1_recall_stderr": 0.004834949146967397, "rouge2_fmeasure": 0.10711581894262515, "rouge2_fmeasure_stderr": 0.00302754276771825, "rouge2_precision": 0.09764044765164406, "rouge2_precision_stderr": 0.0036619448797318528, "rouge2_recall": 0.2030335209107597, "rouge2_recall_stderr": 0.003804312974743258, "rougeL_fmeasure": 0.18005823477527308, "rougeL_fmeasure_stderr": 0.003625480259314596, "rougeL_precision": 0.15566438191020568, "rougeL_precision_stderr": 0.004489975630701125, "rougeL_recall": 0.3537310697482285, "rougeL_recall_stderr": 0.004433439714918285, "rougeLsum_fmeasure": 0.18404319632429025, "rougeLsum_fmeasure_stderr": 0.0037645994118713916, "rougeLsum_precision": 0.1606361822943105, "rougeLsum_precision_stderr": 0.004693428623221418, "rougeLsum_recall": 0.3571516509838202, "rougeLsum_recall_stderr": 0.004469628307077723}}, "4": {"PALM_prompt": {"bleu": 0.8264767377736691, "bleu_stderr": 0.053779296385247366, "rouge1_fmeasure": 0.20221675887809062, "rouge1_fmeasure_stderr": 0.004283401665288551, "rouge1_precision": 0.1812068418232479, "rouge1_precision_stderr": 0.005398716331751978, "rouge1_recall": 0.38369035451056227, "rouge1_recall_stderr": 0.00470013713630282, "rouge2_fmeasure": 0.10880911597097326, "rouge2_fmeasure_stderr": 0.0029539167777379253, "rouge2_precision": 0.1006261222318028, "rouge2_precision_stderr": 0.0036927051039995647, "rouge2_recall": 0.2074416749104224, "rouge2_recall_stderr": 0.0036680698832641224, "rougeL_fmeasure": 0.18068778740178215, "rougeL_fmeasure_stderr": 0.0035531966464061853, "rougeL_precision": 0.15862122681407606, "rougeL_precision_stderr": 0.004530792062039435, "rougeL_recall": 0.35743845572365024, "rougeL_recall_stderr": 0.004293662429385583, "rougeLsum_fmeasure": 0.185234182101754, "rougeLsum_fmeasure_stderr": 0.00371289144584664, "rougeLsum_precision": 0.16438209047726673, "rougeLsum_precision_stderr": 0.004775185780341334, "rougeLsum_recall": 0.3611996661245711, "rougeLsum_recall_stderr": 0.00431294873519373}}, "5": {"PALM_prompt": {"bleu": 0.8765489182001927, "bleu_stderr": 0.040019456401903304, "rouge1_fmeasure": 0.20880684983729342, "rouge1_fmeasure_stderr": 0.004530179904806841, "rouge1_precision": 0.1926905497088421, "rouge1_precision_stderr": 0.0057538188799125855, "rouge1_recall": 0.3858328625547419, "rouge1_recall_stderr": 0.004823164304577201, "rouge2_fmeasure": 0.11565656452634464, "rouge2_fmeasure_stderr": 0.003314764756228703, "rouge2_precision": 0.11101027585585174, "rouge2_precision_stderr": 0.004140682260724748, "rouge2_recall": 0.21158156307350992, "rouge2_recall_stderr": 0.003943074328986311, "rougeL_fmeasure": 0.18694354311801048, "rougeL_fmeasure_stderr": 0.0038293768377802495, "rougeL_precision": 0.16975713372550302, "rougeL_precision_stderr": 0.0049633182199218225, "rougeL_recall": 0.3591348058816613, "rougeL_recall_stderr": 0.004445791053730718, "rougeLsum_fmeasure": 0.1915290665625795, "rougeLsum_fmeasure_stderr": 0.00396816747113494, "rougeLsum_precision": 0.17553725023286618, "rougeLsum_precision_stderr": 0.005177574879953629, "rougeLsum_recall": 0.36311828129199303, "rougeLsum_recall_stderr": 0.0044570305199087895}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 3.586033759423314, "bleu_stderr": 0.09221854298529836, "rouge1_fmeasure": 0.18563962193300107, "rouge1_fmeasure_stderr": 0.00254964484535964, "rouge1_precision": 0.1716696703091672, "rouge1_precision_stderr": 0.0028501359020609955, "rouge1_recall": 0.25185497103999677, "rouge1_recall_stderr": 0.0034752671023028777, "rouge2_fmeasure": 0.049529716403418626, "rouge2_fmeasure_stderr": 0.001152868080944952, "rouge2_precision": 0.04455795738149438, "rouge2_precision_stderr": 0.0011159745679413468, "rouge2_recall": 0.06850978198098658, "rouge2_recall_stderr": 0.0017237693104594396, "rougeL_fmeasure": 0.13995045719718288, "rougeL_fmeasure_stderr": 0.0018763707447302035, "rougeL_precision": 0.1292734642683531, "rougeL_precision_stderr": 0.0022188669387071605, "rougeL_recall": 0.1943570396317403, "rougeL_recall_stderr": 0.0027761702028766522, "rougeLsum_fmeasure": 0.17363683324091103, "rougeLsum_fmeasure_stderr": 0.0023982582496962375, "rougeLsum_precision": 0.1610229468194331, "rougeLsum_precision_stderr": 0.002727168940548498, "rougeLsum_recall": 0.2357960382787708, "rougeLsum_recall_stderr": 0.0032789217359646683}}, "1": {"tldr_en": {"bleu": 3.0885083531376645, "bleu_stderr": 0.09894775701896347, "rouge1_fmeasure": 0.18742769556123198, "rouge1_fmeasure_stderr": 0.0024179522944743578, "rouge1_precision": 0.22141486924288373, "rouge1_precision_stderr": 0.003489092311505774, "rouge1_recall": 0.21774103131853748, "rouge1_recall_stderr": 0.003219238261544544, "rouge2_fmeasure": 0.04749462077218236, "rouge2_fmeasure_stderr": 0.0013391994596319457, "rouge2_precision": 0.05911494441028749, "rouge2_precision_stderr": 0.001977492552657427, "rouge2_recall": 0.056021987595245265, "rouge2_recall_stderr": 0.0016925719239639409, "rougeL_fmeasure": 0.14177658902154563, "rougeL_fmeasure_stderr": 0.0018452092691051947, "rougeL_precision": 0.17052813755243684, "rougeL_precision_stderr": 0.0028543436290086777, "rougeL_recall": 0.16498921391985125, "rougeL_recall_stderr": 0.0024957799702815765, "rougeLsum_fmeasure": 0.17566654570551468, "rougeLsum_fmeasure_stderr": 0.0022619479293200127, "rougeLsum_precision": 0.20843084756852662, "rougeLsum_precision_stderr": 0.003321140474934239, "rougeLsum_recall": 0.20397276049127364, "rougeLsum_recall_stderr": 0.0030133958209501476}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 3.8008125605460465, "bleu_stderr": 0.08334728702857473, "rouge1_fmeasure": 0.2613118947613821, "rouge1_fmeasure_stderr": 0.002127912583794242, "rouge1_precision": 0.2090865590305055, "rouge1_precision_stderr": 0.0018470177010718961, "rouge1_recall": 0.38188549726437987, "rouge1_recall_stderr": 0.003279728734456605, "rouge2_fmeasure": 0.07297635058372318, "rouge2_fmeasure_stderr": 0.001261248605220041, "rouge2_precision": 0.05673694028468428, "rouge2_precision_stderr": 0.0009833704090490367, "rouge2_recall": 0.1101409383371642, "rouge2_recall_stderr": 0.001995492794975982, "rougeL_fmeasure": 0.22836186321205557, "rougeL_fmeasure_stderr": 0.0015605125797972495, "rougeL_precision": 0.18275732889261928, "rougeL_precision_stderr": 0.0013921067012969746, "rougeL_recall": 0.3338079929800571, "rougeL_recall_stderr": 0.0024505755903694226, "rougeLsum_fmeasure": 0.21899459401932223, "rougeLsum_fmeasure_stderr": 0.002102657843438912, "rougeLsum_precision": 0.17512630400560542, "rougeLsum_precision_stderr": 0.0017783983400275266, "rougeLsum_recall": 0.32141092125405557, "rougeLsum_recall_stderr": 0.0032662985068743528}}, "1": {"generate_text_restaurant": {"bleu": 12.455563496168116, "bleu_stderr": 0.15428951986028144, "rouge1_fmeasure": 0.4844618589871223, "rouge1_fmeasure_stderr": 0.002388878380866762, "rouge1_precision": 0.5954981850070853, "rouge1_precision_stderr": 0.0032755377697093577, "rouge1_recall": 0.44832902186902923, "rouge1_recall_stderr": 0.0030688387452721486, "rouge2_fmeasure": 0.23224231794118463, "rouge2_fmeasure_stderr": 0.0020931392807158806, "rouge2_precision": 0.29040989982948295, "rouge2_precision_stderr": 0.0028204861869818784, "rouge2_recall": 0.21450317916657013, "rouge2_recall_stderr": 0.002222395267553394, "rougeL_fmeasure": 0.34990734371381305, "rougeL_fmeasure_stderr": 0.0021300316703662904, "rougeL_precision": 0.43368332329208353, "rougeL_precision_stderr": 0.003056248156608264, "rougeL_recall": 0.32270327842717356, "rougeL_recall_stderr": 0.0024894724353540653, "rougeLsum_fmeasure": 0.39499604706438096, "rougeLsum_fmeasure_stderr": 0.0023754095690034817, "rougeLsum_precision": 0.48684291022612447, "rougeLsum_precision_stderr": 0.0032370809243618716, "rougeLsum_recall": 0.3651475528395549, "rougeLsum_recall_stderr": 0.002807567486848305}}, "2": {"generate_text_restaurant": {"bleu": 15.187891510159636, "bleu_stderr": 0.2005340847158173, "rouge1_fmeasure": 0.5149400564373232, "rouge1_fmeasure_stderr": 0.002281960188799717, "rouge1_precision": 0.6089225605030864, "rouge1_precision_stderr": 0.0031749506024181815, "rouge1_recall": 0.4850253270309617, "rouge1_recall_stderr": 0.0029712295443402523, "rouge2_fmeasure": 0.26064964037026866, "rouge2_fmeasure_stderr": 0.002149918786285521, "rouge2_precision": 0.3125612676733449, "rouge2_precision_stderr": 0.002807570058211188, "rouge2_recall": 0.24520830668256238, "rouge2_recall_stderr": 0.0023166169808109976, "rougeL_fmeasure": 0.3791251783681887, "rougeL_fmeasure_stderr": 0.0021638392150190966, "rougeL_precision": 0.45032557438452553, "rougeL_precision_stderr": 0.002995769757075173, "rougeL_recall": 0.3564750276333542, "rougeL_recall_stderr": 0.002545550505071048, "rougeLsum_fmeasure": 0.4311876269636124, "rougeLsum_fmeasure_stderr": 0.0024222285980316075, "rougeLsum_precision": 0.5099476065779874, "rougeLsum_precision_stderr": 0.003210910234266457, "rougeLsum_recall": 0.40596954377109, "rougeLsum_recall_stderr": 0.002856919494329858}}, "3": {"generate_text_restaurant": {"bleu": 16.060659394975822, "bleu_stderr": 0.17284642655126536, "rouge1_fmeasure": 0.5239878618101873, "rouge1_fmeasure_stderr": 0.0022891503962730515, "rouge1_precision": 0.6148474805598093, "rouge1_precision_stderr": 0.003175147795291498, "rouge1_recall": 0.4938729679272436, "rouge1_recall_stderr": 0.0029612490025592896, "rouge2_fmeasure": 0.27038735747786175, "rouge2_fmeasure_stderr": 0.002196480015942097, "rouge2_precision": 0.32141615003361074, "rouge2_precision_stderr": 0.00284369777288176, "rouge2_recall": 0.2546854150197406, "rouge2_recall_stderr": 0.0023748524797935944, "rougeL_fmeasure": 0.3858853477225852, "rougeL_fmeasure_stderr": 0.002195201839278644, "rougeL_precision": 0.45482796385524316, "rougeL_precision_stderr": 0.0030285934039142345, "rougeL_recall": 0.3632799719766207, "rougeL_recall_stderr": 0.0025747603467030653, "rougeLsum_fmeasure": 0.44212298577697895, "rougeLsum_fmeasure_stderr": 0.0024314611034434628, "rougeLsum_precision": 0.5189202264105984, "rougeLsum_precision_stderr": 0.0032211525451001028, "rougeLsum_recall": 0.41656979202046523, "rougeLsum_recall_stderr": 0.0028606302793073527}}, "4": {"generate_text_restaurant": {"bleu": 16.35064977606228, "bleu_stderr": 0.15318703610951076, "rouge1_fmeasure": 0.5272948463681918, "rouge1_fmeasure_stderr": 0.002276632828321617, "rouge1_precision": 0.6141579219291876, "rouge1_precision_stderr": 0.0031375276423873367, "rouge1_recall": 0.49642790420160215, "rouge1_recall_stderr": 0.002890279129158977, "rouge2_fmeasure": 0.2730606307438099, "rouge2_fmeasure_stderr": 0.002234438055141524, "rouge2_precision": 0.3216796674918701, "rouge2_precision_stderr": 0.0028452669990831196, "rouge2_recall": 0.2567141692534647, "rouge2_recall_stderr": 0.002372105255220152, "rougeL_fmeasure": 0.38874867977872807, "rougeL_fmeasure_stderr": 0.0021952914125996665, "rougeL_precision": 0.45388329842280073, "rougeL_precision_stderr": 0.0029409250663779253, "rougeL_recall": 0.36571295934350256, "rougeL_recall_stderr": 0.0025294206995360793, "rougeLsum_fmeasure": 0.44624941166081916, "rougeLsum_fmeasure_stderr": 0.0024399419515776116, "rougeLsum_precision": 0.5193905306564236, "rougeLsum_precision_stderr": 0.0031731739172339655, "rougeLsum_recall": 0.4201701863764282, "rougeLsum_recall_stderr": 0.0028340856821376906}}, "5": {"generate_text_restaurant": {"bleu": 16.33989554218624, "bleu_stderr": 0.17192498787094718, "rouge1_fmeasure": 0.5272012483217712, "rouge1_fmeasure_stderr": 0.0022727033449929193, "rouge1_precision": 0.6138887802372733, "rouge1_precision_stderr": 0.003148762043973577, "rouge1_recall": 0.49579908715969256, "rouge1_recall_stderr": 0.002855704257905023, "rouge2_fmeasure": 0.2739732304449431, "rouge2_fmeasure_stderr": 0.0022083117263179994, "rouge2_precision": 0.32296239014683903, "rouge2_precision_stderr": 0.002836651143474931, "rouge2_recall": 0.25721295886814427, "rouge2_recall_stderr": 0.0023323863742251865, "rougeL_fmeasure": 0.39053942113540757, "rougeL_fmeasure_stderr": 0.0022084680095602566, "rougeL_precision": 0.4557756119232901, "rougeL_precision_stderr": 0.002957529253678058, "rougeL_recall": 0.36723275796263094, "rougeL_recall_stderr": 0.002545583277155397, "rougeLsum_fmeasure": 0.44667560043167237, "rougeLsum_fmeasure_stderr": 0.002425231007898886, "rougeLsum_precision": 0.520413988660044, "rougeLsum_precision_stderr": 0.003202124211985598, "rougeLsum_recall": 0.41985936033161986, "rougeLsum_recall_stderr": 0.002793790259009851}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.9172897233734647, "bleu_stderr": 0.07612979258741162, "rouge1_fmeasure": 0.19497689185235167, "rouge1_fmeasure_stderr": 0.003037459759774, "rouge1_precision": 0.14704474756783373, "rouge1_precision_stderr": 0.00260243994472115, "rouge1_recall": 0.3179426585120001, "rouge1_recall_stderr": 0.005091359061361595, "rouge2_fmeasure": 0.04545175235793978, "rouge2_fmeasure_stderr": 0.0016613895167432173, "rouge2_precision": 0.033133611529725375, "rouge2_precision_stderr": 0.0012222134099385885, "rouge2_recall": 0.07643755617885226, "rouge2_recall_stderr": 0.002852925969928893, "rougeL_fmeasure": 0.14329622002037548, "rougeL_fmeasure_stderr": 0.002251083734524946, "rougeL_precision": 0.10872152825582168, "rougeL_precision_stderr": 0.0020985136406316504, "rougeL_recall": 0.23420658227277613, "rougeL_recall_stderr": 0.0038579727721169913, "rougeLsum_fmeasure": 0.15671768300152755, "rougeLsum_fmeasure_stderr": 0.0024938518539829066, "rougeLsum_precision": 0.11878580563146314, "rougeLsum_precision_stderr": 0.0022791200374934033, "rougeLsum_recall": 0.2561567987942721, "rougeLsum_recall_stderr": 0.004250537243559008}}, "1": {"article_DOC_summary": {"bleu": 2.9617243214867686, "bleu_stderr": 0.2121625238447871, "rouge1_fmeasure": 0.2429048922678839, "rouge1_fmeasure_stderr": 0.0035470294629619853, "rouge1_precision": 0.2506117820029391, "rouge1_precision_stderr": 0.004197193925421806, "rouge1_recall": 0.265051427357207, "rouge1_recall_stderr": 0.004113838412416427, "rouge2_fmeasure": 0.05697362472735911, "rouge2_fmeasure_stderr": 0.002319852424027265, "rouge2_precision": 0.059355725931601024, "rouge2_precision_stderr": 0.0026146656506255265, "rouge2_recall": 0.06240115907434781, "rouge2_recall_stderr": 0.002528707836330792, "rougeL_fmeasure": 0.1806939148235656, "rougeL_fmeasure_stderr": 0.002897868234740968, "rougeL_precision": 0.18675397233452354, "rougeL_precision_stderr": 0.003437969762757092, "rougeL_recall": 0.19744938072178414, "rougeL_recall_stderr": 0.0033460143453592026, "rougeLsum_fmeasure": 0.18340626109421165, "rougeLsum_fmeasure_stderr": 0.002921137289513508, "rougeLsum_precision": 0.18896297297256576, "rougeLsum_precision_stderr": 0.003436743475620793, "rougeLsum_recall": 0.20147068575801652, "rougeLsum_recall_stderr": 0.0034574795296360844}}, "2": {"article_DOC_summary": {"bleu": 3.790180948775114, "bleu_stderr": 0.21949698226608796, "rouge1_fmeasure": 0.2684969037756941, "rouge1_fmeasure_stderr": 0.003538515661007479, "rouge1_precision": 0.2862450933885503, "rouge1_precision_stderr": 0.0042447208222623255, "rouge1_recall": 0.2741637614630613, "rouge1_recall_stderr": 0.0038690227267741645, "rouge2_fmeasure": 0.06850101702956525, "rouge2_fmeasure_stderr": 0.002469575816155894, "rouge2_precision": 0.07370429393444561, "rouge2_precision_stderr": 0.002782856999759695, "rouge2_recall": 0.07014627514180007, "rouge2_recall_stderr": 0.002596637645417416, "rougeL_fmeasure": 0.20209449252255896, "rougeL_fmeasure_stderr": 0.003025686120342655, "rougeL_precision": 0.21544246954500543, "rougeL_precision_stderr": 0.003580176542199536, "rougeL_recall": 0.20674126511922747, "rougeL_recall_stderr": 0.0032887383484612206, "rougeLsum_fmeasure": 0.20333325134792413, "rougeLsum_fmeasure_stderr": 0.0030149542501899957, "rougeLsum_precision": 0.21658328451330233, "rougeLsum_precision_stderr": 0.003567997453456091, "rougeLsum_recall": 0.2083566653493878, "rougeLsum_recall_stderr": 0.0032948366397107273}}}}
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.347289354458488, "bleu_stderr": 0.03777797007924809, "rouge1_fmeasure": 0.10890954353426278, "rouge1_fmeasure_stderr": 0.002110258935484955, "rouge1_precision": 0.079226368742227, "rouge1_precision_stderr": 0.0023231969845277887, "rouge1_recall": 0.28110828630725015, "rouge1_recall_stderr": 0.004882004381200993, "rouge2_fmeasure": 0.05172430149730381, "rouge2_fmeasure_stderr": 0.0013180585408877222, "rouge2_precision": 0.03743726607123581, "rouge2_precision_stderr": 0.001494228694268955, "rouge2_recall": 0.13690374884005102, "rouge2_recall_stderr": 0.003169133187878269, "rougeL_fmeasure": 0.10554978776184795, "rougeL_fmeasure_stderr": 0.0020007799763877027, "rougeL_precision": 0.07666350308702279, "rougeL_precision_stderr": 0.0022367957170880146, "rougeL_recall": 0.2745019482026448, "rougeL_recall_stderr": 0.004789753920178336, "rougeLsum_fmeasure": 0.10411236694388043, "rougeLsum_fmeasure_stderr": 0.0019983020211639833, "rougeLsum_precision": 0.07592638133051025, "rougeLsum_precision_stderr": 0.002251359653603086, "rougeLsum_recall": 0.2686557447211624, "rougeLsum_recall_stderr": 0.004613052990575398}}, "1": {"PALM_prompt": {"bleu": 0.5749625372797772, "bleu_stderr": 0.042168587225678913, "rouge1_fmeasure": 0.1639618887300654, "rouge1_fmeasure_stderr": 0.003768473720694237, "rouge1_precision": 0.1384507797063748, "rouge1_precision_stderr": 0.004372707845611215, "rouge1_recall": 0.322644364844527, "rouge1_recall_stderr": 0.004855940304467451, "rouge2_fmeasure": 0.0838728953683248, "rouge2_fmeasure_stderr": 0.002551976059024005, "rouge2_precision": 0.0725164426950792, "rouge2_precision_stderr": 0.0030197107273513208, "rouge2_recall": 0.16687390506721525, "rouge2_recall_stderr": 0.0035088962671374286, "rougeL_fmeasure": 0.14973459838273895, "rougeL_fmeasure_stderr": 0.003227143648327153, "rougeL_precision": 0.12455299793053924, "rougeL_precision_stderr": 0.0037936801945088155, "rougeL_recall": 0.3050028170207448, "rougeL_recall_stderr": 0.004518320595879189, "rougeLsum_fmeasure": 0.15125353657464896, "rougeLsum_fmeasure_stderr": 0.0032779530465988412, "rougeLsum_precision": 0.1263163648899627, "rougeLsum_precision_stderr": 0.0038597130269363692, "rougeLsum_recall": 0.3062159795615415, "rougeLsum_recall_stderr": 0.004522396040564721}}, "2": {"PALM_prompt": {"bleu": 0.6896289267842831, "bleu_stderr": 0.04363432869368013, "rouge1_fmeasure": 0.19577491362940544, "rouge1_fmeasure_stderr": 0.004183820712066921, "rouge1_precision": 0.1736763143186271, "rouge1_precision_stderr": 0.005190144069158226, "rouge1_recall": 0.3690495199973296, "rouge1_recall_stderr": 0.004674550419980399, "rouge2_fmeasure": 0.10468204406915697, "rouge2_fmeasure_stderr": 0.002958633901985044, "rouge2_precision": 0.09630923485520457, "rouge2_precision_stderr": 0.003591484421705004, "rouge2_recall": 0.19774238776363945, "rouge2_recall_stderr": 0.0036773455774089777, "rougeL_fmeasure": 0.17655301985743188, "rougeL_fmeasure_stderr": 0.003543990303517691, "rougeL_precision": 0.15346964126360133, "rougeL_precision_stderr": 0.004394358403285555, "rougeL_recall": 0.34605615384304983, "rougeL_recall_stderr": 0.0043553659806620975, "rougeLsum_fmeasure": 0.17970682314011696, "rougeLsum_fmeasure_stderr": 0.003629660902642662, "rougeLsum_precision": 0.15737438288230765, "rougeLsum_precision_stderr": 0.004551780258151066, "rougeLsum_recall": 0.34891501370883077, "rougeLsum_recall_stderr": 0.004358767783710016}}, "3": {"PALM_prompt": {"bleu": 0.7168649256709774, "bleu_stderr": 0.04675221435801664, "rouge1_fmeasure": 0.20039314889286589, "rouge1_fmeasure_stderr": 0.00433393844787534, "rouge1_precision": 0.1766984056786614, "rouge1_precision_stderr": 0.0052889863501627, "rouge1_recall": 0.3784023798729607, "rouge1_recall_stderr": 0.004834949146967397, "rouge2_fmeasure": 0.10711581894262515, "rouge2_fmeasure_stderr": 0.00302754276771825, "rouge2_precision": 0.09764044765164406, "rouge2_precision_stderr": 0.0036619448797318528, "rouge2_recall": 0.2030335209107597, "rouge2_recall_stderr": 0.003804312974743258, "rougeL_fmeasure": 0.18005823477527308, "rougeL_fmeasure_stderr": 0.003625480259314596, "rougeL_precision": 0.15566438191020568, "rougeL_precision_stderr": 0.004489975630701125, "rougeL_recall": 0.3537310697482285, "rougeL_recall_stderr": 0.004433439714918285, "rougeLsum_fmeasure": 0.18404319632429025, "rougeLsum_fmeasure_stderr": 0.0037645994118713916, "rougeLsum_precision": 0.1606361822943105, "rougeLsum_precision_stderr": 0.004693428623221418, "rougeLsum_recall": 0.3571516509838202, "rougeLsum_recall_stderr": 0.004469628307077723}}, "4": {"PALM_prompt": {"bleu": 0.8264767377736691, "bleu_stderr": 0.053779296385247366, "rouge1_fmeasure": 0.20221675887809062, "rouge1_fmeasure_stderr": 0.004283401665288551, "rouge1_precision": 0.1812068418232479, "rouge1_precision_stderr": 0.005398716331751978, "rouge1_recall": 0.38369035451056227, "rouge1_recall_stderr": 0.00470013713630282, "rouge2_fmeasure": 0.10880911597097326, "rouge2_fmeasure_stderr": 0.0029539167777379253, "rouge2_precision": 0.1006261222318028, "rouge2_precision_stderr": 0.0036927051039995647, "rouge2_recall": 0.2074416749104224, "rouge2_recall_stderr": 0.0036680698832641224, "rougeL_fmeasure": 0.18068778740178215, "rougeL_fmeasure_stderr": 0.0035531966464061853, "rougeL_precision": 0.15862122681407606, "rougeL_precision_stderr": 0.004530792062039435, "rougeL_recall": 0.35743845572365024, "rougeL_recall_stderr": 0.004293662429385583, "rougeLsum_fmeasure": 0.185234182101754, "rougeLsum_fmeasure_stderr": 0.00371289144584664, "rougeLsum_precision": 0.16438209047726673, "rougeLsum_precision_stderr": 0.004775185780341334, "rougeLsum_recall": 0.3611996661245711, "rougeLsum_recall_stderr": 0.00431294873519373}}, "5": {"PALM_prompt": {"bleu": 0.8765489182001927, "bleu_stderr": 0.040019456401903304, "rouge1_fmeasure": 0.20880684983729342, "rouge1_fmeasure_stderr": 0.004530179904806841, "rouge1_precision": 0.1926905497088421, "rouge1_precision_stderr": 0.0057538188799125855, "rouge1_recall": 0.3858328625547419, "rouge1_recall_stderr": 0.004823164304577201, "rouge2_fmeasure": 0.11565656452634464, "rouge2_fmeasure_stderr": 0.003314764756228703, "rouge2_precision": 0.11101027585585174, "rouge2_precision_stderr": 0.004140682260724748, "rouge2_recall": 0.21158156307350992, "rouge2_recall_stderr": 0.003943074328986311, "rougeL_fmeasure": 0.18694354311801048, "rougeL_fmeasure_stderr": 0.0038293768377802495, "rougeL_precision": 0.16975713372550302, "rougeL_precision_stderr": 0.0049633182199218225, "rougeL_recall": 0.3591348058816613, "rougeL_recall_stderr": 0.004445791053730718, "rougeLsum_fmeasure": 0.1915290665625795, "rougeLsum_fmeasure_stderr": 0.00396816747113494, "rougeLsum_precision": 0.17553725023286618, "rougeLsum_precision_stderr": 0.005177574879953629, "rougeLsum_recall": 0.36311828129199303, "rougeLsum_recall_stderr": 0.0044570305199087895}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 3.586033759423314, "bleu_stderr": 0.09221854298529836, "rouge1_fmeasure": 0.18563962193300107, "rouge1_fmeasure_stderr": 0.00254964484535964, "rouge1_precision": 0.1716696703091672, "rouge1_precision_stderr": 0.0028501359020609955, "rouge1_recall": 0.25185497103999677, "rouge1_recall_stderr": 0.0034752671023028777, "rouge2_fmeasure": 0.049529716403418626, "rouge2_fmeasure_stderr": 0.001152868080944952, "rouge2_precision": 0.04455795738149438, "rouge2_precision_stderr": 0.0011159745679413468, "rouge2_recall": 0.06850978198098658, "rouge2_recall_stderr": 0.0017237693104594396, "rougeL_fmeasure": 0.13995045719718288, "rougeL_fmeasure_stderr": 0.0018763707447302035, "rougeL_precision": 0.1292734642683531, "rougeL_precision_stderr": 0.0022188669387071605, "rougeL_recall": 0.1943570396317403, "rougeL_recall_stderr": 0.0027761702028766522, "rougeLsum_fmeasure": 0.17363683324091103, "rougeLsum_fmeasure_stderr": 0.0023982582496962375, "rougeLsum_precision": 0.1610229468194331, "rougeLsum_precision_stderr": 0.002727168940548498, "rougeLsum_recall": 0.2357960382787708, "rougeLsum_recall_stderr": 0.0032789217359646683}}, "1": {"tldr_en": {"bleu": 3.0885083531376645, "bleu_stderr": 0.09894775701896347, "rouge1_fmeasure": 0.18742769556123198, "rouge1_fmeasure_stderr": 0.0024179522944743578, "rouge1_precision": 0.22141486924288373, "rouge1_precision_stderr": 0.003489092311505774, "rouge1_recall": 0.21774103131853748, "rouge1_recall_stderr": 0.003219238261544544, "rouge2_fmeasure": 0.04749462077218236, "rouge2_fmeasure_stderr": 0.0013391994596319457, "rouge2_precision": 0.05911494441028749, "rouge2_precision_stderr": 0.001977492552657427, "rouge2_recall": 0.056021987595245265, "rouge2_recall_stderr": 0.0016925719239639409, "rougeL_fmeasure": 0.14177658902154563, "rougeL_fmeasure_stderr": 0.0018452092691051947, "rougeL_precision": 0.17052813755243684, "rougeL_precision_stderr": 0.0028543436290086777, "rougeL_recall": 0.16498921391985125, "rougeL_recall_stderr": 0.0024957799702815765, "rougeLsum_fmeasure": 0.17566654570551468, "rougeLsum_fmeasure_stderr": 0.0022619479293200127, "rougeLsum_precision": 0.20843084756852662, "rougeLsum_precision_stderr": 0.003321140474934239, "rougeLsum_recall": 0.20397276049127364, "rougeLsum_recall_stderr": 0.0030133958209501476}}, "2": {"tldr_en": {"bleu": 4.480739832822098, "bleu_stderr": 0.11578330856151871, "rouge1_fmeasure": 0.24919698955059236, "rouge1_fmeasure_stderr": 0.002338885547952402, "rouge1_precision": 0.31947645525912327, "rouge1_precision_stderr": 0.0038030155885502574, "rouge1_recall": 0.26784034634508286, "rouge1_recall_stderr": 0.003035447513908994, "rouge2_fmeasure": 0.07449487544541136, "rouge2_fmeasure_stderr": 0.0015020810496653879, "rouge2_precision": 0.10032559893960638, "rouge2_precision_stderr": 0.0024161327250120574, "rouge2_recall": 0.07957621599352918, "rouge2_recall_stderr": 0.0017828844980089247, "rougeL_fmeasure": 0.18970698519609924, "rougeL_fmeasure_stderr": 0.0018593336285269075, "rougeL_precision": 0.2468115494253994, "rougeL_precision_stderr": 0.0032025411700306672, "rougeL_recall": 0.20411776076410448, "rougeL_recall_stderr": 0.0024277091219038756, "rougeLsum_fmeasure": 0.23380521600569548, "rougeLsum_fmeasure_stderr": 0.0022238011394774266, "rougeLsum_precision": 0.3009096197940564, "rougeLsum_precision_stderr": 0.0036645356865323267, "rougeLsum_recall": 0.251048984299023, "rougeLsum_recall_stderr": 0.002859798214873002}}, "3": {"tldr_en": {"bleu": 3.416486588059767, "bleu_stderr": 0.05871038450450248, "rouge1_fmeasure": 0.21276268379590219, "rouge1_fmeasure_stderr": 0.002745499539353653, "rouge1_precision": 0.2871095429771904, "rouge1_precision_stderr": 0.004185534949827048, "rouge1_recall": 0.22207861869609608, "rouge1_recall_stderr": 0.0033297349416556445, "rouge2_fmeasure": 0.06395005883547085, "rouge2_fmeasure_stderr": 0.0014814659874998457, "rouge2_precision": 0.08919737395825947, "rouge2_precision_stderr": 0.00234436562476581, "rouge2_recall": 0.06668770358204071, "rouge2_recall_stderr": 0.0017001378203776699, "rougeL_fmeasure": 0.16381135583162096, "rougeL_fmeasure_stderr": 0.0021664607071098315, "rougeL_precision": 0.22428029391025195, "rougeL_precision_stderr": 0.0034506563669751745, "rougeL_recall": 0.17117251055926017, "rougeL_recall_stderr": 0.002654839665946902, "rougeLsum_fmeasure": 0.20067896546125685, "rougeLsum_fmeasure_stderr": 0.0026118338406594167, "rougeLsum_precision": 0.27186005782650335, "rougeLsum_precision_stderr": 0.004024882931968101, "rougeLsum_recall": 0.2092333223784394, "rougeLsum_recall_stderr": 0.0031542921648456132}}, "4": {"tldr_en": {"bleu": 0.08573923963165347, "bleu_stderr": 0.01281159560771889, "rouge1_fmeasure": 0.06934635599486277, "rouge1_fmeasure_stderr": 0.002419160037746626, "rouge1_precision": 0.09854206688714345, "rouge1_precision_stderr": 0.0036437541450359593, "rouge1_recall": 0.07142922091930162, "rouge1_recall_stderr": 0.0026886044555192072, "rouge2_fmeasure": 0.021348512828022314, "rouge2_fmeasure_stderr": 0.001055380041694426, "rouge2_precision": 0.03200012684242646, "rouge2_precision_stderr": 0.001809559832121984, "rouge2_recall": 0.02183618792959388, "rouge2_recall_stderr": 0.0011694185534509838, "rougeL_fmeasure": 0.05429349387264579, "rougeL_fmeasure_stderr": 0.001919877615338037, "rougeL_precision": 0.07858578853182602, "rougeL_precision_stderr": 0.003017215712567151, "rougeL_recall": 0.055804214936503324, "rougeL_recall_stderr": 0.0021264630991137314, "rougeLsum_fmeasure": 0.06504921478523501, "rougeLsum_fmeasure_stderr": 0.002272098654651633, "rougeLsum_precision": 0.0930444540425215, "rougeLsum_precision_stderr": 0.003479952193050714, "rougeLsum_recall": 0.06690767452071034, "rougeLsum_recall_stderr": 0.0025201480735759655}}, "5": {"tldr_en": {"bleu": 7.249514036066506e-16, "bleu_stderr": 3.339336428482265e-14, "rouge1_fmeasure": 0.012068612050969258, "rouge1_fmeasure_stderr": 0.001175572535963405, "rouge1_precision": 0.019085080359257173, "rouge1_precision_stderr": 0.0018850577941172406, "rouge1_recall": 0.011645843404748045, "rouge1_recall_stderr": 0.0012042546813690255, "rouge2_fmeasure": 0.004052279635441424, "rouge2_fmeasure_stderr": 0.0005200626541191212, "rouge2_precision": 0.006894559713384093, "rouge2_precision_stderr": 0.0010187138526863938, "rouge2_recall": 0.003894968708693124, "rouge2_recall_stderr": 0.000532555761738665, "rougeL_fmeasure": 0.009809956651925644, "rougeL_fmeasure_stderr": 0.0009599920054622314, "rougeL_precision": 0.016020963520094682, "rougeL_precision_stderr": 0.0016304077510915223, "rougeL_recall": 0.00943399649665596, "rougeL_recall_stderr": 0.000984307352962584, "rougeLsum_fmeasure": 0.011452198317124414, "rougeLsum_fmeasure_stderr": 0.0011152369987060954, "rougeLsum_precision": 0.018298677104570716, "rougeLsum_precision_stderr": 0.001820476829671769, "rougeLsum_recall": 0.011022020415497191, "rougeLsum_recall_stderr": 0.0011405019701246609}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 3.8008125605460465, "bleu_stderr": 0.08334728702857473, "rouge1_fmeasure": 0.2613118947613821, "rouge1_fmeasure_stderr": 0.002127912583794242, "rouge1_precision": 0.2090865590305055, "rouge1_precision_stderr": 0.0018470177010718961, "rouge1_recall": 0.38188549726437987, "rouge1_recall_stderr": 0.003279728734456605, "rouge2_fmeasure": 0.07297635058372318, "rouge2_fmeasure_stderr": 0.001261248605220041, "rouge2_precision": 0.05673694028468428, "rouge2_precision_stderr": 0.0009833704090490367, "rouge2_recall": 0.1101409383371642, "rouge2_recall_stderr": 0.001995492794975982, "rougeL_fmeasure": 0.22836186321205557, "rougeL_fmeasure_stderr": 0.0015605125797972495, "rougeL_precision": 0.18275732889261928, "rougeL_precision_stderr": 0.0013921067012969746, "rougeL_recall": 0.3338079929800571, "rougeL_recall_stderr": 0.0024505755903694226, "rougeLsum_fmeasure": 0.21899459401932223, "rougeLsum_fmeasure_stderr": 0.002102657843438912, "rougeLsum_precision": 0.17512630400560542, "rougeLsum_precision_stderr": 0.0017783983400275266, "rougeLsum_recall": 0.32141092125405557, "rougeLsum_recall_stderr": 0.0032662985068743528}}, "1": {"generate_text_restaurant": {"bleu": 12.455563496168116, "bleu_stderr": 0.15428951986028144, "rouge1_fmeasure": 0.4844618589871223, "rouge1_fmeasure_stderr": 0.002388878380866762, "rouge1_precision": 0.5954981850070853, "rouge1_precision_stderr": 0.0032755377697093577, "rouge1_recall": 0.44832902186902923, "rouge1_recall_stderr": 0.0030688387452721486, "rouge2_fmeasure": 0.23224231794118463, "rouge2_fmeasure_stderr": 0.0020931392807158806, "rouge2_precision": 0.29040989982948295, "rouge2_precision_stderr": 0.0028204861869818784, "rouge2_recall": 0.21450317916657013, "rouge2_recall_stderr": 0.002222395267553394, "rougeL_fmeasure": 0.34990734371381305, "rougeL_fmeasure_stderr": 0.0021300316703662904, "rougeL_precision": 0.43368332329208353, "rougeL_precision_stderr": 0.003056248156608264, "rougeL_recall": 0.32270327842717356, "rougeL_recall_stderr": 0.0024894724353540653, "rougeLsum_fmeasure": 0.39499604706438096, "rougeLsum_fmeasure_stderr": 0.0023754095690034817, "rougeLsum_precision": 0.48684291022612447, "rougeLsum_precision_stderr": 0.0032370809243618716, "rougeLsum_recall": 0.3651475528395549, "rougeLsum_recall_stderr": 0.002807567486848305}}, "2": {"generate_text_restaurant": {"bleu": 15.187891510159636, "bleu_stderr": 0.2005340847158173, "rouge1_fmeasure": 0.5149400564373232, "rouge1_fmeasure_stderr": 0.002281960188799717, "rouge1_precision": 0.6089225605030864, "rouge1_precision_stderr": 0.0031749506024181815, "rouge1_recall": 0.4850253270309617, "rouge1_recall_stderr": 0.0029712295443402523, "rouge2_fmeasure": 0.26064964037026866, "rouge2_fmeasure_stderr": 0.002149918786285521, "rouge2_precision": 0.3125612676733449, "rouge2_precision_stderr": 0.002807570058211188, "rouge2_recall": 0.24520830668256238, "rouge2_recall_stderr": 0.0023166169808109976, "rougeL_fmeasure": 0.3791251783681887, "rougeL_fmeasure_stderr": 0.0021638392150190966, "rougeL_precision": 0.45032557438452553, "rougeL_precision_stderr": 0.002995769757075173, "rougeL_recall": 0.3564750276333542, "rougeL_recall_stderr": 0.002545550505071048, "rougeLsum_fmeasure": 0.4311876269636124, "rougeLsum_fmeasure_stderr": 0.0024222285980316075, "rougeLsum_precision": 0.5099476065779874, "rougeLsum_precision_stderr": 0.003210910234266457, "rougeLsum_recall": 0.40596954377109, "rougeLsum_recall_stderr": 0.002856919494329858}}, "3": {"generate_text_restaurant": {"bleu": 16.060659394975822, "bleu_stderr": 0.17284642655126536, "rouge1_fmeasure": 0.5239878618101873, "rouge1_fmeasure_stderr": 0.0022891503962730515, "rouge1_precision": 0.6148474805598093, "rouge1_precision_stderr": 0.003175147795291498, "rouge1_recall": 0.4938729679272436, "rouge1_recall_stderr": 0.0029612490025592896, "rouge2_fmeasure": 0.27038735747786175, "rouge2_fmeasure_stderr": 0.002196480015942097, "rouge2_precision": 0.32141615003361074, "rouge2_precision_stderr": 0.00284369777288176, "rouge2_recall": 0.2546854150197406, "rouge2_recall_stderr": 0.0023748524797935944, "rougeL_fmeasure": 0.3858853477225852, "rougeL_fmeasure_stderr": 0.002195201839278644, "rougeL_precision": 0.45482796385524316, "rougeL_precision_stderr": 0.0030285934039142345, "rougeL_recall": 0.3632799719766207, "rougeL_recall_stderr": 0.0025747603467030653, "rougeLsum_fmeasure": 0.44212298577697895, "rougeLsum_fmeasure_stderr": 0.0024314611034434628, "rougeLsum_precision": 0.5189202264105984, "rougeLsum_precision_stderr": 0.0032211525451001028, "rougeLsum_recall": 0.41656979202046523, "rougeLsum_recall_stderr": 0.0028606302793073527}}, "4": {"generate_text_restaurant": {"bleu": 16.35064977606228, "bleu_stderr": 0.15318703610951076, "rouge1_fmeasure": 0.5272948463681918, "rouge1_fmeasure_stderr": 0.002276632828321617, "rouge1_precision": 0.6141579219291876, "rouge1_precision_stderr": 0.0031375276423873367, "rouge1_recall": 0.49642790420160215, "rouge1_recall_stderr": 0.002890279129158977, "rouge2_fmeasure": 0.2730606307438099, "rouge2_fmeasure_stderr": 0.002234438055141524, "rouge2_precision": 0.3216796674918701, "rouge2_precision_stderr": 0.0028452669990831196, "rouge2_recall": 0.2567141692534647, "rouge2_recall_stderr": 0.002372105255220152, "rougeL_fmeasure": 0.38874867977872807, "rougeL_fmeasure_stderr": 0.0021952914125996665, "rougeL_precision": 0.45388329842280073, "rougeL_precision_stderr": 0.0029409250663779253, "rougeL_recall": 0.36571295934350256, "rougeL_recall_stderr": 0.0025294206995360793, "rougeLsum_fmeasure": 0.44624941166081916, "rougeLsum_fmeasure_stderr": 0.0024399419515776116, "rougeLsum_precision": 0.5193905306564236, "rougeLsum_precision_stderr": 0.0031731739172339655, "rougeLsum_recall": 0.4201701863764282, "rougeLsum_recall_stderr": 0.0028340856821376906}}, "5": {"generate_text_restaurant": {"bleu": 16.33989554218624, "bleu_stderr": 0.17192498787094718, "rouge1_fmeasure": 0.5272012483217712, "rouge1_fmeasure_stderr": 0.0022727033449929193, "rouge1_precision": 0.6138887802372733, "rouge1_precision_stderr": 0.003148762043973577, "rouge1_recall": 0.49579908715969256, "rouge1_recall_stderr": 0.002855704257905023, "rouge2_fmeasure": 0.2739732304449431, "rouge2_fmeasure_stderr": 0.0022083117263179994, "rouge2_precision": 0.32296239014683903, "rouge2_precision_stderr": 0.002836651143474931, "rouge2_recall": 0.25721295886814427, "rouge2_recall_stderr": 0.0023323863742251865, "rougeL_fmeasure": 0.39053942113540757, "rougeL_fmeasure_stderr": 0.0022084680095602566, "rougeL_precision": 0.4557756119232901, "rougeL_precision_stderr": 0.002957529253678058, "rougeL_recall": 0.36723275796263094, "rougeL_recall_stderr": 0.002545583277155397, "rougeLsum_fmeasure": 0.44667560043167237, "rougeLsum_fmeasure_stderr": 0.002425231007898886, "rougeLsum_precision": 0.520413988660044, "rougeLsum_precision_stderr": 0.003202124211985598, "rougeLsum_recall": 0.41985936033161986, "rougeLsum_recall_stderr": 0.002793790259009851}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.9172897233734647, "bleu_stderr": 0.07612979258741162, "rouge1_fmeasure": 0.19497689185235167, "rouge1_fmeasure_stderr": 0.003037459759774, "rouge1_precision": 0.14704474756783373, "rouge1_precision_stderr": 0.00260243994472115, "rouge1_recall": 0.3179426585120001, "rouge1_recall_stderr": 0.005091359061361595, "rouge2_fmeasure": 0.04545175235793978, "rouge2_fmeasure_stderr": 0.0016613895167432173, "rouge2_precision": 0.033133611529725375, "rouge2_precision_stderr": 0.0012222134099385885, "rouge2_recall": 0.07643755617885226, "rouge2_recall_stderr": 0.002852925969928893, "rougeL_fmeasure": 0.14329622002037548, "rougeL_fmeasure_stderr": 0.002251083734524946, "rougeL_precision": 0.10872152825582168, "rougeL_precision_stderr": 0.0020985136406316504, "rougeL_recall": 0.23420658227277613, "rougeL_recall_stderr": 0.0038579727721169913, "rougeLsum_fmeasure": 0.15671768300152755, "rougeLsum_fmeasure_stderr": 0.0024938518539829066, "rougeLsum_precision": 0.11878580563146314, "rougeLsum_precision_stderr": 0.0022791200374934033, "rougeLsum_recall": 0.2561567987942721, "rougeLsum_recall_stderr": 0.004250537243559008}}, "1": {"article_DOC_summary": {"bleu": 2.9617243214867686, "bleu_stderr": 0.2121625238447871, "rouge1_fmeasure": 0.2429048922678839, "rouge1_fmeasure_stderr": 0.0035470294629619853, "rouge1_precision": 0.2506117820029391, "rouge1_precision_stderr": 0.004197193925421806, "rouge1_recall": 0.265051427357207, "rouge1_recall_stderr": 0.004113838412416427, "rouge2_fmeasure": 0.05697362472735911, "rouge2_fmeasure_stderr": 0.002319852424027265, "rouge2_precision": 0.059355725931601024, "rouge2_precision_stderr": 0.0026146656506255265, "rouge2_recall": 0.06240115907434781, "rouge2_recall_stderr": 0.002528707836330792, "rougeL_fmeasure": 0.1806939148235656, "rougeL_fmeasure_stderr": 0.002897868234740968, "rougeL_precision": 0.18675397233452354, "rougeL_precision_stderr": 0.003437969762757092, "rougeL_recall": 0.19744938072178414, "rougeL_recall_stderr": 0.0033460143453592026, "rougeLsum_fmeasure": 0.18340626109421165, "rougeLsum_fmeasure_stderr": 0.002921137289513508, "rougeLsum_precision": 0.18896297297256576, "rougeLsum_precision_stderr": 0.003436743475620793, "rougeLsum_recall": 0.20147068575801652, "rougeLsum_recall_stderr": 0.0034574795296360844}}, "2": {"article_DOC_summary": {"bleu": 3.790180948775114, "bleu_stderr": 0.21949698226608796, "rouge1_fmeasure": 0.2684969037756941, "rouge1_fmeasure_stderr": 0.003538515661007479, "rouge1_precision": 0.2862450933885503, "rouge1_precision_stderr": 0.0042447208222623255, "rouge1_recall": 0.2741637614630613, "rouge1_recall_stderr": 0.0038690227267741645, "rouge2_fmeasure": 0.06850101702956525, "rouge2_fmeasure_stderr": 0.002469575816155894, "rouge2_precision": 0.07370429393444561, "rouge2_precision_stderr": 0.002782856999759695, "rouge2_recall": 0.07014627514180007, "rouge2_recall_stderr": 0.002596637645417416, "rougeL_fmeasure": 0.20209449252255896, "rougeL_fmeasure_stderr": 0.003025686120342655, "rougeL_precision": 0.21544246954500543, "rougeL_precision_stderr": 0.003580176542199536, "rougeL_recall": 0.20674126511922747, "rougeL_recall_stderr": 0.0032887383484612206, "rougeLsum_fmeasure": 0.20333325134792413, "rougeLsum_fmeasure_stderr": 0.0030149542501899957, "rougeLsum_precision": 0.21658328451330233, "rougeLsum_precision_stderr": 0.003567997453456091, "rougeLsum_recall": 0.2083566653493878, "rougeLsum_recall_stderr": 0.0032948366397107273}}, "3": {"article_DOC_summary": {"bleu": 3.892045553878533, "bleu_stderr": 0.15338529710936155, "rouge1_fmeasure": 0.25539660890379623, "rouge1_fmeasure_stderr": 0.00408774094209769, "rouge1_precision": 0.27838096040655913, "rouge1_precision_stderr": 0.004710736944579292, "rouge1_recall": 0.256517363989891, "rouge1_recall_stderr": 0.0043630766599847375, "rouge2_fmeasure": 0.06904105989761854, "rouge2_fmeasure_stderr": 0.0026844346914174943, "rouge2_precision": 0.07429604893692207, "rouge2_precision_stderr": 0.0029381253193927104, "rouge2_recall": 0.06987562130359767, "rouge2_recall_stderr": 0.002769991072408768, "rougeL_fmeasure": 0.19320917420626382, "rougeL_fmeasure_stderr": 0.0033971194258629626, "rougeL_precision": 0.21060638862089187, "rougeL_precision_stderr": 0.003887329749618103, "rougeL_recall": 0.19474922081143328, "rougeL_recall_stderr": 0.0036536236006364338, "rougeLsum_fmeasure": 0.19420752036343683, "rougeLsum_fmeasure_stderr": 0.0034107307485400193, "rougeLsum_precision": 0.21158267461706523, "rougeLsum_precision_stderr": 0.003892248180779862, "rougeLsum_recall": 0.1959669205905632, "rougeLsum_recall_stderr": 0.0036938624037444975}}, "4": {"article_DOC_summary": {"bleu": 0.0793183178082136, "bleu_stderr": 0.033794375862044715, "rouge1_fmeasure": 0.05924716020491812, "rouge1_fmeasure_stderr": 0.0036326176527612343, "rouge1_precision": 0.06908582074094831, "rouge1_precision_stderr": 0.004346958400381964, "rouge1_recall": 0.0576659953376829, "rouge1_recall_stderr": 0.0036525448833407675, "rouge2_fmeasure": 0.015889553201926813, "rouge2_fmeasure_stderr": 0.001483207722000358, "rouge2_precision": 0.018055976337857795, "rouge2_precision_stderr": 0.001722423057966192, "rouge2_recall": 0.015572542649221232, "rouge2_recall_stderr": 0.0014824376881666455, "rougeL_fmeasure": 0.04481653988120624, "rougeL_fmeasure_stderr": 0.0028149978857180923, "rougeL_precision": 0.0526906303370273, "rougeL_precision_stderr": 0.00345962655866441, "rougeL_recall": 0.04371274927728761, "rougeL_recall_stderr": 0.00285083893180292, "rougeLsum_fmeasure": 0.044863049752252554, "rougeLsum_fmeasure_stderr": 0.002827990480856111, "rougeLsum_precision": 0.05278495458510948, "rougeLsum_precision_stderr": 0.0034751372117411933, "rougeLsum_recall": 0.04367091698209838, "rougeLsum_recall_stderr": 0.002855118245232499}}, "5": {"article_DOC_summary": {"bleu": 4.872051358237048e-56, "bleu_stderr": 1.0994911419227211e-38, "rouge1_fmeasure": 0.002561173851810196, "rouge1_fmeasure_stderr": 0.0008604007460830072, "rouge1_precision": 0.002843906337499315, "rouge1_precision_stderr": 0.0009956544746623187, "rouge1_recall": 0.0024927455220140884, "rouge1_recall_stderr": 0.0008268054962235649, "rouge2_fmeasure": 0.00047911010108716033, "rouge2_fmeasure_stderr": 0.00025331584786903913, "rouge2_precision": 0.0005943099210680686, "rouge2_precision_stderr": 0.00031949865564033644, "rouge2_recall": 0.0004189368714895907, "rouge2_recall_stderr": 0.00021998461769511408, "rougeL_fmeasure": 0.0017324699968285832, "rougeL_fmeasure_stderr": 0.0005921585827612698, "rougeL_precision": 0.0019474370851628444, "rougeL_precision_stderr": 0.000688363115056785, "rougeL_recall": 0.0016525178188876207, "rougeL_recall_stderr": 0.0005530966506647648, "rougeLsum_fmeasure": 0.0017324699968285832, "rougeLsum_fmeasure_stderr": 0.0005921585827612698, "rougeLsum_precision": 0.0019474370851628444, "rougeLsum_precision_stderr": 0.000688363115056785, "rougeLsum_recall": 0.0016525178188876207, "rougeLsum_recall_stderr": 0.0005530966506647648}}}}
8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_2.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.31947645525912327,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.0038030155885502574
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.26784034634508286,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.003035447513908994
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.24919698955059236,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.002338885547952402
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.10032559893960638,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0024161327250120574
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.07957621599352918,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0017828844980089247
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.07449487544541136,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0015020810496653879
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.2468115494253994,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0032025411700306672
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.20411776076410448,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0024277091219038756
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.18970698519609924,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0018593336285269075
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.3009096197940564,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.0036645356865323267
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.251048984299023,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.002859798214873002
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.23380521600569548,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0022238011394774266
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 4.480739832822098,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.11578330856151871
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 2,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_3.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.2871095429771904,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.004185534949827048
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.22207861869609608,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.0033297349416556445
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.21276268379590219,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.002745499539353653
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.08919737395825947,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.00234436562476581
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.06668770358204071,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0017001378203776699
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.06395005883547085,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0014814659874998457
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.22428029391025195,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0034506563669751745
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.17117251055926017,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.002654839665946902
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.16381135583162096,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0021664607071098315
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.27186005782650335,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.004024882931968101
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.2092333223784394,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0031542921648456132
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.20067896546125685,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0026118338406594167
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 3.416486588059767,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.05871038450450248
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 3,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.09854206688714345,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.0036437541450359593
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.07142922091930162,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.0026886044555192072
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.06934635599486277,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.002419160037746626
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.03200012684242646,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.001809559832121984
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.02183618792959388,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0011694185534509838
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.021348512828022314,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.001055380041694426
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.07858578853182602,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.003017215712567151
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.055804214936503324,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0021264630991137314
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.05429349387264579,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.001919877615338037
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.0930444540425215,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.003479952193050714
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.06690767452071034,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0025201480735759655
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.06504921478523501,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.002272098654651633
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 0.08573923963165347,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.01281159560771889
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b35b/evaluation/generation/slim.8b7178b35b_GEM-wiki_lingua_en_tldr_en_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.019085080359257173,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.0018850577941172406
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.011645843404748045,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.0012042546813690255
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.012068612050969258,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.001175572535963405
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.006894559713384093,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0010187138526863938
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.003894968708693124,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.000532555761738665
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.004052279635441424,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0005200626541191212
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.016020963520094682,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0016304077510915223
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.00943399649665596,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.000984307352962584
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.009809956651925644,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0009599920054622314
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.018298677104570716,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.001820476829671769
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.011022020415497191,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.0011405019701246609
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.011452198317124414,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0011152369987060954
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 7.249514036066506e-16,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 3.339336428482265e-14
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_3.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.27838096040655913,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.004710736944579292
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.256517363989891,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.0043630766599847375
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.25539660890379623,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.00408774094209769
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.07429604893692207,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.0029381253193927104
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.06987562130359767,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.002769991072408768
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.06904105989761854,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.0026844346914174943
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.21060638862089187,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.003887329749618103
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.19474922081143328,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0036536236006364338
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.19320917420626382,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.0033971194258629626
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.21158267461706523,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.003892248180779862
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.1959669205905632,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.0036938624037444975
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.19420752036343683,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.0034107307485400193
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 3.892045553878533,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 0.15338529710936155
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 3,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.06908582074094831,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.004346958400381964
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.0576659953376829,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.0036525448833407675
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.05924716020491812,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.0036326176527612343
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.018055976337857795,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.001722423057966192
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.015572542649221232,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.0014824376881666455
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.015889553201926813,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.001483207722000358
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.0526906303370273,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.00345962655866441
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.04371274927728761,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.00285083893180292
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.04481653988120624,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.0028149978857180923
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.05278495458510948,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.0034751372117411933
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.04367091698209838,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.002855118245232499
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.044863049752252554,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.002827990480856111
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 0.0793183178082136,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 0.033794375862044715
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b35b/evaluation/generation/slim.8b7178b35b_gem_xsum_article_DOC_summary_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.002843906337499315,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0009956544746623187
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.0024927455220140884,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.0008268054962235649
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.002561173851810196,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.0008604007460830072
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.0005943099210680686,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.00031949865564033644
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.0004189368714895907,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.00021998461769511408
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.00047911010108716033,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.00025331584786903913
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.0019474370851628444,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.000688363115056785
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.0016525178188876207,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0005530966506647648
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.0017324699968285832,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.0005921585827612698
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.0019474370851628444,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.000688363115056785
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.0016525178188876207,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.0005530966506647648
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.0017324699968285832,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.0005921585827612698
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 4.872051358237048e-56,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 1.0994911419227211e-38
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b35b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 4,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.26704682686051745, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003346061664825914}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.28723900360273236, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0030648378055716376}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.24048738248155943, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0022924559211728}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.07760883007326022, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018992689472675252}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.07930488429722847, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016958561414342646}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0668489557832933, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013397924519285302}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.2005421334534283, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00272610264193461}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.2147121574085488, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023921943267692113}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.17873946616791977, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017622551083101064}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.25120585518065286, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0032054522314792226}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2689555721836676, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002872035212266304}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.22548796825222664, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002166294099617983}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 4.287770006656117, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11835519853257852}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.24693572848760356, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0036808375259794837}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.24354376203183603, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0034150456640924024}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.21120410580427312, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002645585016214612}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.07085834748957563, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001874517454430376}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0691245743627346, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017265463033494647}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05941725493999427, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013446993130627589}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.18619149092636542, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029566758253007693}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.18275407900501345, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002678372152846199}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1574847757182755, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020235834993420653}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.23179629616820094, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003517652101595812}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.22762811767303537, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0031986554975203343}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.19767422939876575, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00249381697530238}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.8253545690458486, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11601169899524703}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b44b/evaluation/generation/agg.8b7178b44b_GEM-wiki_lingua_en_tldr_en_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.08491288146025691, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003092982485525366}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.08011242516269225, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028916714775573983}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.069564585739879, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002354877693129134}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.02565888234744232, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014806953806721635}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.023735577183813514, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012482652822662537}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.02020024245182335, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009720348342506413}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.06624573735046896, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0025139925050591855}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.06166301644336901, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002275982977075628}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.0532771556440879, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001820427400127109}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.07976639037062659, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0029385774245711256}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.07471788462506215, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002705709953549525}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.06493333871562736, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022026091321757347}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.2364078795997809, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.024391601699964672}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b44b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 4, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}