Muennighoff commited on
Commit
71f6ad1
1 Parent(s): 2e5ca64
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. 8b7178b13b/evaluation/8b7178b13b_1_babi.json +22 -0
  2. 8b7178b13b/evaluation/8b7178b13b_2_babi.json +22 -0
  3. 8b7178b13b/evaluation/8b7178b13b_3_babi.json +22 -0
  4. 8b7178b13b/evaluation/8b7178b13b_4_babi.json +22 -0
  5. 8b7178b13b/evaluation/8b7178b13b_5_babi.json +22 -0
  6. 8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.json +1 -0
  7. 8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.json +1 -0
  8. 8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.json +1 -0
  9. 8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.json +1 -0
  10. 8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_3.json +1 -0
  11. 8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_4.json +1 -0
  12. 8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_5.json +1 -0
  13. 8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.jsonl +3 -0
  14. 8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.jsonl +3 -0
  15. 8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.jsonl +3 -0
  16. 8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.jsonl +3 -0
  17. 8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_3.jsonl +3 -0
  18. 8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_4.jsonl +3 -0
  19. 8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_5.jsonl +3 -0
  20. 8b7178b13b/evaluation/generation/merged.csv +53 -0
  21. 8b7178b13b/evaluation/generation/merged.json +1 -0
  22. 8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.json +133 -0
  23. 8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.json +133 -0
  24. 8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.json +133 -0
  25. 8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.json +133 -0
  26. 8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_3.json +133 -0
  27. 8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_4.json +133 -0
  28. 8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_5.json +133 -0
  29. 8b7178b13b/evaluation/rankeval/8b7178b13b_0.csv +21 -0
  30. 8b7178b13b/evaluation/rankeval/8b7178b13b_0_lm-eval_global_step84877_2023-05-15-10-06-37_0shots_backup.json +0 -87
  31. 8b7178b13b/evaluation/rankeval/8b7178b13b_1.csv +21 -0
  32. 8b7178b13b/evaluation/rankeval/8b7178b13b_1_lm-eval_global_step84877_2023-05-15-10-06-37_1shots_backup.json +0 -87
  33. 8b7178b13b/evaluation/rankeval/8b7178b13b_2.csv +21 -0
  34. 8b7178b13b/evaluation/rankeval/8b7178b13b_2_lm-eval_global_step84877_2023-05-15-10-06-37_2shots_backup.json +0 -87
  35. 8b7178b13b/evaluation/rankeval/8b7178b13b_3.csv +21 -0
  36. 8b7178b13b/evaluation/rankeval/8b7178b13b_3_lm-eval_global_step84877_2023-05-15-10-06-37_3shots_backup.json +0 -87
  37. 8b7178b13b/evaluation/rankeval/8b7178b13b_4.csv +21 -0
  38. 8b7178b13b/evaluation/rankeval/8b7178b13b_4_lm-eval_global_step84877_2023-05-15-10-07-32_4shots_backup.json +0 -87
  39. 8b7178b13b/evaluation/rankeval/8b7178b13b_5.csv +21 -0
  40. 8b7178b13b/evaluation/rankeval/8b7178b13b_5_lm-eval_global_step84877_2023-05-15-10-06-37_5shots_backup.json +0 -87
  41. 8b7178b178b/evaluation/8b7178b178b_1_babi.json +22 -0
  42. 8b7178b178b/evaluation/8b7178b178b_2_babi.json +22 -0
  43. 8b7178b178b/evaluation/8b7178b178b_3_babi.json +22 -0
  44. 8b7178b178b/evaluation/8b7178b178b_4_babi.json +22 -0
  45. 8b7178b178b/evaluation/8b7178b178b_5_babi.json +22 -0
  46. 8b7178b25b/evaluation/8b7178b25b_0_babi.json +22 -0
  47. 8b7178b25b/evaluation/8b7178b25b_1_babi.json +22 -0
  48. 8b7178b25b/evaluation/8b7178b25b_2_babi.json +22 -0
  49. 8b7178b25b/evaluation/8b7178b25b_3_babi.json +22 -0
  50. 8b7178b25b/evaluation/8b7178b25b_4_babi.json +22 -0
8b7178b13b/evaluation/8b7178b13b_1_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.10966666666666666,
5
+ "em_stderr": 0.005705916414010263
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers",
14
+ "num_fewshot": 1,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b13b/evaluation/8b7178b13b_2_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.20533333333333334,
5
+ "em_stderr": 0.007376222253753254
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers",
14
+ "num_fewshot": 2,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b13b/evaluation/8b7178b13b_3_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.258,
5
+ "em_stderr": 0.007989573064892506
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers",
14
+ "num_fewshot": 3,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b13b/evaluation/8b7178b13b_4_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.2843333333333333,
5
+ "em_stderr": 0.008237227300544015
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers",
14
+ "num_fewshot": 4,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b13b/evaluation/8b7178b13b_5_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.30933333333333335,
5
+ "em_stderr": 0.008440329009701236
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers",
14
+ "num_fewshot": 5,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.29175676576014364, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0034471150363075184}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.2607907068748222, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002898135289732422}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.23794097030152866, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002188642411213601}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.08536913371644045, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0020895235460299017}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0713925019948092, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001635476891768791}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.06574644928335649, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013625725128389179}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.22160256648226004, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0027745204374263316}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.19691795695851566, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002270661716223799}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1789951285071888, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001673666200127008}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.27421900185303183, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003293382529360642}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.24426464874342324, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002735802329870702}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.22300315725329103, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020678687486032563}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 4.071985313483549, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07738970826466988}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.2571754504445173, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0037874180373320154}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.22046706474081984, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0032483649489955437}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.20409132873325006, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026059526618525918}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.07716310931244513, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0021423793043653955}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.06255207307386669, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001665836586847627}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05803958336813696, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013905246714186119}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.19917050763282695, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031058368371087975}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.16863998232504207, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025563859107073333}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1557150280292743, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020175919818759087}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.24286851428671166, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003627579238772214}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.2070808578907387, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003069864948055298}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.19198507442281348, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002473123474513908}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.256877193820952, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07385158775345343}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.08608412704276916, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031073010197710128}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.07306085845324418, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026739004311174368}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.06754366655458334, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00231821642571142}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.02477632495929693, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013952242546163309}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.020963360092888926, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011677531292808372}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.01891360756998624, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009547131510927643}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.06705402166283986, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0024891810743964266}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.05633331544121819, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021123430958068772}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.05180637434259422, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017937325092342497}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.08098856968063572, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002946288914451725}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.06822240197436494, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002506023867308553}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.06316340092042942, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021760431723604587}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.11803096947131637, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.018616113127493047}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b13b/evaluation/generation/agg.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.014929032882940911, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001437252981426829}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.012605454696432882, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0012515184538477262}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.01167038909001005, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010944129890059014}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0041010832248453595, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005815077800226312}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0034469110258534192, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0004725161504235943}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.003203820995720566, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00042103771194584703}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.011742235186401492, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011661092630625284}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.009919184883424567, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0010141555758155579}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.009059396220908112, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008608509358819988}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.014255925532109718, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001380098431266019}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.012021925711305772, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001202720217240228}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.011096298465596519, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0010440702662319144}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 5.860100637257344e-12, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.0205550894986597e-10}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.21592444335846092, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004246983520225218}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.27255737663479357, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00447437166346409}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.22044943741927525, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003500198780987625}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.05330214126583765, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0025012370800294183}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.06515904722605939, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0025535061286592504}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.05351615652078515, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0022197484992897607}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.16172792138960806, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003474612361597086}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.20248110828398325, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0034801799909718704}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.16421475353571632, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0028136325952485735}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.16634670992387487, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0034675307885691244}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.21304752670955557, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003802772338811888}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.17052032174118803, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002883605135295133}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.4283658434676942, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1084231496409544}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.06147088600466444, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.003807688121301648}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.06919835565602203, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004080878740872531}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05798370899016291, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003309673469340258}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.014228533929974868, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014047860954449383}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0166314025722418, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0014602317545186565}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.013641962174467587, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012106481387976049}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.04649724897774529, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0029768758298026627}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.05168291535676844, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003094715210052394}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.043392440237132235, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002526920635642375}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.048035895047127686, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003028118969042768}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.05464177271239397, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0033104672738008553}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.04537985187674532, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026361408489140024}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.40228156399700404, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0795066489605694}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b13b/evaluation/generation/agg.8b7178b13b_gem_xsum_article_DOC_summary_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.002305114359249611, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0006855365150103617}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.002688750909485673, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0008419301936781596}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0022410464211047414, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006379815102748624}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.00023662681974673803, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00010805221445763836}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0003191847290509596, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00013624996945101362}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0002428157678792438, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00010121603253679161}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0014967981731443785, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00044282917196946015}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0018089244853048499, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005955143349649119}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.001480383954453164, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0004335775279500401}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0014611359517881007, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0004268628859757995}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0017987055420995693, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005977174922462606}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0014592568247182058, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00042833093506174205}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.2194816597556972e-29, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.9673150803997765e-21}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 8, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d50b98ce786c24110901e7da0bfb8f2f0b1a8997df2194229acd5bc122dc22d
3
+ size 18605434
8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fcc32b2a7a9a4ec85ad6b084bb458f744025984db325f5f2f95bc51178c1ef9
3
+ size 24070805
8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff0281a58876590084570bea0b8feb099efce7b6bd5f1dded593af2c7f2d0146
3
+ size 29380490
8b7178b13b/evaluation/generation/examples.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f9209c03a21b26044ca2675e69b17b1e5cd47f660be0226fba1abdd49d99233
3
+ size 34786842
8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_3.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d38077208e1f7fd684e78d720c69585bf6441502b4381400558f457020508dfa
3
+ size 9525984
8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_4.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebf648f5afd69404494ab6936ba7266cd146373c249af58da4c8975a49b6c660
3
+ size 11646229
8b7178b13b/evaluation/generation/examples.8b7178b13b_gem_xsum_article_DOC_summary_5.jsonl CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd422c7e3808f695d266dfee392525cea2dbb5f620848c74d4478f52bf23f26f
3
+ size 13898127
8b7178b13b/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.13957033665597848
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.13957033665597848
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.21971297989413593
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.21971297989413593
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.2436824998963185
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.2436824998963185
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2526618416523279
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2526618416523279
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.2559926229244319
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.2559926229244319
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.259556048619835
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.259556048619835
14
+ e2e_nlg_cleaned,5,average,multiple,0.22852938827383795
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.021176795907655737
16
+ gem_xsum,0,median,rouge2_fmeasure,0.021176795907655737
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.050641256552544464
18
+ gem_xsum,1,median,rouge2_fmeasure,0.050641256552544464
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.05359901469303583
20
+ gem_xsum,2,median,rouge2_fmeasure,0.05359901469303583
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.05351615652078515
22
+ gem_xsum,3,median,rouge2_fmeasure,0.05351615652078515
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.013641962174467587
24
+ gem_xsum,4,median,rouge2_fmeasure,0.013641962174467587
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0002428157678792438
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0002428157678792438
27
+ gem_xsum,5,average,multiple,0.032136333602728
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.047608492095606206
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.047608492095606206
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.08120001213349398
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.08120001213349398
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.11056055329882854
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.11056055329882854
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.12608234413935157
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.12608234413935157
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.1354322190391961
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.1354322190391961
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.1419903076188946
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.1419903076188946
40
+ web_nlg_en,5,average,multiple,0.10714565472089517
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.04284409748743667
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.04284409748743667
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05107550928141819
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.05107550928141819
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.06574644928335649
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.06574644928335649
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.05803958336813696
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.05803958336813696
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01891360756998624
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.01891360756998624
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.003203820995720566
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.003203820995720566
53
+ wiki_lingua_en,5,average,multiple,0.039970511331009186
8b7178b13b/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4522375218061915, "bleu_stderr": 0.04842537846111568, "rouge1_fmeasure": 0.10639641446726286, "rouge1_fmeasure_stderr": 0.002372783014156511, "rouge1_precision": 0.09405156373772382, "rouge1_precision_stderr": 0.0036589357455526954, "rouge1_recall": 0.2623187491640316, "rouge1_recall_stderr": 0.0054853417994958405, "rouge2_fmeasure": 0.047608492095606206, "rouge2_fmeasure_stderr": 0.001384256918874843, "rouge2_precision": 0.04001471903798784, "rouge2_precision_stderr": 0.002167193193860632, "rouge2_recall": 0.12482367655474384, "rouge2_recall_stderr": 0.003329638916496446, "rougeL_fmeasure": 0.09811489729413486, "rougeL_fmeasure_stderr": 0.002141247906215953, "rougeL_precision": 0.08735958257324784, "rougeL_precision_stderr": 0.0035013157983101876, "rougeL_recall": 0.2450465857818092, "rougeL_recall_stderr": 0.005156857986010396, "rougeLsum_fmeasure": 0.0992851030641096, "rougeLsum_fmeasure_stderr": 0.0022026560740801633, "rougeLsum_precision": 0.08882938411931994, "rougeLsum_precision_stderr": 0.0035668764760974714, "rougeLsum_recall": 0.24452754279908495, "rougeLsum_recall_stderr": 0.005065510424462396}}, "1": {"PALM_prompt": {"bleu": 0.650629434960658, "bleu_stderr": 0.04811417898690189, "rouge1_fmeasure": 0.16281032775122675, "rouge1_fmeasure_stderr": 0.003905335709899557, "rouge1_precision": 0.15916782201345867, "rouge1_precision_stderr": 0.005131870879954661, "rouge1_recall": 0.29665725707782414, "rouge1_recall_stderr": 0.005228319001095806, "rouge2_fmeasure": 0.08120001213349398, "rouge2_fmeasure_stderr": 0.0026664765201647343, "rouge2_precision": 0.07819648278455865, "rouge2_precision_stderr": 0.0034063589997107026, "rouge2_recall": 0.15070155918572603, "rouge2_recall_stderr": 0.003623729048520002, "rougeL_fmeasure": 0.14574992383080743, "rougeL_fmeasure_stderr": 0.003397730356308293, "rougeL_precision": 0.14191196636841233, "rougeL_precision_stderr": 0.004635280920281775, "rougeL_recall": 0.2736224046058516, "rougeL_recall_stderr": 0.004816339822450663, "rougeLsum_fmeasure": 0.14966591828719222, "rougeLsum_fmeasure_stderr": 0.003504137587849123, "rougeLsum_precision": 0.14648404402072937, "rougeLsum_precision_stderr": 0.004776844963861229, "rougeLsum_recall": 0.2774966849512831, "rougeLsum_recall_stderr": 0.004834496186187873}}, "2": {"PALM_prompt": {"bleu": 0.9046528025555703, "bleu_stderr": 0.04317473166983863, "rouge1_fmeasure": 0.2126025964717363, "rouge1_fmeasure_stderr": 0.004487229116514054, "rouge1_precision": 0.21014915270499354, "rouge1_precision_stderr": 0.005813053226743566, "rouge1_recall": 0.35653672883727167, "rouge1_recall_stderr": 0.0051494656409290275, "rouge2_fmeasure": 0.11056055329882854, "rouge2_fmeasure_stderr": 0.0031590233571128605, "rouge2_precision": 0.11091162929767316, "rouge2_precision_stderr": 0.003989043309470051, "rouge2_recall": 0.18668399640135247, "rouge2_recall_stderr": 0.0039177512048306565, "rougeL_fmeasure": 0.18634690018126043, "rougeL_fmeasure_stderr": 0.0038448807973693925, "rougeL_precision": 0.1825110333764981, "rougeL_precision_stderr": 0.005070346564582974, "rougeL_recall": 0.32304322748014636, "rougeL_recall_stderr": 0.004678142532303707, "rougeLsum_fmeasure": 0.19202573601368517, "rougeLsum_fmeasure_stderr": 0.003952591052036519, "rougeLsum_precision": 0.18939634311269352, "rougeLsum_precision_stderr": 0.005258445068941222, "rougeLsum_recall": 0.32966626591290826, "rougeLsum_recall_stderr": 0.004724502434373412}}, "3": {"PALM_prompt": {"bleu": 1.1442800703272336, "bleu_stderr": 0.05161996592570338, "rouge1_fmeasure": 0.23508991254207284, "rouge1_fmeasure_stderr": 0.004629614568550998, "rouge1_precision": 0.23213625362665477, "rouge1_precision_stderr": 0.0059610834558063595, "rouge1_recall": 0.38062292481677, "rouge1_recall_stderr": 0.005078282219671234, "rouge2_fmeasure": 0.12608234413935157, "rouge2_fmeasure_stderr": 0.003256601130342381, "rouge2_precision": 0.12811335412630678, "rouge2_precision_stderr": 0.004200144761611546, "rouge2_recall": 0.20379204157806646, "rouge2_recall_stderr": 0.00386170912683119, "rougeL_fmeasure": 0.20320672155464048, "rougeL_fmeasure_stderr": 0.003882337416765662, "rougeL_precision": 0.1997394107463207, "rougeL_precision_stderr": 0.005153079376868987, "rougeL_recall": 0.3397942315323993, "rougeL_recall_stderr": 0.004538527029075319, "rougeLsum_fmeasure": 0.21069756332821474, "rougeLsum_fmeasure_stderr": 0.004028196190821567, "rougeLsum_precision": 0.2083043346123531, "rougeLsum_precision_stderr": 0.005363455263356501, "rougeLsum_recall": 0.347929987856493, "rougeLsum_recall_stderr": 0.0045764471921679985}}, "4": {"PALM_prompt": {"bleu": 1.3793783786539846, "bleu_stderr": 0.08292725693632953, "rouge1_fmeasure": 0.25107613979636906, "rouge1_fmeasure_stderr": 0.00476323540590602, "rouge1_precision": 0.25146617993410997, "rouge1_precision_stderr": 0.006199020027337637, "rouge1_recall": 0.3925026283739189, "rouge1_recall_stderr": 0.005116605241451321, "rouge2_fmeasure": 0.1354322190391961, "rouge2_fmeasure_stderr": 0.003385535533668589, "rouge2_precision": 0.1385226844845104, "rouge2_precision_stderr": 0.004282956786141916, "rouge2_recall": 0.21131194271802842, "rouge2_recall_stderr": 0.0040385627196912076, "rougeL_fmeasure": 0.2160535699510597, "rougeL_fmeasure_stderr": 0.003987483818543123, "rougeL_precision": 0.21425855000222013, "rougeL_precision_stderr": 0.005261027090142587, "rougeL_recall": 0.34988090714764264, "rougeL_recall_stderr": 0.004590444333954342, "rougeLsum_fmeasure": 0.22537108240785111, "rougeLsum_fmeasure_stderr": 0.004170782290413772, "rougeLsum_precision": 0.22545655976991655, "rougeLsum_precision_stderr": 0.0055589913005229, "rougeLsum_recall": 0.36010176303537716, "rougeLsum_recall_stderr": 0.0046603281582664625}}, "5": {"PALM_prompt": {"bleu": 1.4145829791141218, "bleu_stderr": 0.06756850302725374, "rouge1_fmeasure": 0.25726246836968625, "rouge1_fmeasure_stderr": 0.004880864665323261, "rouge1_precision": 0.25559388874117145, "rouge1_precision_stderr": 0.006265188037282814, "rouge1_recall": 0.4014517195417289, "rouge1_recall_stderr": 0.0052097160639095145, "rouge2_fmeasure": 0.1419903076188946, "rouge2_fmeasure_stderr": 0.003491421839159055, "rouge2_precision": 0.1462902529413211, "rouge2_precision_stderr": 0.004502888503688928, "rouge2_recall": 0.22067604236758182, "rouge2_recall_stderr": 0.0041554431177298085, "rougeL_fmeasure": 0.22145748379950564, "rougeL_fmeasure_stderr": 0.004104691248611666, "rougeL_precision": 0.21838534521953784, "rougeL_precision_stderr": 0.005386484846358847, "rougeL_recall": 0.35807492888309944, "rougeL_recall_stderr": 0.004713215605602484, "rougeLsum_fmeasure": 0.2308946484376556, "rougeLsum_fmeasure_stderr": 0.004281312884008834, "rougeLsum_precision": 0.22909884605448813, "rougeLsum_precision_stderr": 0.00564116547513353, "rougeLsum_recall": 0.3686456415754311, "rougeLsum_recall_stderr": 0.004784333349134369}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 2.5768645983512664, "bleu_stderr": 0.10390732312462625, "rouge1_fmeasure": 0.1810278543145764, "rouge1_fmeasure_stderr": 0.002359129908964426, "rouge1_precision": 0.1631004447259747, "rouge1_precision_stderr": 0.0025140049334374266, "rouge1_recall": 0.24869580468026348, "rouge1_recall_stderr": 0.0032277820635021014, "rouge2_fmeasure": 0.04284409748743667, "rouge2_fmeasure_stderr": 0.001036773633130634, "rouge2_precision": 0.0385111191751266, "rouge2_precision_stderr": 0.0009986244610694906, "rouge2_recall": 0.05950986614858929, "rouge2_recall_stderr": 0.0015515657908224004, "rougeL_fmeasure": 0.1323060407446597, "rougeL_fmeasure_stderr": 0.001666744596513707, "rougeL_precision": 0.11812150163422086, "rougeL_precision_stderr": 0.0017909292945280444, "rougeL_recall": 0.18668011105548613, "rougeL_recall_stderr": 0.002526858332113489, "rougeLsum_fmeasure": 0.16822692441451978, "rougeLsum_fmeasure_stderr": 0.0021965424695315023, "rougeLsum_precision": 0.15149118025181801, "rougeLsum_precision_stderr": 0.0023436532932745767, "rougeLsum_recall": 0.23193053314919995, "rougeLsum_recall_stderr": 0.003047509010443981}}, "1": {"tldr_en": {"bleu": 3.2805418289148096, "bleu_stderr": 0.06930736886640082, "rouge1_fmeasure": 0.20446918503496525, "rouge1_fmeasure_stderr": 0.0022069372036890365, "rouge1_precision": 0.24381809168774637, "rouge1_precision_stderr": 0.0033022582237546634, "rouge1_recall": 0.23017687741596388, "rouge1_recall_stderr": 0.0028624354435880044, "rouge2_fmeasure": 0.05107550928141819, "rouge2_fmeasure_stderr": 0.0012372719079850197, "rouge2_precision": 0.064059693409422, "rouge2_precision_stderr": 0.0018318331544901892, "rouge2_recall": 0.05717654707897076, "rouge2_recall_stderr": 0.0015139726468610508, "rougeL_fmeasure": 0.15417240714255895, "rougeL_fmeasure_stderr": 0.001662532523275427, "rougeL_precision": 0.18545125118813593, "rougeL_precision_stderr": 0.0026173569724847225, "rougeL_recall": 0.1747030421481033, "rougeL_recall_stderr": 0.0022396644779822898, "rougeLsum_fmeasure": 0.19083537889844354, "rougeLsum_fmeasure_stderr": 0.002054035558945731, "rougeLsum_precision": 0.2280337645476899, "rougeLsum_precision_stderr": 0.003112015382305472, "rougeLsum_recall": 0.21469429192649783, "rougeLsum_recall_stderr": 0.0026546326180329014}}, "2": {"tldr_en": {"bleu": 4.071985313483549, "bleu_stderr": 0.07738970826466988, "rouge1_fmeasure": 0.23794097030152866, "rouge1_fmeasure_stderr": 0.002188642411213601, "rouge1_precision": 0.29175676576014364, "rouge1_precision_stderr": 0.0034471150363075184, "rouge1_recall": 0.2607907068748222, "rouge1_recall_stderr": 0.002898135289732422, "rouge2_fmeasure": 0.06574644928335649, "rouge2_fmeasure_stderr": 0.0013625725128389179, "rouge2_precision": 0.08536913371644045, "rouge2_precision_stderr": 0.0020895235460299017, "rouge2_recall": 0.0713925019948092, "rouge2_recall_stderr": 0.001635476891768791, "rougeL_fmeasure": 0.1789951285071888, "rougeL_fmeasure_stderr": 0.001673666200127008, "rougeL_precision": 0.22160256648226004, "rougeL_precision_stderr": 0.0027745204374263316, "rougeL_recall": 0.19691795695851566, "rougeL_recall_stderr": 0.002270661716223799, "rougeLsum_fmeasure": 0.22300315725329103, "rougeLsum_fmeasure_stderr": 0.0020678687486032563, "rougeLsum_precision": 0.27421900185303183, "rougeLsum_precision_stderr": 0.003293382529360642, "rougeLsum_recall": 0.24426464874342324, "rougeLsum_recall_stderr": 0.002735802329870702}}, "3": {"tldr_en": {"bleu": 3.256877193820952, "bleu_stderr": 0.07385158775345343, "rouge1_fmeasure": 0.20409132873325006, "rouge1_fmeasure_stderr": 0.0026059526618525918, "rouge1_precision": 0.2571754504445173, "rouge1_precision_stderr": 0.0037874180373320154, "rouge1_recall": 0.22046706474081984, "rouge1_recall_stderr": 0.0032483649489955437, "rouge2_fmeasure": 0.05803958336813696, "rouge2_fmeasure_stderr": 0.0013905246714186119, "rouge2_precision": 0.07716310931244513, "rouge2_precision_stderr": 0.0021423793043653955, "rouge2_recall": 0.06255207307386669, "rouge2_recall_stderr": 0.001665836586847627, "rougeL_fmeasure": 0.1557150280292743, "rougeL_fmeasure_stderr": 0.0020175919818759087, "rougeL_precision": 0.19917050763282695, "rougeL_precision_stderr": 0.0031058368371087975, "rougeL_recall": 0.16863998232504207, "rougeL_recall_stderr": 0.0025563859107073333, "rougeLsum_fmeasure": 0.19198507442281348, "rougeLsum_fmeasure_stderr": 0.002473123474513908, "rougeLsum_precision": 0.24286851428671166, "rougeLsum_precision_stderr": 0.003627579238772214, "rougeLsum_recall": 0.2070808578907387, "rougeLsum_recall_stderr": 0.003069864948055298}}, "4": {"tldr_en": {"bleu": 0.11803096947131637, "bleu_stderr": 0.018616113127493047, "rouge1_fmeasure": 0.06754366655458334, "rouge1_fmeasure_stderr": 0.00231821642571142, "rouge1_precision": 0.08608412704276916, "rouge1_precision_stderr": 0.0031073010197710128, "rouge1_recall": 0.07306085845324418, "rouge1_recall_stderr": 0.0026739004311174368, "rouge2_fmeasure": 0.01891360756998624, "rouge2_fmeasure_stderr": 0.0009547131510927643, "rouge2_precision": 0.02477632495929693, "rouge2_precision_stderr": 0.0013952242546163309, "rouge2_recall": 0.020963360092888926, "rouge2_recall_stderr": 0.0011677531292808372, "rougeL_fmeasure": 0.05180637434259422, "rougeL_fmeasure_stderr": 0.0017937325092342497, "rougeL_precision": 0.06705402166283986, "rougeL_precision_stderr": 0.0024891810743964266, "rougeL_recall": 0.05633331544121819, "rougeL_recall_stderr": 0.0021123430958068772, "rougeLsum_fmeasure": 0.06316340092042942, "rougeLsum_fmeasure_stderr": 0.0021760431723604587, "rougeLsum_precision": 0.08098856968063572, "rougeLsum_precision_stderr": 0.002946288914451725, "rougeLsum_recall": 0.06822240197436494, "rougeLsum_recall_stderr": 0.002506023867308553}}, "5": {"tldr_en": {"bleu": 5.860100637257344e-12, "bleu_stderr": 1.0205550894986597e-10, "rouge1_fmeasure": 0.01167038909001005, "rouge1_fmeasure_stderr": 0.0010944129890059014, "rouge1_precision": 0.014929032882940911, "rouge1_precision_stderr": 0.001437252981426829, "rouge1_recall": 0.012605454696432882, "rouge1_recall_stderr": 0.0012515184538477262, "rouge2_fmeasure": 0.003203820995720566, "rouge2_fmeasure_stderr": 0.00042103771194584703, "rouge2_precision": 0.0041010832248453595, "rouge2_precision_stderr": 0.0005815077800226312, "rouge2_recall": 0.0034469110258534192, "rouge2_recall_stderr": 0.0004725161504235943, "rougeL_fmeasure": 0.009059396220908112, "rougeL_fmeasure_stderr": 0.0008608509358819988, "rougeL_precision": 0.011742235186401492, "rougeL_precision_stderr": 0.0011661092630625284, "rougeL_recall": 0.009919184883424567, "rougeL_recall_stderr": 0.0010141555758155579, "rougeLsum_fmeasure": 0.011096298465596519, "rougeLsum_fmeasure_stderr": 0.0010440702662319144, "rougeLsum_precision": 0.014255925532109718, "rougeLsum_precision_stderr": 0.001380098431266019, "rougeLsum_recall": 0.012021925711305772, "rougeLsum_recall_stderr": 0.001202720217240228}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 7.518228772851459, "bleu_stderr": 0.07988589142118296, "rouge1_fmeasure": 0.30892622395914193, "rouge1_fmeasure_stderr": 0.002362726614809891, "rouge1_precision": 0.2527585937479932, "rouge1_precision_stderr": 0.0019586691047001085, "rouge1_recall": 0.4532685157753863, "rouge1_recall_stderr": 0.0036911339160002263, "rouge2_fmeasure": 0.13957033665597848, "rouge2_fmeasure_stderr": 0.001587978757027655, "rouge2_precision": 0.10827294752994317, "rouge2_precision_stderr": 0.0013193546727602984, "rouge2_recall": 0.21206177532278597, "rouge2_recall_stderr": 0.0024697648114498094, "rougeL_fmeasure": 0.25201104517465034, "rougeL_fmeasure_stderr": 0.0018344182759923726, "rougeL_precision": 0.2074761734719538, "rougeL_precision_stderr": 0.0015484435087844478, "rougeL_recall": 0.3707049758227277, "rougeL_recall_stderr": 0.002992917144452656, "rougeLsum_fmeasure": 0.2721367746591264, "rougeLsum_fmeasure_stderr": 0.002175228967316158, "rougeLsum_precision": 0.22367271169207667, "rougeLsum_precision_stderr": 0.0018161195853035582, "rougeLsum_recall": 0.3994503148641263, "rougeLsum_recall_stderr": 0.0034245642845668594}}, "1": {"generate_text_restaurant": {"bleu": 11.762713013802438, "bleu_stderr": 0.09382576543199773, "rouge1_fmeasure": 0.4698359144705449, "rouge1_fmeasure_stderr": 0.0023428195562317463, "rouge1_precision": 0.5758411505584643, "rouge1_precision_stderr": 0.0032641614095776606, "rouge1_recall": 0.43699974124480134, "rouge1_recall_stderr": 0.0030360517355224794, "rouge2_fmeasure": 0.21971297989413593, "rouge2_fmeasure_stderr": 0.0020517231278393165, "rouge2_precision": 0.27339290904893143, "rouge2_precision_stderr": 0.0027333405330301377, "rouge2_recall": 0.20425825992656632, "rouge2_recall_stderr": 0.0021962926617281136, "rougeL_fmeasure": 0.3397129377482095, "rougeL_fmeasure_stderr": 0.0020808579737579473, "rougeL_precision": 0.4200242586753886, "rougeL_precision_stderr": 0.0030278428531837543, "rougeL_recall": 0.31470569352158173, "rougeL_recall_stderr": 0.002441278894908242, "rougeLsum_fmeasure": 0.3826629288347234, "rougeLsum_fmeasure_stderr": 0.0023388359390624446, "rougeLsum_precision": 0.4702024862705025, "rougeLsum_precision_stderr": 0.0032151475190549403, "rougeLsum_recall": 0.3555458059380285, "rougeLsum_recall_stderr": 0.0027838026083691135}}, "2": {"generate_text_restaurant": {"bleu": 14.028619911508963, "bleu_stderr": 0.17673898243267264, "rouge1_fmeasure": 0.4982696315856735, "rouge1_fmeasure_stderr": 0.002277178664730148, "rouge1_precision": 0.5880328986485642, "rouge1_precision_stderr": 0.0031633644314478966, "rouge1_recall": 0.4720615710167169, "rouge1_recall_stderr": 0.0030042579734499103, "rouge2_fmeasure": 0.2436824998963185, "rouge2_fmeasure_stderr": 0.0021479952999434505, "rouge2_precision": 0.2905416018050732, "rouge2_precision_stderr": 0.0027458331588556565, "rouge2_recall": 0.2313719045642725, "rouge2_recall_stderr": 0.002349650773654754, "rougeL_fmeasure": 0.36315580245879386, "rougeL_fmeasure_stderr": 0.0021103667472393136, "rougeL_precision": 0.4304826242102334, "rougeL_precision_stderr": 0.002921537898047091, "rougeL_recall": 0.34359634632916913, "rougeL_recall_stderr": 0.002538014950574236, "rougeLsum_fmeasure": 0.41230819320152157, "rougeLsum_fmeasure_stderr": 0.0023628359694983633, "rougeLsum_precision": 0.4866663175226494, "rougeLsum_precision_stderr": 0.003134734940667762, "rougeLsum_recall": 0.39070560600420967, "rougeLsum_recall_stderr": 0.002859660656155989}}, "3": {"generate_text_restaurant": {"bleu": 14.824429124321526, "bleu_stderr": 0.1257638219512965, "rouge1_fmeasure": 0.5070937995389966, "rouge1_fmeasure_stderr": 0.0021993659376666874, "rouge1_precision": 0.5903839264246126, "rouge1_precision_stderr": 0.003096593178519806, "rouge1_recall": 0.48303754455261133, "rouge1_recall_stderr": 0.0029313776925495253, "rouge2_fmeasure": 0.2526618416523279, "rouge2_fmeasure_stderr": 0.0021298129637362085, "rouge2_precision": 0.2971819596009762, "rouge2_precision_stderr": 0.0027204840509740583, "rouge2_recall": 0.24110048435208156, "rouge2_recall_stderr": 0.002348630487706421, "rougeL_fmeasure": 0.37097982222923104, "rougeL_fmeasure_stderr": 0.00214061071940383, "rougeL_precision": 0.43347382731719464, "rougeL_precision_stderr": 0.0029257357889719074, "rougeL_recall": 0.3529852168172252, "rougeL_recall_stderr": 0.002550242635255893, "rougeLsum_fmeasure": 0.42203281188344766, "rougeLsum_fmeasure_stderr": 0.0023652228936698657, "rougeLsum_precision": 0.49123394352656125, "rougeLsum_precision_stderr": 0.0031248395828196904, "rougeLsum_recall": 0.4020764458847546, "rougeLsum_recall_stderr": 0.0028446825862533704}}, "4": {"generate_text_restaurant": {"bleu": 15.1456115809235, "bleu_stderr": 0.1628753687758124, "rouge1_fmeasure": 0.5145793651843434, "rouge1_fmeasure_stderr": 0.002244489356919281, "rouge1_precision": 0.5945998345487046, "rouge1_precision_stderr": 0.003074613754297493, "rouge1_recall": 0.48854055206810637, "rouge1_recall_stderr": 0.0028822641580179367, "rouge2_fmeasure": 0.2559926229244319, "rouge2_fmeasure_stderr": 0.002186356460577112, "rouge2_precision": 0.29784097926829245, "rouge2_precision_stderr": 0.0026941678409656135, "rouge2_recall": 0.2436337287979087, "rouge2_recall_stderr": 0.0023716503152493335, "rougeL_fmeasure": 0.3747326640922719, "rougeL_fmeasure_stderr": 0.002148508706418628, "rougeL_precision": 0.433653082379793, "rougeL_precision_stderr": 0.002833488672483008, "rougeL_recall": 0.3558030676746678, "rougeL_recall_stderr": 0.0025159494686373797, "rougeLsum_fmeasure": 0.42740717670424816, "rougeLsum_fmeasure_stderr": 0.0023775373903584217, "rougeLsum_precision": 0.49357503577481815, "rougeLsum_precision_stderr": 0.0030663612183832903, "rougeLsum_recall": 0.4058018094790397, "rougeLsum_recall_stderr": 0.002791634703514705}}, "5": {"generate_text_restaurant": {"bleu": 15.184590176560544, "bleu_stderr": 0.17570671649096134, "rouge1_fmeasure": 0.5172310679190623, "rouge1_fmeasure_stderr": 0.002187662941341786, "rouge1_precision": 0.5980619294865127, "rouge1_precision_stderr": 0.003055147702862821, "rouge1_recall": 0.4899430329979205, "rouge1_recall_stderr": 0.002838541142993511, "rouge2_fmeasure": 0.259556048619835, "rouge2_fmeasure_stderr": 0.002143271876534031, "rouge2_precision": 0.3029600136699186, "rouge2_precision_stderr": 0.002703515351522382, "rouge2_recall": 0.24620276831802643, "rouge2_recall_stderr": 0.0023264378571411816, "rougeL_fmeasure": 0.37807698267399703, "rougeL_fmeasure_stderr": 0.002126725638562394, "rougeL_precision": 0.4380103729999666, "rougeL_precision_stderr": 0.0028424300466825983, "rougeL_recall": 0.35795640195755224, "rougeL_recall_stderr": 0.0024826420662082673, "rougeLsum_fmeasure": 0.4311732922825964, "rougeLsum_fmeasure_stderr": 0.0023482937755900097, "rougeLsum_precision": 0.4982942932710782, "rougeLsum_precision_stderr": 0.0030594319184573462, "rougeLsum_recall": 0.40852131422574206, "rougeLsum_recall_stderr": 0.0027745302682206834}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.0840616150366835, "bleu_stderr": 0.10220605780455636, "rouge1_fmeasure": 0.12067679782231505, "rouge1_fmeasure_stderr": 0.003015872334492621, "rouge1_precision": 0.09160247708817207, "rouge1_precision_stderr": 0.002375172469912284, "rouge1_recall": 0.19613476599918667, "rouge1_recall_stderr": 0.005073318605110471, "rouge2_fmeasure": 0.021176795907655737, "rouge2_fmeasure_stderr": 0.0011603757977784665, "rouge2_precision": 0.015487063309988504, "rouge2_precision_stderr": 0.000848981732321504, "rouge2_recall": 0.0363837944571209, "rouge2_recall_stderr": 0.0020790710888273895, "rougeL_fmeasure": 0.09086726074181874, "rougeL_fmeasure_stderr": 0.0022256142858678965, "rougeL_precision": 0.06937634282139736, "rougeL_precision_stderr": 0.001813605739492327, "rougeL_recall": 0.1479963053053532, "rougeL_recall_stderr": 0.0038099944087791725, "rougeLsum_fmeasure": 0.09880097411831448, "rougeLsum_fmeasure_stderr": 0.0024614730448775535, "rougeLsum_precision": 0.07512656050190324, "rougeLsum_precision_stderr": 0.001959507720128778, "rougeLsum_recall": 0.1612082948452722, "rougeLsum_recall_stderr": 0.004219489865399336}}, "1": {"article_DOC_summary": {"bleu": 2.1144830791719307, "bleu_stderr": 0.11445499213864452, "rouge1_fmeasure": 0.22223912621847974, "rouge1_fmeasure_stderr": 0.003038126692025604, "rouge1_precision": 0.2003179694226108, "rouge1_precision_stderr": 0.003646294986793581, "rouge1_recall": 0.3082326819925303, "rouge1_recall_stderr": 0.004421626418186149, "rouge2_fmeasure": 0.050641256552544464, "rouge2_fmeasure_stderr": 0.002016315458850625, "rouge2_precision": 0.04617082546967428, "rouge2_precision_stderr": 0.0021567164563758494, "rouge2_recall": 0.07074171325045796, "rouge2_recall_stderr": 0.0026782882522819163, "rougeL_fmeasure": 0.16589957489115423, "rougeL_fmeasure_stderr": 0.0024349153804753905, "rougeL_precision": 0.15020466828774898, "rougeL_precision_stderr": 0.002974138334891651, "rougeL_recall": 0.23037723268498672, "rougeL_recall_stderr": 0.0034903634923284257, "rougeLsum_fmeasure": 0.17413975005815832, "rougeLsum_fmeasure_stderr": 0.0025334465295397523, "rougeLsum_precision": 0.156190697593295, "rougeLsum_precision_stderr": 0.0029737943854797597, "rougeLsum_recall": 0.2443326358730322, "rougeLsum_recall_stderr": 0.003851152935036917}}, "2": {"article_DOC_summary": {"bleu": 2.2306956500254795, "bleu_stderr": 0.1534426529740081, "rouge1_fmeasure": 0.22874247828238384, "rouge1_fmeasure_stderr": 0.003191868132841587, "rouge1_precision": 0.2167785998186487, "rouge1_precision_stderr": 0.0038703703581663036, "rouge1_recall": 0.29079354607238944, "rouge1_recall_stderr": 0.0042086230091149205, "rouge2_fmeasure": 0.05359901469303583, "rouge2_fmeasure_stderr": 0.002063273466751458, "rouge2_precision": 0.05175962394844965, "rouge2_precision_stderr": 0.002262457698273439, "rouge2_recall": 0.06767519675126271, "rouge2_recall_stderr": 0.002531468931801855, "rougeL_fmeasure": 0.17116883513373068, "rougeL_fmeasure_stderr": 0.002601631095778569, "rougeL_precision": 0.1626685190790419, "rougeL_precision_stderr": 0.003181540362928691, "rougeL_recall": 0.2176434055764335, "rougeL_recall_stderr": 0.0033468188991687773, "rougeLsum_fmeasure": 0.17781283216267912, "rougeLsum_fmeasure_stderr": 0.0026319963165001175, "rougeLsum_precision": 0.16752103628195428, "rougeLsum_precision_stderr": 0.0031476246457933605, "rougeLsum_recall": 0.2286412427632142, "rougeLsum_recall_stderr": 0.003582394216085152}}, "3": {"article_DOC_summary": {"bleu": 2.4283658434676942, "bleu_stderr": 0.1084231496409544, "rouge1_fmeasure": 0.22044943741927525, "rouge1_fmeasure_stderr": 0.003500198780987625, "rouge1_precision": 0.21592444335846092, "rouge1_precision_stderr": 0.004246983520225218, "rouge1_recall": 0.27255737663479357, "rouge1_recall_stderr": 0.00447437166346409, "rouge2_fmeasure": 0.05351615652078515, "rouge2_fmeasure_stderr": 0.0022197484992897607, "rouge2_precision": 0.05330214126583765, "rouge2_precision_stderr": 0.0025012370800294183, "rouge2_recall": 0.06515904722605939, "rouge2_recall_stderr": 0.0025535061286592504, "rougeL_fmeasure": 0.16421475353571632, "rougeL_fmeasure_stderr": 0.0028136325952485735, "rougeL_precision": 0.16172792138960806, "rougeL_precision_stderr": 0.003474612361597086, "rougeL_recall": 0.20248110828398325, "rougeL_recall_stderr": 0.0034801799909718704, "rougeLsum_fmeasure": 0.17052032174118803, "rougeLsum_fmeasure_stderr": 0.002883605135295133, "rougeLsum_precision": 0.16634670992387487, "rougeLsum_precision_stderr": 0.0034675307885691244, "rougeLsum_recall": 0.21304752670955557, "rougeLsum_recall_stderr": 0.003802772338811888}}, "4": {"article_DOC_summary": {"bleu": 0.40228156399700404, "bleu_stderr": 0.0795066489605694, "rouge1_fmeasure": 0.05798370899016291, "rouge1_fmeasure_stderr": 0.003309673469340258, "rouge1_precision": 0.06147088600466444, "rouge1_precision_stderr": 0.003807688121301648, "rouge1_recall": 0.06919835565602203, "rouge1_recall_stderr": 0.004080878740872531, "rouge2_fmeasure": 0.013641962174467587, "rouge2_fmeasure_stderr": 0.0012106481387976049, "rouge2_precision": 0.014228533929974868, "rouge2_precision_stderr": 0.0014047860954449383, "rouge2_recall": 0.0166314025722418, "rouge2_recall_stderr": 0.0014602317545186565, "rougeL_fmeasure": 0.043392440237132235, "rougeL_fmeasure_stderr": 0.002526920635642375, "rougeL_precision": 0.04649724897774529, "rougeL_precision_stderr": 0.0029768758298026627, "rougeL_recall": 0.05168291535676844, "rougeL_recall_stderr": 0.003094715210052394, "rougeLsum_fmeasure": 0.04537985187674532, "rougeLsum_fmeasure_stderr": 0.0026361408489140024, "rougeLsum_precision": 0.048035895047127686, "rougeLsum_precision_stderr": 0.003028118969042768, "rougeLsum_recall": 0.05464177271239397, "rougeLsum_recall_stderr": 0.0033104672738008553}}, "5": {"article_DOC_summary": {"bleu": 1.2194816597556972e-29, "bleu_stderr": 1.9673150803997765e-21, "rouge1_fmeasure": 0.0022410464211047414, "rouge1_fmeasure_stderr": 0.0006379815102748624, "rouge1_precision": 0.002305114359249611, "rouge1_precision_stderr": 0.0006855365150103617, "rouge1_recall": 0.002688750909485673, "rouge1_recall_stderr": 0.0008419301936781596, "rouge2_fmeasure": 0.0002428157678792438, "rouge2_fmeasure_stderr": 0.00010121603253679161, "rouge2_precision": 0.00023662681974673803, "rouge2_precision_stderr": 0.00010805221445763836, "rouge2_recall": 0.0003191847290509596, "rouge2_recall_stderr": 0.00013624996945101362, "rougeL_fmeasure": 0.001480383954453164, "rougeL_fmeasure_stderr": 0.0004335775279500401, "rougeL_precision": 0.0014967981731443785, "rougeL_precision_stderr": 0.00044282917196946015, "rougeL_recall": 0.0018089244853048499, "rougeL_recall_stderr": 0.0005955143349649119, "rougeLsum_fmeasure": 0.0014592568247182058, "rougeLsum_fmeasure_stderr": 0.00042833093506174205, "rougeLsum_precision": 0.0014611359517881007, "rougeLsum_precision_stderr": 0.0004268628859757995, "rougeLsum_recall": 0.0017987055420995693, "rougeLsum_recall_stderr": 0.0005977174922462606}}}}
8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_2.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.29175676576014364,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.0034471150363075184
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.2607907068748222,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.002898135289732422
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.23794097030152866,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.002188642411213601
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.08536913371644045,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0020895235460299017
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.0713925019948092,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.001635476891768791
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.06574644928335649,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0013625725128389179
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.22160256648226004,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0027745204374263316
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.19691795695851566,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.002270661716223799
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.1789951285071888,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.001673666200127008
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.27421900185303183,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.003293382529360642
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.24426464874342324,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.002735802329870702
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.22300315725329103,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0020678687486032563
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 4.071985313483549,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.07738970826466988
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 2,
126
+ "batch_size": 8,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_3.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.2571754504445173,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.0037874180373320154
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.22046706474081984,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.0032483649489955437
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.20409132873325006,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0026059526618525918
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.07716310931244513,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0021423793043653955
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.06255207307386669,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.001665836586847627
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.05803958336813696,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0013905246714186119
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.19917050763282695,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0031058368371087975
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.16863998232504207,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0025563859107073333
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.1557150280292743,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0020175919818759087
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.24286851428671166,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.003627579238772214
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.2070808578907387,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.003069864948055298
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.19198507442281348,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.002473123474513908
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 3.256877193820952,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.07385158775345343
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 3,
126
+ "batch_size": 8,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.08608412704276916,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.0031073010197710128
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.07306085845324418,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.0026739004311174368
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.06754366655458334,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.00231821642571142
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.02477632495929693,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0013952242546163309
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.020963360092888926,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0011677531292808372
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.01891360756998624,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.0009547131510927643
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.06705402166283986,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0024891810743964266
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.05633331544121819,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0021123430958068772
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.05180637434259422,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0017937325092342497
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.08098856968063572,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.002946288914451725
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.06822240197436494,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.002506023867308553
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.06316340092042942,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0021760431723604587
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 0.11803096947131637,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 0.018616113127493047
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 8,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b13b/evaluation/generation/slim.8b7178b13b_GEM-wiki_lingua_en_tldr_en_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "GEM/wiki_lingua_en",
5
+ "prompt_name": "tldr_en",
6
+ "rouge1_precision": 0.014929032882940911,
7
+ "dataset_path": "GEM/wiki_lingua",
8
+ "dataset_name": "en",
9
+ "subset": null,
10
+ "rouge1_precision_stderr": 0.001437252981426829
11
+ },
12
+ {
13
+ "task_name": "GEM/wiki_lingua_en",
14
+ "prompt_name": "tldr_en",
15
+ "rouge1_recall": 0.012605454696432882,
16
+ "dataset_path": "GEM/wiki_lingua",
17
+ "dataset_name": "en",
18
+ "subset": null,
19
+ "rouge1_recall_stderr": 0.0012515184538477262
20
+ },
21
+ {
22
+ "task_name": "GEM/wiki_lingua_en",
23
+ "prompt_name": "tldr_en",
24
+ "rouge1_fmeasure": 0.01167038909001005,
25
+ "dataset_path": "GEM/wiki_lingua",
26
+ "dataset_name": "en",
27
+ "subset": null,
28
+ "rouge1_fmeasure_stderr": 0.0010944129890059014
29
+ },
30
+ {
31
+ "task_name": "GEM/wiki_lingua_en",
32
+ "prompt_name": "tldr_en",
33
+ "rouge2_precision": 0.0041010832248453595,
34
+ "dataset_path": "GEM/wiki_lingua",
35
+ "dataset_name": "en",
36
+ "subset": null,
37
+ "rouge2_precision_stderr": 0.0005815077800226312
38
+ },
39
+ {
40
+ "task_name": "GEM/wiki_lingua_en",
41
+ "prompt_name": "tldr_en",
42
+ "rouge2_recall": 0.0034469110258534192,
43
+ "dataset_path": "GEM/wiki_lingua",
44
+ "dataset_name": "en",
45
+ "subset": null,
46
+ "rouge2_recall_stderr": 0.0004725161504235943
47
+ },
48
+ {
49
+ "task_name": "GEM/wiki_lingua_en",
50
+ "prompt_name": "tldr_en",
51
+ "rouge2_fmeasure": 0.003203820995720566,
52
+ "dataset_path": "GEM/wiki_lingua",
53
+ "dataset_name": "en",
54
+ "subset": null,
55
+ "rouge2_fmeasure_stderr": 0.00042103771194584703
56
+ },
57
+ {
58
+ "task_name": "GEM/wiki_lingua_en",
59
+ "prompt_name": "tldr_en",
60
+ "rougeL_precision": 0.011742235186401492,
61
+ "dataset_path": "GEM/wiki_lingua",
62
+ "dataset_name": "en",
63
+ "subset": null,
64
+ "rougeL_precision_stderr": 0.0011661092630625284
65
+ },
66
+ {
67
+ "task_name": "GEM/wiki_lingua_en",
68
+ "prompt_name": "tldr_en",
69
+ "rougeL_recall": 0.009919184883424567,
70
+ "dataset_path": "GEM/wiki_lingua",
71
+ "dataset_name": "en",
72
+ "subset": null,
73
+ "rougeL_recall_stderr": 0.0010141555758155579
74
+ },
75
+ {
76
+ "task_name": "GEM/wiki_lingua_en",
77
+ "prompt_name": "tldr_en",
78
+ "rougeL_fmeasure": 0.009059396220908112,
79
+ "dataset_path": "GEM/wiki_lingua",
80
+ "dataset_name": "en",
81
+ "subset": null,
82
+ "rougeL_fmeasure_stderr": 0.0008608509358819988
83
+ },
84
+ {
85
+ "task_name": "GEM/wiki_lingua_en",
86
+ "prompt_name": "tldr_en",
87
+ "rougeLsum_precision": 0.014255925532109718,
88
+ "dataset_path": "GEM/wiki_lingua",
89
+ "dataset_name": "en",
90
+ "subset": null,
91
+ "rougeLsum_precision_stderr": 0.001380098431266019
92
+ },
93
+ {
94
+ "task_name": "GEM/wiki_lingua_en",
95
+ "prompt_name": "tldr_en",
96
+ "rougeLsum_recall": 0.012021925711305772,
97
+ "dataset_path": "GEM/wiki_lingua",
98
+ "dataset_name": "en",
99
+ "subset": null,
100
+ "rougeLsum_recall_stderr": 0.001202720217240228
101
+ },
102
+ {
103
+ "task_name": "GEM/wiki_lingua_en",
104
+ "prompt_name": "tldr_en",
105
+ "rougeLsum_fmeasure": 0.011096298465596519,
106
+ "dataset_path": "GEM/wiki_lingua",
107
+ "dataset_name": "en",
108
+ "subset": null,
109
+ "rougeLsum_fmeasure_stderr": 0.0010440702662319144
110
+ },
111
+ {
112
+ "task_name": "GEM/wiki_lingua_en",
113
+ "prompt_name": "tldr_en",
114
+ "bleu": 5.860100637257344e-12,
115
+ "dataset_path": "GEM/wiki_lingua",
116
+ "dataset_name": "en",
117
+ "subset": null,
118
+ "bleu_stderr": 1.0205550894986597e-10
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 8,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_3.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.21592444335846092,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.004246983520225218
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.27255737663479357,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.00447437166346409
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.22044943741927525,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.003500198780987625
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.05330214126583765,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.0025012370800294183
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.06515904722605939,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.0025535061286592504
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.05351615652078515,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.0022197484992897607
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.16172792138960806,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.003474612361597086
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.20248110828398325,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0034801799909718704
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.16421475353571632,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.0028136325952485735
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.16634670992387487,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.0034675307885691244
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.21304752670955557,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.003802772338811888
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.17052032174118803,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.002883605135295133
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 2.4283658434676942,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 0.1084231496409544
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 3,
126
+ "batch_size": 8,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_4.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.06147088600466444,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.003807688121301648
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.06919835565602203,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.004080878740872531
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.05798370899016291,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.003309673469340258
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.014228533929974868,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.0014047860954449383
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.0166314025722418,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.0014602317545186565
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.013641962174467587,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.0012106481387976049
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.04649724897774529,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.0029768758298026627
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.05168291535676844,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.003094715210052394
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.043392440237132235,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.002526920635642375
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.048035895047127686,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.003028118969042768
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.05464177271239397,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.0033104672738008553
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.04537985187674532,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.0026361408489140024
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 0.40228156399700404,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 0.0795066489605694
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 4,
126
+ "batch_size": 8,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b13b/evaluation/generation/slim.8b7178b13b_gem_xsum_article_DOC_summary_5.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "task_name": "gem_xsum",
5
+ "prompt_name": "article_DOC_summary",
6
+ "rouge1_precision": 0.002305114359249611,
7
+ "dataset_path": "GEM/xsum",
8
+ "dataset_name": null,
9
+ "subset": "",
10
+ "rouge1_precision_stderr": 0.0006855365150103617
11
+ },
12
+ {
13
+ "task_name": "gem_xsum",
14
+ "prompt_name": "article_DOC_summary",
15
+ "rouge1_recall": 0.002688750909485673,
16
+ "dataset_path": "GEM/xsum",
17
+ "dataset_name": null,
18
+ "subset": "",
19
+ "rouge1_recall_stderr": 0.0008419301936781596
20
+ },
21
+ {
22
+ "task_name": "gem_xsum",
23
+ "prompt_name": "article_DOC_summary",
24
+ "rouge1_fmeasure": 0.0022410464211047414,
25
+ "dataset_path": "GEM/xsum",
26
+ "dataset_name": null,
27
+ "subset": "",
28
+ "rouge1_fmeasure_stderr": 0.0006379815102748624
29
+ },
30
+ {
31
+ "task_name": "gem_xsum",
32
+ "prompt_name": "article_DOC_summary",
33
+ "rouge2_precision": 0.00023662681974673803,
34
+ "dataset_path": "GEM/xsum",
35
+ "dataset_name": null,
36
+ "subset": "",
37
+ "rouge2_precision_stderr": 0.00010805221445763836
38
+ },
39
+ {
40
+ "task_name": "gem_xsum",
41
+ "prompt_name": "article_DOC_summary",
42
+ "rouge2_recall": 0.0003191847290509596,
43
+ "dataset_path": "GEM/xsum",
44
+ "dataset_name": null,
45
+ "subset": "",
46
+ "rouge2_recall_stderr": 0.00013624996945101362
47
+ },
48
+ {
49
+ "task_name": "gem_xsum",
50
+ "prompt_name": "article_DOC_summary",
51
+ "rouge2_fmeasure": 0.0002428157678792438,
52
+ "dataset_path": "GEM/xsum",
53
+ "dataset_name": null,
54
+ "subset": "",
55
+ "rouge2_fmeasure_stderr": 0.00010121603253679161
56
+ },
57
+ {
58
+ "task_name": "gem_xsum",
59
+ "prompt_name": "article_DOC_summary",
60
+ "rougeL_precision": 0.0014967981731443785,
61
+ "dataset_path": "GEM/xsum",
62
+ "dataset_name": null,
63
+ "subset": "",
64
+ "rougeL_precision_stderr": 0.00044282917196946015
65
+ },
66
+ {
67
+ "task_name": "gem_xsum",
68
+ "prompt_name": "article_DOC_summary",
69
+ "rougeL_recall": 0.0018089244853048499,
70
+ "dataset_path": "GEM/xsum",
71
+ "dataset_name": null,
72
+ "subset": "",
73
+ "rougeL_recall_stderr": 0.0005955143349649119
74
+ },
75
+ {
76
+ "task_name": "gem_xsum",
77
+ "prompt_name": "article_DOC_summary",
78
+ "rougeL_fmeasure": 0.001480383954453164,
79
+ "dataset_path": "GEM/xsum",
80
+ "dataset_name": null,
81
+ "subset": "",
82
+ "rougeL_fmeasure_stderr": 0.0004335775279500401
83
+ },
84
+ {
85
+ "task_name": "gem_xsum",
86
+ "prompt_name": "article_DOC_summary",
87
+ "rougeLsum_precision": 0.0014611359517881007,
88
+ "dataset_path": "GEM/xsum",
89
+ "dataset_name": null,
90
+ "subset": "",
91
+ "rougeLsum_precision_stderr": 0.0004268628859757995
92
+ },
93
+ {
94
+ "task_name": "gem_xsum",
95
+ "prompt_name": "article_DOC_summary",
96
+ "rougeLsum_recall": 0.0017987055420995693,
97
+ "dataset_path": "GEM/xsum",
98
+ "dataset_name": null,
99
+ "subset": "",
100
+ "rougeLsum_recall_stderr": 0.0005977174922462606
101
+ },
102
+ {
103
+ "task_name": "gem_xsum",
104
+ "prompt_name": "article_DOC_summary",
105
+ "rougeLsum_fmeasure": 0.0014592568247182058,
106
+ "dataset_path": "GEM/xsum",
107
+ "dataset_name": null,
108
+ "subset": "",
109
+ "rougeLsum_fmeasure_stderr": 0.00042833093506174205
110
+ },
111
+ {
112
+ "task_name": "gem_xsum",
113
+ "prompt_name": "article_DOC_summary",
114
+ "bleu": 1.2194816597556972e-29,
115
+ "dataset_path": "GEM/xsum",
116
+ "dataset_name": null,
117
+ "subset": "",
118
+ "bleu_stderr": 1.9673150803997765e-21
119
+ }
120
+ ],
121
+ "config": {
122
+ "model": "hf-causal",
123
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b13b/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
124
+ "task_args": "",
125
+ "num_fewshot": 5,
126
+ "batch_size": 8,
127
+ "device": "cuda",
128
+ "use_cache": false,
129
+ "limit": 3000,
130
+ "bootstrap_iters": 10,
131
+ "seed": 1234
132
+ }
133
+ }
8b7178b13b/evaluation/rankeval/8b7178b13b_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.354,0.015129868238451773,0
3
+ anli_r2,acc,0.331,0.01488827258820394,0
4
+ anli_r3,acc,0.3458333333333333,0.01373624534231101,0
5
+ arc_challenge,acc,0.2721843003412969,0.013006600406423707,0
6
+ arc_challenge,acc_norm,0.2832764505119454,0.013167478735134575,0
7
+ arc_easy,acc,0.5707070707070707,0.010156678075911087,0
8
+ arc_easy,acc_norm,0.5172558922558923,0.010253671674754631,0
9
+ boolq,acc,0.5501529051987768,0.008700950643028801,1
10
+ cb,acc,0.2857142857142857,0.06091449038731724,1
11
+ cb,f1,0.30952380952380953,,1
12
+ copa,acc,0.7,0.046056618647183814,0
13
+ hellaswag,acc,0.4360685122485561,0.004948824501355485,0
14
+ hellaswag,acc_norm,0.5632344154550887,0.004949716368890496,0
15
+ piqa,acc,0.7225244831338411,0.010446818281039959,0
16
+ piqa,acc_norm,0.7317736670293797,0.010336761992404485,0
17
+ rte,acc,0.5306859205776173,0.03003973059219781,0
18
+ sciq,acc,0.848,0.011358918303475282,0
19
+ sciq,acc_norm,0.758,0.013550631705555958,0
20
+ storycloze_2016,acc,0.6969535008017104,0.010627613073376715,0
21
+ winogrande,acc,0.5666929755327546,0.013926915052757347,0
8b7178b13b/evaluation/rankeval/8b7178b13b_0_lm-eval_global_step84877_2023-05-15-10-06-37_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.354,
5
- "acc_stderr": 0.015129868238451773
6
- },
7
- "anli_r2": {
8
- "acc": 0.331,
9
- "acc_stderr": 0.01488827258820394
10
- },
11
- "anli_r3": {
12
- "acc": 0.3458333333333333,
13
- "acc_stderr": 0.01373624534231101
14
- },
15
- "cb": {
16
- "acc": 0.2857142857142857,
17
- "acc_stderr": 0.06091449038731724,
18
- "f1": 0.30952380952380953
19
- },
20
- "copa": {
21
- "acc": 0.7,
22
- "acc_stderr": 0.046056618647183814
23
- },
24
- "hellaswag": {
25
- "acc": 0.4360685122485561,
26
- "acc_stderr": 0.004948824501355485,
27
- "acc_norm": 0.5632344154550887,
28
- "acc_norm_stderr": 0.004949716368890496
29
- },
30
- "rte": {
31
- "acc": 0.5306859205776173,
32
- "acc_stderr": 0.03003973059219781
33
- },
34
- "winogrande": {
35
- "acc": 0.5666929755327546,
36
- "acc_stderr": 0.013926915052757347
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6969535008017104,
40
- "acc_stderr": 0.010627613073376715
41
- },
42
- "boolq": {
43
- "acc": 0.5501529051987768,
44
- "acc_stderr": 0.008700950643028801
45
- },
46
- "arc_easy": {
47
- "acc": 0.5707070707070707,
48
- "acc_stderr": 0.010156678075911087,
49
- "acc_norm": 0.5172558922558923,
50
- "acc_norm_stderr": 0.010253671674754631
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2721843003412969,
54
- "acc_stderr": 0.013006600406423707,
55
- "acc_norm": 0.2832764505119454,
56
- "acc_norm_stderr": 0.013167478735134575
57
- },
58
- "sciq": {
59
- "acc": 0.848,
60
- "acc_stderr": 0.011358918303475282,
61
- "acc_norm": 0.758,
62
- "acc_norm_stderr": 0.013550631705555958
63
- },
64
- "piqa": {
65
- "acc": 0.7225244831338411,
66
- "acc_stderr": 0.010446818281039959,
67
- "acc_norm": 0.7317736670293797,
68
- "acc_norm_stderr": 0.010336761992404485
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b13b/evaluation/rankeval/8b7178b13b_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.34,0.014987482264363937,0
3
+ anli_r2,acc,0.326,0.014830507204541028,0
4
+ anli_r3,acc,0.3541666666666667,0.01381193349957096,0
5
+ arc_challenge,acc,0.27474402730375425,0.013044617212771227,0
6
+ arc_challenge,acc_norm,0.3037542662116041,0.01343890918477876,0
7
+ arc_easy,acc,0.5968013468013468,0.010065668576794803,0
8
+ arc_easy,acc_norm,0.5913299663299664,0.01008717449876288,0
9
+ boolq,acc,0.5562691131498471,0.008689501105367413,1
10
+ cb,acc,0.42857142857142855,0.06672848092813058,1
11
+ cb,f1,0.36324786324786323,,1
12
+ copa,acc,0.75,0.04351941398892446,0
13
+ hellaswag,acc,0.4411471818362876,0.004955095096264714,0
14
+ hellaswag,acc_norm,0.5774746066520613,0.004929517011508216,0
15
+ piqa,acc,0.7295973884657236,0.010363167031620784,0
16
+ piqa,acc_norm,0.7334058759521219,0.010316749863541365,0
17
+ rte,acc,0.5234657039711191,0.030063300411902652,0
18
+ sciq,acc,0.887,0.010016552866696846,0
19
+ sciq,acc_norm,0.882,0.01020686926438179,0
20
+ storycloze_2016,acc,0.6830571886691609,0.010759650951452121,0
21
+ winogrande,acc,0.5595895816890292,0.013952330311915603,0
8b7178b13b/evaluation/rankeval/8b7178b13b_1_lm-eval_global_step84877_2023-05-15-10-06-37_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.34,
5
- "acc_stderr": 0.014987482264363937
6
- },
7
- "anli_r2": {
8
- "acc": 0.326,
9
- "acc_stderr": 0.014830507204541028
10
- },
11
- "anli_r3": {
12
- "acc": 0.3541666666666667,
13
- "acc_stderr": 0.01381193349957096
14
- },
15
- "cb": {
16
- "acc": 0.42857142857142855,
17
- "acc_stderr": 0.06672848092813058,
18
- "f1": 0.36324786324786323
19
- },
20
- "copa": {
21
- "acc": 0.75,
22
- "acc_stderr": 0.04351941398892446
23
- },
24
- "hellaswag": {
25
- "acc": 0.4411471818362876,
26
- "acc_stderr": 0.004955095096264714,
27
- "acc_norm": 0.5774746066520613,
28
- "acc_norm_stderr": 0.004929517011508216
29
- },
30
- "rte": {
31
- "acc": 0.5234657039711191,
32
- "acc_stderr": 0.030063300411902652
33
- },
34
- "winogrande": {
35
- "acc": 0.5595895816890292,
36
- "acc_stderr": 0.013952330311915603
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6830571886691609,
40
- "acc_stderr": 0.010759650951452121
41
- },
42
- "boolq": {
43
- "acc": 0.5562691131498471,
44
- "acc_stderr": 0.008689501105367413
45
- },
46
- "arc_easy": {
47
- "acc": 0.5968013468013468,
48
- "acc_stderr": 0.010065668576794803,
49
- "acc_norm": 0.5913299663299664,
50
- "acc_norm_stderr": 0.01008717449876288
51
- },
52
- "arc_challenge": {
53
- "acc": 0.27474402730375425,
54
- "acc_stderr": 0.013044617212771227,
55
- "acc_norm": 0.3037542662116041,
56
- "acc_norm_stderr": 0.01343890918477876
57
- },
58
- "sciq": {
59
- "acc": 0.887,
60
- "acc_stderr": 0.010016552866696846,
61
- "acc_norm": 0.882,
62
- "acc_norm_stderr": 0.01020686926438179
63
- },
64
- "piqa": {
65
- "acc": 0.7295973884657236,
66
- "acc_stderr": 0.010363167031620784,
67
- "acc_norm": 0.7334058759521219,
68
- "acc_norm_stderr": 0.010316749863541365
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b13b/evaluation/rankeval/8b7178b13b_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.343,0.015019206922356951,0
3
+ anli_r2,acc,0.339,0.014976758771620349,0
4
+ anli_r3,acc,0.33416666666666667,0.013622434813136774,0
5
+ arc_challenge,acc,0.28924914675767915,0.013250012579393443,0
6
+ arc_challenge,acc_norm,0.310580204778157,0.013522292098053059,0
7
+ arc_easy,acc,0.6077441077441077,0.010018744689650043,0
8
+ arc_easy,acc_norm,0.6026936026936027,0.010041053078884286,0
9
+ boolq,acc,0.5529051987767584,0.008695963064172717,1
10
+ cb,acc,0.4107142857142857,0.0663363415035954,1
11
+ cb,f1,0.30617283950617286,,1
12
+ copa,acc,0.75,0.04351941398892446,0
13
+ hellaswag,acc,0.4419438358892651,0.004956030970911519,0
14
+ hellaswag,acc_norm,0.5717984465245967,0.004938068627349502,0
15
+ piqa,acc,0.7295973884657236,0.010363167031620784,0
16
+ piqa,acc_norm,0.735038084874864,0.010296557993316042,0
17
+ rte,acc,0.4404332129963899,0.029882123363118726,0
18
+ sciq,acc,0.914,0.008870325962594766,0
19
+ sciq,acc_norm,0.908,0.009144376393151108,0
20
+ storycloze_2016,acc,0.6862640299305185,0.01073017911931762,0
21
+ winogrande,acc,0.5382794001578532,0.014011242594964115,0
8b7178b13b/evaluation/rankeval/8b7178b13b_2_lm-eval_global_step84877_2023-05-15-10-06-37_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.343,
5
- "acc_stderr": 0.015019206922356951
6
- },
7
- "anli_r2": {
8
- "acc": 0.339,
9
- "acc_stderr": 0.014976758771620349
10
- },
11
- "anli_r3": {
12
- "acc": 0.33416666666666667,
13
- "acc_stderr": 0.013622434813136774
14
- },
15
- "cb": {
16
- "acc": 0.4107142857142857,
17
- "acc_stderr": 0.0663363415035954,
18
- "f1": 0.30617283950617286
19
- },
20
- "copa": {
21
- "acc": 0.75,
22
- "acc_stderr": 0.04351941398892446
23
- },
24
- "hellaswag": {
25
- "acc": 0.4419438358892651,
26
- "acc_stderr": 0.004956030970911519,
27
- "acc_norm": 0.5717984465245967,
28
- "acc_norm_stderr": 0.004938068627349502
29
- },
30
- "rte": {
31
- "acc": 0.4404332129963899,
32
- "acc_stderr": 0.029882123363118726
33
- },
34
- "winogrande": {
35
- "acc": 0.5382794001578532,
36
- "acc_stderr": 0.014011242594964115
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6862640299305185,
40
- "acc_stderr": 0.01073017911931762
41
- },
42
- "boolq": {
43
- "acc": 0.5529051987767584,
44
- "acc_stderr": 0.008695963064172717
45
- },
46
- "arc_easy": {
47
- "acc": 0.6077441077441077,
48
- "acc_stderr": 0.010018744689650043,
49
- "acc_norm": 0.6026936026936027,
50
- "acc_norm_stderr": 0.010041053078884286
51
- },
52
- "arc_challenge": {
53
- "acc": 0.28924914675767915,
54
- "acc_stderr": 0.013250012579393443,
55
- "acc_norm": 0.310580204778157,
56
- "acc_norm_stderr": 0.013522292098053059
57
- },
58
- "sciq": {
59
- "acc": 0.914,
60
- "acc_stderr": 0.008870325962594766,
61
- "acc_norm": 0.908,
62
- "acc_norm_stderr": 0.009144376393151108
63
- },
64
- "piqa": {
65
- "acc": 0.7295973884657236,
66
- "acc_stderr": 0.010363167031620784,
67
- "acc_norm": 0.735038084874864,
68
- "acc_norm_stderr": 0.010296557993316042
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b13b/evaluation/rankeval/8b7178b13b_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.344,0.015029633724408945,0
3
+ anli_r2,acc,0.365,0.0152317762262649,0
4
+ anli_r3,acc,0.3333333333333333,0.013613950010225612,0
5
+ arc_challenge,acc,0.2858361774744027,0.013203196088537369,0
6
+ arc_challenge,acc_norm,0.3037542662116041,0.01343890918477876,0
7
+ arc_easy,acc,0.6077441077441077,0.010018744689650043,0
8
+ arc_easy,acc_norm,0.6022727272727273,0.010042861602178056,0
9
+ boolq,acc,0.5314984709480123,0.00872768484861531,1
10
+ cb,acc,0.44642857142857145,0.06703189227942398,1
11
+ cb,f1,0.428030303030303,,1
12
+ copa,acc,0.74,0.04408440022768079,0
13
+ hellaswag,acc,0.4431388169687313,0.004957410545559414,0
14
+ hellaswag,acc_norm,0.58105954989046,0.004923772581848488,0
15
+ piqa,acc,0.7323177366702938,0.010330111189370429,0
16
+ piqa,acc_norm,0.735038084874864,0.010296557993316044,0
17
+ rte,acc,0.48736462093862815,0.030086851767188564,0
18
+ sciq,acc,0.913,0.008916866630745923,0
19
+ sciq,acc_norm,0.911,0.009008893392651518,0
20
+ storycloze_2016,acc,0.6916087653661144,0.010679734445487801,0
21
+ winogrande,acc,0.5556432517758485,0.013965196769083555,0
8b7178b13b/evaluation/rankeval/8b7178b13b_3_lm-eval_global_step84877_2023-05-15-10-06-37_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.344,
5
- "acc_stderr": 0.015029633724408945
6
- },
7
- "anli_r2": {
8
- "acc": 0.365,
9
- "acc_stderr": 0.0152317762262649
10
- },
11
- "anli_r3": {
12
- "acc": 0.3333333333333333,
13
- "acc_stderr": 0.013613950010225612
14
- },
15
- "cb": {
16
- "acc": 0.44642857142857145,
17
- "acc_stderr": 0.06703189227942398,
18
- "f1": 0.428030303030303
19
- },
20
- "copa": {
21
- "acc": 0.74,
22
- "acc_stderr": 0.04408440022768079
23
- },
24
- "hellaswag": {
25
- "acc": 0.4431388169687313,
26
- "acc_stderr": 0.004957410545559414,
27
- "acc_norm": 0.58105954989046,
28
- "acc_norm_stderr": 0.004923772581848488
29
- },
30
- "rte": {
31
- "acc": 0.48736462093862815,
32
- "acc_stderr": 0.030086851767188564
33
- },
34
- "winogrande": {
35
- "acc": 0.5556432517758485,
36
- "acc_stderr": 0.013965196769083555
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6916087653661144,
40
- "acc_stderr": 0.010679734445487801
41
- },
42
- "boolq": {
43
- "acc": 0.5314984709480123,
44
- "acc_stderr": 0.00872768484861531
45
- },
46
- "arc_easy": {
47
- "acc": 0.6077441077441077,
48
- "acc_stderr": 0.010018744689650043,
49
- "acc_norm": 0.6022727272727273,
50
- "acc_norm_stderr": 0.010042861602178056
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2858361774744027,
54
- "acc_stderr": 0.013203196088537369,
55
- "acc_norm": 0.3037542662116041,
56
- "acc_norm_stderr": 0.01343890918477876
57
- },
58
- "sciq": {
59
- "acc": 0.913,
60
- "acc_stderr": 0.008916866630745923,
61
- "acc_norm": 0.911,
62
- "acc_norm_stderr": 0.009008893392651518
63
- },
64
- "piqa": {
65
- "acc": 0.7323177366702938,
66
- "acc_stderr": 0.010330111189370429,
67
- "acc_norm": 0.735038084874864,
68
- "acc_norm_stderr": 0.010296557993316044
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b13b/evaluation/rankeval/8b7178b13b_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.363,0.015213890444671283,0
3
+ anli_r2,acc,0.362,0.0152048409129195,0
4
+ anli_r3,acc,0.3516666666666667,0.013789711695404806,0
5
+ arc_challenge,acc,0.27559726962457337,0.013057169655761838,0
6
+ arc_challenge,acc_norm,0.31313993174061433,0.013552671543623501,0
7
+ arc_easy,acc,0.6203703703703703,0.009958037725468565,0
8
+ arc_easy,acc_norm,0.6085858585858586,0.010014917532627824,0
9
+ boolq,acc,0.5162079510703363,0.008740459157499082,1
10
+ cb,acc,0.39285714285714285,0.0658538889806635,1
11
+ cb,f1,0.3340305010893247,,1
12
+ copa,acc,0.74,0.04408440022768078,0
13
+ hellaswag,acc,0.44064927305317664,0.004954503606471609,0
14
+ hellaswag,acc_norm,0.5764787890858395,0.004931065434173691,0
15
+ piqa,acc,0.7285092491838956,0.010376251176596135,0
16
+ piqa,acc_norm,0.7393906420021763,0.010241826155811632,0
17
+ rte,acc,0.44765342960288806,0.029931070362939526,0
18
+ sciq,acc,0.91,0.009054390204866444,0
19
+ sciq,acc_norm,0.914,0.008870325962594766,0
20
+ storycloze_2016,acc,0.6932121859967931,0.010664275190473634,0
21
+ winogrande,acc,0.5501183898973955,0.013981711904049732,0
8b7178b13b/evaluation/rankeval/8b7178b13b_4_lm-eval_global_step84877_2023-05-15-10-07-32_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.363,
5
- "acc_stderr": 0.015213890444671283
6
- },
7
- "anli_r2": {
8
- "acc": 0.362,
9
- "acc_stderr": 0.0152048409129195
10
- },
11
- "anli_r3": {
12
- "acc": 0.3516666666666667,
13
- "acc_stderr": 0.013789711695404806
14
- },
15
- "cb": {
16
- "acc": 0.39285714285714285,
17
- "acc_stderr": 0.0658538889806635,
18
- "f1": 0.3340305010893247
19
- },
20
- "copa": {
21
- "acc": 0.74,
22
- "acc_stderr": 0.04408440022768078
23
- },
24
- "hellaswag": {
25
- "acc": 0.44064927305317664,
26
- "acc_stderr": 0.004954503606471609,
27
- "acc_norm": 0.5764787890858395,
28
- "acc_norm_stderr": 0.004931065434173691
29
- },
30
- "rte": {
31
- "acc": 0.44765342960288806,
32
- "acc_stderr": 0.029931070362939526
33
- },
34
- "winogrande": {
35
- "acc": 0.5501183898973955,
36
- "acc_stderr": 0.013981711904049732
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6932121859967931,
40
- "acc_stderr": 0.010664275190473634
41
- },
42
- "boolq": {
43
- "acc": 0.5162079510703363,
44
- "acc_stderr": 0.008740459157499082
45
- },
46
- "arc_easy": {
47
- "acc": 0.6203703703703703,
48
- "acc_stderr": 0.009958037725468565,
49
- "acc_norm": 0.6085858585858586,
50
- "acc_norm_stderr": 0.010014917532627824
51
- },
52
- "arc_challenge": {
53
- "acc": 0.27559726962457337,
54
- "acc_stderr": 0.013057169655761838,
55
- "acc_norm": 0.31313993174061433,
56
- "acc_norm_stderr": 0.013552671543623501
57
- },
58
- "sciq": {
59
- "acc": 0.91,
60
- "acc_stderr": 0.009054390204866444,
61
- "acc_norm": 0.914,
62
- "acc_norm_stderr": 0.008870325962594766
63
- },
64
- "piqa": {
65
- "acc": 0.7285092491838956,
66
- "acc_stderr": 0.010376251176596135,
67
- "acc_norm": 0.7393906420021763,
68
- "acc_norm_stderr": 0.010241826155811632
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b13b/evaluation/rankeval/8b7178b13b_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.359,0.015177264224798601,0
3
+ anli_r2,acc,0.335,0.014933117490932573,0
4
+ anli_r3,acc,0.3258333333333333,0.013535422043417454,0
5
+ arc_challenge,acc,0.2832764505119454,0.013167478735134575,0
6
+ arc_challenge,acc_norm,0.3165529010238908,0.01359243151906808,0
7
+ arc_easy,acc,0.6094276094276094,0.010011059112064243,0
8
+ arc_easy,acc_norm,0.6119528619528619,0.009999295905750666,0
9
+ boolq,acc,0.519571865443425,0.008738352682962235,1
10
+ cb,acc,0.42857142857142855,0.06672848092813058,1
11
+ cb,f1,0.38723751912112364,,1
12
+ copa,acc,0.78,0.04163331998932262,0
13
+ hellaswag,acc,0.44343756223859787,0.0049577508971529426,0
14
+ hellaswag,acc_norm,0.5806612228639714,0.004924424018073683,0
15
+ piqa,acc,0.7247007616974973,0.01042142927736953,0
16
+ piqa,acc_norm,0.7393906420021763,0.010241826155811632,0
17
+ rte,acc,0.48014440433212996,0.0300727231673172,0
18
+ sciq,acc,0.913,0.008916866630745925,0
19
+ sciq,acc_norm,0.917,0.00872852720607479,0
20
+ storycloze_2016,acc,0.6937466595403528,0.010659088460112754,0
21
+ winogrande,acc,0.5540647198105761,0.013970093482330697,0
8b7178b13b/evaluation/rankeval/8b7178b13b_5_lm-eval_global_step84877_2023-05-15-10-06-37_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.359,
5
- "acc_stderr": 0.015177264224798601
6
- },
7
- "anli_r2": {
8
- "acc": 0.335,
9
- "acc_stderr": 0.014933117490932573
10
- },
11
- "anli_r3": {
12
- "acc": 0.3258333333333333,
13
- "acc_stderr": 0.013535422043417454
14
- },
15
- "cb": {
16
- "acc": 0.42857142857142855,
17
- "acc_stderr": 0.06672848092813058,
18
- "f1": 0.38723751912112364
19
- },
20
- "copa": {
21
- "acc": 0.78,
22
- "acc_stderr": 0.04163331998932262
23
- },
24
- "hellaswag": {
25
- "acc": 0.44343756223859787,
26
- "acc_stderr": 0.0049577508971529426,
27
- "acc_norm": 0.5806612228639714,
28
- "acc_norm_stderr": 0.004924424018073683
29
- },
30
- "rte": {
31
- "acc": 0.48014440433212996,
32
- "acc_stderr": 0.0300727231673172
33
- },
34
- "winogrande": {
35
- "acc": 0.5540647198105761,
36
- "acc_stderr": 0.013970093482330697
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6937466595403528,
40
- "acc_stderr": 0.010659088460112754
41
- },
42
- "boolq": {
43
- "acc": 0.519571865443425,
44
- "acc_stderr": 0.008738352682962235
45
- },
46
- "arc_easy": {
47
- "acc": 0.6094276094276094,
48
- "acc_stderr": 0.010011059112064243,
49
- "acc_norm": 0.6119528619528619,
50
- "acc_norm_stderr": 0.009999295905750666
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2832764505119454,
54
- "acc_stderr": 0.013167478735134575,
55
- "acc_norm": 0.3165529010238908,
56
- "acc_norm_stderr": 0.01359243151906808
57
- },
58
- "sciq": {
59
- "acc": 0.913,
60
- "acc_stderr": 0.008916866630745925,
61
- "acc_norm": 0.917,
62
- "acc_norm_stderr": 0.00872852720607479
63
- },
64
- "piqa": {
65
- "acc": 0.7247007616974973,
66
- "acc_stderr": 0.01042142927736953,
67
- "acc_norm": 0.7393906420021763,
68
- "acc_norm_stderr": 0.010241826155811632
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b7178b178b/evaluation/8b7178b178b_1_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.10766666666666666,
5
+ "em_stderr": 0.005659993848227298
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b178b/transformers",
14
+ "num_fewshot": 1,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b178b/evaluation/8b7178b178b_2_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.208,
5
+ "em_stderr": 0.007411498505927842
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b178b/transformers",
14
+ "num_fewshot": 2,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b178b/evaluation/8b7178b178b_3_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.2713333333333333,
5
+ "em_stderr": 0.008119472096605799
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b178b/transformers",
14
+ "num_fewshot": 3,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b178b/evaluation/8b7178b178b_4_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.309,
5
+ "em_stderr": 0.008437815608561314
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b178b/transformers",
14
+ "num_fewshot": 4,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b178b/evaluation/8b7178b178b_5_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.3273333333333333,
5
+ "em_stderr": 0.008568540173271721
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b178b/transformers",
14
+ "num_fewshot": 5,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b25b/evaluation/8b7178b25b_0_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.0,
5
+ "em_stderr": 0.0
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers",
14
+ "num_fewshot": 0,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b25b/evaluation/8b7178b25b_1_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.118,
5
+ "em_stderr": 0.005890973421765812
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers",
14
+ "num_fewshot": 1,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b25b/evaluation/8b7178b25b_2_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.24333333333333335,
5
+ "em_stderr": 0.007835466732772215
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers",
14
+ "num_fewshot": 2,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b25b/evaluation/8b7178b25b_3_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.2833333333333333,
5
+ "em_stderr": 0.008228472181192749
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers",
14
+ "num_fewshot": 3,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }
8b7178b25b/evaluation/8b7178b25b_4_babi.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "babi": {
4
+ "em": 0.30766666666666664,
5
+ "em_stderr": 0.008427710547037915
6
+ }
7
+ },
8
+ "versions": {
9
+ "babi": 0
10
+ },
11
+ "config": {
12
+ "model": "gpt2",
13
+ "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-8b7-178b-oscar-repetitions/8b7178b25b/transformers",
14
+ "num_fewshot": 4,
15
+ "batch_size": null,
16
+ "device": null,
17
+ "no_cache": true,
18
+ "limit": 3000,
19
+ "bootstrap_iters": 100000,
20
+ "description_dict": {}
21
+ }
22
+ }